Spaces:
Runtime error
Runtime error
| #Title: TREC_COVID_Round1_OHSU.py | |
| #Author: Jimmy Chen, School of Medicine, OHSU | |
| #Description: Generate 1000 documents per topic in Round 1 TREC_COVID and get trec_eval metrics | |
| # To replicate OSHU_RUN2 | |
| # Results: https://ir.nist.gov/covidSubmit/archive/round1/OHSU_RUN2.pdf | |
| # | |
| # In root pyserini directory: | |
| # | |
| # 1. wget https://www.dropbox.com/s/gtq2c3xq81mjowk/lucene-index-covid-full-text-2020-04-10.tar.gz | |
| # 2. tar xvfz lucene-index-covid-full-text-2020-04-10.tar.gz | |
| # 3. python bin/generate_trec_covid_round1_OSHU_RUN2.py | |
| # 4. trec_eval -c -q -M1000 -m all_trec qrels-rnd1.txt Round1_data/full_R1.txt | |
| import sys | |
| sys.path.insert(0, "./") | |
| import pandas as pd | |
| import numpy as np | |
| #import torch | |
| import os | |
| from tqdm.auto import tqdm | |
| import json | |
| from pyserini.search import pysearch | |
| import xml.etree.ElementTree as ET | |
| import requests | |
| import urllib.request | |
| from trectools import misc, TrecRun, TrecQrel, procedures | |
| from pyserini.analysis.pyanalysis import get_lucene_analyzer, Analyzer | |
| import nltk | |
| from nltk.corpus import stopwords | |
| #Round 1 indexes | |
| #Replace with url to folder containing your index | |
| R1_fulltext = 'lucene-index-covid-full-text-2020-04-10' | |
| #Download round 1 topics and parse into dataframe | |
| tree = ET.fromstring(requests.get('https://ir.nist.gov/covidSubmit/data/topics-rnd1.xml').text) | |
| topicid = [] | |
| query = [] | |
| question = [] | |
| narrative = [] | |
| for child in tree.iter(): | |
| tag =child.tag | |
| text = child.text | |
| attrib = child.attrib | |
| if (tag == 'topic'): | |
| topicid.append(attrib['number']) | |
| if (tag == 'query'): | |
| query.append(text) | |
| if (tag == 'question'): | |
| question.append(text) | |
| if (tag == 'narrative'): | |
| narrative.append(text) | |
| #Join to CSV | |
| my_dict = {'Topic':topicid, 'Query':query, 'Question':question , 'Narrative':narrative} | |
| R1_topics = pd.DataFrame(my_dict) | |
| R1_topics = R1_topics[['Topic', 'Query', 'Question', 'Narrative']] | |
| curr_dir = os.getcwd() | |
| Pyserini_files = os.path.join(curr_dir, 'Round1_data') | |
| if (os.path.exists(Pyserini_files) == False): | |
| os.mkdir(Pyserini_files) | |
| #Topics | |
| full_searcher = pysearch.LuceneSearcher(R1_fulltext) | |
| #Configure searcher parameters | |
| full_searcher.set_bm25_similarity(k1=1.5, b=0.4) | |
| full_searcher.set_lm_dirichlet_similarity(mu = 2000) | |
| full_searcher.set_rm3_reranker(fb_terms=10, fb_docs=10, original_query_weight=0.5) | |
| #Stopwords for tokenization - manual review | |
| stopwords_manual = ['seek', 'seeking', 'look', 'looking', 'studies', 'study', 'information', | |
| 'about', 'range', 'studies', 'its', 'coronaviru', | |
| 'other', '2', '19', 'well', ' will', 'from', 'have', 'more', 'covid', 'any', 'what', | |
| 'should', 'may', 'due', 'help', 'non', 's', 'those', 'people', 'ways', 'all', 'gain', | |
| 'possible', 'toward', 'specifically', 'learned', 'number', 'proportion', 'including', | |
| 'etc', 'still', 'while', 'human', 'specific', 'result', 'results', 'assess', 'need', | |
| 'between', 'take', 'taking', 'patient', 'type', 'cause' ,'frequency', 'less', 'face', | |
| 'likely', 'infect', 'upon', 'develop', 'represent', 'promising', 'step', 'related', | |
| 'papers', 'describe', 'also', 'relevant', 'who', 'show', 'science', 'basic', 'complete', | |
| 'do', 'how', 'been', 'against', 'use', 'to', 'had', 'has', 'approach', 'Studies', 'Stud', 'Inst', 'Divi' ,'Thomae', | |
| 'Brigham', 'Young', 'Univ', 'studies', 'volition', 'severe acute respiratory syndrome', 'affect', 'affected'] | |
| #NLTK stopwords | |
| nltk.download('stopwords') | |
| stopwords = list(set(stopwords.words('English'))) | |
| stopwords_manual = list(np.append(stopwords_manual, stopwords)) | |
| token_narrative_list = [] | |
| #Extract important narrative text | |
| for i in range(len(R1_topics)): | |
| analyzer = Analyzer(get_lucene_analyzer(stemmer='krovetz')) | |
| tokens = analyzer.analyze(R1_topics['Narrative'][i]) | |
| #Remove stopwords and duplicates from token | |
| tokens = [w for w in tokens if not w in stopwords_manual] | |
| tokens = list(set(tokens)) | |
| token_narrative_list.append(tokens) | |
| #Tokenize question | |
| token_question_list = [] | |
| #Extract important question text - NOT USED YET | |
| for i in range(len(R1_topics)): | |
| analyzer = Analyzer(get_lucene_analyzer(stemmer='krovetz')) | |
| tokens = analyzer.analyze(R1_topics['Question'][i]) | |
| #Remove stopwords and duplicates from token | |
| tokens = [w for w in tokens if not w in stopwords_manual] | |
| tokens = list(set(tokens)) | |
| token_question_list.append(tokens) | |
| #Anserini searcher can take both query and keywords | |
| #keywords_list = '2019-nCoV, SARS-CoV-2, COVID-19' | |
| keywords_list = 'COVID-19' | |
| #Extract search results from the searcher | |
| docid_list = [] | |
| rank_list = [] | |
| score_list = [] | |
| topic_id_list = [] | |
| title_list = [] | |
| doi_list = [] | |
| print('Searching topics for documents') | |
| #Search extra - will drop duplicates and excess to 1000 | |
| n_papers = 1100 | |
| for ii, row in R1_topics.iterrows(): | |
| query = R1_topics['Query'][ii] | |
| question = R1_topics['Question'][ii] | |
| topic_num = R1_topics['Topic'][ii] | |
| token_topic = ', '.join(token_narrative_list[ii]) | |
| token_question = ','.join(token_question_list[ii]) | |
| input_query = query + '. ' + token_question + '. ' + token_topic + ' . ' + keywords_list | |
| hits = full_searcher.search(q = input_query, k=n_papers) | |
| print(topic_num) | |
| #Each key is a qid, value is the anserini search list | |
| for i in tqdm(range(0, n_papers), position = 0, leave = True): | |
| topic_id_list.append(topic_num) | |
| docid_list.append(hits[i].docid) | |
| rank_list.append(str(i+1)) | |
| score_list.append(hits[i].score) | |
| title_list.append(hits[i].lucene_document.get("title")) | |
| doi_list.append('https://doi.org/' + str(hits[i].lucene_document.get("doi"))) | |
| #Make dataframe from lists generated from search | |
| def TREC_df(topic_id_list, docid_list, rank_list, score_list, title_list, doi_list, run_param): | |
| #Run-tag for TREC run requirements | |
| Q0 = ['q0'] * len(topic_id_list) | |
| qid = [run_param] * len(topic_id_list) | |
| df = {'topic': topic_id_list , 'q0':Q0, 'docid':docid_list, 'rank':rank_list, | |
| 'score':score_list, 'title': title_list, 'doi':doi_list, 'qid':qid} | |
| df = pd.DataFrame(df) | |
| df = df[['topic', 'q0', 'docid', 'rank', 'score', 'title', 'doi', 'qid']] | |
| #Remove duplicates | |
| df.drop_duplicates(subset=['topic', 'docid'], keep='first', inplace = True) | |
| #Re-rank | |
| df['rank'] = df.groupby('topic')['score'].rank(ascending=False) | |
| df['rank'] = df['rank'].astype(int) | |
| df = df[df['rank'] <= 1000] | |
| #Reset index | |
| df.reset_index(drop=True, inplace=True) | |
| #Get columns for submission | |
| succinct_results = df[['topic', 'q0', 'docid', 'rank', 'score', 'qid']] | |
| return succinct_results | |
| full_df = TREC_df(topic_id_list, docid_list, rank_list, score_list, title_list, doi_list, 'FullTxt_run') | |
| full_df.to_csv(os.path.join(Pyserini_files, 'full_R1.txt'), sep=' ', index=False, header=None) | |
| #Use Trec Eval to evaluate initial runs | |
| #Run TREC_Eval | |
| print('Running trec_eval on search results') | |
| r = requests.get('https://ir.nist.gov/covidSubmit/data/qrels-rnd1.txt') | |
| qrels_file = os.path.join(os.getcwd(), 'qrels.txt') | |
| with open(qrels_file, 'wb') as f: | |
| f.write(r.content) | |
| qrels = TrecQrel(qrels_file) | |
| #Generate metrics for all 3 indices (1000 docs retrieved for each) | |
| runs = procedures.list_of_runs_from_path(Pyserini_files, "*.txt") | |
| results = procedures.evaluate_runs(runs, qrels, per_query=True) | |
| p5 = procedures.extract_metric_from_results(results, "P_5") | |
| p10 = procedures.extract_metric_from_results(results, "P_10") | |
| Bpref = procedures.extract_metric_from_results(results, "bpref") | |
| Mean_avgP = procedures.extract_metric_from_results(results, 'map') | |
| #Aggregate results to dataframe | |
| runs_names = [os.path.basename(str(x)).split('.')[0] for x in runs] | |
| p5_list = [] | |
| p10_list = [] | |
| map_list = [] | |
| bpref_list = [] | |
| ndcg_list = [] | |
| for i in range(len(runs)): | |
| p5_list.append(p5[i][1]) | |
| p10_list.append(p10[i][1]) | |
| map_list.append(Mean_avgP[i][1]) | |
| bpref_list.append(Bpref[i][1]) | |
| Result_df = {'Run':runs_names, 'P@5': p5_list, 'P@10': p10_list, 'MAP': map_list, 'Bpref': bpref_list} | |
| Result_df = pd.DataFrame(Result_df) | |
| with pd.option_context('display.max_rows', None, 'display.max_columns', None): # more options can be specified also | |
| print(Result_df) | |