Spaces:

geonmin-kim
/

NetsPresso_QA

Runtime error

App Files Files Community

NetsPresso_QA / scripts /generate_trec_covid_round1_OSHU_RUN2.py

geonmin-kim

Upload folder using huggingface_hub

d6585f5 over 2 years ago

raw

history blame contribute delete

8.28 kB

	#Title: TREC_COVID_Round1_OHSU.py
	#Author: Jimmy Chen, School of Medicine, OHSU
	#Description: Generate 1000 documents per topic in Round 1 TREC_COVID and get trec_eval metrics

	# To replicate OSHU_RUN2
	# Results: https://ir.nist.gov/covidSubmit/archive/round1/OHSU_RUN2.pdf
	#
	# In root pyserini directory:
	#
	# 1. wget https://www.dropbox.com/s/gtq2c3xq81mjowk/lucene-index-covid-full-text-2020-04-10.tar.gz
	# 2. tar xvfz lucene-index-covid-full-text-2020-04-10.tar.gz
	# 3. python bin/generate_trec_covid_round1_OSHU_RUN2.py
	# 4. trec_eval -c -q -M1000 -m all_trec qrels-rnd1.txt Round1_data/full_R1.txt

	import sys
	sys.path.insert(0, "./")

	import pandas as pd
	import numpy as np
	#import torch
	import os
	from tqdm.auto import tqdm
	import json
	from pyserini.search import pysearch
	import xml.etree.ElementTree as ET
	import requests
	import urllib.request
	from trectools import misc, TrecRun, TrecQrel, procedures
	from pyserini.analysis.pyanalysis import get_lucene_analyzer, Analyzer
	import nltk
	from nltk.corpus import stopwords

	#Round 1 indexes
	#Replace with url to folder containing your index
	R1_fulltext = 'lucene-index-covid-full-text-2020-04-10'

	#Download round 1 topics and parse into dataframe
	tree = ET.fromstring(requests.get('https://ir.nist.gov/covidSubmit/data/topics-rnd1.xml').text)
	topicid = []
	query = []
	question = []
	narrative = []
	for child in tree.iter():
	tag =child.tag
	text = child.text
	attrib = child.attrib
	if (tag == 'topic'):
	topicid.append(attrib['number'])
	if (tag == 'query'):
	query.append(text)
	if (tag == 'question'):
	question.append(text)
	if (tag == 'narrative'):
	narrative.append(text)

	#Join to CSV
	my_dict = {'Topic':topicid, 'Query':query, 'Question':question , 'Narrative':narrative}
	R1_topics = pd.DataFrame(my_dict)
	R1_topics = R1_topics[['Topic', 'Query', 'Question', 'Narrative']]

	curr_dir = os.getcwd()
	Pyserini_files = os.path.join(curr_dir, 'Round1_data')
	if (os.path.exists(Pyserini_files) == False):
	os.mkdir(Pyserini_files)

	#Topics
	full_searcher = pysearch.LuceneSearcher(R1_fulltext)

	#Configure searcher parameters
	full_searcher.set_bm25_similarity(k1=1.5, b=0.4)
	full_searcher.set_lm_dirichlet_similarity(mu = 2000)
	full_searcher.set_rm3_reranker(fb_terms=10, fb_docs=10, original_query_weight=0.5)

	#Stopwords for tokenization - manual review
	stopwords_manual = ['seek', 'seeking', 'look', 'looking', 'studies', 'study', 'information',
	'about', 'range', 'studies', 'its', 'coronaviru',
	'other', '2', '19', 'well', ' will', 'from', 'have', 'more', 'covid', 'any', 'what',
	'should', 'may', 'due', 'help', 'non', 's', 'those', 'people', 'ways', 'all', 'gain',
	'possible', 'toward', 'specifically', 'learned', 'number', 'proportion', 'including',
	'etc', 'still', 'while', 'human', 'specific', 'result', 'results', 'assess', 'need',
	'between', 'take', 'taking', 'patient', 'type', 'cause' ,'frequency', 'less', 'face',
	'likely', 'infect', 'upon', 'develop', 'represent', 'promising', 'step', 'related',
	'papers', 'describe', 'also', 'relevant', 'who', 'show', 'science', 'basic', 'complete',
	'do', 'how', 'been', 'against', 'use', 'to', 'had', 'has', 'approach', 'Studies', 'Stud', 'Inst', 'Divi' ,'Thomae',
	'Brigham', 'Young', 'Univ', 'studies', 'volition', 'severe acute respiratory syndrome', 'affect', 'affected']

	#NLTK stopwords
	nltk.download('stopwords')
	stopwords = list(set(stopwords.words('English')))
	stopwords_manual = list(np.append(stopwords_manual, stopwords))

	token_narrative_list = []

	#Extract important narrative text
	for i in range(len(R1_topics)):
	analyzer = Analyzer(get_lucene_analyzer(stemmer='krovetz'))
	tokens = analyzer.analyze(R1_topics['Narrative'][i])
	#Remove stopwords and duplicates from token
	tokens = [w for w in tokens if not w in stopwords_manual]
	tokens = list(set(tokens))
	token_narrative_list.append(tokens)

	#Tokenize question
	token_question_list = []

	#Extract important question text - NOT USED YET
	for i in range(len(R1_topics)):
	analyzer = Analyzer(get_lucene_analyzer(stemmer='krovetz'))
	tokens = analyzer.analyze(R1_topics['Question'][i])
	#Remove stopwords and duplicates from token
	tokens = [w for w in tokens if not w in stopwords_manual]
	tokens = list(set(tokens))
	token_question_list.append(tokens)

	#Anserini searcher can take both query and keywords
	#keywords_list = '2019-nCoV, SARS-CoV-2, COVID-19'
	keywords_list = 'COVID-19'

	#Extract search results from the searcher
	docid_list = []
	rank_list = []
	score_list = []
	topic_id_list = []
	title_list = []
	doi_list = []

	print('Searching topics for documents')
	#Search extra - will drop duplicates and excess to 1000
	n_papers = 1100
	for ii, row in R1_topics.iterrows():
	query = R1_topics['Query'][ii]
	question = R1_topics['Question'][ii]
	topic_num = R1_topics['Topic'][ii]
	token_topic = ', '.join(token_narrative_list[ii])
	token_question = ','.join(token_question_list[ii])
	input_query = query + '. ' + token_question + '. ' + token_topic + ' . ' + keywords_list

	hits = full_searcher.search(q = input_query, k=n_papers)
	print(topic_num)
	#Each key is a qid, value is the anserini search list
	for i in tqdm(range(0, n_papers), position = 0, leave = True):
	topic_id_list.append(topic_num)
	docid_list.append(hits[i].docid)
	rank_list.append(str(i+1))
	score_list.append(hits[i].score)
	title_list.append(hits[i].lucene_document.get("title"))
	doi_list.append('https://doi.org/' + str(hits[i].lucene_document.get("doi")))


	#Make dataframe from lists generated from search
	def TREC_df(topic_id_list, docid_list, rank_list, score_list, title_list, doi_list, run_param):
	#Run-tag for TREC run requirements
	Q0 = ['q0'] * len(topic_id_list)
	qid = [run_param] * len(topic_id_list)

	df = {'topic': topic_id_list , 'q0':Q0, 'docid':docid_list, 'rank':rank_list,
	'score':score_list, 'title': title_list, 'doi':doi_list, 'qid':qid}
	df = pd.DataFrame(df)
	df = df[['topic', 'q0', 'docid', 'rank', 'score', 'title', 'doi', 'qid']]

	#Remove duplicates
	df.drop_duplicates(subset=['topic', 'docid'], keep='first', inplace = True)

	#Re-rank
	df['rank'] = df.groupby('topic')['score'].rank(ascending=False)
	df['rank'] = df['rank'].astype(int)

	df = df[df['rank'] <= 1000]
	#Reset index
	df.reset_index(drop=True, inplace=True)

	#Get columns for submission
	succinct_results = df[['topic', 'q0', 'docid', 'rank', 'score', 'qid']]

	return succinct_results

	full_df = TREC_df(topic_id_list, docid_list, rank_list, score_list, title_list, doi_list, 'FullTxt_run')
	full_df.to_csv(os.path.join(Pyserini_files, 'full_R1.txt'), sep=' ', index=False, header=None)

	#Use Trec Eval to evaluate initial runs
	#Run TREC_Eval
	print('Running trec_eval on search results')
	r = requests.get('https://ir.nist.gov/covidSubmit/data/qrels-rnd1.txt')
	qrels_file = os.path.join(os.getcwd(), 'qrels.txt')
	with open(qrels_file, 'wb') as f:
	f.write(r.content)
	qrels = TrecQrel(qrels_file)

	#Generate metrics for all 3 indices (1000 docs retrieved for each)
	runs = procedures.list_of_runs_from_path(Pyserini_files, "*.txt")
	results = procedures.evaluate_runs(runs, qrels, per_query=True)
	p5 = procedures.extract_metric_from_results(results, "P_5")
	p10 = procedures.extract_metric_from_results(results, "P_10")
	Bpref = procedures.extract_metric_from_results(results, "bpref")
	Mean_avgP = procedures.extract_metric_from_results(results, 'map')

	#Aggregate results to dataframe
	runs_names = [os.path.basename(str(x)).split('.')[0] for x in runs]
	p5_list = []
	p10_list = []
	map_list = []
	bpref_list = []
	ndcg_list = []

	for i in range(len(runs)):
	p5_list.append(p5[i][1])
	p10_list.append(p10[i][1])
	map_list.append(Mean_avgP[i][1])
	bpref_list.append(Bpref[i][1])

	Result_df = {'Run':runs_names, 'P@5': p5_list, 'P@10': p10_list, 'MAP': map_list, 'Bpref': bpref_list}
	Result_df = pd.DataFrame(Result_df)

	with pd.option_context('display.max_rows', None, 'display.max_columns', None): # more options can be specified also
	print(Result_df)