Spaces:

geonmin-kim
/

NetsPresso_QA

Runtime error

App Files Files Community

NetsPresso_QA / scripts /repro_matrix /run_all_odqa.py

geonmin-kim

Upload folder using huggingface_hub

d6585f5 over 2 years ago

raw

history blame contribute delete

6.68 kB

	#
	# Pyserini: Reproducible IR research with sparse and dense representations
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	#

	import argparse
	import math
	import os
	import time
	from collections import defaultdict
	from string import Template

	import yaml

	from defs_odqa import models, evaluate_dpr_retrieval_metric_definitions
	from utils import run_dpr_retrieval_eval_and_return_metric, convert_trec_run_to_dpr_retrieval_json, run_fusion, ok_str, fail_str

	GARRRF_LS = ['answers','titles','sentences']
	HITS_1K = set(['GarT5-RRF', 'DPR-DKRR', 'DPR-Hybrid'])

	def print_results(metric, topics):
	print(f'Metric = {metric}, Topics = {topics}')
	for model in models['models']:
	print(' ' * 32, end='')
	print(f'{model:30}', end='')
	key = f'{model}'
	print(f'{table[key][metric]:7.2f}', end='\n')
	print('')


	if __name__ == '__main__':
	parser = argparse.ArgumentParser(
	description='Generate regression matrix for GarDKRR')
	parser.add_argument('--skip-eval', action='store_true',
	default=False, help='Skip running trec_eval.')
	parser.add_argument('--topics', choices=['tqa', 'nq'],
	help='Topics to be run [tqa, nq]', required=True)
	parser.add_argument('--full-topk', action='store_true',
	default=False, help='Run topk 5-1000, default is topk 5-100')
	args = parser.parse_args()
	hits = 1000 if args.full_topk else 100
	yaml_path = 'pyserini/resources/triviaqa.yaml' if args.topics == "tqa" else 'pyserini/resources/naturalquestion.yaml'
	topics = 'dpr-trivia-test' if args.topics == 'tqa' else 'nq-test'
	start = time.time()
	table = defaultdict(lambda: defaultdict(lambda: 0.0))

	with open(yaml_path) as f:
	yaml_data = yaml.safe_load(f)
	for condition in yaml_data['conditions']:
	name = condition['model_name']
	cmd_template = condition['command']

	if not args.full_topk:
	# if using topk100
	if name in HITS_1K:
	# if running topk1000 is a must to ensure scores match with the ones in the table
	hits = 1000
	else:
	hits = 100

	print(f'model {name}:')
	if topics == 'nq-test' and name == 'BM25-k1_0.9_b_0.4_dpr-topics':
	topics = 'dpr-nq-test'
	elif args.topics == 'nq':
	topics = 'nq-test'
	print(f' - Topics: {topics}')

	# running retrieval
	if name == "GarT5-RRF":
	runfile = [f'runs/run.odqa.{name}.{topics}.{i}.hits-{hits}.txt' for i in GARRRF_LS]
	else:
	runfile = [f'runs/run.odqa.{name}.{topics}.hits-{hits}.txt']

	if name != "GarT5RRF-DKRR-RRF":
	cmd = [Template(cmd_template[i]).substitute(output=runfile[i]) for i in range(len(runfile))]
	if hits == 100:
	cmd = [i + ' --hits 100' for i in cmd]
	for i in range(len(runfile)):
	if not os.path.exists(runfile[i]):
	print(f' Running: {cmd[i]}')
	os.system(cmd[i])

	# fusion
	if 'RRF' in name:
	runs = []
	output = ''
	if name == 'GarT5-RRF':
	runs = runfile
	output = f'runs/run.odqa.{name}.{topics}.hits-{hits}.fusion.txt'
	elif name == 'GarT5RRF-DKRR-RRF':
	runs = [f'runs/run.odqa.DPR-DKRR.{topics}.hits-1000.txt', f'runs/run.odqa.GarT5-RRF.{topics}.hits-1000.fusion.txt']
	output = runfile[0].replace('.txt','.fusion.txt')
	else:
	raise NameError('Unexpected model name')
	if not os.path.exists(output):
	if not args.full_topk and name != 'GarT5-RRF':
	# if using topk100, we change it back for methods that require topk1000 to generate runs
	hits = 100
	status = run_fusion(runs, output, hits)
	if status != 0:
	raise RuntimeError('fusion failed')
	runfile = [output]


	# trec conversion + evaluation
	if not args.skip_eval:
	jsonfile = runfile[0].replace('.txt', '.json')
	runfile = jsonfile.replace('.json','.txt')
	if not os.path.exists(jsonfile):
	status = convert_trec_run_to_dpr_retrieval_json(
	topics, 'wikipedia-dpr', runfile, jsonfile)
	if status != 0:
	raise RuntimeError("dpr retrieval convertion failed")
	topk_defs = evaluate_dpr_retrieval_metric_definitions['Top5-100']
	if args.full_topk:
	topk_defs = evaluate_dpr_retrieval_metric_definitions['Top5-1000']
	score = run_dpr_retrieval_eval_and_return_metric(topk_defs, jsonfile)

	# comparing ground truth scores with the generated ones
	for expected in condition['scores']:
	for metric, expected_score in expected.items():
	if metric not in score.keys(): continue
	if not args.skip_eval:
	if math.isclose(score[metric], float(expected_score),abs_tol=2e-2):
	result_str = ok_str
	else:
	result_str = fail_str + \
	f' expected {expected[metric]:.4f}'
	print(f' {metric:7}: {score[metric]:.2f} {result_str}')
	table[name][metric] = score[metric]
	else:
	table[name][metric] = expected_score

	print('')
	metric_ls = ['Top5', 'Top20', 'Top100', 'Top500', 'Top1000']
	metric_ls = metric_ls[:3] if not args.full_topk else metric_ls
	for metric in metric_ls:
	print_results(metric, topics)

	end = time.time()
	print(f'Total elapsed time: {end - start:.0f}s')