Spaces:
Paused
Paused
add exact match metadata
Browse files
app.py
CHANGED
|
@@ -3,6 +3,8 @@ import os
|
|
| 3 |
import gradio as gr
|
| 4 |
import requests
|
| 5 |
from huggingface_hub import HfApi
|
|
|
|
|
|
|
| 6 |
|
| 7 |
hf_api = HfApi()
|
| 8 |
roots_datasets = {
|
|
@@ -52,11 +54,32 @@ def process_pii(text):
|
|
| 52 |
return text
|
| 53 |
|
| 54 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 55 |
def process_results(results, highlight_terms):
|
| 56 |
if len(results) == 0:
|
| 57 |
return """<br><p style='font-family: Arial; color:Silver; text-align: center;'>
|
| 58 |
No results retrieved.</p><br><hr>"""
|
| 59 |
-
|
| 60 |
results_html = ""
|
| 61 |
for result in results:
|
| 62 |
tokens = result["text"].split()
|
|
@@ -68,136 +91,102 @@ def process_results(results, highlight_terms):
|
|
| 68 |
tokens_html.append(token)
|
| 69 |
tokens_html = " ".join(tokens_html)
|
| 70 |
tokens_html = process_pii(tokens_html)
|
| 71 |
-
meta_html = (
|
| 72 |
-
|
| 73 |
-
<p class='underline-on-hover' style='font-size:12px; font-family: Arial; color:#585858; text-align: left;'>
|
| 74 |
-
<a href='{}' target='_blank'>{}</a></p>""".format(
|
| 75 |
-
result["meta"]["url"], result["meta"]["url"]
|
| 76 |
-
)
|
| 77 |
-
if "meta" in result
|
| 78 |
-
and result["meta"] is not None
|
| 79 |
-
and "url" in result["meta"]
|
| 80 |
-
else ""
|
| 81 |
-
)
|
| 82 |
-
docid_html = get_docid_html(result["docid"])
|
| 83 |
-
results_html += """{}
|
| 84 |
-
<p style='font-size:14px; font-family: Arial; color:#7978FF; text-align: left;'>Document ID: {}</p>
|
| 85 |
-
<p style='font-size:12px; font-family: Arial; color:MediumAquaMarine'>Language: {}</p>
|
| 86 |
<p style='font-family: Arial;'>{}</p>
|
| 87 |
<br>
|
| 88 |
""".format(
|
| 89 |
-
|
| 90 |
)
|
|
|
|
| 91 |
return results_html + "<hr>"
|
| 92 |
|
| 93 |
|
| 94 |
-
def
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
)
|
| 110 |
|
| 111 |
-
payload = json.loads(output.text)
|
| 112 |
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
+ process_results(results, highlight_terms)
|
| 134 |
)
|
|
|
|
|
|
|
| 135 |
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
collapsible_results = f"""
|
| 145 |
-
<details>
|
| 146 |
-
<summary style='font-family: Arial; color:MediumAquaMarine; text-align: left; line-height: 3em'>
|
| 147 |
-
Results for language: <b>{lang}</b><hr>
|
| 148 |
-
</summary>
|
| 149 |
-
{process_results(results_for_lang, highlight_terms)}
|
| 150 |
-
</details>"""
|
| 151 |
-
results_html += collapsible_results
|
| 152 |
-
return results_html
|
| 153 |
-
|
| 154 |
-
return process_results(results, highlight_terms)
|
| 155 |
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
</
|
| 163 |
-
|
|
|
|
| 164 |
|
| 165 |
-
return
|
| 166 |
|
| 167 |
|
| 168 |
-
def
|
| 169 |
try:
|
| 170 |
-
print("perform_exact_search")
|
| 171 |
query = " ".join(query.split())
|
| 172 |
if query == "" or query is None:
|
| 173 |
return ""
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
|
|
|
| 179 |
output = requests.post(
|
| 180 |
-
|
| 181 |
headers={"Content-type": "application/json"},
|
| 182 |
data=json.dumps(post_data),
|
| 183 |
timeout=60,
|
| 184 |
)
|
| 185 |
-
|
| 186 |
payload = json.loads(output.text)
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
result_html = """<br><hr><br>"""
|
| 193 |
-
query_start = result.find(query)
|
| 194 |
-
query_end = query_start + len(query)
|
| 195 |
-
result_html += result[0:query_start]
|
| 196 |
-
result_html += "<b>{}</b>".format(result[query_start:query_end])
|
| 197 |
-
result_html += result[query_end:]
|
| 198 |
-
results_html += result_html
|
| 199 |
-
return results_html + "<hr>"
|
| 200 |
-
|
| 201 |
except Exception as e:
|
| 202 |
results_html = f"""
|
| 203 |
<p style='font-size:18px; font-family: Arial; color:MediumVioletRed; text-align: center;'>
|
|
@@ -206,6 +195,9 @@ def perform_exact_search(query, num_results=10):
|
|
| 206 |
Check if a relevant discussion already exists in the Community tab. If not, please open a discussion.
|
| 207 |
</p>
|
| 208 |
"""
|
|
|
|
|
|
|
|
|
|
| 209 |
|
| 210 |
|
| 211 |
def flag(query, language, num_results, issue_description):
|
|
@@ -308,12 +300,6 @@ if __name__ == "__main__":
|
|
| 308 |
query = query.strip()
|
| 309 |
if query is None or query == "":
|
| 310 |
return "", ""
|
| 311 |
-
|
| 312 |
-
if exact_search:
|
| 313 |
-
return {
|
| 314 |
-
results: perform_exact_search(query, k),
|
| 315 |
-
flagging_form: gr.update(visible=True),
|
| 316 |
-
}
|
| 317 |
return {
|
| 318 |
results: scisearch(query, lang, k, exact_search),
|
| 319 |
flagging_form: gr.update(visible=True),
|
|
|
|
| 3 |
import gradio as gr
|
| 4 |
import requests
|
| 5 |
from huggingface_hub import HfApi
|
| 6 |
+
import traceback
|
| 7 |
+
|
| 8 |
|
| 9 |
hf_api = HfApi()
|
| 10 |
roots_datasets = {
|
|
|
|
| 54 |
return text
|
| 55 |
|
| 56 |
|
| 57 |
+
def format_meta(result):
|
| 58 |
+
meta_html = (
|
| 59 |
+
"""
|
| 60 |
+
<p class='underline-on-hover' style='font-size:12px; font-family: Arial; color:#585858; text-align: left;'>
|
| 61 |
+
<a href='{}' target='_blank'>{}</a></p>""".format(
|
| 62 |
+
result["meta"]["url"], result["meta"]["url"]
|
| 63 |
+
)
|
| 64 |
+
if "meta" in result and result["meta"] is not None and "url" in result["meta"]
|
| 65 |
+
else ""
|
| 66 |
+
)
|
| 67 |
+
docid_html = get_docid_html(result["docid"])
|
| 68 |
+
return """{}
|
| 69 |
+
<p style='font-size:14px; font-family: Arial; color:#7978FF; text-align: left;'>Document ID: {}</p>
|
| 70 |
+
<p style='font-size:12px; font-family: Arial; color:MediumAquaMarine'>Language: {}</p>
|
| 71 |
+
""".format(
|
| 72 |
+
meta_html,
|
| 73 |
+
docid_html,
|
| 74 |
+
result["lang"] if lang in result else None,
|
| 75 |
+
)
|
| 76 |
+
return meta_html
|
| 77 |
+
|
| 78 |
+
|
| 79 |
def process_results(results, highlight_terms):
|
| 80 |
if len(results) == 0:
|
| 81 |
return """<br><p style='font-family: Arial; color:Silver; text-align: center;'>
|
| 82 |
No results retrieved.</p><br><hr>"""
|
|
|
|
| 83 |
results_html = ""
|
| 84 |
for result in results:
|
| 85 |
tokens = result["text"].split()
|
|
|
|
| 91 |
tokens_html.append(token)
|
| 92 |
tokens_html = " ".join(tokens_html)
|
| 93 |
tokens_html = process_pii(tokens_html)
|
| 94 |
+
meta_html = format_meta(result)
|
| 95 |
+
meta_html += """
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 96 |
<p style='font-family: Arial;'>{}</p>
|
| 97 |
<br>
|
| 98 |
""".format(
|
| 99 |
+
tokens_html
|
| 100 |
)
|
| 101 |
+
results_html += meta_html
|
| 102 |
return results_html + "<hr>"
|
| 103 |
|
| 104 |
|
| 105 |
+
def process_exact_match_payload(payload, query):
|
| 106 |
+
results = payload["results"]
|
| 107 |
+
results_html = ""
|
| 108 |
+
for result in results:
|
| 109 |
+
text = result["text"]
|
| 110 |
+
print(result, text, type(text))
|
| 111 |
+
meta_html = format_meta(result)
|
| 112 |
+
result_html = """<br><hr><br>""" + meta_html
|
| 113 |
+
query_start = text.find(query)
|
| 114 |
+
query_end = query_start + len(query)
|
| 115 |
+
result_html += text[0:query_start]
|
| 116 |
+
result_html += "<b>{}</b>".format(text[query_start:query_end])
|
| 117 |
+
result_html += text[query_end:]
|
| 118 |
+
results_html += result_html
|
| 119 |
+
return results_html + "<hr>"
|
|
|
|
| 120 |
|
|
|
|
| 121 |
|
| 122 |
+
def process_bm25_match_payload(payload, language):
|
| 123 |
+
if "err" in payload:
|
| 124 |
+
if payload["err"]["type"] == "unsupported_lang":
|
| 125 |
+
detected_lang = payload["err"]["meta"]["detected_lang"]
|
| 126 |
+
return f"""
|
| 127 |
+
<p style='font-size:18px; font-family: Arial; color:MediumVioletRed; text-align: center;'>
|
| 128 |
+
Detected language <b>{detected_lang}</b> is not supported.<br>
|
| 129 |
+
Please choose a language from the dropdown or type another query.
|
| 130 |
+
</p><br><hr><br>"""
|
| 131 |
+
|
| 132 |
+
results = payload["results"]
|
| 133 |
+
highlight_terms = payload["highlight_terms"]
|
| 134 |
+
|
| 135 |
+
if language == "detect_language":
|
| 136 |
+
return (
|
| 137 |
+
(
|
| 138 |
+
f"""<p style='font-family: Arial; color:MediumAquaMarine; text-align: center; line-height: 3em'>
|
| 139 |
+
Detected language: <b>{results[0]["lang"]}</b></p><br><hr><br>"""
|
| 140 |
+
if len(results) > 0 and language == "detect_language"
|
| 141 |
+
else ""
|
|
|
|
| 142 |
)
|
| 143 |
+
+ process_results(results, highlight_terms)
|
| 144 |
+
)
|
| 145 |
|
| 146 |
+
if language == "all":
|
| 147 |
+
results_html = ""
|
| 148 |
+
for lang, results_for_lang in results.items():
|
| 149 |
+
if len(results_for_lang) == 0:
|
| 150 |
+
results_html += f"""<p style='font-family: Arial; color:Silver; text-align: left; line-height: 3em'>
|
| 151 |
+
No results for language: <b>{lang}</b><hr></p>"""
|
| 152 |
+
continue
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 153 |
|
| 154 |
+
collapsible_results = f"""
|
| 155 |
+
<details>
|
| 156 |
+
<summary style='font-family: Arial; color:MediumAquaMarine; text-align: left; line-height: 3em'>
|
| 157 |
+
Results for language: <b>{lang}</b><hr>
|
| 158 |
+
</summary>
|
| 159 |
+
{process_results(results_for_lang, highlight_terms)}
|
| 160 |
+
</details>"""
|
| 161 |
+
results_html += collapsible_results
|
| 162 |
+
return results_html
|
| 163 |
|
| 164 |
+
return process_results(results, highlight_terms)
|
| 165 |
|
| 166 |
|
| 167 |
+
def scisearch(query, language, num_results=10, exact_search=False):
|
| 168 |
try:
|
|
|
|
| 169 |
query = " ".join(query.split())
|
| 170 |
if query == "" or query is None:
|
| 171 |
return ""
|
| 172 |
+
post_data = {"query": query, "k": num_results}
|
| 173 |
+
if language != "detect_language":
|
| 174 |
+
post_data["lang"] = language
|
| 175 |
+
address = (
|
| 176 |
+
"http://34.105.160.81:8080" if exact_search else os.environ.get("address")
|
| 177 |
+
)
|
| 178 |
output = requests.post(
|
| 179 |
+
address,
|
| 180 |
headers={"Content-type": "application/json"},
|
| 181 |
data=json.dumps(post_data),
|
| 182 |
timeout=60,
|
| 183 |
)
|
|
|
|
| 184 |
payload = json.loads(output.text)
|
| 185 |
+
return (
|
| 186 |
+
process_bm25_match_payload(payload, language)
|
| 187 |
+
if not exact_search
|
| 188 |
+
else process_exact_match_payload(payload, query)
|
| 189 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 190 |
except Exception as e:
|
| 191 |
results_html = f"""
|
| 192 |
<p style='font-size:18px; font-family: Arial; color:MediumVioletRed; text-align: center;'>
|
|
|
|
| 195 |
Check if a relevant discussion already exists in the Community tab. If not, please open a discussion.
|
| 196 |
</p>
|
| 197 |
"""
|
| 198 |
+
print(e)
|
| 199 |
+
print(traceback.format_exc())
|
| 200 |
+
return results_html
|
| 201 |
|
| 202 |
|
| 203 |
def flag(query, language, num_results, issue_description):
|
|
|
|
| 300 |
query = query.strip()
|
| 301 |
if query is None or query == "":
|
| 302 |
return "", ""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 303 |
return {
|
| 304 |
results: scisearch(query, lang, k, exact_search),
|
| 305 |
flagging_form: gr.update(visible=True),
|