Spaces:
Runtime error
Runtime error
| # | |
| # Pyserini: Reproducible IR research with sparse and dense representations | |
| # | |
| # Licensed under the Apache License, Version 2.0 (the "License"); | |
| # you may not use this file except in compliance with the License. | |
| # You may obtain a copy of the License at | |
| # | |
| # http://www.apache.org/licenses/LICENSE-2.0 | |
| # | |
| # Unless required by applicable law or agreed to in writing, software | |
| # distributed under the License is distributed on an "AS IS" BASIS, | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| # See the License for the specific language governing permissions and | |
| # limitations under the License. | |
| # | |
| import re | |
| import spacy | |
| """ | |
| This file provides helpers to convert passage and queries | |
| """ | |
| def read_stopwords(fileName='stopwords.txt', lower_case=True): | |
| """Reads a list of stopwords from a file. By default the words | |
| are read from a standard repo location and are lower_cased. | |
| :param fileName a stopword file name | |
| :param lower_case a boolean flag indicating if lowercasing is needed. | |
| :return a list of stopwords | |
| """ | |
| stopwords = set() | |
| with open(fileName) as f: | |
| for w in f: | |
| w = w.strip() | |
| if w: | |
| if lower_case: | |
| w = w.lower() | |
| stopwords.add(w) | |
| return stopwords | |
| def is_alpha_num(s): | |
| return s and (re.match("^[a-zA-Z-_.0-9]+$", s) is not None) | |
| class SpacyTextParser: | |
| def __init__(self, model_name, stopwords, | |
| remove_punct=True, | |
| sent_split=False, | |
| keep_only_alpha_num=False, | |
| lower_case=True, | |
| enable_POS=True): | |
| """Constructor. | |
| :param model_name a name of the spacy model to use, e.g., en_core_web_sm | |
| :param stopwords a list of stop words to be excluded (case insensitive); | |
| a token is also excluded when its lemma is in the stop word list. | |
| :param remove_punct a bool flag indicating if the punctuation tokens need to be removed | |
| :param sent_split a bool flag indicating if sentence splitting is necessary | |
| :param keep_only_alpha_num a bool flag indicating if we need to keep only alpha-numeric characters | |
| :param enable_POS a bool flag that enables POS tagging (which, e.g., can improve lemmatization) | |
| """ | |
| disable_list = ['ner', 'parser'] | |
| if not enable_POS: | |
| disable_list.append('tagger') | |
| print('Disabled Spacy components: ', disable_list) | |
| self._nlp = spacy.load(model_name, disable=disable_list) | |
| if sent_split: | |
| sentencizer = self._nlp.create_pipe("sentencizer") | |
| self._nlp.add_pipe(sentencizer) | |
| self._remove_punct = remove_punct | |
| self._stopwords = frozenset([w.lower() for w in stopwords]) | |
| self._keep_only_alpha_num = keep_only_alpha_num | |
| self._lower_case = lower_case | |
| def _basic_clean(text): | |
| return text.replace("’", "'") | |
| def __call__(self, text): | |
| """A thin wrapper that merely calls spacy. | |
| :param text input text string | |
| :return a spacy Doc object | |
| """ | |
| return self._nlp(SpacyTextParser._basic_clean(text)) | |
| def proc_text(self, text): | |
| """Process text, remove stopwords and obtain lemmas, but does not split into sentences. | |
| This function should not emit newlines! | |
| :param text input text string | |
| :return a tuple (lemmatized text, original-form text). Text is white-space separated. | |
| """ | |
| lemmas = [] | |
| tokens = [] | |
| doc = self(text) | |
| for tokObj in doc: | |
| if self._remove_punct and tokObj.is_punct: | |
| continue | |
| lemma = tokObj.lemma_ | |
| text = tokObj.text | |
| if self._keep_only_alpha_num and not is_alpha_num(text): | |
| continue | |
| tok1 = text.lower() | |
| tok2 = lemma.lower() | |
| if tok1 in self._stopwords or tok2 in self._stopwords: | |
| continue | |
| if self._lower_case: | |
| text = text.lower() | |
| lemma = lemma.lower() | |
| lemmas.append(lemma) | |
| tokens.append(text) | |
| return ' '.join(lemmas), ' '.join(tokens) | |
| def get_retokenized(tokenizer, text): | |
| """Obtain a space separated re-tokenized text. | |
| :param tokenizer: a tokenizer that has the function | |
| tokenize that returns an array of tokens. | |
| :param text: a text to re-tokenize. | |
| """ | |
| return ' '.join(tokenizer.tokenize(text)) | |
| def add_retokenized_field(data_entry, | |
| src_field, | |
| dst_field, | |
| tokenizer): | |
| """ | |
| Create a re-tokenized field from an existing one. | |
| :param data_entry: a dictionary of entries (keys are field names, values are text items) | |
| :param src_field: a source field | |
| :param dst_field: a target field | |
| :param tokenizer: a tokenizer to use, if None, nothing is done | |
| """ | |
| if tokenizer is not None: | |
| dst = '' | |
| if src_field in data_entry: | |
| dst = get_retokenized(tokenizer, data_entry[src_field]) | |
| data_entry[dst_field] = dst |