Spaces:
Runtime error
Runtime error
| # | |
| # Pyserini: Reproducible IR research with sparse and dense representations | |
| # | |
| # Licensed under the Apache License, Version 2.0 (the "License"); | |
| # you may not use this file except in compliance with the License. | |
| # You may obtain a copy of the License at | |
| # | |
| # http://www.apache.org/licenses/LICENSE-2.0 | |
| # | |
| # Unless required by applicable law or agreed to in writing, software | |
| # distributed under the License is distributed on an "AS IS" BASIS, | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| # See the License for the specific language governing permissions and | |
| # limitations under the License. | |
| # | |
| from typing import List | |
| from ..pyclass import autoclass | |
| # Wrappers around Lucene classes | |
| JAnalyzer = autoclass('org.apache.lucene.analysis.Analyzer') | |
| JArabicAnalyzer = autoclass('org.apache.lucene.analysis.ar.ArabicAnalyzer') | |
| JBengaliAnalyzer = autoclass('org.apache.lucene.analysis.bn.BengaliAnalyzer') | |
| JCJKAnalyzer = autoclass('org.apache.lucene.analysis.cjk.CJKAnalyzer') | |
| JDanishAnalyzer = autoclass('org.apache.lucene.analysis.da.DanishAnalyzer') | |
| JDefaultEnglishAnalyzer = autoclass('io.anserini.analysis.DefaultEnglishAnalyzer') | |
| JDutchAnalyzer = autoclass('org.apache.lucene.analysis.nl.DutchAnalyzer') | |
| JFinnishAnalyzer = autoclass('org.apache.lucene.analysis.fi.FinnishAnalyzer') | |
| JFrenchAnalyzer = autoclass('org.apache.lucene.analysis.fr.FrenchAnalyzer') | |
| JGermanAnalyzer = autoclass('org.apache.lucene.analysis.de.GermanAnalyzer') | |
| JHindiAnalyzer = autoclass('org.apache.lucene.analysis.hi.HindiAnalyzer') | |
| JHungarianAnalyzer = autoclass('org.apache.lucene.analysis.hu.HungarianAnalyzer') | |
| JIndonesianAnalyzer = autoclass('org.apache.lucene.analysis.id.IndonesianAnalyzer') | |
| JItalianAnalyzer = autoclass('org.apache.lucene.analysis.it.ItalianAnalyzer') | |
| JJapaneseAnalyzer = autoclass('org.apache.lucene.analysis.ja.JapaneseAnalyzer') | |
| JNorwegianAnalyzer = autoclass('org.apache.lucene.analysis.no.NorwegianAnalyzer') | |
| JPortugueseAnalyzer = autoclass('org.apache.lucene.analysis.pt.PortugueseAnalyzer') | |
| JRussianAnalyzer = autoclass('org.apache.lucene.analysis.ru.RussianAnalyzer') | |
| JSpanishAnalyzer = autoclass('org.apache.lucene.analysis.es.SpanishAnalyzer') | |
| JSwedishAnalyzer = autoclass('org.apache.lucene.analysis.sv.SwedishAnalyzer') | |
| JTeluguAnalyzer = autoclass('org.apache.lucene.analysis.te.TeluguAnalyzer') | |
| JThaiAnalyzer = autoclass('org.apache.lucene.analysis.th.ThaiAnalyzer') | |
| JTurkishAnalyzer = autoclass('org.apache.lucene.analysis.tr.TurkishAnalyzer') | |
| JWhiteSpaceAnalyzer = autoclass('org.apache.lucene.analysis.core.WhitespaceAnalyzer') | |
| JCharArraySet = autoclass('org.apache.lucene.analysis.CharArraySet') | |
| # Wrappers around Anserini classes | |
| JAnalyzerUtils = autoclass('io.anserini.analysis.AnalyzerUtils') | |
| JDefaultEnglishAnalyzer = autoclass('io.anserini.analysis.DefaultEnglishAnalyzer') | |
| JTweetAnalyzer = autoclass('io.anserini.analysis.TweetAnalyzer') | |
| JHuggingFaceTokenizerAnalyzer = autoclass('io.anserini.analysis.HuggingFaceTokenizerAnalyzer') | |
| def get_lucene_analyzer(language: str='en', stemming: bool=True, stemmer: str='porter', stopwords: bool=True, huggingFaceTokenizer: str=None) -> JAnalyzer: | |
| """Create a Lucene ``Analyzer`` with specific settings. | |
| Parameters | |
| ---------- | |
| language : str | |
| Name of analyzer. | |
| stemming : bool | |
| Set to stem. | |
| stemmer : str | |
| Stemmer to use. | |
| stopwords : bool | |
| Set to filter stopwords. | |
| huggingFaceTokenizer: str | |
| a huggingface model id or path to a tokenizer.json file | |
| Returns | |
| ------- | |
| JAnalyzer | |
| Java ``Analyzer`` with specified settings. | |
| """ | |
| if language.lower() == 'ar': | |
| return JArabicAnalyzer() | |
| elif language.lower() == 'bn': | |
| return JBengaliAnalyzer() | |
| elif language.lower() in ['zh', 'ko']: | |
| return JCJKAnalyzer() | |
| elif language.lower() == 'da': | |
| return JDanishAnalyzer() | |
| elif language.lower() == 'nl': | |
| return JDutchAnalyzer() | |
| elif language.lower() == 'fi': | |
| return JFinnishAnalyzer() | |
| elif language.lower() == 'fr': | |
| return JFrenchAnalyzer() | |
| elif language.lower() == 'de': | |
| return JGermanAnalyzer() | |
| elif language.lower() == 'hi': | |
| return JHindiAnalyzer() | |
| elif language.lower() == 'hu': | |
| return JHungarianAnalyzer() | |
| elif language.lower() == 'id': | |
| return JIndonesianAnalyzer() | |
| elif language.lower() == 'it': | |
| return JItalianAnalyzer() | |
| elif language.lower() == 'ja': | |
| return JJapaneseAnalyzer() | |
| elif language.lower() == 'no': | |
| return JNorwegianAnalyzer() | |
| elif language.lower() == 'pt': | |
| return JPortugueseAnalyzer() | |
| elif language.lower() == 'ru': | |
| return JRussianAnalyzer() | |
| elif language.lower() == 'es': | |
| return JSpanishAnalyzer() | |
| elif language.lower() == 'te': | |
| return JTeluguAnalyzer() | |
| elif language.lower() == 'th': | |
| return JThaiAnalyzer() | |
| elif language.lower() == 'tr': | |
| return JTurkishAnalyzer() | |
| elif language.lower() == 'tweet': | |
| return JTweetAnalyzer() | |
| elif language.lower() == 'hgf_tokenizer': | |
| return JHuggingFaceTokenizerAnalyzer(huggingFaceTokenizer) | |
| elif language.lower() == 'en': | |
| if stemming: | |
| if stopwords: | |
| return JDefaultEnglishAnalyzer.newStemmingInstance(stemmer) | |
| else: | |
| return JDefaultEnglishAnalyzer.newStemmingInstance(stemmer, JCharArraySet.EMPTY_SET) | |
| else: | |
| if stopwords: | |
| return JDefaultEnglishAnalyzer.newNonStemmingInstance() | |
| else: | |
| return JDefaultEnglishAnalyzer.newNonStemmingInstance(JCharArraySet.EMPTY_SET) | |
| else: | |
| raise ValueError('Invalid configuration.') | |
| class Analyzer: | |
| """Python wrapper around a Lucene ``Analyzer`` to simplify analysis. | |
| Parameters | |
| ---------- | |
| analyzer : JAnalyzer | |
| Lucene ``Analyzer``. | |
| """ | |
| def __init__(self, analyzer): | |
| if not isinstance(analyzer, JAnalyzer): | |
| raise TypeError('Invalid JAnalyzer!') | |
| self.analyzer = analyzer | |
| def analyze(self, text: str) -> List[str]: | |
| """Analyze a piece of text. | |
| Parameters | |
| ---------- | |
| text : str | |
| Text to analyze. | |
| Returns | |
| ------- | |
| List[str] | |
| List of tokens corresponding to the output of the analyzer. | |
| """ | |
| results = JAnalyzerUtils.analyze(self.analyzer, text) | |
| tokens = [] | |
| for token in results.toArray(): | |
| tokens.append(token) | |
| return tokens | |