| |
| |
| |
| |
| |
| |
|
|
| import re |
| import unicodedata |
|
|
| UNICODE_PUNCT = { |
| ",": ",", |
| "。": ".", |
| "、": ",", |
| "„": '"', |
| "”": '"', |
| "“": '"', |
| "«": '"', |
| "»": '"', |
| "1": '"', |
| "」": '"', |
| "「": '"', |
| "《": '"', |
| "》": '"', |
| "´": "'", |
| "∶": ":", |
| ":": ":", |
| "?": "?", |
| "!": "!", |
| "(": "(", |
| ")": ")", |
| ";": ";", |
| "–": "-", |
| "—": " - ", |
| ".": ". ", |
| "~": "~", |
| "’": "'", |
| "…": "...", |
| "━": "-", |
| "〈": "<", |
| "〉": ">", |
| "【": "[", |
| "】": "]", |
| "%": "%", |
| "►": "-", |
| } |
|
|
| UNICODE_PUNCT_RE = re.compile(f"[{''.join(UNICODE_PUNCT.keys())}]") |
|
|
| MATH_RE = r"(?<!\\)(\$\$?.+?\$\$?)" |
| CODE_RE = r'\`{1,3}.*?\`{1,3}' |
|
|
|
|
| def replace_unicode_punct(text: str) -> str: |
| return "".join((UNICODE_PUNCT.get(c, c) for c in text)) |
|
|
|
|
| def remove_unicode_punct(text: str) -> str: |
| """More aggressive version of replace_unicode_punct but also faster.""" |
| return UNICODE_PUNCT_RE.sub("", text) |
|
|
|
|
| def strip_accents(line: str) -> str: |
| """Strips accents from a piece of text.""" |
| nfd = unicodedata.normalize("NFD", line) |
| output = [c for c in nfd if unicodedata.category(c) != "Mn"] |
| if len(output) == line: |
| return line |
| return "".join(output) |
|
|
|
|
| |
| NON_PRINTING_CHARS_RE = re.compile( |
| f"[{''.join(map(chr, list(range(0,32)) + list(range(127,160))))}]" |
| ) |
| DIGIT_RE = re.compile(r"\d") |
| PUNCT_OR_NON_PRINTING_CHARS_RE = re.compile( |
| (UNICODE_PUNCT_RE.pattern + NON_PRINTING_CHARS_RE.pattern).replace("][", "") |
| ) |
|
|
|
|
| def remove_non_printing_char(text: str) -> str: |
| return NON_PRINTING_CHARS_RE.sub("", text) |
|
|
|
|
| def normalize_spacing_for_tok(text: str, language: str = "en") -> str: |
| res = ( |
| text.replace("\r", "") |
| |
| .replace("(", " (") |
| .replace(")", ") ") |
| .replace(" +", " ") |
| ) |
| res = re.sub(r"\) ([\.\!\:\?\;\,])", r"\)\1", res) |
| res = res.replace("( ", "(").replace(" )", ")") |
| res = re.sub(r"(\d) \%", r"\1\%", res) |
| res = res.replace(" :", ":").replace(" ;", ";") |
| res = res.replace("`", "'").replace("''", ' " ') |
|
|
| res = ( |
| res.replace("„", '"') |
| .replace("“", '"') |
| .replace("”", '"') |
| .replace("–", "-") |
| .replace("—", " - ") |
| .replace(" +", " ") |
| .replace("´", "'") |
| .replace("([a-z])‘([a-z])", r"\1'\2/") |
| .replace("([a-z])’([a-z])", r"\1'\2/") |
| .replace("‘", '"') |
| .replace("‚", '"') |
| .replace("’", '"') |
| .replace("''", '"') |
| .replace("´´", '"') |
| .replace("…", "...") |
| |
| .replace(" « ", ' "') |
| .replace("« ", '"') |
| .replace("«", '"') |
| .replace(" » ", '" ') |
| .replace(" »", '"') |
| .replace("»", '"') |
| |
| .replace(" %", "%") |
| .replace("nº ", "nº ") |
| .replace(" :", ":") |
| .replace(" ºC", " ºC") |
| .replace(" cm", " cm") |
| .replace(" ?", "?") |
| .replace(" !", "!") |
| .replace(" ;", ";") |
| .replace(", ", ", ") |
| .replace(" +", " ") |
| .replace(".", ". ") |
| ) |
| |
| if language == "en": |
| res = re.sub(r"\"([,\.]+)", r"\1\"", res) |
| |
| elif language == "cs" or language == "cz": |
| pass |
| |
| else: |
| res = res.replace(',"', '",') |
| res = re.sub( |
| r"(\.+)\"(\s*[^<])", r"\"\1\2", res |
| ) |
|
|
| if ( |
| language == "de" |
| or language == "es" |
| or language == "cz" |
| or language == "cs" |
| or language == "fr" |
| ): |
| res = re.sub(r"(\d) (\d)", r"\1,\2", res) |
| else: |
| res = re.sub(r"(\d) (\d)", r"\1.\2", res) |
| return res |
|
|
|
|
| def normalize(line: str, accent=True, case=True, numbers=True, math=True, code=True, punct=1) -> str: |
| line = line.strip() |
| if not line: |
| return line |
| if case: |
| line = line.lower() |
| if accent: |
| line = strip_accents(line) |
| if numbers: |
| line = DIGIT_RE.sub("0", line) |
| if punct == 1: |
| line = replace_unicode_punct(line) |
| elif punct == 2: |
| line = remove_unicode_punct(line) |
| if math: |
| line = re.sub(MATH_RE, "[EQUATION]", line, flags=re.DOTALL) |
| if code: |
| line = re.sub(CODE_RE, "[CODE]", line, flags=re.DOTALL) |
| |
| line = line.replace("<s>", "").replace("</s>", "") |
| line = remove_non_printing_char(line) |
| return line |
|
|
|
|
| def slow_normalize_for_dedup(line: str) -> str: |
| return normalize(line, accent=False, case=True, numbers=True, punct=2) |
|
|
|
|
| def normalize_for_dedup(line: str) -> str: |
| line = line.strip() |
| if not line: |
| return line |
| |
| line = line.lower() |
| |
| line = DIGIT_RE.sub("0", line) |
| line = PUNCT_OR_NON_PRINTING_CHARS_RE.sub("", line) |
| return line |