diff --git a/Dockerfile.dev b/Dockerfile.dev index 2d19d84f..8bd57cd0 100644 --- a/Dockerfile.dev +++ b/Dockerfile.dev @@ -1,5 +1,23 @@ FROM python:3.11-bullseye +# Update default packages +RUN apt-get -qq update + +# Get Ubuntu packages +RUN apt-get install -y -q \ + build-essential \ + curl \ + cmake + +# NOTE: no need to run update again at this point +# RUN apt-get update + +# Get Rust; NOTE: using sh for better compatibility with other base images +RUN curl https://sh.rustup.rs -sSf | sh -s -- -y + +# Add .cargo/bin to PATH +ENV PATH="/root/.cargo/bin:${PATH}" + # Set the working directory to /app WORKDIR /app @@ -10,4 +28,4 @@ RUN make install # Install the app COPY . /app -RUN pip install -e . \ No newline at end of file +RUN pip install -e . diff --git a/pyproject.toml b/pyproject.toml index af570248..6b70cd2d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,7 +16,13 @@ classifiers = [ ] requires-python = ">=3.10" -dependencies = ["pydantic==1.8.2"] +dependencies = [ + "pydantic>=2.4.2", # dolma does not work with very old versions of pydantic + "dolma@git+https://github.com/allenai/dolma.git@5a010a2685914b1db7744426abfb4b9ece52da95", # Install from git until a 0.9.2 package is released + "kenlm>=0.2.0", # Used for perplexity tagging + "blingfire>=0.1.8", # Used for perplexity tagging + "requests>=2.31.0", +] [project.optional-dependencies] dev = ["black==23.9.1", "ruff==0.1.0", "pyright==1.1.331", "pre-commit==3.5.0"] diff --git a/src/dfm/__init__.py b/src/dfm/__init__.py new file mode 100644 index 00000000..d01c3abd --- /dev/null +++ b/src/dfm/__init__.py @@ -0,0 +1,4 @@ +import importlib.metadata + +# Fetches the version of the package as defined in pyproject.toml +__version__ = importlib.metadata.version(__package__) diff --git a/src/dfm/common/__init__.py b/src/dfm/common/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/dfm/common/data_cleaning/__init__.py b/src/dfm/common/data_cleaning/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/dfm/common/data_cleaning/ccnet_text_normalizer.py b/src/dfm/common/data_cleaning/ccnet_text_normalizer.py new file mode 100644 index 00000000..dab25c41 --- /dev/null +++ b/src/dfm/common/data_cleaning/ccnet_text_normalizer.py @@ -0,0 +1,203 @@ +# This file has initially been copied from the ccnet repository from Facebook. +# https://github.com/facebookresearch/cc_net/blob/main/cc_net/text_normalizer.py +# The utility functions can be used to normalize text before processing it +# with ccnet models, but might not be the best general purpose implementation. +# +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +# +# This file is full of ambigous characters, so disable ruff check for those. +# ruff: noqa: RUF001 + +import re +import unicodedata +from typing import Literal + +UNICODE_PUNCT = { + ",": ",", + "。": ".", + "、": ",", + "„": '"', + "”": '"', + "“": '"', + "«": '"', + "»": '"', + "1": '"', + "」": '"', + "「": '"', + "《": '"', + "》": '"', + "´": "'", + "∶": ":", + ":": ":", + "?": "?", + "!": "!", + "(": "(", + ")": ")", + ";": ";", + "–": "-", + "—": " - ", + ".": ". ", + "~": "~", + "’": "'", + "…": "...", + "━": "-", + "〈": "<", + "〉": ">", + "【": "[", + "】": "]", + "%": "%", + "►": "-", +} + +UNICODE_PUNCT_RE = re.compile(f"[{''.join(UNICODE_PUNCT.keys())}]") + + +def replace_unicode_punct(text: str) -> str: + return "".join(UNICODE_PUNCT.get(c, c) for c in text) + + +def remove_unicode_punct(text: str) -> str: + """More aggressive version of replace_unicode_punct but also faster.""" + return UNICODE_PUNCT_RE.sub("", text) + + +def strip_accents(line: str) -> str: + """Strips accents from a piece of text.""" + nfd = unicodedata.normalize("NFD", line) + output = [c for c in nfd if unicodedata.category(c) != "Mn"] + return "".join(output) + + +# Build a regex matching all control characters. +NON_PRINTING_CHARS_RE = re.compile( + f"[{''.join(map(chr, list(range(32)) + list(range(127,160))))}]", +) +DIGIT_RE = re.compile(r"\d") +PUNCT_OR_NON_PRINTING_CHARS_RE = re.compile( + (UNICODE_PUNCT_RE.pattern + NON_PRINTING_CHARS_RE.pattern).replace("][", ""), +) + + +def remove_non_printing_char(text: str) -> str: + return NON_PRINTING_CHARS_RE.sub("", text) + + +def normalize_spacing_for_tok(text: str, language: str = "en") -> str: + res = ( + text.replace("\r", "") + # remove extra spaces + .replace("(", " (") + .replace(")", ") ") + .replace(" +", " ") + ) + res = re.sub(r"\) ([\.\!\:\?\;\,])", r"\)\1", res) + res = res.replace("( ", "(").replace(" )", ")") + res = re.sub(r"(\d) \%", r"\1\%", res) + res = res.replace(" :", ":").replace(" ;", ";") + res = res.replace("`", "'").replace("''", ' " ') + + res = ( + res.replace("„", '"') + .replace("“", '"') + .replace("”", '"') + .replace("–", "-") + .replace("—", " - ") + .replace(" +", " ") + .replace("´", "'") + .replace("([a-z])‘([a-z])", r"\1'\2/") + .replace("([a-z])’([a-z])", r"\1'\2/") + .replace("‘", '"') + .replace("‚", '"') + .replace("’", '"') + .replace("''", '"') + .replace("´´", '"') + .replace("…", "...") + # French quotes + .replace(" « ", ' "') + .replace("« ", '"') + .replace("«", '"') + .replace(" » ", '" ') + .replace(" »", '"') + .replace("»", '"') + # handle pseudo-spaces + .replace(" %", "%") + .replace("nº ", "nº ") + .replace(" :", ":") + .replace(" ºC", " ºC") + .replace(" cm", " cm") + .replace(" ?", "?") + .replace(" !", "!") + .replace(" ;", ";") + .replace(", ", ", ") + .replace(" +", " ") + .replace(".", ". ") + ) + # English "quotation," followed by comma, style + if language == "en": + res = re.sub(r"\"([,\.]+)", r"\1\"", res) + # Czech is confused + elif language == "cs" or language == "cz": + pass + # German/Spanish/French "quotation", followed by comma, style + else: + res = res.replace(',"', '",') + res = re.sub( + r"(\.+)\"(\s*[^<])", + r"\"\1\2", + res, + ) # don't fix period at end of sentence + + if ( + language == "de" + or language == "es" + or language == "cz" + or language == "cs" + or language == "fr" + ): + res = re.sub(r"(\d) (\d)", r"\1,\2", res) + else: + res = re.sub(r"(\d) (\d)", r"\1.\2", res) + return res + + +def normalize( + line: str, + accent: bool = True, + case: bool = True, + numbers: bool = True, + punct: Literal[1, 2] = 1, +) -> str: + line = line.strip() + if not line: + return line + if case: + line = line.lower() + if accent: + line = strip_accents(line) + if numbers: + line = DIGIT_RE.sub("0", line) + if punct == 1: + line = replace_unicode_punct(line) + elif punct == 2: + line = remove_unicode_punct(line) + line = remove_non_printing_char(line) + return line + + +def slow_normalize_for_dedup(line: str) -> str: + return normalize(line, accent=False, case=True, numbers=True, punct=2) + + +def normalize_for_dedup(line: str) -> str: + line = line.strip() + if not line: + return line + # case + line = line.lower() + # numbers + line = DIGIT_RE.sub("0", line) + line = PUNCT_OR_NON_PRINTING_CHARS_RE.sub("", line) + return line diff --git a/src/dfm/common/data_cleaning/dolma_taggers/__init__.py b/src/dfm/common/data_cleaning/dolma_taggers/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/dfm/common/data_cleaning/dolma_taggers/language_scandi.py b/src/dfm/common/data_cleaning/dolma_taggers/language_scandi.py new file mode 100644 index 00000000..1108abff --- /dev/null +++ b/src/dfm/common/data_cleaning/dolma_taggers/language_scandi.py @@ -0,0 +1,253 @@ +""" + +Dolma taggers for Scandinavian language detection. + +""" +from collections.abc import Iterable + +import pycld2 as cld2 +import regex +from anyascii import anyascii +from dolma.core.data_types import DocResult, Document, Span, TextSlice +from dolma.core.ft_tagger import BaseFastTextTagger, Prediction +from dolma.core.registry import TaggerRegistry +from dolma.core.taggers import BaseTagger +from dolma.core.utils import split_paragraphs + +LANGS = { + "ENGLISH": "en", + "DANISH": "da", + "SWEDISH": "sv", + "NORWEGIAN": "no", + "ICELANDIC": "is", + "FAROESE": "fo", # Note that FAROESE is not supported by cld2 or fasttext +} + + +@TaggerRegistry.add("cld2_scandi_doc") +class Cld2ScandiLanguageTagger(BaseTagger): + """This tagger runs the Compact Language Detect 2 model on a full document + and will return a score between 0 and 1 for each language in LANGS. + It uses the pretrained model from the pycld2 package.""" + + RE_BAD_CHARS = regex.compile(r"[\p{Cc}\p{Cs}]+") + + def _sanitize_input(self, text: str) -> str: + return self.RE_BAD_CHARS.sub("", text) + + def _to_ascii_input(self, text: str) -> str: + return anyascii(text) + + def _identity_fn(self, text: str) -> str: + return text + + def _predict_text(self, text: str) -> dict[str, float]: + """Predict the language of a string and return the detected languages in a dictionary.""" + is_reliable = False + details: Iterable[tuple[str, str, int, float]] = [] + for fn in (self._identity_fn, self._to_ascii_input, self._sanitize_input): + try: + retvals = cld2.detect(fn(text)) + assert len(retvals) == 3 + is_reliable, _, details = retvals + # is_reliable is True if the detection is "high confidence" + # details is a Tuple of up to three detected languages, where each is + # tuple is (languageName, languageCode, percent, score). percent is + # what percentage of the original text was detected as this language + # and score is the confidence score for that language. + break + except cld2.error: + ... + + scores: dict[str, float] = {} + if is_reliable: + for lang, _, score, _ in details: + if lang in LANGS: + scores[LANGS[lang]] = score / 100.0 + + return scores + + def predict(self, doc: Document) -> DocResult: + lang_scores = self._predict_text(doc.text) + spans: list[Span] = [] + for lang_code in LANGS.values(): + # If the language was not detected we will still tag + # the sentence with a score of 0 + score = lang_scores.get(lang_code, 0) + + positive_span = Span( + start=0, + end=len(doc.text), + type=lang_code, + score=score, + ) + negative_span = Span( + start=0, + end=len(doc.text), + type=f"not_{lang_code}", + score=1.0 - score, + ) + spans.append(positive_span) + spans.append(negative_span) + return DocResult(doc=doc, spans=spans) + + +@TaggerRegistry.add("cld2_scandi_paragraph") +class Cld2ScandiLanguageParagraphTagger(Cld2ScandiLanguageTagger): + """This tagger runs the Compact Language Detect 2 model on each paragraph, + and will save a score between 0 and 1 for each language in LANGS""" + + def predict(self, doc: Document) -> DocResult: + paragraphs = split_paragraphs(doc.text) + spans: list[Span] = [] + for paragraph in paragraphs: + lang_scores = self._predict_text(paragraph.text) + for lang_code in LANGS.values(): + score = lang_scores.get(lang_code, 0.0) + + positive_span = Span( + start=paragraph.start, + end=paragraph.end, + type=lang_code, + score=score, + ) + negative_span = Span( + start=paragraph.start, + end=paragraph.end, + type=f"not_{lang_code}", + score=1.0 - score, + ) + spans.extend((positive_span, negative_span)) + return DocResult(doc=doc, spans=spans) + + +@TaggerRegistry.add("ft_lang_id_scandi_doc") +class FastTextScandiLanguageDocumentTagger(BaseFastTextTagger): + """This tagger runs the FastText language detection model on each document. + The score is between 0 and 1 and provided for each language in LANGS. + + The method is described in the following papers: + + @article{joulin2016bag, + title={Bag of Tricks for Efficient Text Classification}, + author={Joulin, Armand and Grave, Edouard and Bojanowski, Piotr and Mikolov, Tomas}, + journal={arXiv preprint arXiv:1607.01759}, + year={2016} + } + @article{joulin2016fasttext, + title={FastText.zip: Compressing text classification models}, + author={Joulin, Armand and Grave, Edouard and Bojanowski, Piotr and Douze, Matthijs and J{\'e}gou, H{\'e}rve and Mikolov, Tomas}, + journal={arXiv preprint arXiv:1612.03651}, + year={2016} + } + + The pretrained model is automatically downloaded (link publically available at): + https://fasttext.cc/docs/en/language-identification.html + + """ + + MODEL_PATH = "https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin" + + def __init__(self): + super().__init__( + model_path=self.MODEL_PATH, + model_mode=self.DOCUMENT_LEVEL_TAGGER, + ) + + def predict_slice(self, text_slice: TextSlice) -> Iterable[Prediction]: + pred = self.classifier.predict( + text_slice.text.lower().replace("\n", " ").strip(), + k=-1, + ) + # Initialize scores to 0 + scores = {k: 0.0 for k in LANGS.values()} + + for label, score in zip(*pred): + # label is of the form __label__[code] + label_code = label[-2:] + if label_code in scores: + scores[label_code] = score + + predictions_positive = [Prediction(label=k, score=v) for k, v in scores.items()] + predictions_negative = [ + Prediction(label=f"not_{k}", score=1.0 - v) for k, v in scores.items() + ] + + return predictions_positive + predictions_negative + + +@TaggerRegistry.add("ft_lang_id_scandi_paragraph") +class FastTextScandiLanguageParagraphTagger(FastTextScandiLanguageDocumentTagger): + """This tagger runs the FastText language detection model on each paragraph. + The score is between 0 and 1 and provided for each language in LANGS. + """ + + def __init__(self): + BaseFastTextTagger.__init__( + self, + model_path=self.MODEL_PATH, + model_mode=self.PARAGRAPH_LEVEL_TAGGER, + ) + + +def add_global_language_score_from_slice_score(result: DocResult) -> DocResult: + # the total document score is # of characters in each "lang" span multiplied by the likelihood + # of said span being lang + for lang in LANGS.values(): + try: + doc_lang_score = sum( + (s.end - s.start) * s.score for s in result.spans if s.type == lang + ) / len( + result.doc.text, + ) + doc_not_lang_score = 1 - doc_lang_score + except ZeroDivisionError: + doc_lang_score = doc_not_lang_score = 0.0 + + doc_level = ( + Span( + start=0, + end=len(result.doc.text), + type=f"doc_{lang}", + score=doc_lang_score, + ), + Span( + start=0, + end=len(result.doc.text), + type=f"doc_not_{lang}", + score=doc_not_lang_score, + ), + ) + result.spans.extend(doc_level) + return result + + +# Composite tagger that provides both paragraph and doc scores +@TaggerRegistry.add("cld2_scandi_paragraph_with_doc_score") +class Cld2ScandiLanguageParagraphWithDocScoreTagger( + Cld2ScandiLanguageParagraphTagger, +): + """This tagger runs the Compact Language Detect 2 model on each paragraph + and will also provide a total score for each document. + The score is between 0 and 1 and provided for each language in LANGS.""" + + def predict(self, doc: Document) -> DocResult: + doc_result = super().predict(doc) + doc_result = add_global_language_score_from_slice_score(doc_result) + return doc_result + + +# Composite tagger that provides both paragraph and doc scores +@TaggerRegistry.add("ft_lang_id_scandi_paragraph_with_doc_score") +class FastTextScandiLanguageParagraphWithDocScoreTagger( + FastTextScandiLanguageParagraphTagger, +): + """This tagger runs the FastText language detection model on each paragraph, + and will also provide a total score for each document. + The score is between 0 and 1 and provided for each language in LANGS. + """ + + def predict(self, doc: Document) -> DocResult: + doc_result = super().predict(doc) + doc_result = add_global_language_score_from_slice_score(doc_result) + return doc_result diff --git a/src/dfm/common/data_cleaning/dolma_taggers/perplexity.py b/src/dfm/common/data_cleaning/dolma_taggers/perplexity.py new file mode 100644 index 00000000..1825c5c7 --- /dev/null +++ b/src/dfm/common/data_cleaning/dolma_taggers/perplexity.py @@ -0,0 +1,194 @@ +""" +Perplexity taggers + +This module contain taggers based on language models +""" +import hashlib +import logging +from pathlib import Path +from typing import Any, Self + +import blingfire +import kenlm +import requests +from dolma.core.data_types import DocResult, Document, Span +from dolma.core.registry import TaggerRegistry +from dolma.core.taggers import BaseTagger +from dolma.core.utils import split_paragraphs + +from dfm.common.data_cleaning.ccnet_text_normalizer import normalize + +ccnet_sha256 = { + "af.arpa.bin": "7278e70cb22e29e94942b103c0ba49f406a9369c2949199fdf8d4bee4b0ce48e", + "ar.arpa.bin": "85739ba1e022a4abd9eb260e6c67e8a4e7646f0717e2800d8dde1ec039b7f5e2", + "az.arpa.bin": "247fd2355db94b4357d19c78c8ac38ce16299d1dac237745edeea8005d7771ba", + "be.arpa.bin": "b23a70aa0cec41555932e6b4aaa5a361c95d091fbd6d4c21e6a48c866b9cd1e8", + "bg.arpa.bin": "1edb68d25238d692cb9cc6b2e4f9fce0e99b49b421020c8e89d0781507dbcd38", + "bn.arpa.bin": "f21c8187eb77d2d7d17892b61dc3446dab79a61d3d0af4f0c90660f9df500cb2", + "ca.arpa.bin": "1e4e84639fd9a35cbfa47709ca2cd9eefc84dcee7ab7d91df11e5f89f88312d4", + "cs.arpa.bin": "4f89f980c12cae596b19fccd9aebea4be5be86c6f81a8b42fc975922ea656bb1", + "da.arpa.bin": "b7f754b56421944ada2c979d0b11e8eada8308e179cb60fbc1acc4318b03695b", + "de.arpa.bin": "a5bc18a9741dc57593d7cce469350d5d2db8ce1e87be6c2ec450850316e586ba", + "el.arpa.bin": "8a53a69835d0a8e88c720fc052180c54973d2b6ac3ed2ff83c666d432a0d3686", + "en.arpa.bin": "e90c9b25af01dcaa2667ed45d012d891269760fc6eccfe8dbbd161eb20e01d7d", + "es.arpa.bin": "00121ab8c31f275132fc67c292392a33ff81b8eae1015103e8a86f9df2e642d4", + "et.arpa.bin": "7c4b98dc3f7fff73611afdd0dc1379437cb0b3dd3addc0abadb65864cabb937f", + "fa.arpa.bin": "05d00d4fdb31e00295a63e4df4187954d43850a8bd7b61c717f809b19fc94cfe", + "fi.arpa.bin": "56aa4a6890c4152be3d594e7f7dc353e78881500803f36586c1c01d88f906618", + "fr.arpa.bin": "4a52387916be57551013df3f9052ee031c042445940a4d0e69b066597586c6aa", + "gu.arpa.bin": "4ad5be86ef47f3105eb9d7d178520a0cede5d02e4ca61a3aa2d32c8322ca5bd1", + "he.arpa.bin": "69d1ab538beb6c8aa646b7c611b701ad2d1a19dcce00d6690072fa9453ad2f00", + "hi.arpa.bin": "b7173df087ff5b24d759fdbf8d07d8e21a31c1b54c978c7c5c71f05b24e12f47", + "hr.arpa.bin": "3ba8caf473415c4d12be594c36892f1454a71a08441ad796bf105ebe4e957a8f", + "hu.arpa.bin": "ce82ceb8a1e808fc441d985c4249c08c67d527937d26e3e524404185803723cf", + "hy.arpa.bin": "3c5c3511a82538ab198536e54df4e770c40d78bf5929a7143ab42695641a0031", + "id.arpa.bin": "8e871368fb386180df09d1dfb45f0319dba7a1955b9d209e498c49d96d07b3dd", + "is.arpa.bin": "287f6f7bd8130d50df8966169427b236e9aa79ff2b4250c5bdfdc2c9a0c19f52", + "it.arpa.bin": "784efb647bd699041809d59dd309193f78a47ea347d13b0c93c3bd74f437a53b", + "ja.arpa.bin": "efa96d229e2a84be705f81bc4ea1c6da79505e5c7f001f92586e16481e5b586a", + "ka.arpa.bin": "07477bd9166bc2c748532f1c3af65aad42740231c0dc1f8a4410764e0d626199", + "kk.arpa.bin": "3cec2b6c9b3ae34919dd23ff59148e81b76593d7ec17feefcd5e2829cd1643c0", + "km.arpa.bin": "84a09db4e1e7a70e1cd7c347d9729339e3eaa993f42b4bba4ba91fe0a84ff763", + "kn.arpa.bin": "f1e0e469c8c78ac4e3b62d348e966e658cf7b8f683aafa4a2b4d55ca1e7d756c", + "ko.arpa.bin": "7e345046786a1ac6dbb0d3d0fdd65d2ff0e8a848395dbc84c6152acee1987f5f", + "lt.arpa.bin": "ecc1703e098477503035d980f6be841b5359f8f5f55cc4f78087232c7da15398", + "lv.arpa.bin": "5f6212551d5de115309674eed8ea595f1375973832917dd285942a0ef8d6c7e7", + "mk.arpa.bin": "0915b0c452f5bc6dd254c4145fd09f1252ea5e17f13f48991c72cb98fa2ed804", + "ml.arpa.bin": "3f0cfbf0bdc6935229d6903df8cb60b4ed2b9ed2cb9d4c253266b13bd3211297", + "mn.arpa.bin": "c8e57fcf604d178d45fbe3b1650c04e715c41cb8151bf8b115dc88c52ebfba56", + "mr.arpa.bin": "e00986484585cd67deba5902c7da78566452e3c40fc9aa285218152563d33303", + "my.arpa.bin": "ac3496e2981ea3ad85673ca52e04f5aa8e7be68d1d94c2e73ce26436864ae217", + "ne.arpa.bin": "7ef6c2d3e4e1858fb207e6c200e422833ccf072157a6a0148b408db3e760d22e", + "nl.arpa.bin": "aa017d97061e84f51d7f74b83a6a43aef246974fc9a502436043f6f0e9e12bbb", + "no.arpa.bin": "0ec663c264d6580beebe7e0e80a939dbe7082af55af3875f292ebd11ea5800de", + "pl.arpa.bin": "b97634bca2b28d95716b951ceadca3de4a170ff07639bcdc3c73fc0961362e98", + "pt.arpa.bin": "f5a10774d7b7125c6e887b62c56fea2d348adebc81ab1708d34f68de722090e0", + "ro.arpa.bin": "619b9a2d4d53bdb368bfdf2cc770e1e9549d52b22d1fd3afc0ee8a022543ed56", + "ru.arpa.bin": "588da7d3e160f61f7e821804bc4d518460687e1c4832c339bb3a28c03417ab53", + "uk.arpa.bin": "bfd09bdfe669a9fd5f8f8d9be519bdce3fb678214bc6afd5ccce499930b7d311", + "zh.arpa.bin": "f157d94cb2828bbb44b5dddf38e7eb7f62a47d317917646a73fe2af50a3dad68", +} + + +def _get_ccnet_pretrained_lm(lang: str) -> Path: + # Download pretrained model and save to the data folder + url = f"http://dl.fbaipublicfiles.com/cc_net/lm/{lang}.arpa.bin" + data_folder = Path("data_lm") + + Path.mkdir(data_folder, parents=True, exist_ok=True) + + filename = f"{lang}.arpa.bin" + file_path = data_folder / filename + + # Check if the file already exists + if not Path.exists(file_path): + # If the file does not exist, download it + logging.info(f"Downloading {lang} model...") + response = requests.get(url) + if response.status_code == requests.codes.ok: + sha256 = hashlib.sha256(response.content).hexdigest() + if sha256 != ccnet_sha256[filename]: + raise RuntimeError( + f"Checksum mismatch {sha256} != {ccnet_sha256[filename]}", + ) + with Path.open(file_path, "wb") as file: + file.write(response.content) + logging.info(f"{lang} model downloaded and saved at {file_path}") + else: + raise RuntimeError( + f"Failed to download {lang} model. Status code: {response.status_code}", + ) + else: + logging.info(f"{lang} model already exists at {file_path}") + + return file_path + + +def pp(log_score: float, length: float) -> float: + """Convert total log-probability to perplexity""" + return 10.0 ** (-log_score / length) + + +class PerplexityBaseTagger(BaseTagger): + """Base class for CCNet based perplexity tagger""" + + @property + def model(self: Self) -> kenlm.Model: + return self._model + + @model.setter + def model(self: Self, model: kenlm.Model): + self._model = model + + +def create_ccnet_perplexity_tagger(lang: str) -> type[PerplexityBaseTagger]: + """Dynamically create perplexity tagger class for a given language. + The class for each language is based on a CCNet pretrained model [1]. + The pretrained models are available throught the Github project page https://github.com/facebookresearch/cc_net. + The models are small language models trained on the Wikipedia of the corresponding language. + + [1] + @inproceedings{wenzek2020ccnet, + title={CCNet: Extracting High Quality Monolingual Datasets from Web Crawl Data}, + author={Wenzek, Guillaume and Lachaux, Marie-Anne and Conneau, Alexis and Chaudhary, Vishrav and Guzm{\'a}n, Francisco and Joulin, Armand and Grave, {\'E}douard}, + booktitle={Proceedings of The 12th Language Resources and Evaluation Conference}, + pages={4003--4012}, + year={2020} + } + """ + + def __init__(self: Any) -> None: + model_bin_path = _get_ccnet_pretrained_lm(lang) + self.model = kenlm.Model(str(model_bin_path)) + + def predict(self: PerplexityBaseTagger, doc: Document) -> DocResult: + paragraphs = split_paragraphs(doc.text) + spans: list[Span] = [] + doc_log_prob: float = 0.0 + doc_length: float = 0.0 + for paragraph in paragraphs: + # To get proper scores from the language model we need to normalize the text + # Do not remove accents as it removes æøå and others. + normalized_text = blingfire.normalize_spaces( + normalize(paragraph.text, accent=False), + ) + # The kenlm model expects end of sentence punctuation to be separated from words with spaces + # so we separate the words using blingfire. + normalized_words = blingfire.text_to_words(normalized_text) + log_prob = self.model.score(normalized_words) + length = len(normalized_words.split()) + 1 + doc_log_prob += log_prob + doc_length += length + paragraph_span = Span( + start=paragraph.start, + end=paragraph.end, + type="perplexity", + score=pp(log_prob, length), + ) + spans.append(paragraph_span) + + paragraph_span = Span( + start=0, + end=len(doc.text), + type="doc_perplexity", + score=pp(doc_log_prob, doc_length), + ) + return DocResult(doc=doc, spans=spans) + + # Build the class dynamically from base class + # and methods. + cls = type( + f"CCNetPerplexity{lang}Tagger", + (PerplexityBaseTagger,), + { + "__init__": __init__, + "predict": predict, + }, + ) + # Add the class decorator explicitly to add the tagger to the registry + cls = TaggerRegistry.add(f"ccnet_perplexity_paragraph_w_doc_{lang}")(cls) + return cls + + +for lang in ["da", "en", "is", "no", "sv"]: + create_ccnet_perplexity_tagger(lang) diff --git a/typings/blingfire/__init__.pyi b/typings/blingfire/__init__.pyi new file mode 100644 index 00000000..20da73f7 --- /dev/null +++ b/typings/blingfire/__init__.pyi @@ -0,0 +1,54 @@ +""" +This type stub file was generated by pyright. +""" +# def text_to_sentences(s: str): # -> Any | Literal['']: +# ... +# +# def text_to_sentences_with_model(h, s): # -> Any | Literal['']: +# ... + +def normalize_spaces(s: str, uSpace: int = 0x20) -> str: # -> Any | Literal['']: + ... + +def text_to_words(s: str) -> str: # -> Any | Literal['']: + ... + +# Uncomment lines that are used in project +# def text_to_words_with_model(h, s): # -> Any | Literal['']: +# ... +# +# def word_hyphenation_with_model(h, s, uHy=...): # -> Any | Literal['']: +# ... +# +# def get_blingfiretok_version(): # -> Any: +# ... +# +# def text_to_hashes(s, word_n_grams, bucketSize): # -> NDArray[Any] | None: +# ... +# +# def text_to_token_with_offsets(s, text_to_token_f, split_byte): # -> tuple[Literal[''], list[Unknown]] | tuple[Any, list[tuple[Unknown, Unknown]]]: +# ... +# +# def text_to_words_with_offsets(s): # -> tuple[Literal[''], list[Unknown]] | tuple[Any, list[tuple[Unknown, Unknown]]]: +# ... +# +# def text_to_sentences_and_offsets(s): # -> tuple[Literal[''], list[Unknown]] | tuple[Any, list[tuple[Unknown, Unknown]]]: +# ... +# +# def load_model(file_name): # -> Any: +# ... +# +# def free_model(h): # -> None: +# ... +# +# def text_to_ids(h, s, max_len, unk=..., no_padding=...): # -> NDArray[Any]: +# ... +# +# def ids_to_text(h, ids, skip_special_tokens=..., output_buffer_size=...): # -> Any | Literal['']: +# ... +# +# def utf8text_to_ids_with_offsets(h, s_bytes, max_len, unk=..., no_padding=...): # -> tuple[NDArray[Any], NDArray[Any], NDArray[Any]]: +# ... +# +# def change_settings_dummy_prefix(h, add_prefix): # -> None: +# ... diff --git a/typings/fasttext/FastText.pyi b/typings/fasttext/FastText.pyi new file mode 100644 index 00000000..4dde0e7b --- /dev/null +++ b/typings/fasttext/FastText.pyi @@ -0,0 +1,264 @@ +""" +This type stub file was initially generated by pyright +""" +from typing import Iterable + +loss_name = ... +model_name = ... +EOS = ... +BOW = ... +EOW = ... +displayed_errors = ... + +def eprint(*args, **kwargs): # -> None: + ... + +class _Meter: + def __init__(self, fasttext_model, meter) -> None: ... + def score_vs_true(self, label): # -> tuple[NDArray[Unknown], NDArray[Any]]: + """Return scores and the gold of each sample for a specific label""" + ... + def precision_recall_curve( + self, label=... + ): # -> tuple[NDArray[Unknown], NDArray[Any]]: + """Return precision/recall curve""" + ... + def precision_at_recall(self, recall, label=...): + """Return precision for a given recall""" + ... + def recall_at_precision(self, precision, label=...): + """Return recall for a given precision""" + ... + +class _FastText: + """ + This class defines the API to inspect models and should not be used to + create objects. It will be returned by functions such as load_model or + train. + + In general this API assumes to be given only unicode for Python2 and the + Python3 equvalent called str for any string-like arguments. All unicode + strings are then encoded as UTF-8 and fed to the fastText C++ API. + """ + + def __init__(self, model_path=..., args=...) -> None: ... + def set_args(self, args=...): # -> None: + ... + def is_quantized(self): ... + def get_dimension(self): + """Get the dimension (size) of a lookup vector (hidden layer).""" + ... + def get_word_vector(self, word): # -> NDArray[Unknown]: + """Get the vector representation of word.""" + ... + def get_sentence_vector(self, text): # -> NDArray[Unknown]: + """ + Given a string, get a single vector represenation. This function + assumes to be given a single line of text. We split words on + whitespace (space, newline, tab, vertical tab) and the control + characters carriage return, formfeed and the null character. + """ + ... + def get_nearest_neighbors(self, word, k=..., on_unicode_error=...): ... + def get_analogies(self, wordA, wordB, wordC, k=..., on_unicode_error=...): ... + def get_word_id(self, word): + """ + Given a word, get the word id within the dictionary. + Returns -1 if word is not in the dictionary. + """ + ... + def get_label_id(self, label): + """ + Given a label, get the label id within the dictionary. + Returns -1 if label is not in the dictionary. + """ + ... + def get_subword_id(self, subword): + """ + Given a subword, return the index (within input matrix) it hashes to. + """ + ... + def get_subwords( + self, word, on_unicode_error=... + ): # -> tuple[Unknown, NDArray[Unknown]]: + """ + Given a word, get the subwords and their indicies. + """ + ... + def get_input_vector(self, ind): # -> NDArray[Unknown]: + """ + Given an index, get the corresponding vector of the Input Matrix. + """ + ... + def predict( + self, + text: str, + k: int = ..., + threshold: float = ..., + on_unicode_error: str = ..., + ) -> Iterable[ + tuple[str, float] + ]: # -> tuple[Unknown, Unknown] | tuple[Any | tuple[()], NDArray[Unknown]]: + """ + Given a string, get a list of labels and a list of + corresponding probabilities. k controls the number + of returned labels. A choice of 5, will return the 5 + most probable labels. By default this returns only + the most likely label and probability. threshold filters + the returned labels by a threshold on probability. A + choice of 0.5 will return labels with at least 0.5 + probability. k and threshold will be applied together to + determine the returned labels. + + This function assumes to be given + a single line of text. We split words on whitespace (space, + newline, tab, vertical tab) and the control characters carriage + return, formfeed and the null character. + + If the model is not supervised, this function will throw a ValueError. + + If given a list of strings, it will return a list of results as usually + received for a single line of text. + """ + ... + def get_input_matrix(self): # -> NDArray[Unknown]: + """ + Get a reference to the full input matrix of a Model. This only + works if the model is not quantized. + """ + ... + def get_output_matrix(self): # -> NDArray[Unknown]: + """ + Get a reference to the full output matrix of a Model. This only + works if the model is not quantized. + """ + ... + def get_words( + self, include_freq=..., on_unicode_error=... + ): # -> tuple[Unknown, NDArray[Unknown]]: + """ + Get the entire list of words of the dictionary optionally + including the frequency of the individual words. This + does not include any subwords. For that please consult + the function get_subwords. + """ + ... + def get_labels( + self, include_freq=..., on_unicode_error=... + ): # -> tuple[Unknown, NDArray[Unknown]]: + """ + Get the entire list of labels of the dictionary optionally + including the frequency of the individual labels. Unsupervised + models use words as labels, which is why get_labels + will call and return get_words for this type of + model. + """ + ... + def get_line(self, text, on_unicode_error=...): + """ + Split a line of text into words and labels. Labels must start with + the prefix used to create the model (__label__ by default). + """ + ... + def save_model(self, path): # -> None: + """Save the model to the given path""" + ... + def test(self, path, k=..., threshold=...): + """Evaluate supervised model using file given by path""" + ... + def test_label(self, path, k=..., threshold=...): + """ + Return the precision and recall score for each label. + + The returned value is a dictionary, where the key is the label. + For example: + f.test_label(...) + {'__label__italian-cuisine' : {'precision' : 0.7, 'recall' : 0.74}} + """ + ... + def get_meter(self, path, k=...): # -> _Meter: + ... + def quantize( + self, + input=..., + qout=..., + cutoff=..., + retrain=..., + epoch=..., + lr=..., + thread=..., + verbose=..., + dsub=..., + qnorm=..., + ): # -> None: + """ + Quantize the model reducing the size of the model and + it's memory footprint. + """ + ... + def set_matrices(self, input_matrix, output_matrix): # -> None: + """ + Set input and output matrices. This function assumes you know what you + are doing. + """ + ... + @property + def words(self): # -> tuple[Unknown, NDArray[Unknown]]: + ... + @property + def labels(self): # -> tuple[Unknown, NDArray[Unknown]]: + ... + def __getitem__(self, word): # -> NDArray[Unknown]: + ... + def __contains__(self, word): # -> bool: + ... + +def tokenize(text): + """Given a string of text, tokenize it and return a list of tokens""" + ... + +def load_model(path): # -> _FastText: + """Load a model given a filepath and return a model object.""" + ... + +unsupervised_default = ... + +def read_args( + arg_list, arg_dict, arg_names, default_values +): # -> tuple[dict[Unknown, Unknown], set[Unknown]]: + ... + +def train_supervised(*kargs, **kwargs): # -> _FastText: + """ + Train a supervised model and return a model object. + + input must be a filepath. The input text does not need to be tokenized + as per the tokenize function, but it must be preprocessed and encoded + as UTF-8. You might want to consult standard preprocessing scripts such + as tokenizer.perl mentioned here: http://www.statmt.org/wmt07/baseline.html + + The input file must must contain at least one label per line. For an + example consult the example datasets which are part of the fastText + repository such as the dataset pulled by classification-example.sh. + """ + ... + +def train_unsupervised(*kargs, **kwargs): # -> _FastText: + """ + Train an unsupervised model and return a model object. + + input must be a filepath. The input text does not need to be tokenized + as per the tokenize function, but it must be preprocessed and encoded + as UTF-8. You might want to consult standard preprocessing scripts such + as tokenizer.perl mentioned here: http://www.statmt.org/wmt07/baseline.html + + The input field must not contain any labels or use the specified label prefix + unless it is ok for those words to be ignored. For an example consult the + dataset pulled by the example script word-vector-example.sh, which is + part of the fastText repository. + """ + ... + +def cbow(*kargs, **kwargs): ... +def skipgram(*kargs, **kwargs): ... +def supervised(*kargs, **kwargs): ... diff --git a/typings/fasttext/__init__.pyi b/typings/fasttext/__init__.pyi new file mode 100644 index 00000000..cbf98ace --- /dev/null +++ b/typings/fasttext/__init__.pyi @@ -0,0 +1,17 @@ +""" +This type stub file was generated by pyright. +""" + +from __future__ import absolute_import, division, print_function, unicode_literals +from .FastText import ( + BOW, + EOS, + EOW, + cbow, + load_model, + skipgram, + supervised, + tokenize, + train_supervised, + train_unsupervised, +) diff --git a/typings/fasttext/tests/__init__.pyi b/typings/fasttext/tests/__init__.pyi new file mode 100644 index 00000000..8cec2d4a --- /dev/null +++ b/typings/fasttext/tests/__init__.pyi @@ -0,0 +1,7 @@ +""" +This type stub file was generated by pyright. +""" + +from __future__ import absolute_import, division, print_function, unicode_literals +from .test_configurations import get_supervised_models +from .test_script import gen_tests, gen_unit_tests diff --git a/typings/fasttext/tests/test_configurations.pyi b/typings/fasttext/tests/test_configurations.pyi new file mode 100644 index 00000000..6930c565 --- /dev/null +++ b/typings/fasttext/tests/test_configurations.pyi @@ -0,0 +1,26 @@ +""" +This type stub file was generated by pyright. +""" + +def max_thread(): # -> int: + ... + +def check_supervised_configuration(configuration, verbose=...): ... +def check_supervised_configurations(configurations, verbose=...): ... +def flickr_job(thread=...): # -> dict[Unknown, Unknown]: + ... + +def langid_job1(thread=...): # -> dict[Unknown, Unknown]: + ... + +def langid_job2(thread=...): # -> dict[Unknown, Unknown]: + ... + +def cooking_job1(thread=...): # -> dict[Unknown, Unknown]: + ... + +def cooking_job2(thread=...): # -> dict[Unknown, Unknown]: + ... + +def get_supervised_models(thread=..., verbose=...): # -> list[Unknown]: + ... diff --git a/typings/fasttext/tests/test_script.pyi b/typings/fasttext/tests/test_script.pyi new file mode 100644 index 00000000..632d8d85 --- /dev/null +++ b/typings/fasttext/tests/test_script.pyi @@ -0,0 +1,82 @@ +""" +This type stub file was generated by pyright. +""" + +import unittest + +def eprint(cls, *args, **kwargs): # -> None: + ... + +def get_random_unicode(length): # -> str: + ... + +def get_random_words(N, a=..., b=..., unique=...): # -> list[Unknown]: + ... + +def get_random_data( + num_lines=..., + max_vocab_size=..., + min_words_line=..., + max_words_line=..., + min_len_word=..., + max_len_word=..., + unique_words=..., +): # -> list[Unknown]: + ... + +def default_kwargs(kwargs): ... +def build_unsupervised_model(data, kwargs): # -> _FastText: + ... + +def build_supervised_model(data, kwargs): # -> _FastText: + ... + +def read_labels(data_file): # -> tuple[list[Unknown], list[Unknown]]: + ... + +class TestFastTextUnitPy(unittest.TestCase): + def gen_test_get_vector(self, kwargs): # -> None: + ... + def gen_test_multi_get_line(self, kwargs): # -> None: + ... + def gen_test_supervised_util_test(self, kwargs): # -> None: + ... + def gen_test_supervised_predict(self, kwargs): # -> None: + ... + def gen_test_supervised_multiline_predict(self, kwargs): # -> None: + ... + def gen_test_vocab(self, kwargs): # -> None: + ... + def gen_test_subwords(self, kwargs): # -> None: + ... + def gen_test_tokenize(self, kwargs): # -> None: + ... + def gen_test_unsupervised_dimension(self, kwargs): # -> None: + ... + def gen_test_supervised_dimension(self, kwargs): # -> None: + ... + def gen_test_subword_vector(self, kwargs): # -> None: + ... + def gen_test_unsupervised_get_words(self, kwargs): # -> None: + ... + def gen_test_supervised_get_words(self, kwargs): # -> None: + ... + def gen_test_unsupervised_get_labels(self, kwargs): # -> None: + ... + def gen_test_supervised_get_labels(self, kwargs): # -> None: + ... + def gen_test_unsupervised_exercise_is_quant(self, kwargs): # -> None: + ... + def gen_test_supervised_exercise_is_quant(self, kwargs): # -> None: + ... + def gen_test_newline_predict_sentence(self, kwargs): # -> None: + ... + +def gen_sup_test(configuration, data_dir): # -> (self: Unknown) -> None: + ... + +def gen_unit_tests(verbose=...): # -> type[TestFastTextUnitPy]: + ... + +def gen_tests(data_dir, verbose=...): # -> type[TestFastTextPy]: + class TestFastTextPy(unittest.TestCase): ... diff --git a/typings/fasttext/util/__init__.pyi b/typings/fasttext/util/__init__.pyi new file mode 100644 index 00000000..87465a71 --- /dev/null +++ b/typings/fasttext/util/__init__.pyi @@ -0,0 +1,6 @@ +""" +This type stub file was generated by pyright. +""" + +from __future__ import absolute_import, division, print_function, unicode_literals +from .util import download_model, find_nearest_neighbor, reduce_model, test diff --git a/typings/fasttext/util/util.pyi b/typings/fasttext/util/util.pyi new file mode 100644 index 00000000..16d2c81a --- /dev/null +++ b/typings/fasttext/util/util.pyi @@ -0,0 +1,39 @@ +""" +This type stub file was generated by pyright. +""" + +valid_lang_ids = ... + +def test(predictions, labels, k=...): # -> tuple[float, float]: + """ + Return precision and recall modeled after fasttext's test + """ + ... + +def find_nearest_neighbor(query, vectors, ban_set, cossims=...): # -> Any: + """ + query is a 1d numpy array corresponding to the vector to which you want to + find the closest vector + vectors is a 2d numpy array corresponding to the vectors you want to consider + ban_set is a set of indicies within vectors you want to ignore for nearest match + cossims is a 1d numpy array of size len(vectors), which can be passed for efficiency + + returns the index of the closest match to query within vectors + + """ + ... + +def reduce_model(ft_model, target_dim): + """ + ft_model is an instance of `_FastText` class + This function computes the PCA of the input and the output matrices + and sets the reduced ones. + """ + ... + +def download_model(lang_id, if_exists=..., dimension=...): # -> None: + """ + Download pre-trained common-crawl vectors from fastText's website + https://fasttext.cc/docs/en/crawl-vectors.html + """ + ... diff --git a/typings/kenlm/__init__.pyi b/typings/kenlm/__init__.pyi new file mode 100644 index 00000000..40f566db --- /dev/null +++ b/typings/kenlm/__init__.pyi @@ -0,0 +1,7 @@ +""" +Type stub for kenlm +""" + +class Model: + def __init__(self, model_bin_path: str) -> None: ... + def score(self, sentence: str) -> float: ... diff --git a/typings/pycld2/__init__.pyi b/typings/pycld2/__init__.pyi new file mode 100644 index 00000000..5b478a63 --- /dev/null +++ b/typings/pycld2/__init__.pyi @@ -0,0 +1,21 @@ +""" +Type stub file for pycld2 +""" + +from typing import Union, TypeAlias + +from pycld2 import DETECTED_LANGUAGES, ENCODINGS, LANGUAGES, VERSION, __version__, error + +IsReliable: TypeAlias = bool +TextBytesFound: TypeAlias = int +DetectDetails: TypeAlias = tuple[tuple[str, str, int, float], ...] +Vectors: TypeAlias = tuple[tuple[int, int, str, str], ...] + +def detect( + text: str, returnVectors: bool = False +) -> Union[ + tuple[IsReliable, TextBytesFound, DetectDetails], + tuple[IsReliable, TextBytesFound, DetectDetails, Vectors], +]: ... + +__all__ = ("DETECTED_LANGUAGES", "ENCODINGS", "LANGUAGES", "VERSION", "detect", "error")