From 530305f5f8d349f1e2f8b3c452983f6d64adc776 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Peter=20Bj=C3=B8rn=20J=C3=B8rgensen?=
 <peterbjorgensen@gmail.com>
Date: Mon, 13 Nov 2023 15:45:08 +0100
Subject: [PATCH] Add type stubs for external libraries (only partially typed
 for the functions we use)

---
 .../dolma_taggers/language_scandi.py          |   6 +-
 typings/blingfire/__init__.pyi                |  54 ++++
 typings/fasttext/FastText.pyi                 | 264 ++++++++++++++++++
 typings/fasttext/__init__.pyi                 |  17 ++
 typings/fasttext/tests/__init__.pyi           |   7 +
 .../fasttext/tests/test_configurations.pyi    |  26 ++
 typings/fasttext/tests/test_script.pyi        |  82 ++++++
 typings/fasttext/util/__init__.pyi            |   6 +
 typings/fasttext/util/util.pyi                |  39 +++
 typings/kenlm/__init__.pyi                    |   7 +
 typings/pycld2/__init__.pyi                   |  21 ++
 11 files changed, 527 insertions(+), 2 deletions(-)
 create mode 100644 typings/blingfire/__init__.pyi
 create mode 100644 typings/fasttext/FastText.pyi
 create mode 100644 typings/fasttext/__init__.pyi
 create mode 100644 typings/fasttext/tests/__init__.pyi
 create mode 100644 typings/fasttext/tests/test_configurations.pyi
 create mode 100644 typings/fasttext/tests/test_script.pyi
 create mode 100644 typings/fasttext/util/__init__.pyi
 create mode 100644 typings/fasttext/util/util.pyi
 create mode 100644 typings/kenlm/__init__.pyi
 create mode 100644 typings/pycld2/__init__.pyi

diff --git a/src/dfm/common/data_cleaning/dolma_taggers/language_scandi.py b/src/dfm/common/data_cleaning/dolma_taggers/language_scandi.py
index cfc124aa..11915793 100644
--- a/src/dfm/common/data_cleaning/dolma_taggers/language_scandi.py
+++ b/src/dfm/common/data_cleaning/dolma_taggers/language_scandi.py
@@ -38,11 +38,13 @@ def _identity_fn(self, text: str) -> str:
         return text
 
     def _predict_text(self, text: str) -> dict[str, float]:
-        details = []
         is_reliable = False
+        details: Iterable[tuple[str, str, int, float]] = []
         for fn in (self._identity_fn, self._to_ascii_input, self._sanitize_input):
             try:
-                is_reliable, _, details = cld2.detect(fn(text))
+                retvals = cld2.detect(fn(text))
+                assert len(retvals) == 3
+                is_reliable, _, details = retvals
                 break
             except cld2.error:
                 ...
diff --git a/typings/blingfire/__init__.pyi b/typings/blingfire/__init__.pyi
new file mode 100644
index 00000000..20da73f7
--- /dev/null
+++ b/typings/blingfire/__init__.pyi
@@ -0,0 +1,54 @@
+"""
+This type stub file was generated by pyright.
+"""
+# def text_to_sentences(s: str): # -> Any | Literal['']:
+#    ...
+#
+# def text_to_sentences_with_model(h, s): # -> Any | Literal['']:
+#    ...
+
+def normalize_spaces(s: str, uSpace: int = 0x20) -> str:  # -> Any | Literal['']:
+    ...
+
+def text_to_words(s: str) -> str:  # -> Any | Literal['']:
+    ...
+
+# Uncomment lines that are used in project
+# def text_to_words_with_model(h, s): # -> Any | Literal['']:
+#    ...
+#
+# def word_hyphenation_with_model(h, s, uHy=...): # -> Any | Literal['']:
+#    ...
+#
+# def get_blingfiretok_version(): # -> Any:
+#    ...
+#
+# def text_to_hashes(s, word_n_grams, bucketSize): # -> NDArray[Any] | None:
+#    ...
+#
+# def text_to_token_with_offsets(s, text_to_token_f, split_byte): # -> tuple[Literal[''], list[Unknown]] | tuple[Any, list[tuple[Unknown, Unknown]]]:
+#    ...
+#
+# def text_to_words_with_offsets(s): # -> tuple[Literal[''], list[Unknown]] | tuple[Any, list[tuple[Unknown, Unknown]]]:
+#    ...
+#
+# def text_to_sentences_and_offsets(s): # -> tuple[Literal[''], list[Unknown]] | tuple[Any, list[tuple[Unknown, Unknown]]]:
+#    ...
+#
+# def load_model(file_name): # -> Any:
+#    ...
+#
+# def free_model(h): # -> None:
+#    ...
+#
+# def text_to_ids(h, s, max_len, unk=..., no_padding=...): # -> NDArray[Any]:
+#    ...
+#
+# def ids_to_text(h, ids, skip_special_tokens=..., output_buffer_size=...): # -> Any | Literal['']:
+#    ...
+#
+# def utf8text_to_ids_with_offsets(h, s_bytes, max_len, unk=..., no_padding=...): # -> tuple[NDArray[Any], NDArray[Any], NDArray[Any]]:
+#    ...
+#
+# def change_settings_dummy_prefix(h, add_prefix): # -> None:
+#    ...
diff --git a/typings/fasttext/FastText.pyi b/typings/fasttext/FastText.pyi
new file mode 100644
index 00000000..4dde0e7b
--- /dev/null
+++ b/typings/fasttext/FastText.pyi
@@ -0,0 +1,264 @@
+"""
+This type stub file was initially generated by pyright
+"""
+from typing import Iterable
+
+loss_name = ...
+model_name = ...
+EOS = ...
+BOW = ...
+EOW = ...
+displayed_errors = ...
+
+def eprint(*args, **kwargs):  # -> None:
+    ...
+
+class _Meter:
+    def __init__(self, fasttext_model, meter) -> None: ...
+    def score_vs_true(self, label):  # -> tuple[NDArray[Unknown], NDArray[Any]]:
+        """Return scores and the gold of each sample for a specific label"""
+        ...
+    def precision_recall_curve(
+        self, label=...
+    ):  # -> tuple[NDArray[Unknown], NDArray[Any]]:
+        """Return precision/recall curve"""
+        ...
+    def precision_at_recall(self, recall, label=...):
+        """Return precision for a given recall"""
+        ...
+    def recall_at_precision(self, precision, label=...):
+        """Return recall for a given precision"""
+        ...
+
+class _FastText:
+    """
+    This class defines the API to inspect models and should not be used to
+    create objects. It will be returned by functions such as load_model or
+    train.
+
+    In general this API assumes to be given only unicode for Python2 and the
+    Python3 equvalent called str for any string-like arguments. All unicode
+    strings are then encoded as UTF-8 and fed to the fastText C++ API.
+    """
+
+    def __init__(self, model_path=..., args=...) -> None: ...
+    def set_args(self, args=...):  # -> None:
+        ...
+    def is_quantized(self): ...
+    def get_dimension(self):
+        """Get the dimension (size) of a lookup vector (hidden layer)."""
+        ...
+    def get_word_vector(self, word):  # -> NDArray[Unknown]:
+        """Get the vector representation of word."""
+        ...
+    def get_sentence_vector(self, text):  # -> NDArray[Unknown]:
+        """
+        Given a string, get a single vector represenation. This function
+        assumes to be given a single line of text. We split words on
+        whitespace (space, newline, tab, vertical tab) and the control
+        characters carriage return, formfeed and the null character.
+        """
+        ...
+    def get_nearest_neighbors(self, word, k=..., on_unicode_error=...): ...
+    def get_analogies(self, wordA, wordB, wordC, k=..., on_unicode_error=...): ...
+    def get_word_id(self, word):
+        """
+        Given a word, get the word id within the dictionary.
+        Returns -1 if word is not in the dictionary.
+        """
+        ...
+    def get_label_id(self, label):
+        """
+        Given a label, get the label id within the dictionary.
+        Returns -1 if label is not in the dictionary.
+        """
+        ...
+    def get_subword_id(self, subword):
+        """
+        Given a subword, return the index (within input matrix) it hashes to.
+        """
+        ...
+    def get_subwords(
+        self, word, on_unicode_error=...
+    ):  # -> tuple[Unknown, NDArray[Unknown]]:
+        """
+        Given a word, get the subwords and their indicies.
+        """
+        ...
+    def get_input_vector(self, ind):  # -> NDArray[Unknown]:
+        """
+        Given an index, get the corresponding vector of the Input Matrix.
+        """
+        ...
+    def predict(
+        self,
+        text: str,
+        k: int = ...,
+        threshold: float = ...,
+        on_unicode_error: str = ...,
+    ) -> Iterable[
+        tuple[str, float]
+    ]:  # -> tuple[Unknown, Unknown] | tuple[Any | tuple[()], NDArray[Unknown]]:
+        """
+        Given a string, get a list of labels and a list of
+        corresponding probabilities. k controls the number
+        of returned labels. A choice of 5, will return the 5
+        most probable labels. By default this returns only
+        the most likely label and probability. threshold filters
+        the returned labels by a threshold on probability. A
+        choice of 0.5 will return labels with at least 0.5
+        probability. k and threshold will be applied together to
+        determine the returned labels.
+
+        This function assumes to be given
+        a single line of text. We split words on whitespace (space,
+        newline, tab, vertical tab) and the control characters carriage
+        return, formfeed and the null character.
+
+        If the model is not supervised, this function will throw a ValueError.
+
+        If given a list of strings, it will return a list of results as usually
+        received for a single line of text.
+        """
+        ...
+    def get_input_matrix(self):  # -> NDArray[Unknown]:
+        """
+        Get a reference to the full input matrix of a Model. This only
+        works if the model is not quantized.
+        """
+        ...
+    def get_output_matrix(self):  # -> NDArray[Unknown]:
+        """
+        Get a reference to the full output matrix of a Model. This only
+        works if the model is not quantized.
+        """
+        ...
+    def get_words(
+        self, include_freq=..., on_unicode_error=...
+    ):  # -> tuple[Unknown, NDArray[Unknown]]:
+        """
+        Get the entire list of words of the dictionary optionally
+        including the frequency of the individual words. This
+        does not include any subwords. For that please consult
+        the function get_subwords.
+        """
+        ...
+    def get_labels(
+        self, include_freq=..., on_unicode_error=...
+    ):  # -> tuple[Unknown, NDArray[Unknown]]:
+        """
+        Get the entire list of labels of the dictionary optionally
+        including the frequency of the individual labels. Unsupervised
+        models use words as labels, which is why get_labels
+        will call and return get_words for this type of
+        model.
+        """
+        ...
+    def get_line(self, text, on_unicode_error=...):
+        """
+        Split a line of text into words and labels. Labels must start with
+        the prefix used to create the model (__label__ by default).
+        """
+        ...
+    def save_model(self, path):  # -> None:
+        """Save the model to the given path"""
+        ...
+    def test(self, path, k=..., threshold=...):
+        """Evaluate supervised model using file given by path"""
+        ...
+    def test_label(self, path, k=..., threshold=...):
+        """
+        Return the precision and recall score for each label.
+
+        The returned value is a dictionary, where the key is the label.
+        For example:
+        f.test_label(...)
+        {'__label__italian-cuisine' : {'precision' : 0.7, 'recall' : 0.74}}
+        """
+        ...
+    def get_meter(self, path, k=...):  # -> _Meter:
+        ...
+    def quantize(
+        self,
+        input=...,
+        qout=...,
+        cutoff=...,
+        retrain=...,
+        epoch=...,
+        lr=...,
+        thread=...,
+        verbose=...,
+        dsub=...,
+        qnorm=...,
+    ):  # -> None:
+        """
+        Quantize the model reducing the size of the model and
+        it's memory footprint.
+        """
+        ...
+    def set_matrices(self, input_matrix, output_matrix):  # -> None:
+        """
+        Set input and output matrices. This function assumes you know what you
+        are doing.
+        """
+        ...
+    @property
+    def words(self):  # -> tuple[Unknown, NDArray[Unknown]]:
+        ...
+    @property
+    def labels(self):  # -> tuple[Unknown, NDArray[Unknown]]:
+        ...
+    def __getitem__(self, word):  # -> NDArray[Unknown]:
+        ...
+    def __contains__(self, word):  # -> bool:
+        ...
+
+def tokenize(text):
+    """Given a string of text, tokenize it and return a list of tokens"""
+    ...
+
+def load_model(path):  # -> _FastText:
+    """Load a model given a filepath and return a model object."""
+    ...
+
+unsupervised_default = ...
+
+def read_args(
+    arg_list, arg_dict, arg_names, default_values
+):  # -> tuple[dict[Unknown, Unknown], set[Unknown]]:
+    ...
+
+def train_supervised(*kargs, **kwargs):  # -> _FastText:
+    """
+    Train a supervised model and return a model object.
+
+    input must be a filepath. The input text does not need to be tokenized
+    as per the tokenize function, but it must be preprocessed and encoded
+    as UTF-8. You might want to consult standard preprocessing scripts such
+    as tokenizer.perl mentioned here: http://www.statmt.org/wmt07/baseline.html
+
+    The input file must must contain at least one label per line. For an
+    example consult the example datasets which are part of the fastText
+    repository such as the dataset pulled by classification-example.sh.
+    """
+    ...
+
+def train_unsupervised(*kargs, **kwargs):  # -> _FastText:
+    """
+    Train an unsupervised model and return a model object.
+
+    input must be a filepath. The input text does not need to be tokenized
+    as per the tokenize function, but it must be preprocessed and encoded
+    as UTF-8. You might want to consult standard preprocessing scripts such
+    as tokenizer.perl mentioned here: http://www.statmt.org/wmt07/baseline.html
+
+    The input field must not contain any labels or use the specified label prefix
+    unless it is ok for those words to be ignored. For an example consult the
+    dataset pulled by the example script word-vector-example.sh, which is
+    part of the fastText repository.
+    """
+    ...
+
+def cbow(*kargs, **kwargs): ...
+def skipgram(*kargs, **kwargs): ...
+def supervised(*kargs, **kwargs): ...
diff --git a/typings/fasttext/__init__.pyi b/typings/fasttext/__init__.pyi
new file mode 100644
index 00000000..cbf98ace
--- /dev/null
+++ b/typings/fasttext/__init__.pyi
@@ -0,0 +1,17 @@
+"""
+This type stub file was generated by pyright.
+"""
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+from .FastText import (
+    BOW,
+    EOS,
+    EOW,
+    cbow,
+    load_model,
+    skipgram,
+    supervised,
+    tokenize,
+    train_supervised,
+    train_unsupervised,
+)
diff --git a/typings/fasttext/tests/__init__.pyi b/typings/fasttext/tests/__init__.pyi
new file mode 100644
index 00000000..8cec2d4a
--- /dev/null
+++ b/typings/fasttext/tests/__init__.pyi
@@ -0,0 +1,7 @@
+"""
+This type stub file was generated by pyright.
+"""
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+from .test_configurations import get_supervised_models
+from .test_script import gen_tests, gen_unit_tests
diff --git a/typings/fasttext/tests/test_configurations.pyi b/typings/fasttext/tests/test_configurations.pyi
new file mode 100644
index 00000000..6930c565
--- /dev/null
+++ b/typings/fasttext/tests/test_configurations.pyi
@@ -0,0 +1,26 @@
+"""
+This type stub file was generated by pyright.
+"""
+
+def max_thread():  # -> int:
+    ...
+
+def check_supervised_configuration(configuration, verbose=...): ...
+def check_supervised_configurations(configurations, verbose=...): ...
+def flickr_job(thread=...):  # -> dict[Unknown, Unknown]:
+    ...
+
+def langid_job1(thread=...):  # -> dict[Unknown, Unknown]:
+    ...
+
+def langid_job2(thread=...):  # -> dict[Unknown, Unknown]:
+    ...
+
+def cooking_job1(thread=...):  # -> dict[Unknown, Unknown]:
+    ...
+
+def cooking_job2(thread=...):  # -> dict[Unknown, Unknown]:
+    ...
+
+def get_supervised_models(thread=..., verbose=...):  # -> list[Unknown]:
+    ...
diff --git a/typings/fasttext/tests/test_script.pyi b/typings/fasttext/tests/test_script.pyi
new file mode 100644
index 00000000..632d8d85
--- /dev/null
+++ b/typings/fasttext/tests/test_script.pyi
@@ -0,0 +1,82 @@
+"""
+This type stub file was generated by pyright.
+"""
+
+import unittest
+
+def eprint(cls, *args, **kwargs):  # -> None:
+    ...
+
+def get_random_unicode(length):  # -> str:
+    ...
+
+def get_random_words(N, a=..., b=..., unique=...):  # -> list[Unknown]:
+    ...
+
+def get_random_data(
+    num_lines=...,
+    max_vocab_size=...,
+    min_words_line=...,
+    max_words_line=...,
+    min_len_word=...,
+    max_len_word=...,
+    unique_words=...,
+):  # -> list[Unknown]:
+    ...
+
+def default_kwargs(kwargs): ...
+def build_unsupervised_model(data, kwargs):  # -> _FastText:
+    ...
+
+def build_supervised_model(data, kwargs):  # -> _FastText:
+    ...
+
+def read_labels(data_file):  # -> tuple[list[Unknown], list[Unknown]]:
+    ...
+
+class TestFastTextUnitPy(unittest.TestCase):
+    def gen_test_get_vector(self, kwargs):  # -> None:
+        ...
+    def gen_test_multi_get_line(self, kwargs):  # -> None:
+        ...
+    def gen_test_supervised_util_test(self, kwargs):  # -> None:
+        ...
+    def gen_test_supervised_predict(self, kwargs):  # -> None:
+        ...
+    def gen_test_supervised_multiline_predict(self, kwargs):  # -> None:
+        ...
+    def gen_test_vocab(self, kwargs):  # -> None:
+        ...
+    def gen_test_subwords(self, kwargs):  # -> None:
+        ...
+    def gen_test_tokenize(self, kwargs):  # -> None:
+        ...
+    def gen_test_unsupervised_dimension(self, kwargs):  # -> None:
+        ...
+    def gen_test_supervised_dimension(self, kwargs):  # -> None:
+        ...
+    def gen_test_subword_vector(self, kwargs):  # -> None:
+        ...
+    def gen_test_unsupervised_get_words(self, kwargs):  # -> None:
+        ...
+    def gen_test_supervised_get_words(self, kwargs):  # -> None:
+        ...
+    def gen_test_unsupervised_get_labels(self, kwargs):  # -> None:
+        ...
+    def gen_test_supervised_get_labels(self, kwargs):  # -> None:
+        ...
+    def gen_test_unsupervised_exercise_is_quant(self, kwargs):  # -> None:
+        ...
+    def gen_test_supervised_exercise_is_quant(self, kwargs):  # -> None:
+        ...
+    def gen_test_newline_predict_sentence(self, kwargs):  # -> None:
+        ...
+
+def gen_sup_test(configuration, data_dir):  # -> (self: Unknown) -> None:
+    ...
+
+def gen_unit_tests(verbose=...):  # -> type[TestFastTextUnitPy]:
+    ...
+
+def gen_tests(data_dir, verbose=...):  # -> type[TestFastTextPy]:
+    class TestFastTextPy(unittest.TestCase): ...
diff --git a/typings/fasttext/util/__init__.pyi b/typings/fasttext/util/__init__.pyi
new file mode 100644
index 00000000..87465a71
--- /dev/null
+++ b/typings/fasttext/util/__init__.pyi
@@ -0,0 +1,6 @@
+"""
+This type stub file was generated by pyright.
+"""
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+from .util import download_model, find_nearest_neighbor, reduce_model, test
diff --git a/typings/fasttext/util/util.pyi b/typings/fasttext/util/util.pyi
new file mode 100644
index 00000000..16d2c81a
--- /dev/null
+++ b/typings/fasttext/util/util.pyi
@@ -0,0 +1,39 @@
+"""
+This type stub file was generated by pyright.
+"""
+
+valid_lang_ids = ...
+
+def test(predictions, labels, k=...):  # -> tuple[float, float]:
+    """
+    Return precision and recall modeled after fasttext's test
+    """
+    ...
+
+def find_nearest_neighbor(query, vectors, ban_set, cossims=...):  # -> Any:
+    """
+    query is a 1d numpy array corresponding to the vector to which you want to
+    find the closest vector
+    vectors is a 2d numpy array corresponding to the vectors you want to consider
+    ban_set is a set of indicies within vectors you want to ignore for nearest match
+    cossims is a 1d numpy array of size len(vectors), which can be passed for efficiency
+
+    returns the index of the closest match to query within vectors
+
+    """
+    ...
+
+def reduce_model(ft_model, target_dim):
+    """
+    ft_model is an instance of `_FastText` class
+    This function computes the PCA of the input and the output matrices
+    and sets the reduced ones.
+    """
+    ...
+
+def download_model(lang_id, if_exists=..., dimension=...):  # -> None:
+    """
+    Download pre-trained common-crawl vectors from fastText's website
+    https://fasttext.cc/docs/en/crawl-vectors.html
+    """
+    ...
diff --git a/typings/kenlm/__init__.pyi b/typings/kenlm/__init__.pyi
new file mode 100644
index 00000000..40f566db
--- /dev/null
+++ b/typings/kenlm/__init__.pyi
@@ -0,0 +1,7 @@
+"""
+Type stub for kenlm
+"""
+
+class Model:
+    def __init__(self, model_bin_path: str) -> None: ...
+    def score(self, sentence: str) -> float: ...
diff --git a/typings/pycld2/__init__.pyi b/typings/pycld2/__init__.pyi
new file mode 100644
index 00000000..5b478a63
--- /dev/null
+++ b/typings/pycld2/__init__.pyi
@@ -0,0 +1,21 @@
+"""
+Type stub file for pycld2
+"""
+
+from typing import Union, TypeAlias
+
+from pycld2 import DETECTED_LANGUAGES, ENCODINGS, LANGUAGES, VERSION, __version__, error
+
+IsReliable: TypeAlias = bool
+TextBytesFound: TypeAlias = int
+DetectDetails: TypeAlias = tuple[tuple[str, str, int, float], ...]
+Vectors: TypeAlias = tuple[tuple[int, int, str, str], ...]
+
+def detect(
+    text: str, returnVectors: bool = False
+) -> Union[
+    tuple[IsReliable, TextBytesFound, DetectDetails],
+    tuple[IsReliable, TextBytesFound, DetectDetails, Vectors],
+]: ...
+
+__all__ = ("DETECTED_LANGUAGES", "ENCODINGS", "LANGUAGES", "VERSION", "detect", "error")