From 530305f5f8d349f1e2f8b3c452983f6d64adc776 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20Bj=C3=B8rn=20J=C3=B8rgensen?= Date: Mon, 13 Nov 2023 15:45:08 +0100 Subject: [PATCH] Add type stubs for external libraries (only partially typed for the functions we use) --- .../dolma_taggers/language_scandi.py | 6 +- typings/blingfire/__init__.pyi | 54 ++++ typings/fasttext/FastText.pyi | 264 ++++++++++++++++++ typings/fasttext/__init__.pyi | 17 ++ typings/fasttext/tests/__init__.pyi | 7 + .../fasttext/tests/test_configurations.pyi | 26 ++ typings/fasttext/tests/test_script.pyi | 82 ++++++ typings/fasttext/util/__init__.pyi | 6 + typings/fasttext/util/util.pyi | 39 +++ typings/kenlm/__init__.pyi | 7 + typings/pycld2/__init__.pyi | 21 ++ 11 files changed, 527 insertions(+), 2 deletions(-) create mode 100644 typings/blingfire/__init__.pyi create mode 100644 typings/fasttext/FastText.pyi create mode 100644 typings/fasttext/__init__.pyi create mode 100644 typings/fasttext/tests/__init__.pyi create mode 100644 typings/fasttext/tests/test_configurations.pyi create mode 100644 typings/fasttext/tests/test_script.pyi create mode 100644 typings/fasttext/util/__init__.pyi create mode 100644 typings/fasttext/util/util.pyi create mode 100644 typings/kenlm/__init__.pyi create mode 100644 typings/pycld2/__init__.pyi diff --git a/src/dfm/common/data_cleaning/dolma_taggers/language_scandi.py b/src/dfm/common/data_cleaning/dolma_taggers/language_scandi.py index cfc124aa..11915793 100644 --- a/src/dfm/common/data_cleaning/dolma_taggers/language_scandi.py +++ b/src/dfm/common/data_cleaning/dolma_taggers/language_scandi.py @@ -38,11 +38,13 @@ def _identity_fn(self, text: str) -> str: return text def _predict_text(self, text: str) -> dict[str, float]: - details = [] is_reliable = False + details: Iterable[tuple[str, str, int, float]] = [] for fn in (self._identity_fn, self._to_ascii_input, self._sanitize_input): try: - is_reliable, _, details = cld2.detect(fn(text)) + retvals = cld2.detect(fn(text)) + assert len(retvals) == 3 + is_reliable, _, details = retvals break except cld2.error: ... diff --git a/typings/blingfire/__init__.pyi b/typings/blingfire/__init__.pyi new file mode 100644 index 00000000..20da73f7 --- /dev/null +++ b/typings/blingfire/__init__.pyi @@ -0,0 +1,54 @@ +""" +This type stub file was generated by pyright. +""" +# def text_to_sentences(s: str): # -> Any | Literal['']: +# ... +# +# def text_to_sentences_with_model(h, s): # -> Any | Literal['']: +# ... + +def normalize_spaces(s: str, uSpace: int = 0x20) -> str: # -> Any | Literal['']: + ... + +def text_to_words(s: str) -> str: # -> Any | Literal['']: + ... + +# Uncomment lines that are used in project +# def text_to_words_with_model(h, s): # -> Any | Literal['']: +# ... +# +# def word_hyphenation_with_model(h, s, uHy=...): # -> Any | Literal['']: +# ... +# +# def get_blingfiretok_version(): # -> Any: +# ... +# +# def text_to_hashes(s, word_n_grams, bucketSize): # -> NDArray[Any] | None: +# ... +# +# def text_to_token_with_offsets(s, text_to_token_f, split_byte): # -> tuple[Literal[''], list[Unknown]] | tuple[Any, list[tuple[Unknown, Unknown]]]: +# ... +# +# def text_to_words_with_offsets(s): # -> tuple[Literal[''], list[Unknown]] | tuple[Any, list[tuple[Unknown, Unknown]]]: +# ... +# +# def text_to_sentences_and_offsets(s): # -> tuple[Literal[''], list[Unknown]] | tuple[Any, list[tuple[Unknown, Unknown]]]: +# ... +# +# def load_model(file_name): # -> Any: +# ... +# +# def free_model(h): # -> None: +# ... +# +# def text_to_ids(h, s, max_len, unk=..., no_padding=...): # -> NDArray[Any]: +# ... +# +# def ids_to_text(h, ids, skip_special_tokens=..., output_buffer_size=...): # -> Any | Literal['']: +# ... +# +# def utf8text_to_ids_with_offsets(h, s_bytes, max_len, unk=..., no_padding=...): # -> tuple[NDArray[Any], NDArray[Any], NDArray[Any]]: +# ... +# +# def change_settings_dummy_prefix(h, add_prefix): # -> None: +# ... diff --git a/typings/fasttext/FastText.pyi b/typings/fasttext/FastText.pyi new file mode 100644 index 00000000..4dde0e7b --- /dev/null +++ b/typings/fasttext/FastText.pyi @@ -0,0 +1,264 @@ +""" +This type stub file was initially generated by pyright +""" +from typing import Iterable + +loss_name = ... +model_name = ... +EOS = ... +BOW = ... +EOW = ... +displayed_errors = ... + +def eprint(*args, **kwargs): # -> None: + ... + +class _Meter: + def __init__(self, fasttext_model, meter) -> None: ... + def score_vs_true(self, label): # -> tuple[NDArray[Unknown], NDArray[Any]]: + """Return scores and the gold of each sample for a specific label""" + ... + def precision_recall_curve( + self, label=... + ): # -> tuple[NDArray[Unknown], NDArray[Any]]: + """Return precision/recall curve""" + ... + def precision_at_recall(self, recall, label=...): + """Return precision for a given recall""" + ... + def recall_at_precision(self, precision, label=...): + """Return recall for a given precision""" + ... + +class _FastText: + """ + This class defines the API to inspect models and should not be used to + create objects. It will be returned by functions such as load_model or + train. + + In general this API assumes to be given only unicode for Python2 and the + Python3 equvalent called str for any string-like arguments. All unicode + strings are then encoded as UTF-8 and fed to the fastText C++ API. + """ + + def __init__(self, model_path=..., args=...) -> None: ... + def set_args(self, args=...): # -> None: + ... + def is_quantized(self): ... + def get_dimension(self): + """Get the dimension (size) of a lookup vector (hidden layer).""" + ... + def get_word_vector(self, word): # -> NDArray[Unknown]: + """Get the vector representation of word.""" + ... + def get_sentence_vector(self, text): # -> NDArray[Unknown]: + """ + Given a string, get a single vector represenation. This function + assumes to be given a single line of text. We split words on + whitespace (space, newline, tab, vertical tab) and the control + characters carriage return, formfeed and the null character. + """ + ... + def get_nearest_neighbors(self, word, k=..., on_unicode_error=...): ... + def get_analogies(self, wordA, wordB, wordC, k=..., on_unicode_error=...): ... + def get_word_id(self, word): + """ + Given a word, get the word id within the dictionary. + Returns -1 if word is not in the dictionary. + """ + ... + def get_label_id(self, label): + """ + Given a label, get the label id within the dictionary. + Returns -1 if label is not in the dictionary. + """ + ... + def get_subword_id(self, subword): + """ + Given a subword, return the index (within input matrix) it hashes to. + """ + ... + def get_subwords( + self, word, on_unicode_error=... + ): # -> tuple[Unknown, NDArray[Unknown]]: + """ + Given a word, get the subwords and their indicies. + """ + ... + def get_input_vector(self, ind): # -> NDArray[Unknown]: + """ + Given an index, get the corresponding vector of the Input Matrix. + """ + ... + def predict( + self, + text: str, + k: int = ..., + threshold: float = ..., + on_unicode_error: str = ..., + ) -> Iterable[ + tuple[str, float] + ]: # -> tuple[Unknown, Unknown] | tuple[Any | tuple[()], NDArray[Unknown]]: + """ + Given a string, get a list of labels and a list of + corresponding probabilities. k controls the number + of returned labels. A choice of 5, will return the 5 + most probable labels. By default this returns only + the most likely label and probability. threshold filters + the returned labels by a threshold on probability. A + choice of 0.5 will return labels with at least 0.5 + probability. k and threshold will be applied together to + determine the returned labels. + + This function assumes to be given + a single line of text. We split words on whitespace (space, + newline, tab, vertical tab) and the control characters carriage + return, formfeed and the null character. + + If the model is not supervised, this function will throw a ValueError. + + If given a list of strings, it will return a list of results as usually + received for a single line of text. + """ + ... + def get_input_matrix(self): # -> NDArray[Unknown]: + """ + Get a reference to the full input matrix of a Model. This only + works if the model is not quantized. + """ + ... + def get_output_matrix(self): # -> NDArray[Unknown]: + """ + Get a reference to the full output matrix of a Model. This only + works if the model is not quantized. + """ + ... + def get_words( + self, include_freq=..., on_unicode_error=... + ): # -> tuple[Unknown, NDArray[Unknown]]: + """ + Get the entire list of words of the dictionary optionally + including the frequency of the individual words. This + does not include any subwords. For that please consult + the function get_subwords. + """ + ... + def get_labels( + self, include_freq=..., on_unicode_error=... + ): # -> tuple[Unknown, NDArray[Unknown]]: + """ + Get the entire list of labels of the dictionary optionally + including the frequency of the individual labels. Unsupervised + models use words as labels, which is why get_labels + will call and return get_words for this type of + model. + """ + ... + def get_line(self, text, on_unicode_error=...): + """ + Split a line of text into words and labels. Labels must start with + the prefix used to create the model (__label__ by default). + """ + ... + def save_model(self, path): # -> None: + """Save the model to the given path""" + ... + def test(self, path, k=..., threshold=...): + """Evaluate supervised model using file given by path""" + ... + def test_label(self, path, k=..., threshold=...): + """ + Return the precision and recall score for each label. + + The returned value is a dictionary, where the key is the label. + For example: + f.test_label(...) + {'__label__italian-cuisine' : {'precision' : 0.7, 'recall' : 0.74}} + """ + ... + def get_meter(self, path, k=...): # -> _Meter: + ... + def quantize( + self, + input=..., + qout=..., + cutoff=..., + retrain=..., + epoch=..., + lr=..., + thread=..., + verbose=..., + dsub=..., + qnorm=..., + ): # -> None: + """ + Quantize the model reducing the size of the model and + it's memory footprint. + """ + ... + def set_matrices(self, input_matrix, output_matrix): # -> None: + """ + Set input and output matrices. This function assumes you know what you + are doing. + """ + ... + @property + def words(self): # -> tuple[Unknown, NDArray[Unknown]]: + ... + @property + def labels(self): # -> tuple[Unknown, NDArray[Unknown]]: + ... + def __getitem__(self, word): # -> NDArray[Unknown]: + ... + def __contains__(self, word): # -> bool: + ... + +def tokenize(text): + """Given a string of text, tokenize it and return a list of tokens""" + ... + +def load_model(path): # -> _FastText: + """Load a model given a filepath and return a model object.""" + ... + +unsupervised_default = ... + +def read_args( + arg_list, arg_dict, arg_names, default_values +): # -> tuple[dict[Unknown, Unknown], set[Unknown]]: + ... + +def train_supervised(*kargs, **kwargs): # -> _FastText: + """ + Train a supervised model and return a model object. + + input must be a filepath. The input text does not need to be tokenized + as per the tokenize function, but it must be preprocessed and encoded + as UTF-8. You might want to consult standard preprocessing scripts such + as tokenizer.perl mentioned here: http://www.statmt.org/wmt07/baseline.html + + The input file must must contain at least one label per line. For an + example consult the example datasets which are part of the fastText + repository such as the dataset pulled by classification-example.sh. + """ + ... + +def train_unsupervised(*kargs, **kwargs): # -> _FastText: + """ + Train an unsupervised model and return a model object. + + input must be a filepath. The input text does not need to be tokenized + as per the tokenize function, but it must be preprocessed and encoded + as UTF-8. You might want to consult standard preprocessing scripts such + as tokenizer.perl mentioned here: http://www.statmt.org/wmt07/baseline.html + + The input field must not contain any labels or use the specified label prefix + unless it is ok for those words to be ignored. For an example consult the + dataset pulled by the example script word-vector-example.sh, which is + part of the fastText repository. + """ + ... + +def cbow(*kargs, **kwargs): ... +def skipgram(*kargs, **kwargs): ... +def supervised(*kargs, **kwargs): ... diff --git a/typings/fasttext/__init__.pyi b/typings/fasttext/__init__.pyi new file mode 100644 index 00000000..cbf98ace --- /dev/null +++ b/typings/fasttext/__init__.pyi @@ -0,0 +1,17 @@ +""" +This type stub file was generated by pyright. +""" + +from __future__ import absolute_import, division, print_function, unicode_literals +from .FastText import ( + BOW, + EOS, + EOW, + cbow, + load_model, + skipgram, + supervised, + tokenize, + train_supervised, + train_unsupervised, +) diff --git a/typings/fasttext/tests/__init__.pyi b/typings/fasttext/tests/__init__.pyi new file mode 100644 index 00000000..8cec2d4a --- /dev/null +++ b/typings/fasttext/tests/__init__.pyi @@ -0,0 +1,7 @@ +""" +This type stub file was generated by pyright. +""" + +from __future__ import absolute_import, division, print_function, unicode_literals +from .test_configurations import get_supervised_models +from .test_script import gen_tests, gen_unit_tests diff --git a/typings/fasttext/tests/test_configurations.pyi b/typings/fasttext/tests/test_configurations.pyi new file mode 100644 index 00000000..6930c565 --- /dev/null +++ b/typings/fasttext/tests/test_configurations.pyi @@ -0,0 +1,26 @@ +""" +This type stub file was generated by pyright. +""" + +def max_thread(): # -> int: + ... + +def check_supervised_configuration(configuration, verbose=...): ... +def check_supervised_configurations(configurations, verbose=...): ... +def flickr_job(thread=...): # -> dict[Unknown, Unknown]: + ... + +def langid_job1(thread=...): # -> dict[Unknown, Unknown]: + ... + +def langid_job2(thread=...): # -> dict[Unknown, Unknown]: + ... + +def cooking_job1(thread=...): # -> dict[Unknown, Unknown]: + ... + +def cooking_job2(thread=...): # -> dict[Unknown, Unknown]: + ... + +def get_supervised_models(thread=..., verbose=...): # -> list[Unknown]: + ... diff --git a/typings/fasttext/tests/test_script.pyi b/typings/fasttext/tests/test_script.pyi new file mode 100644 index 00000000..632d8d85 --- /dev/null +++ b/typings/fasttext/tests/test_script.pyi @@ -0,0 +1,82 @@ +""" +This type stub file was generated by pyright. +""" + +import unittest + +def eprint(cls, *args, **kwargs): # -> None: + ... + +def get_random_unicode(length): # -> str: + ... + +def get_random_words(N, a=..., b=..., unique=...): # -> list[Unknown]: + ... + +def get_random_data( + num_lines=..., + max_vocab_size=..., + min_words_line=..., + max_words_line=..., + min_len_word=..., + max_len_word=..., + unique_words=..., +): # -> list[Unknown]: + ... + +def default_kwargs(kwargs): ... +def build_unsupervised_model(data, kwargs): # -> _FastText: + ... + +def build_supervised_model(data, kwargs): # -> _FastText: + ... + +def read_labels(data_file): # -> tuple[list[Unknown], list[Unknown]]: + ... + +class TestFastTextUnitPy(unittest.TestCase): + def gen_test_get_vector(self, kwargs): # -> None: + ... + def gen_test_multi_get_line(self, kwargs): # -> None: + ... + def gen_test_supervised_util_test(self, kwargs): # -> None: + ... + def gen_test_supervised_predict(self, kwargs): # -> None: + ... + def gen_test_supervised_multiline_predict(self, kwargs): # -> None: + ... + def gen_test_vocab(self, kwargs): # -> None: + ... + def gen_test_subwords(self, kwargs): # -> None: + ... + def gen_test_tokenize(self, kwargs): # -> None: + ... + def gen_test_unsupervised_dimension(self, kwargs): # -> None: + ... + def gen_test_supervised_dimension(self, kwargs): # -> None: + ... + def gen_test_subword_vector(self, kwargs): # -> None: + ... + def gen_test_unsupervised_get_words(self, kwargs): # -> None: + ... + def gen_test_supervised_get_words(self, kwargs): # -> None: + ... + def gen_test_unsupervised_get_labels(self, kwargs): # -> None: + ... + def gen_test_supervised_get_labels(self, kwargs): # -> None: + ... + def gen_test_unsupervised_exercise_is_quant(self, kwargs): # -> None: + ... + def gen_test_supervised_exercise_is_quant(self, kwargs): # -> None: + ... + def gen_test_newline_predict_sentence(self, kwargs): # -> None: + ... + +def gen_sup_test(configuration, data_dir): # -> (self: Unknown) -> None: + ... + +def gen_unit_tests(verbose=...): # -> type[TestFastTextUnitPy]: + ... + +def gen_tests(data_dir, verbose=...): # -> type[TestFastTextPy]: + class TestFastTextPy(unittest.TestCase): ... diff --git a/typings/fasttext/util/__init__.pyi b/typings/fasttext/util/__init__.pyi new file mode 100644 index 00000000..87465a71 --- /dev/null +++ b/typings/fasttext/util/__init__.pyi @@ -0,0 +1,6 @@ +""" +This type stub file was generated by pyright. +""" + +from __future__ import absolute_import, division, print_function, unicode_literals +from .util import download_model, find_nearest_neighbor, reduce_model, test diff --git a/typings/fasttext/util/util.pyi b/typings/fasttext/util/util.pyi new file mode 100644 index 00000000..16d2c81a --- /dev/null +++ b/typings/fasttext/util/util.pyi @@ -0,0 +1,39 @@ +""" +This type stub file was generated by pyright. +""" + +valid_lang_ids = ... + +def test(predictions, labels, k=...): # -> tuple[float, float]: + """ + Return precision and recall modeled after fasttext's test + """ + ... + +def find_nearest_neighbor(query, vectors, ban_set, cossims=...): # -> Any: + """ + query is a 1d numpy array corresponding to the vector to which you want to + find the closest vector + vectors is a 2d numpy array corresponding to the vectors you want to consider + ban_set is a set of indicies within vectors you want to ignore for nearest match + cossims is a 1d numpy array of size len(vectors), which can be passed for efficiency + + returns the index of the closest match to query within vectors + + """ + ... + +def reduce_model(ft_model, target_dim): + """ + ft_model is an instance of `_FastText` class + This function computes the PCA of the input and the output matrices + and sets the reduced ones. + """ + ... + +def download_model(lang_id, if_exists=..., dimension=...): # -> None: + """ + Download pre-trained common-crawl vectors from fastText's website + https://fasttext.cc/docs/en/crawl-vectors.html + """ + ... diff --git a/typings/kenlm/__init__.pyi b/typings/kenlm/__init__.pyi new file mode 100644 index 00000000..40f566db --- /dev/null +++ b/typings/kenlm/__init__.pyi @@ -0,0 +1,7 @@ +""" +Type stub for kenlm +""" + +class Model: + def __init__(self, model_bin_path: str) -> None: ... + def score(self, sentence: str) -> float: ... diff --git a/typings/pycld2/__init__.pyi b/typings/pycld2/__init__.pyi new file mode 100644 index 00000000..5b478a63 --- /dev/null +++ b/typings/pycld2/__init__.pyi @@ -0,0 +1,21 @@ +""" +Type stub file for pycld2 +""" + +from typing import Union, TypeAlias + +from pycld2 import DETECTED_LANGUAGES, ENCODINGS, LANGUAGES, VERSION, __version__, error + +IsReliable: TypeAlias = bool +TextBytesFound: TypeAlias = int +DetectDetails: TypeAlias = tuple[tuple[str, str, int, float], ...] +Vectors: TypeAlias = tuple[tuple[int, int, str, str], ...] + +def detect( + text: str, returnVectors: bool = False +) -> Union[ + tuple[IsReliable, TextBytesFound, DetectDetails], + tuple[IsReliable, TextBytesFound, DetectDetails, Vectors], +]: ... + +__all__ = ("DETECTED_LANGUAGES", "ENCODINGS", "LANGUAGES", "VERSION", "detect", "error")