Skip to content

Commit

Permalink
Add type stubs for external libraries (only partially typed for the f…
Browse files Browse the repository at this point in the history
…unctions we use)
  • Loading branch information
peterbjorgensen committed Nov 13, 2023
1 parent ddf43fa commit 530305f
Show file tree
Hide file tree
Showing 11 changed files with 527 additions and 2 deletions.
6 changes: 4 additions & 2 deletions src/dfm/common/data_cleaning/dolma_taggers/language_scandi.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,11 +38,13 @@ def _identity_fn(self, text: str) -> str:
return text

def _predict_text(self, text: str) -> dict[str, float]:
details = []
is_reliable = False
details: Iterable[tuple[str, str, int, float]] = []
for fn in (self._identity_fn, self._to_ascii_input, self._sanitize_input):
try:
is_reliable, _, details = cld2.detect(fn(text))
retvals = cld2.detect(fn(text))
assert len(retvals) == 3
is_reliable, _, details = retvals
break
except cld2.error:
...
Expand Down
54 changes: 54 additions & 0 deletions typings/blingfire/__init__.pyi
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
"""
This type stub file was generated by pyright.
"""
# def text_to_sentences(s: str): # -> Any | Literal['']:
# ...
#
# def text_to_sentences_with_model(h, s): # -> Any | Literal['']:
# ...

def normalize_spaces(s: str, uSpace: int = 0x20) -> str: # -> Any | Literal['']:
...

def text_to_words(s: str) -> str: # -> Any | Literal['']:
...

# Uncomment lines that are used in project
# def text_to_words_with_model(h, s): # -> Any | Literal['']:
# ...
#
# def word_hyphenation_with_model(h, s, uHy=...): # -> Any | Literal['']:
# ...
#
# def get_blingfiretok_version(): # -> Any:
# ...
#
# def text_to_hashes(s, word_n_grams, bucketSize): # -> NDArray[Any] | None:
# ...
#
# def text_to_token_with_offsets(s, text_to_token_f, split_byte): # -> tuple[Literal[''], list[Unknown]] | tuple[Any, list[tuple[Unknown, Unknown]]]:
# ...
#
# def text_to_words_with_offsets(s): # -> tuple[Literal[''], list[Unknown]] | tuple[Any, list[tuple[Unknown, Unknown]]]:
# ...
#
# def text_to_sentences_and_offsets(s): # -> tuple[Literal[''], list[Unknown]] | tuple[Any, list[tuple[Unknown, Unknown]]]:
# ...
#
# def load_model(file_name): # -> Any:
# ...
#
# def free_model(h): # -> None:
# ...
#
# def text_to_ids(h, s, max_len, unk=..., no_padding=...): # -> NDArray[Any]:
# ...
#
# def ids_to_text(h, ids, skip_special_tokens=..., output_buffer_size=...): # -> Any | Literal['']:
# ...
#
# def utf8text_to_ids_with_offsets(h, s_bytes, max_len, unk=..., no_padding=...): # -> tuple[NDArray[Any], NDArray[Any], NDArray[Any]]:
# ...
#
# def change_settings_dummy_prefix(h, add_prefix): # -> None:
# ...
264 changes: 264 additions & 0 deletions typings/fasttext/FastText.pyi
Original file line number Diff line number Diff line change
@@ -0,0 +1,264 @@
"""
This type stub file was initially generated by pyright
"""
from typing import Iterable

loss_name = ...
model_name = ...
EOS = ...
BOW = ...
EOW = ...
displayed_errors = ...

def eprint(*args, **kwargs): # -> None:
...

class _Meter:
def __init__(self, fasttext_model, meter) -> None: ...
def score_vs_true(self, label): # -> tuple[NDArray[Unknown], NDArray[Any]]:
"""Return scores and the gold of each sample for a specific label"""
...
def precision_recall_curve(
self, label=...
): # -> tuple[NDArray[Unknown], NDArray[Any]]:
"""Return precision/recall curve"""
...
def precision_at_recall(self, recall, label=...):
"""Return precision for a given recall"""
...
def recall_at_precision(self, precision, label=...):
"""Return recall for a given precision"""
...

class _FastText:
"""
This class defines the API to inspect models and should not be used to
create objects. It will be returned by functions such as load_model or
train.
In general this API assumes to be given only unicode for Python2 and the
Python3 equvalent called str for any string-like arguments. All unicode
strings are then encoded as UTF-8 and fed to the fastText C++ API.
"""

def __init__(self, model_path=..., args=...) -> None: ...
def set_args(self, args=...): # -> None:
...
def is_quantized(self): ...
def get_dimension(self):
"""Get the dimension (size) of a lookup vector (hidden layer)."""
...
def get_word_vector(self, word): # -> NDArray[Unknown]:
"""Get the vector representation of word."""
...
def get_sentence_vector(self, text): # -> NDArray[Unknown]:
"""
Given a string, get a single vector represenation. This function
assumes to be given a single line of text. We split words on
whitespace (space, newline, tab, vertical tab) and the control
characters carriage return, formfeed and the null character.
"""
...
def get_nearest_neighbors(self, word, k=..., on_unicode_error=...): ...
def get_analogies(self, wordA, wordB, wordC, k=..., on_unicode_error=...): ...
def get_word_id(self, word):
"""
Given a word, get the word id within the dictionary.
Returns -1 if word is not in the dictionary.
"""
...
def get_label_id(self, label):
"""
Given a label, get the label id within the dictionary.
Returns -1 if label is not in the dictionary.
"""
...
def get_subword_id(self, subword):
"""
Given a subword, return the index (within input matrix) it hashes to.
"""
...
def get_subwords(
self, word, on_unicode_error=...
): # -> tuple[Unknown, NDArray[Unknown]]:
"""
Given a word, get the subwords and their indicies.
"""
...
def get_input_vector(self, ind): # -> NDArray[Unknown]:
"""
Given an index, get the corresponding vector of the Input Matrix.
"""
...
def predict(
self,
text: str,
k: int = ...,
threshold: float = ...,
on_unicode_error: str = ...,
) -> Iterable[
tuple[str, float]
]: # -> tuple[Unknown, Unknown] | tuple[Any | tuple[()], NDArray[Unknown]]:
"""
Given a string, get a list of labels and a list of
corresponding probabilities. k controls the number
of returned labels. A choice of 5, will return the 5
most probable labels. By default this returns only
the most likely label and probability. threshold filters
the returned labels by a threshold on probability. A
choice of 0.5 will return labels with at least 0.5
probability. k and threshold will be applied together to
determine the returned labels.
This function assumes to be given
a single line of text. We split words on whitespace (space,
newline, tab, vertical tab) and the control characters carriage
return, formfeed and the null character.
If the model is not supervised, this function will throw a ValueError.
If given a list of strings, it will return a list of results as usually
received for a single line of text.
"""
...
def get_input_matrix(self): # -> NDArray[Unknown]:
"""
Get a reference to the full input matrix of a Model. This only
works if the model is not quantized.
"""
...
def get_output_matrix(self): # -> NDArray[Unknown]:
"""
Get a reference to the full output matrix of a Model. This only
works if the model is not quantized.
"""
...
def get_words(
self, include_freq=..., on_unicode_error=...
): # -> tuple[Unknown, NDArray[Unknown]]:
"""
Get the entire list of words of the dictionary optionally
including the frequency of the individual words. This
does not include any subwords. For that please consult
the function get_subwords.
"""
...
def get_labels(
self, include_freq=..., on_unicode_error=...
): # -> tuple[Unknown, NDArray[Unknown]]:
"""
Get the entire list of labels of the dictionary optionally
including the frequency of the individual labels. Unsupervised
models use words as labels, which is why get_labels
will call and return get_words for this type of
model.
"""
...
def get_line(self, text, on_unicode_error=...):
"""
Split a line of text into words and labels. Labels must start with
the prefix used to create the model (__label__ by default).
"""
...
def save_model(self, path): # -> None:
"""Save the model to the given path"""
...
def test(self, path, k=..., threshold=...):
"""Evaluate supervised model using file given by path"""
...
def test_label(self, path, k=..., threshold=...):
"""
Return the precision and recall score for each label.
The returned value is a dictionary, where the key is the label.
For example:
f.test_label(...)
{'__label__italian-cuisine' : {'precision' : 0.7, 'recall' : 0.74}}
"""
...
def get_meter(self, path, k=...): # -> _Meter:
...
def quantize(
self,
input=...,
qout=...,
cutoff=...,
retrain=...,
epoch=...,
lr=...,
thread=...,
verbose=...,
dsub=...,
qnorm=...,
): # -> None:
"""
Quantize the model reducing the size of the model and
it's memory footprint.
"""
...
def set_matrices(self, input_matrix, output_matrix): # -> None:
"""
Set input and output matrices. This function assumes you know what you
are doing.
"""
...
@property
def words(self): # -> tuple[Unknown, NDArray[Unknown]]:
...
@property
def labels(self): # -> tuple[Unknown, NDArray[Unknown]]:
...
def __getitem__(self, word): # -> NDArray[Unknown]:
...
def __contains__(self, word): # -> bool:
...

def tokenize(text):
"""Given a string of text, tokenize it and return a list of tokens"""
...

def load_model(path): # -> _FastText:
"""Load a model given a filepath and return a model object."""
...

unsupervised_default = ...

def read_args(
arg_list, arg_dict, arg_names, default_values
): # -> tuple[dict[Unknown, Unknown], set[Unknown]]:
...

def train_supervised(*kargs, **kwargs): # -> _FastText:
"""
Train a supervised model and return a model object.
input must be a filepath. The input text does not need to be tokenized
as per the tokenize function, but it must be preprocessed and encoded
as UTF-8. You might want to consult standard preprocessing scripts such
as tokenizer.perl mentioned here: http://www.statmt.org/wmt07/baseline.html
The input file must must contain at least one label per line. For an
example consult the example datasets which are part of the fastText
repository such as the dataset pulled by classification-example.sh.
"""
...

def train_unsupervised(*kargs, **kwargs): # -> _FastText:
"""
Train an unsupervised model and return a model object.
input must be a filepath. The input text does not need to be tokenized
as per the tokenize function, but it must be preprocessed and encoded
as UTF-8. You might want to consult standard preprocessing scripts such
as tokenizer.perl mentioned here: http://www.statmt.org/wmt07/baseline.html
The input field must not contain any labels or use the specified label prefix
unless it is ok for those words to be ignored. For an example consult the
dataset pulled by the example script word-vector-example.sh, which is
part of the fastText repository.
"""
...

def cbow(*kargs, **kwargs): ...
def skipgram(*kargs, **kwargs): ...
def supervised(*kargs, **kwargs): ...
17 changes: 17 additions & 0 deletions typings/fasttext/__init__.pyi
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
"""
This type stub file was generated by pyright.
"""

from __future__ import absolute_import, division, print_function, unicode_literals
from .FastText import (
BOW,
EOS,
EOW,
cbow,
load_model,
skipgram,
supervised,
tokenize,
train_supervised,
train_unsupervised,
)
7 changes: 7 additions & 0 deletions typings/fasttext/tests/__init__.pyi
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
"""
This type stub file was generated by pyright.
"""

from __future__ import absolute_import, division, print_function, unicode_literals
from .test_configurations import get_supervised_models
from .test_script import gen_tests, gen_unit_tests
26 changes: 26 additions & 0 deletions typings/fasttext/tests/test_configurations.pyi
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
"""
This type stub file was generated by pyright.
"""

def max_thread(): # -> int:
...

def check_supervised_configuration(configuration, verbose=...): ...
def check_supervised_configurations(configurations, verbose=...): ...
def flickr_job(thread=...): # -> dict[Unknown, Unknown]:
...

def langid_job1(thread=...): # -> dict[Unknown, Unknown]:
...

def langid_job2(thread=...): # -> dict[Unknown, Unknown]:
...

def cooking_job1(thread=...): # -> dict[Unknown, Unknown]:
...

def cooking_job2(thread=...): # -> dict[Unknown, Unknown]:
...

def get_supervised_models(thread=..., verbose=...): # -> list[Unknown]:
...
Loading

0 comments on commit 530305f

Please sign in to comment.