diff --git a/Dockerfile.dev b/Dockerfile.dev
index 2d19d84f..8bd57cd0 100644
--- a/Dockerfile.dev
+++ b/Dockerfile.dev
@@ -1,5 +1,23 @@
 FROM python:3.11-bullseye
 
+# Update default packages
+RUN apt-get -qq update
+
+# Get Ubuntu packages
+RUN apt-get install -y -q \
+    build-essential \
+    curl \
+    cmake
+
+# NOTE: no need to run update again at this point
+# RUN apt-get update
+
+# Get Rust; NOTE: using sh for better compatibility with other base images
+RUN curl https://sh.rustup.rs -sSf | sh -s -- -y
+
+# Add .cargo/bin to PATH
+ENV PATH="/root/.cargo/bin:${PATH}"
+
 # Set the working directory to /app
 WORKDIR /app
 
@@ -10,4 +28,4 @@ RUN make install
 
 # Install the app
 COPY . /app
-RUN pip install -e .
\ No newline at end of file
+RUN pip install -e .
diff --git a/pyproject.toml b/pyproject.toml
index af570248..6b70cd2d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -16,7 +16,13 @@ classifiers = [
 ]
 requires-python = ">=3.10"
 
-dependencies = ["pydantic==1.8.2"]
+dependencies = [
+  "pydantic>=2.4.2", # dolma does not work with very old versions of pydantic
+  "dolma@git+https://github.com/allenai/dolma.git@5a010a2685914b1db7744426abfb4b9ece52da95", # Install from git until a 0.9.2 package is released
+  "kenlm>=0.2.0", # Used for perplexity tagging
+  "blingfire>=0.1.8", # Used for perplexity tagging
+  "requests>=2.31.0",
+]
 
 [project.optional-dependencies]
 dev = ["black==23.9.1", "ruff==0.1.0", "pyright==1.1.331", "pre-commit==3.5.0"]
diff --git a/src/dfm/__init__.py b/src/dfm/__init__.py
new file mode 100644
index 00000000..d01c3abd
--- /dev/null
+++ b/src/dfm/__init__.py
@@ -0,0 +1,4 @@
+import importlib.metadata
+
+# Fetches the version of the package as defined in pyproject.toml
+__version__ = importlib.metadata.version(__package__)
diff --git a/src/dfm/common/__init__.py b/src/dfm/common/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/src/dfm/common/data_cleaning/__init__.py b/src/dfm/common/data_cleaning/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/src/dfm/common/data_cleaning/ccnet_text_normalizer.py b/src/dfm/common/data_cleaning/ccnet_text_normalizer.py
new file mode 100644
index 00000000..dab25c41
--- /dev/null
+++ b/src/dfm/common/data_cleaning/ccnet_text_normalizer.py
@@ -0,0 +1,203 @@
+# This file has initially been copied from the ccnet repository from Facebook.
+# https://github.com/facebookresearch/cc_net/blob/main/cc_net/text_normalizer.py
+# The utility functions can be used to normalize text before processing it
+# with ccnet models, but might not be the best general purpose implementation.
+#
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# This file is full of ambigous characters, so disable ruff check for those.
+# ruff: noqa: RUF001
+
+import re
+import unicodedata
+from typing import Literal
+
+UNICODE_PUNCT = {
+    "，": ",",
+    "。": ".",
+    "、": ",",
+    "„": '"',
+    "”": '"',
+    "“": '"',
+    "«": '"',
+    "»": '"',
+    "１": '"',
+    "」": '"',
+    "「": '"',
+    "《": '"',
+    "》": '"',
+    "´": "'",
+    "∶": ":",
+    "：": ":",
+    "？": "?",
+    "！": "!",
+    "（": "(",
+    "）": ")",
+    "；": ";",
+    "–": "-",
+    "—": " - ",
+    "．": ". ",
+    "～": "~",
+    "’": "'",
+    "…": "...",
+    "━": "-",
+    "〈": "<",
+    "〉": ">",
+    "【": "[",
+    "】": "]",
+    "％": "%",
+    "►": "-",
+}
+
+UNICODE_PUNCT_RE = re.compile(f"[{''.join(UNICODE_PUNCT.keys())}]")
+
+
+def replace_unicode_punct(text: str) -> str:
+    return "".join(UNICODE_PUNCT.get(c, c) for c in text)
+
+
+def remove_unicode_punct(text: str) -> str:
+    """More aggressive version of replace_unicode_punct but also faster."""
+    return UNICODE_PUNCT_RE.sub("", text)
+
+
+def strip_accents(line: str) -> str:
+    """Strips accents from a piece of text."""
+    nfd = unicodedata.normalize("NFD", line)
+    output = [c for c in nfd if unicodedata.category(c) != "Mn"]
+    return "".join(output)
+
+
+# Build a regex matching all control characters.
+NON_PRINTING_CHARS_RE = re.compile(
+    f"[{''.join(map(chr, list(range(32)) + list(range(127,160))))}]",
+)
+DIGIT_RE = re.compile(r"\d")
+PUNCT_OR_NON_PRINTING_CHARS_RE = re.compile(
+    (UNICODE_PUNCT_RE.pattern + NON_PRINTING_CHARS_RE.pattern).replace("][", ""),
+)
+
+
+def remove_non_printing_char(text: str) -> str:
+    return NON_PRINTING_CHARS_RE.sub("", text)
+
+
+def normalize_spacing_for_tok(text: str, language: str = "en") -> str:
+    res = (
+        text.replace("\r", "")
+        # remove extra spaces
+        .replace("(", " (")
+        .replace(")", ") ")
+        .replace(" +", " ")
+    )
+    res = re.sub(r"\) ([\.\!\:\?\;\,])", r"\)\1", res)
+    res = res.replace("( ", "(").replace(" )", ")")
+    res = re.sub(r"(\d) \%", r"\1\%", res)
+    res = res.replace(" :", ":").replace(" ;", ";")
+    res = res.replace("`", "'").replace("''", ' " ')
+
+    res = (
+        res.replace("„", '"')
+        .replace("“", '"')
+        .replace("”", '"')
+        .replace("–", "-")
+        .replace("—", " - ")
+        .replace(" +", " ")
+        .replace("´", "'")
+        .replace("([a-z])‘([a-z])", r"\1'\2/")
+        .replace("([a-z])’([a-z])", r"\1'\2/")
+        .replace("‘", '"')
+        .replace("‚", '"')
+        .replace("’", '"')
+        .replace("''", '"')
+        .replace("´´", '"')
+        .replace("…", "...")
+        # French quotes
+        .replace(" « ", ' "')
+        .replace("« ", '"')
+        .replace("«", '"')
+        .replace(" » ", '" ')
+        .replace(" »", '"')
+        .replace("»", '"')
+        # handle pseudo-spaces
+        .replace(" %", "%")
+        .replace("nº ", "nº ")
+        .replace(" :", ":")
+        .replace(" ºC", " ºC")
+        .replace(" cm", " cm")
+        .replace(" ?", "?")
+        .replace(" !", "!")
+        .replace(" ;", ";")
+        .replace(", ", ", ")
+        .replace(" +", " ")
+        .replace("．", ". ")
+    )
+    # English "quotation," followed by comma, style
+    if language == "en":
+        res = re.sub(r"\"([,\.]+)", r"\1\"", res)
+    # Czech is confused
+    elif language == "cs" or language == "cz":
+        pass
+    # German/Spanish/French "quotation", followed by comma, style
+    else:
+        res = res.replace(',"', '",')
+        res = re.sub(
+            r"(\.+)\"(\s*[^<])",
+            r"\"\1\2",
+            res,
+        )  # don't fix period at end of sentence
+
+    if (
+        language == "de"
+        or language == "es"
+        or language == "cz"
+        or language == "cs"
+        or language == "fr"
+    ):
+        res = re.sub(r"(\d) (\d)", r"\1,\2", res)
+    else:
+        res = re.sub(r"(\d) (\d)", r"\1.\2", res)
+    return res
+
+
+def normalize(
+    line: str,
+    accent: bool = True,
+    case: bool = True,
+    numbers: bool = True,
+    punct: Literal[1, 2] = 1,
+) -> str:
+    line = line.strip()
+    if not line:
+        return line
+    if case:
+        line = line.lower()
+    if accent:
+        line = strip_accents(line)
+    if numbers:
+        line = DIGIT_RE.sub("0", line)
+    if punct == 1:
+        line = replace_unicode_punct(line)
+    elif punct == 2:
+        line = remove_unicode_punct(line)
+    line = remove_non_printing_char(line)
+    return line
+
+
+def slow_normalize_for_dedup(line: str) -> str:
+    return normalize(line, accent=False, case=True, numbers=True, punct=2)
+
+
+def normalize_for_dedup(line: str) -> str:
+    line = line.strip()
+    if not line:
+        return line
+    # case
+    line = line.lower()
+    # numbers
+    line = DIGIT_RE.sub("0", line)
+    line = PUNCT_OR_NON_PRINTING_CHARS_RE.sub("", line)
+    return line
diff --git a/src/dfm/common/data_cleaning/dolma_taggers/__init__.py b/src/dfm/common/data_cleaning/dolma_taggers/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/src/dfm/common/data_cleaning/dolma_taggers/language_scandi.py b/src/dfm/common/data_cleaning/dolma_taggers/language_scandi.py
new file mode 100644
index 00000000..1108abff
--- /dev/null
+++ b/src/dfm/common/data_cleaning/dolma_taggers/language_scandi.py
@@ -0,0 +1,253 @@
+"""
+
+Dolma taggers for Scandinavian language detection.
+
+"""
+from collections.abc import Iterable
+
+import pycld2 as cld2
+import regex
+from anyascii import anyascii
+from dolma.core.data_types import DocResult, Document, Span, TextSlice
+from dolma.core.ft_tagger import BaseFastTextTagger, Prediction
+from dolma.core.registry import TaggerRegistry
+from dolma.core.taggers import BaseTagger
+from dolma.core.utils import split_paragraphs
+
+LANGS = {
+    "ENGLISH": "en",
+    "DANISH": "da",
+    "SWEDISH": "sv",
+    "NORWEGIAN": "no",
+    "ICELANDIC": "is",
+    "FAROESE": "fo",  # Note that FAROESE is not supported by cld2 or fasttext
+}
+
+
+@TaggerRegistry.add("cld2_scandi_doc")
+class Cld2ScandiLanguageTagger(BaseTagger):
+    """This tagger runs the Compact Language Detect 2 model on a full document
+    and will return a score between 0 and 1 for each language in LANGS.
+    It uses the pretrained model from the pycld2 package."""
+
+    RE_BAD_CHARS = regex.compile(r"[\p{Cc}\p{Cs}]+")
+
+    def _sanitize_input(self, text: str) -> str:
+        return self.RE_BAD_CHARS.sub("", text)
+
+    def _to_ascii_input(self, text: str) -> str:
+        return anyascii(text)
+
+    def _identity_fn(self, text: str) -> str:
+        return text
+
+    def _predict_text(self, text: str) -> dict[str, float]:
+        """Predict the language of a string and return the detected languages in a dictionary."""
+        is_reliable = False
+        details: Iterable[tuple[str, str, int, float]] = []
+        for fn in (self._identity_fn, self._to_ascii_input, self._sanitize_input):
+            try:
+                retvals = cld2.detect(fn(text))
+                assert len(retvals) == 3
+                is_reliable, _, details = retvals
+                # is_reliable is True if the detection is "high confidence"
+                # details is a Tuple of up to three detected languages, where each is
+                # tuple is (languageName, languageCode, percent, score).  percent is
+                # what percentage of the original text was detected as this language
+                # and score is the confidence score for that language.
+                break
+            except cld2.error:
+                ...
+
+        scores: dict[str, float] = {}
+        if is_reliable:
+            for lang, _, score, _ in details:
+                if lang in LANGS:
+                    scores[LANGS[lang]] = score / 100.0
+
+        return scores
+
+    def predict(self, doc: Document) -> DocResult:
+        lang_scores = self._predict_text(doc.text)
+        spans: list[Span] = []
+        for lang_code in LANGS.values():
+            # If the language was not detected we will still tag
+            # the sentence with a score of 0
+            score = lang_scores.get(lang_code, 0)
+
+            positive_span = Span(
+                start=0,
+                end=len(doc.text),
+                type=lang_code,
+                score=score,
+            )
+            negative_span = Span(
+                start=0,
+                end=len(doc.text),
+                type=f"not_{lang_code}",
+                score=1.0 - score,
+            )
+            spans.append(positive_span)
+            spans.append(negative_span)
+        return DocResult(doc=doc, spans=spans)
+
+
+@TaggerRegistry.add("cld2_scandi_paragraph")
+class Cld2ScandiLanguageParagraphTagger(Cld2ScandiLanguageTagger):
+    """This tagger runs the Compact Language Detect 2 model on each paragraph,
+    and will save a score between 0 and 1 for each language in LANGS"""
+
+    def predict(self, doc: Document) -> DocResult:
+        paragraphs = split_paragraphs(doc.text)
+        spans: list[Span] = []
+        for paragraph in paragraphs:
+            lang_scores = self._predict_text(paragraph.text)
+            for lang_code in LANGS.values():
+                score = lang_scores.get(lang_code, 0.0)
+
+                positive_span = Span(
+                    start=paragraph.start,
+                    end=paragraph.end,
+                    type=lang_code,
+                    score=score,
+                )
+                negative_span = Span(
+                    start=paragraph.start,
+                    end=paragraph.end,
+                    type=f"not_{lang_code}",
+                    score=1.0 - score,
+                )
+                spans.extend((positive_span, negative_span))
+        return DocResult(doc=doc, spans=spans)
+
+
+@TaggerRegistry.add("ft_lang_id_scandi_doc")
+class FastTextScandiLanguageDocumentTagger(BaseFastTextTagger):
+    """This tagger runs the FastText language detection model on each document.
+    The score is between 0 and 1 and provided for each language in LANGS.
+
+    The method is described in the following papers:
+
+    @article{joulin2016bag,
+      title={Bag of Tricks for Efficient Text Classification},
+      author={Joulin, Armand and Grave, Edouard and Bojanowski, Piotr and Mikolov, Tomas},
+      journal={arXiv preprint arXiv:1607.01759},
+      year={2016}
+    }
+    @article{joulin2016fasttext,
+      title={FastText.zip: Compressing text classification models},
+      author={Joulin, Armand and Grave, Edouard and Bojanowski, Piotr and Douze, Matthijs and J{\'e}gou, H{\'e}rve and Mikolov, Tomas},
+      journal={arXiv preprint arXiv:1612.03651},
+      year={2016}
+    }
+
+    The pretrained model is automatically downloaded (link publically available at):
+    https://fasttext.cc/docs/en/language-identification.html
+
+    """
+
+    MODEL_PATH = "https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin"
+
+    def __init__(self):
+        super().__init__(
+            model_path=self.MODEL_PATH,
+            model_mode=self.DOCUMENT_LEVEL_TAGGER,
+        )
+
+    def predict_slice(self, text_slice: TextSlice) -> Iterable[Prediction]:
+        pred = self.classifier.predict(
+            text_slice.text.lower().replace("\n", " ").strip(),
+            k=-1,
+        )
+        # Initialize scores to 0
+        scores = {k: 0.0 for k in LANGS.values()}
+
+        for label, score in zip(*pred):
+            # label is of the form __label__[code]
+            label_code = label[-2:]
+            if label_code in scores:
+                scores[label_code] = score
+
+        predictions_positive = [Prediction(label=k, score=v) for k, v in scores.items()]
+        predictions_negative = [
+            Prediction(label=f"not_{k}", score=1.0 - v) for k, v in scores.items()
+        ]
+
+        return predictions_positive + predictions_negative
+
+
+@TaggerRegistry.add("ft_lang_id_scandi_paragraph")
+class FastTextScandiLanguageParagraphTagger(FastTextScandiLanguageDocumentTagger):
+    """This tagger runs the FastText language detection model on each paragraph.
+    The score is between 0 and 1 and provided for each language in LANGS.
+    """
+
+    def __init__(self):
+        BaseFastTextTagger.__init__(
+            self,
+            model_path=self.MODEL_PATH,
+            model_mode=self.PARAGRAPH_LEVEL_TAGGER,
+        )
+
+
+def add_global_language_score_from_slice_score(result: DocResult) -> DocResult:
+    # the total document score is # of characters in each "lang" span multiplied by the likelihood
+    # of said span being lang
+    for lang in LANGS.values():
+        try:
+            doc_lang_score = sum(
+                (s.end - s.start) * s.score for s in result.spans if s.type == lang
+            ) / len(
+                result.doc.text,
+            )
+            doc_not_lang_score = 1 - doc_lang_score
+        except ZeroDivisionError:
+            doc_lang_score = doc_not_lang_score = 0.0
+
+        doc_level = (
+            Span(
+                start=0,
+                end=len(result.doc.text),
+                type=f"doc_{lang}",
+                score=doc_lang_score,
+            ),
+            Span(
+                start=0,
+                end=len(result.doc.text),
+                type=f"doc_not_{lang}",
+                score=doc_not_lang_score,
+            ),
+        )
+        result.spans.extend(doc_level)
+    return result
+
+
+# Composite tagger that provides both paragraph and doc scores
+@TaggerRegistry.add("cld2_scandi_paragraph_with_doc_score")
+class Cld2ScandiLanguageParagraphWithDocScoreTagger(
+    Cld2ScandiLanguageParagraphTagger,
+):
+    """This tagger runs the Compact Language Detect 2 model on each paragraph
+    and will also provide a total score for each document.
+    The score is between 0 and 1 and provided for each language in LANGS."""
+
+    def predict(self, doc: Document) -> DocResult:
+        doc_result = super().predict(doc)
+        doc_result = add_global_language_score_from_slice_score(doc_result)
+        return doc_result
+
+
+# Composite tagger that provides both paragraph and doc scores
+@TaggerRegistry.add("ft_lang_id_scandi_paragraph_with_doc_score")
+class FastTextScandiLanguageParagraphWithDocScoreTagger(
+    FastTextScandiLanguageParagraphTagger,
+):
+    """This tagger runs the FastText language detection model on each paragraph,
+    and will also provide a total score for each document.
+    The score is between 0 and 1 and provided for each language in LANGS.
+    """
+
+    def predict(self, doc: Document) -> DocResult:
+        doc_result = super().predict(doc)
+        doc_result = add_global_language_score_from_slice_score(doc_result)
+        return doc_result
diff --git a/src/dfm/common/data_cleaning/dolma_taggers/perplexity.py b/src/dfm/common/data_cleaning/dolma_taggers/perplexity.py
new file mode 100644
index 00000000..1825c5c7
--- /dev/null
+++ b/src/dfm/common/data_cleaning/dolma_taggers/perplexity.py
@@ -0,0 +1,194 @@
+"""
+Perplexity taggers
+
+This module contain taggers based on language models
+"""
+import hashlib
+import logging
+from pathlib import Path
+from typing import Any, Self
+
+import blingfire
+import kenlm
+import requests
+from dolma.core.data_types import DocResult, Document, Span
+from dolma.core.registry import TaggerRegistry
+from dolma.core.taggers import BaseTagger
+from dolma.core.utils import split_paragraphs
+
+from dfm.common.data_cleaning.ccnet_text_normalizer import normalize
+
+ccnet_sha256 = {
+    "af.arpa.bin": "7278e70cb22e29e94942b103c0ba49f406a9369c2949199fdf8d4bee4b0ce48e",
+    "ar.arpa.bin": "85739ba1e022a4abd9eb260e6c67e8a4e7646f0717e2800d8dde1ec039b7f5e2",
+    "az.arpa.bin": "247fd2355db94b4357d19c78c8ac38ce16299d1dac237745edeea8005d7771ba",
+    "be.arpa.bin": "b23a70aa0cec41555932e6b4aaa5a361c95d091fbd6d4c21e6a48c866b9cd1e8",
+    "bg.arpa.bin": "1edb68d25238d692cb9cc6b2e4f9fce0e99b49b421020c8e89d0781507dbcd38",
+    "bn.arpa.bin": "f21c8187eb77d2d7d17892b61dc3446dab79a61d3d0af4f0c90660f9df500cb2",
+    "ca.arpa.bin": "1e4e84639fd9a35cbfa47709ca2cd9eefc84dcee7ab7d91df11e5f89f88312d4",
+    "cs.arpa.bin": "4f89f980c12cae596b19fccd9aebea4be5be86c6f81a8b42fc975922ea656bb1",
+    "da.arpa.bin": "b7f754b56421944ada2c979d0b11e8eada8308e179cb60fbc1acc4318b03695b",
+    "de.arpa.bin": "a5bc18a9741dc57593d7cce469350d5d2db8ce1e87be6c2ec450850316e586ba",
+    "el.arpa.bin": "8a53a69835d0a8e88c720fc052180c54973d2b6ac3ed2ff83c666d432a0d3686",
+    "en.arpa.bin": "e90c9b25af01dcaa2667ed45d012d891269760fc6eccfe8dbbd161eb20e01d7d",
+    "es.arpa.bin": "00121ab8c31f275132fc67c292392a33ff81b8eae1015103e8a86f9df2e642d4",
+    "et.arpa.bin": "7c4b98dc3f7fff73611afdd0dc1379437cb0b3dd3addc0abadb65864cabb937f",
+    "fa.arpa.bin": "05d00d4fdb31e00295a63e4df4187954d43850a8bd7b61c717f809b19fc94cfe",
+    "fi.arpa.bin": "56aa4a6890c4152be3d594e7f7dc353e78881500803f36586c1c01d88f906618",
+    "fr.arpa.bin": "4a52387916be57551013df3f9052ee031c042445940a4d0e69b066597586c6aa",
+    "gu.arpa.bin": "4ad5be86ef47f3105eb9d7d178520a0cede5d02e4ca61a3aa2d32c8322ca5bd1",
+    "he.arpa.bin": "69d1ab538beb6c8aa646b7c611b701ad2d1a19dcce00d6690072fa9453ad2f00",
+    "hi.arpa.bin": "b7173df087ff5b24d759fdbf8d07d8e21a31c1b54c978c7c5c71f05b24e12f47",
+    "hr.arpa.bin": "3ba8caf473415c4d12be594c36892f1454a71a08441ad796bf105ebe4e957a8f",
+    "hu.arpa.bin": "ce82ceb8a1e808fc441d985c4249c08c67d527937d26e3e524404185803723cf",
+    "hy.arpa.bin": "3c5c3511a82538ab198536e54df4e770c40d78bf5929a7143ab42695641a0031",
+    "id.arpa.bin": "8e871368fb386180df09d1dfb45f0319dba7a1955b9d209e498c49d96d07b3dd",
+    "is.arpa.bin": "287f6f7bd8130d50df8966169427b236e9aa79ff2b4250c5bdfdc2c9a0c19f52",
+    "it.arpa.bin": "784efb647bd699041809d59dd309193f78a47ea347d13b0c93c3bd74f437a53b",
+    "ja.arpa.bin": "efa96d229e2a84be705f81bc4ea1c6da79505e5c7f001f92586e16481e5b586a",
+    "ka.arpa.bin": "07477bd9166bc2c748532f1c3af65aad42740231c0dc1f8a4410764e0d626199",
+    "kk.arpa.bin": "3cec2b6c9b3ae34919dd23ff59148e81b76593d7ec17feefcd5e2829cd1643c0",
+    "km.arpa.bin": "84a09db4e1e7a70e1cd7c347d9729339e3eaa993f42b4bba4ba91fe0a84ff763",
+    "kn.arpa.bin": "f1e0e469c8c78ac4e3b62d348e966e658cf7b8f683aafa4a2b4d55ca1e7d756c",
+    "ko.arpa.bin": "7e345046786a1ac6dbb0d3d0fdd65d2ff0e8a848395dbc84c6152acee1987f5f",
+    "lt.arpa.bin": "ecc1703e098477503035d980f6be841b5359f8f5f55cc4f78087232c7da15398",
+    "lv.arpa.bin": "5f6212551d5de115309674eed8ea595f1375973832917dd285942a0ef8d6c7e7",
+    "mk.arpa.bin": "0915b0c452f5bc6dd254c4145fd09f1252ea5e17f13f48991c72cb98fa2ed804",
+    "ml.arpa.bin": "3f0cfbf0bdc6935229d6903df8cb60b4ed2b9ed2cb9d4c253266b13bd3211297",
+    "mn.arpa.bin": "c8e57fcf604d178d45fbe3b1650c04e715c41cb8151bf8b115dc88c52ebfba56",
+    "mr.arpa.bin": "e00986484585cd67deba5902c7da78566452e3c40fc9aa285218152563d33303",
+    "my.arpa.bin": "ac3496e2981ea3ad85673ca52e04f5aa8e7be68d1d94c2e73ce26436864ae217",
+    "ne.arpa.bin": "7ef6c2d3e4e1858fb207e6c200e422833ccf072157a6a0148b408db3e760d22e",
+    "nl.arpa.bin": "aa017d97061e84f51d7f74b83a6a43aef246974fc9a502436043f6f0e9e12bbb",
+    "no.arpa.bin": "0ec663c264d6580beebe7e0e80a939dbe7082af55af3875f292ebd11ea5800de",
+    "pl.arpa.bin": "b97634bca2b28d95716b951ceadca3de4a170ff07639bcdc3c73fc0961362e98",
+    "pt.arpa.bin": "f5a10774d7b7125c6e887b62c56fea2d348adebc81ab1708d34f68de722090e0",
+    "ro.arpa.bin": "619b9a2d4d53bdb368bfdf2cc770e1e9549d52b22d1fd3afc0ee8a022543ed56",
+    "ru.arpa.bin": "588da7d3e160f61f7e821804bc4d518460687e1c4832c339bb3a28c03417ab53",
+    "uk.arpa.bin": "bfd09bdfe669a9fd5f8f8d9be519bdce3fb678214bc6afd5ccce499930b7d311",
+    "zh.arpa.bin": "f157d94cb2828bbb44b5dddf38e7eb7f62a47d317917646a73fe2af50a3dad68",
+}
+
+
+def _get_ccnet_pretrained_lm(lang: str) -> Path:
+    # Download pretrained model and save to the data folder
+    url = f"http://dl.fbaipublicfiles.com/cc_net/lm/{lang}.arpa.bin"
+    data_folder = Path("data_lm")
+
+    Path.mkdir(data_folder, parents=True, exist_ok=True)
+
+    filename = f"{lang}.arpa.bin"
+    file_path = data_folder / filename
+
+    # Check if the file already exists
+    if not Path.exists(file_path):
+        # If the file does not exist, download it
+        logging.info(f"Downloading {lang} model...")
+        response = requests.get(url)
+        if response.status_code == requests.codes.ok:
+            sha256 = hashlib.sha256(response.content).hexdigest()
+            if sha256 != ccnet_sha256[filename]:
+                raise RuntimeError(
+                    f"Checksum mismatch {sha256} != {ccnet_sha256[filename]}",
+                )
+            with Path.open(file_path, "wb") as file:
+                file.write(response.content)
+            logging.info(f"{lang} model downloaded and saved at {file_path}")
+        else:
+            raise RuntimeError(
+                f"Failed to download {lang} model. Status code: {response.status_code}",
+            )
+    else:
+        logging.info(f"{lang} model already exists at {file_path}")
+
+    return file_path
+
+
+def pp(log_score: float, length: float) -> float:
+    """Convert total log-probability to perplexity"""
+    return 10.0 ** (-log_score / length)
+
+
+class PerplexityBaseTagger(BaseTagger):
+    """Base class for CCNet based perplexity tagger"""
+
+    @property
+    def model(self: Self) -> kenlm.Model:
+        return self._model
+
+    @model.setter
+    def model(self: Self, model: kenlm.Model):
+        self._model = model
+
+
+def create_ccnet_perplexity_tagger(lang: str) -> type[PerplexityBaseTagger]:
+    """Dynamically create perplexity tagger class for a given language.
+    The class for each language is based on a CCNet pretrained model [1].
+    The pretrained models are available throught the Github project page https://github.com/facebookresearch/cc_net.
+    The models are small language models trained on the Wikipedia of the corresponding language.
+
+    [1]
+    @inproceedings{wenzek2020ccnet,
+      title={CCNet: Extracting High Quality Monolingual Datasets from Web Crawl Data},
+      author={Wenzek, Guillaume and Lachaux, Marie-Anne and Conneau, Alexis and Chaudhary, Vishrav and Guzm{\'a}n, Francisco and Joulin, Armand and Grave, {\'E}douard},
+      booktitle={Proceedings of The 12th Language Resources and Evaluation Conference},
+      pages={4003--4012},
+      year={2020}
+    }
+    """
+
+    def __init__(self: Any) -> None:
+        model_bin_path = _get_ccnet_pretrained_lm(lang)
+        self.model = kenlm.Model(str(model_bin_path))
+
+    def predict(self: PerplexityBaseTagger, doc: Document) -> DocResult:
+        paragraphs = split_paragraphs(doc.text)
+        spans: list[Span] = []
+        doc_log_prob: float = 0.0
+        doc_length: float = 0.0
+        for paragraph in paragraphs:
+            # To get proper scores from the language model we need to normalize the text
+            # Do not remove accents as it removes æøå and others.
+            normalized_text = blingfire.normalize_spaces(
+                normalize(paragraph.text, accent=False),
+            )
+            # The kenlm model expects end of sentence punctuation to be separated from words with spaces
+            # so we separate the words using blingfire.
+            normalized_words = blingfire.text_to_words(normalized_text)
+            log_prob = self.model.score(normalized_words)
+            length = len(normalized_words.split()) + 1
+            doc_log_prob += log_prob
+            doc_length += length
+            paragraph_span = Span(
+                start=paragraph.start,
+                end=paragraph.end,
+                type="perplexity",
+                score=pp(log_prob, length),
+            )
+            spans.append(paragraph_span)
+
+        paragraph_span = Span(
+            start=0,
+            end=len(doc.text),
+            type="doc_perplexity",
+            score=pp(doc_log_prob, doc_length),
+        )
+        return DocResult(doc=doc, spans=spans)
+
+    # Build the class dynamically from base class
+    # and methods.
+    cls = type(
+        f"CCNetPerplexity{lang}Tagger",
+        (PerplexityBaseTagger,),
+        {
+            "__init__": __init__,
+            "predict": predict,
+        },
+    )
+    # Add the class decorator explicitly to add the tagger to the registry
+    cls = TaggerRegistry.add(f"ccnet_perplexity_paragraph_w_doc_{lang}")(cls)
+    return cls
+
+
+for lang in ["da", "en", "is", "no", "sv"]:
+    create_ccnet_perplexity_tagger(lang)
diff --git a/typings/blingfire/__init__.pyi b/typings/blingfire/__init__.pyi
new file mode 100644
index 00000000..20da73f7
--- /dev/null
+++ b/typings/blingfire/__init__.pyi
@@ -0,0 +1,54 @@
+"""
+This type stub file was generated by pyright.
+"""
+# def text_to_sentences(s: str): # -> Any | Literal['']:
+#    ...
+#
+# def text_to_sentences_with_model(h, s): # -> Any | Literal['']:
+#    ...
+
+def normalize_spaces(s: str, uSpace: int = 0x20) -> str:  # -> Any | Literal['']:
+    ...
+
+def text_to_words(s: str) -> str:  # -> Any | Literal['']:
+    ...
+
+# Uncomment lines that are used in project
+# def text_to_words_with_model(h, s): # -> Any | Literal['']:
+#    ...
+#
+# def word_hyphenation_with_model(h, s, uHy=...): # -> Any | Literal['']:
+#    ...
+#
+# def get_blingfiretok_version(): # -> Any:
+#    ...
+#
+# def text_to_hashes(s, word_n_grams, bucketSize): # -> NDArray[Any] | None:
+#    ...
+#
+# def text_to_token_with_offsets(s, text_to_token_f, split_byte): # -> tuple[Literal[''], list[Unknown]] | tuple[Any, list[tuple[Unknown, Unknown]]]:
+#    ...
+#
+# def text_to_words_with_offsets(s): # -> tuple[Literal[''], list[Unknown]] | tuple[Any, list[tuple[Unknown, Unknown]]]:
+#    ...
+#
+# def text_to_sentences_and_offsets(s): # -> tuple[Literal[''], list[Unknown]] | tuple[Any, list[tuple[Unknown, Unknown]]]:
+#    ...
+#
+# def load_model(file_name): # -> Any:
+#    ...
+#
+# def free_model(h): # -> None:
+#    ...
+#
+# def text_to_ids(h, s, max_len, unk=..., no_padding=...): # -> NDArray[Any]:
+#    ...
+#
+# def ids_to_text(h, ids, skip_special_tokens=..., output_buffer_size=...): # -> Any | Literal['']:
+#    ...
+#
+# def utf8text_to_ids_with_offsets(h, s_bytes, max_len, unk=..., no_padding=...): # -> tuple[NDArray[Any], NDArray[Any], NDArray[Any]]:
+#    ...
+#
+# def change_settings_dummy_prefix(h, add_prefix): # -> None:
+#    ...
diff --git a/typings/fasttext/FastText.pyi b/typings/fasttext/FastText.pyi
new file mode 100644
index 00000000..4dde0e7b
--- /dev/null
+++ b/typings/fasttext/FastText.pyi
@@ -0,0 +1,264 @@
+"""
+This type stub file was initially generated by pyright
+"""
+from typing import Iterable
+
+loss_name = ...
+model_name = ...
+EOS = ...
+BOW = ...
+EOW = ...
+displayed_errors = ...
+
+def eprint(*args, **kwargs):  # -> None:
+    ...
+
+class _Meter:
+    def __init__(self, fasttext_model, meter) -> None: ...
+    def score_vs_true(self, label):  # -> tuple[NDArray[Unknown], NDArray[Any]]:
+        """Return scores and the gold of each sample for a specific label"""
+        ...
+    def precision_recall_curve(
+        self, label=...
+    ):  # -> tuple[NDArray[Unknown], NDArray[Any]]:
+        """Return precision/recall curve"""
+        ...
+    def precision_at_recall(self, recall, label=...):
+        """Return precision for a given recall"""
+        ...
+    def recall_at_precision(self, precision, label=...):
+        """Return recall for a given precision"""
+        ...
+
+class _FastText:
+    """
+    This class defines the API to inspect models and should not be used to
+    create objects. It will be returned by functions such as load_model or
+    train.
+
+    In general this API assumes to be given only unicode for Python2 and the
+    Python3 equvalent called str for any string-like arguments. All unicode
+    strings are then encoded as UTF-8 and fed to the fastText C++ API.
+    """
+
+    def __init__(self, model_path=..., args=...) -> None: ...
+    def set_args(self, args=...):  # -> None:
+        ...
+    def is_quantized(self): ...
+    def get_dimension(self):
+        """Get the dimension (size) of a lookup vector (hidden layer)."""
+        ...
+    def get_word_vector(self, word):  # -> NDArray[Unknown]:
+        """Get the vector representation of word."""
+        ...
+    def get_sentence_vector(self, text):  # -> NDArray[Unknown]:
+        """
+        Given a string, get a single vector represenation. This function
+        assumes to be given a single line of text. We split words on
+        whitespace (space, newline, tab, vertical tab) and the control
+        characters carriage return, formfeed and the null character.
+        """
+        ...
+    def get_nearest_neighbors(self, word, k=..., on_unicode_error=...): ...
+    def get_analogies(self, wordA, wordB, wordC, k=..., on_unicode_error=...): ...
+    def get_word_id(self, word):
+        """
+        Given a word, get the word id within the dictionary.
+        Returns -1 if word is not in the dictionary.
+        """
+        ...
+    def get_label_id(self, label):
+        """
+        Given a label, get the label id within the dictionary.
+        Returns -1 if label is not in the dictionary.
+        """
+        ...
+    def get_subword_id(self, subword):
+        """
+        Given a subword, return the index (within input matrix) it hashes to.
+        """
+        ...
+    def get_subwords(
+        self, word, on_unicode_error=...
+    ):  # -> tuple[Unknown, NDArray[Unknown]]:
+        """
+        Given a word, get the subwords and their indicies.
+        """
+        ...
+    def get_input_vector(self, ind):  # -> NDArray[Unknown]:
+        """
+        Given an index, get the corresponding vector of the Input Matrix.
+        """
+        ...
+    def predict(
+        self,
+        text: str,
+        k: int = ...,
+        threshold: float = ...,
+        on_unicode_error: str = ...,
+    ) -> Iterable[
+        tuple[str, float]
+    ]:  # -> tuple[Unknown, Unknown] | tuple[Any | tuple[()], NDArray[Unknown]]:
+        """
+        Given a string, get a list of labels and a list of
+        corresponding probabilities. k controls the number
+        of returned labels. A choice of 5, will return the 5
+        most probable labels. By default this returns only
+        the most likely label and probability. threshold filters
+        the returned labels by a threshold on probability. A
+        choice of 0.5 will return labels with at least 0.5
+        probability. k and threshold will be applied together to
+        determine the returned labels.
+
+        This function assumes to be given
+        a single line of text. We split words on whitespace (space,
+        newline, tab, vertical tab) and the control characters carriage
+        return, formfeed and the null character.
+
+        If the model is not supervised, this function will throw a ValueError.
+
+        If given a list of strings, it will return a list of results as usually
+        received for a single line of text.
+        """
+        ...
+    def get_input_matrix(self):  # -> NDArray[Unknown]:
+        """
+        Get a reference to the full input matrix of a Model. This only
+        works if the model is not quantized.
+        """
+        ...
+    def get_output_matrix(self):  # -> NDArray[Unknown]:
+        """
+        Get a reference to the full output matrix of a Model. This only
+        works if the model is not quantized.
+        """
+        ...
+    def get_words(
+        self, include_freq=..., on_unicode_error=...
+    ):  # -> tuple[Unknown, NDArray[Unknown]]:
+        """
+        Get the entire list of words of the dictionary optionally
+        including the frequency of the individual words. This
+        does not include any subwords. For that please consult
+        the function get_subwords.
+        """
+        ...
+    def get_labels(
+        self, include_freq=..., on_unicode_error=...
+    ):  # -> tuple[Unknown, NDArray[Unknown]]:
+        """
+        Get the entire list of labels of the dictionary optionally
+        including the frequency of the individual labels. Unsupervised
+        models use words as labels, which is why get_labels
+        will call and return get_words for this type of
+        model.
+        """
+        ...
+    def get_line(self, text, on_unicode_error=...):
+        """
+        Split a line of text into words and labels. Labels must start with
+        the prefix used to create the model (__label__ by default).
+        """
+        ...
+    def save_model(self, path):  # -> None:
+        """Save the model to the given path"""
+        ...
+    def test(self, path, k=..., threshold=...):
+        """Evaluate supervised model using file given by path"""
+        ...
+    def test_label(self, path, k=..., threshold=...):
+        """
+        Return the precision and recall score for each label.
+
+        The returned value is a dictionary, where the key is the label.
+        For example:
+        f.test_label(...)
+        {'__label__italian-cuisine' : {'precision' : 0.7, 'recall' : 0.74}}
+        """
+        ...
+    def get_meter(self, path, k=...):  # -> _Meter:
+        ...
+    def quantize(
+        self,
+        input=...,
+        qout=...,
+        cutoff=...,
+        retrain=...,
+        epoch=...,
+        lr=...,
+        thread=...,
+        verbose=...,
+        dsub=...,
+        qnorm=...,
+    ):  # -> None:
+        """
+        Quantize the model reducing the size of the model and
+        it's memory footprint.
+        """
+        ...
+    def set_matrices(self, input_matrix, output_matrix):  # -> None:
+        """
+        Set input and output matrices. This function assumes you know what you
+        are doing.
+        """
+        ...
+    @property
+    def words(self):  # -> tuple[Unknown, NDArray[Unknown]]:
+        ...
+    @property
+    def labels(self):  # -> tuple[Unknown, NDArray[Unknown]]:
+        ...
+    def __getitem__(self, word):  # -> NDArray[Unknown]:
+        ...
+    def __contains__(self, word):  # -> bool:
+        ...
+
+def tokenize(text):
+    """Given a string of text, tokenize it and return a list of tokens"""
+    ...
+
+def load_model(path):  # -> _FastText:
+    """Load a model given a filepath and return a model object."""
+    ...
+
+unsupervised_default = ...
+
+def read_args(
+    arg_list, arg_dict, arg_names, default_values
+):  # -> tuple[dict[Unknown, Unknown], set[Unknown]]:
+    ...
+
+def train_supervised(*kargs, **kwargs):  # -> _FastText:
+    """
+    Train a supervised model and return a model object.
+
+    input must be a filepath. The input text does not need to be tokenized
+    as per the tokenize function, but it must be preprocessed and encoded
+    as UTF-8. You might want to consult standard preprocessing scripts such
+    as tokenizer.perl mentioned here: http://www.statmt.org/wmt07/baseline.html
+
+    The input file must must contain at least one label per line. For an
+    example consult the example datasets which are part of the fastText
+    repository such as the dataset pulled by classification-example.sh.
+    """
+    ...
+
+def train_unsupervised(*kargs, **kwargs):  # -> _FastText:
+    """
+    Train an unsupervised model and return a model object.
+
+    input must be a filepath. The input text does not need to be tokenized
+    as per the tokenize function, but it must be preprocessed and encoded
+    as UTF-8. You might want to consult standard preprocessing scripts such
+    as tokenizer.perl mentioned here: http://www.statmt.org/wmt07/baseline.html
+
+    The input field must not contain any labels or use the specified label prefix
+    unless it is ok for those words to be ignored. For an example consult the
+    dataset pulled by the example script word-vector-example.sh, which is
+    part of the fastText repository.
+    """
+    ...
+
+def cbow(*kargs, **kwargs): ...
+def skipgram(*kargs, **kwargs): ...
+def supervised(*kargs, **kwargs): ...
diff --git a/typings/fasttext/__init__.pyi b/typings/fasttext/__init__.pyi
new file mode 100644
index 00000000..cbf98ace
--- /dev/null
+++ b/typings/fasttext/__init__.pyi
@@ -0,0 +1,17 @@
+"""
+This type stub file was generated by pyright.
+"""
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+from .FastText import (
+    BOW,
+    EOS,
+    EOW,
+    cbow,
+    load_model,
+    skipgram,
+    supervised,
+    tokenize,
+    train_supervised,
+    train_unsupervised,
+)
diff --git a/typings/fasttext/tests/__init__.pyi b/typings/fasttext/tests/__init__.pyi
new file mode 100644
index 00000000..8cec2d4a
--- /dev/null
+++ b/typings/fasttext/tests/__init__.pyi
@@ -0,0 +1,7 @@
+"""
+This type stub file was generated by pyright.
+"""
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+from .test_configurations import get_supervised_models
+from .test_script import gen_tests, gen_unit_tests
diff --git a/typings/fasttext/tests/test_configurations.pyi b/typings/fasttext/tests/test_configurations.pyi
new file mode 100644
index 00000000..6930c565
--- /dev/null
+++ b/typings/fasttext/tests/test_configurations.pyi
@@ -0,0 +1,26 @@
+"""
+This type stub file was generated by pyright.
+"""
+
+def max_thread():  # -> int:
+    ...
+
+def check_supervised_configuration(configuration, verbose=...): ...
+def check_supervised_configurations(configurations, verbose=...): ...
+def flickr_job(thread=...):  # -> dict[Unknown, Unknown]:
+    ...
+
+def langid_job1(thread=...):  # -> dict[Unknown, Unknown]:
+    ...
+
+def langid_job2(thread=...):  # -> dict[Unknown, Unknown]:
+    ...
+
+def cooking_job1(thread=...):  # -> dict[Unknown, Unknown]:
+    ...
+
+def cooking_job2(thread=...):  # -> dict[Unknown, Unknown]:
+    ...
+
+def get_supervised_models(thread=..., verbose=...):  # -> list[Unknown]:
+    ...
diff --git a/typings/fasttext/tests/test_script.pyi b/typings/fasttext/tests/test_script.pyi
new file mode 100644
index 00000000..632d8d85
--- /dev/null
+++ b/typings/fasttext/tests/test_script.pyi
@@ -0,0 +1,82 @@
+"""
+This type stub file was generated by pyright.
+"""
+
+import unittest
+
+def eprint(cls, *args, **kwargs):  # -> None:
+    ...
+
+def get_random_unicode(length):  # -> str:
+    ...
+
+def get_random_words(N, a=..., b=..., unique=...):  # -> list[Unknown]:
+    ...
+
+def get_random_data(
+    num_lines=...,
+    max_vocab_size=...,
+    min_words_line=...,
+    max_words_line=...,
+    min_len_word=...,
+    max_len_word=...,
+    unique_words=...,
+):  # -> list[Unknown]:
+    ...
+
+def default_kwargs(kwargs): ...
+def build_unsupervised_model(data, kwargs):  # -> _FastText:
+    ...
+
+def build_supervised_model(data, kwargs):  # -> _FastText:
+    ...
+
+def read_labels(data_file):  # -> tuple[list[Unknown], list[Unknown]]:
+    ...
+
+class TestFastTextUnitPy(unittest.TestCase):
+    def gen_test_get_vector(self, kwargs):  # -> None:
+        ...
+    def gen_test_multi_get_line(self, kwargs):  # -> None:
+        ...
+    def gen_test_supervised_util_test(self, kwargs):  # -> None:
+        ...
+    def gen_test_supervised_predict(self, kwargs):  # -> None:
+        ...
+    def gen_test_supervised_multiline_predict(self, kwargs):  # -> None:
+        ...
+    def gen_test_vocab(self, kwargs):  # -> None:
+        ...
+    def gen_test_subwords(self, kwargs):  # -> None:
+        ...
+    def gen_test_tokenize(self, kwargs):  # -> None:
+        ...
+    def gen_test_unsupervised_dimension(self, kwargs):  # -> None:
+        ...
+    def gen_test_supervised_dimension(self, kwargs):  # -> None:
+        ...
+    def gen_test_subword_vector(self, kwargs):  # -> None:
+        ...
+    def gen_test_unsupervised_get_words(self, kwargs):  # -> None:
+        ...
+    def gen_test_supervised_get_words(self, kwargs):  # -> None:
+        ...
+    def gen_test_unsupervised_get_labels(self, kwargs):  # -> None:
+        ...
+    def gen_test_supervised_get_labels(self, kwargs):  # -> None:
+        ...
+    def gen_test_unsupervised_exercise_is_quant(self, kwargs):  # -> None:
+        ...
+    def gen_test_supervised_exercise_is_quant(self, kwargs):  # -> None:
+        ...
+    def gen_test_newline_predict_sentence(self, kwargs):  # -> None:
+        ...
+
+def gen_sup_test(configuration, data_dir):  # -> (self: Unknown) -> None:
+    ...
+
+def gen_unit_tests(verbose=...):  # -> type[TestFastTextUnitPy]:
+    ...
+
+def gen_tests(data_dir, verbose=...):  # -> type[TestFastTextPy]:
+    class TestFastTextPy(unittest.TestCase): ...
diff --git a/typings/fasttext/util/__init__.pyi b/typings/fasttext/util/__init__.pyi
new file mode 100644
index 00000000..87465a71
--- /dev/null
+++ b/typings/fasttext/util/__init__.pyi
@@ -0,0 +1,6 @@
+"""
+This type stub file was generated by pyright.
+"""
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+from .util import download_model, find_nearest_neighbor, reduce_model, test
diff --git a/typings/fasttext/util/util.pyi b/typings/fasttext/util/util.pyi
new file mode 100644
index 00000000..16d2c81a
--- /dev/null
+++ b/typings/fasttext/util/util.pyi
@@ -0,0 +1,39 @@
+"""
+This type stub file was generated by pyright.
+"""
+
+valid_lang_ids = ...
+
+def test(predictions, labels, k=...):  # -> tuple[float, float]:
+    """
+    Return precision and recall modeled after fasttext's test
+    """
+    ...
+
+def find_nearest_neighbor(query, vectors, ban_set, cossims=...):  # -> Any:
+    """
+    query is a 1d numpy array corresponding to the vector to which you want to
+    find the closest vector
+    vectors is a 2d numpy array corresponding to the vectors you want to consider
+    ban_set is a set of indicies within vectors you want to ignore for nearest match
+    cossims is a 1d numpy array of size len(vectors), which can be passed for efficiency
+
+    returns the index of the closest match to query within vectors
+
+    """
+    ...
+
+def reduce_model(ft_model, target_dim):
+    """
+    ft_model is an instance of `_FastText` class
+    This function computes the PCA of the input and the output matrices
+    and sets the reduced ones.
+    """
+    ...
+
+def download_model(lang_id, if_exists=..., dimension=...):  # -> None:
+    """
+    Download pre-trained common-crawl vectors from fastText's website
+    https://fasttext.cc/docs/en/crawl-vectors.html
+    """
+    ...
diff --git a/typings/kenlm/__init__.pyi b/typings/kenlm/__init__.pyi
new file mode 100644
index 00000000..40f566db
--- /dev/null
+++ b/typings/kenlm/__init__.pyi
@@ -0,0 +1,7 @@
+"""
+Type stub for kenlm
+"""
+
+class Model:
+    def __init__(self, model_bin_path: str) -> None: ...
+    def score(self, sentence: str) -> float: ...
diff --git a/typings/pycld2/__init__.pyi b/typings/pycld2/__init__.pyi
new file mode 100644
index 00000000..5b478a63
--- /dev/null
+++ b/typings/pycld2/__init__.pyi
@@ -0,0 +1,21 @@
+"""
+Type stub file for pycld2
+"""
+
+from typing import Union, TypeAlias
+
+from pycld2 import DETECTED_LANGUAGES, ENCODINGS, LANGUAGES, VERSION, __version__, error
+
+IsReliable: TypeAlias = bool
+TextBytesFound: TypeAlias = int
+DetectDetails: TypeAlias = tuple[tuple[str, str, int, float], ...]
+Vectors: TypeAlias = tuple[tuple[int, int, str, str], ...]
+
+def detect(
+    text: str, returnVectors: bool = False
+) -> Union[
+    tuple[IsReliable, TextBytesFound, DetectDetails],
+    tuple[IsReliable, TextBytesFound, DetectDetails, Vectors],
+]: ...
+
+__all__ = ("DETECTED_LANGUAGES", "ENCODINGS", "LANGUAGES", "VERSION", "detect", "error")