Merge pull request #186 from centre-for-humanities-computing/dolma_ta…

…ggers Dolma taggers
centre-for-humanities-computing · Nov 15, 2023 · 29aecb3 · 29aecb3
2 parents 1988e05 + ac4ebd2
commit 29aecb3
Show file tree

Hide file tree

Showing 19 changed files with 1,203 additions and 2 deletions.
diff --git a/Dockerfile.dev b/Dockerfile.dev
@@ -1,5 +1,23 @@
 FROM python:3.11-bullseye
 
+# Update default packages
+RUN apt-get -qq update
+
+# Get Ubuntu packages
+RUN apt-get install -y -q \
+    build-essential \
+    curl \
+    cmake
+
+# NOTE: no need to run update again at this point
+# RUN apt-get update
+
+# Get Rust; NOTE: using sh for better compatibility with other base images
+RUN curl https://sh.rustup.rs -sSf | sh -s -- -y
+
+# Add .cargo/bin to PATH
+ENV PATH="/root/.cargo/bin:${PATH}"
+
 # Set the working directory to /app
 WORKDIR /app
 
@@ -10,4 +28,4 @@ RUN make install
 
 # Install the app
 COPY . /app
-RUN pip install -e .
+RUN pip install -e .
diff --git a/pyproject.toml b/pyproject.toml
@@ -16,7 +16,13 @@ classifiers = [
 ]
 requires-python = ">=3.10"
 
-dependencies = ["pydantic==1.8.2"]
+dependencies = [
+  "pydantic>=2.4.2", # dolma does not work with very old versions of pydantic
+  "dolma@git+https://github.com/allenai/dolma.git@5a010a2685914b1db7744426abfb4b9ece52da95", # Install from git until a 0.9.2 package is released
+  "kenlm>=0.2.0", # Used for perplexity tagging
+  "blingfire>=0.1.8", # Used for perplexity tagging
+  "requests>=2.31.0",
+]
 
 [project.optional-dependencies]
 dev = ["black==23.9.1", "ruff==0.1.0", "pyright==1.1.331", "pre-commit==3.5.0"]

diff --git a/src/dfm/__init__.py b/src/dfm/__init__.py
@@ -0,0 +1,4 @@
+import importlib.metadata
+
+# Fetches the version of the package as defined in pyproject.toml
+__version__ = importlib.metadata.version(__package__)
diff --git a/src/dfm/common/__init__.py b/src/dfm/common/__init__.py
diff --git a/src/dfm/common/data_cleaning/__init__.py b/src/dfm/common/data_cleaning/__init__.py
diff --git a/src/dfm/common/data_cleaning/ccnet_text_normalizer.py b/src/dfm/common/data_cleaning/ccnet_text_normalizer.py
@@ -0,0 +1,203 @@
+# This file has initially been copied from the ccnet repository from Facebook.
+# https://github.com/facebookresearch/cc_net/blob/main/cc_net/text_normalizer.py
+# The utility functions can be used to normalize text before processing it
+# with ccnet models, but might not be the best general purpose implementation.
+#
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# This file is full of ambigous characters, so disable ruff check for those.
+# ruff: noqa: RUF001
+
+import re
+import unicodedata
+from typing import Literal
+
+UNICODE_PUNCT = {
+    "，": ",",
+    "。": ".",
+    "、": ",",
+    "„": '"',
+    "”": '"',
+    "“": '"',
+    "«": '"',
+    "»": '"',
+    "１": '"',
+    "」": '"',
+    "「": '"',
+    "《": '"',
+    "》": '"',
+    "´": "'",
+    "∶": ":",
+    "：": ":",
+    "？": "?",
+    "！": "!",
+    "（": "(",
+    "）": ")",
+    "；": ";",
+    "–": "-",
+    "—": " - ",
+    "．": ". ",
+    "～": "~",
+    "’": "'",
+    "…": "...",
+    "━": "-",
+    "〈": "<",
+    "〉": ">",
+    "【": "[",
+    "】": "]",
+    "％": "%",
+    "►": "-",
+}
+
+UNICODE_PUNCT_RE = re.compile(f"[{''.join(UNICODE_PUNCT.keys())}]")
+
+
+def replace_unicode_punct(text: str) -> str:
+    return "".join(UNICODE_PUNCT.get(c, c) for c in text)
+
+
+def remove_unicode_punct(text: str) -> str:
+    """More aggressive version of replace_unicode_punct but also faster."""
+    return UNICODE_PUNCT_RE.sub("", text)
+
+
+def strip_accents(line: str) -> str:
+    """Strips accents from a piece of text."""
+    nfd = unicodedata.normalize("NFD", line)
+    output = [c for c in nfd if unicodedata.category(c) != "Mn"]
+    return "".join(output)
+
+
+# Build a regex matching all control characters.
+NON_PRINTING_CHARS_RE = re.compile(
+    f"[{''.join(map(chr, list(range(32)) + list(range(127,160))))}]",
+)
+DIGIT_RE = re.compile(r"\d")
+PUNCT_OR_NON_PRINTING_CHARS_RE = re.compile(
+    (UNICODE_PUNCT_RE.pattern + NON_PRINTING_CHARS_RE.pattern).replace("][", ""),
+)
+
+
+def remove_non_printing_char(text: str) -> str:
+    return NON_PRINTING_CHARS_RE.sub("", text)
+
+
+def normalize_spacing_for_tok(text: str, language: str = "en") -> str:
+    res = (
+        text.replace("\r", "")
+        # remove extra spaces
+        .replace("(", " (")
+        .replace(")", ") ")
+        .replace(" +", " ")
+    )
+    res = re.sub(r"\) ([\.\!\:\?\;\,])", r"\)\1", res)
+    res = res.replace("( ", "(").replace(" )", ")")
+    res = re.sub(r"(\d) \%", r"\1\%", res)
+    res = res.replace(" :", ":").replace(" ;", ";")
+    res = res.replace("`", "'").replace("''", ' " ')
+
+    res = (
+        res.replace("„", '"')
+        .replace("“", '"')
+        .replace("”", '"')
+        .replace("–", "-")
+        .replace("—", " - ")
+        .replace(" +", " ")
+        .replace("´", "'")
+        .replace("([a-z])‘([a-z])", r"\1'\2/")
+        .replace("([a-z])’([a-z])", r"\1'\2/")
+        .replace("‘", '"')
+        .replace("‚", '"')
+        .replace("’", '"')
+        .replace("''", '"')
+        .replace("´´", '"')
+        .replace("…", "...")
+        # French quotes
+        .replace(" « ", ' "')
+        .replace("« ", '"')
+        .replace("«", '"')
+        .replace(" » ", '" ')
+        .replace(" »", '"')
+        .replace("»", '"')
+        # handle pseudo-spaces
+        .replace(" %", "%")
+        .replace("nº ", "nº ")
+        .replace(" :", ":")
+        .replace(" ºC", " ºC")
+        .replace(" cm", " cm")
+        .replace(" ?", "?")
+        .replace(" !", "!")
+        .replace(" ;", ";")
+        .replace(", ", ", ")
+        .replace(" +", " ")
+        .replace("．", ". ")
+    )
+    # English "quotation," followed by comma, style
+    if language == "en":
+        res = re.sub(r"\"([,\.]+)", r"\1\"", res)
+    # Czech is confused
+    elif language == "cs" or language == "cz":
+        pass
+    # German/Spanish/French "quotation", followed by comma, style
+    else:
+        res = res.replace(',"', '",')
+        res = re.sub(
+            r"(\.+)\"(\s*[^<])",
+            r"\"\1\2",
+            res,
+        )  # don't fix period at end of sentence
+
+    if (
+        language == "de"
+        or language == "es"
+        or language == "cz"
+        or language == "cs"
+        or language == "fr"
+    ):
+        res = re.sub(r"(\d) (\d)", r"\1,\2", res)
+    else:
+        res = re.sub(r"(\d) (\d)", r"\1.\2", res)
+    return res
+
+
+def normalize(
+    line: str,
+    accent: bool = True,
+    case: bool = True,
+    numbers: bool = True,
+    punct: Literal[1, 2] = 1,
+) -> str:
+    line = line.strip()
+    if not line:
+        return line
+    if case:
+        line = line.lower()
+    if accent:
+        line = strip_accents(line)
+    if numbers:
+        line = DIGIT_RE.sub("0", line)
+    if punct == 1:
+        line = replace_unicode_punct(line)
+    elif punct == 2:
+        line = remove_unicode_punct(line)
+    line = remove_non_printing_char(line)
+    return line
+
+
+def slow_normalize_for_dedup(line: str) -> str:
+    return normalize(line, accent=False, case=True, numbers=True, punct=2)
+
+
+def normalize_for_dedup(line: str) -> str:
+    line = line.strip()
+    if not line:
+        return line
+    # case
+    line = line.lower()
+    # numbers
+    line = DIGIT_RE.sub("0", line)
+    line = PUNCT_OR_NON_PRINTING_CHARS_RE.sub("", line)
+    return line
diff --git a/src/dfm/common/data_cleaning/dolma_taggers/__init__.py b/src/dfm/common/data_cleaning/dolma_taggers/__init__.py