Skip to content

Commit

Permalink
Merge pull request #186 from centre-for-humanities-computing/dolma_ta…
Browse files Browse the repository at this point in the history
…ggers

Dolma taggers
  • Loading branch information
peterbjorgensen authored Nov 15, 2023
2 parents 1988e05 + ac4ebd2 commit 29aecb3
Show file tree
Hide file tree
Showing 19 changed files with 1,203 additions and 2 deletions.
20 changes: 19 additions & 1 deletion Dockerfile.dev
Original file line number Diff line number Diff line change
@@ -1,5 +1,23 @@
FROM python:3.11-bullseye

# Update default packages
RUN apt-get -qq update

# Get Ubuntu packages
RUN apt-get install -y -q \
build-essential \
curl \
cmake

# NOTE: no need to run update again at this point
# RUN apt-get update

# Get Rust; NOTE: using sh for better compatibility with other base images
RUN curl https://sh.rustup.rs -sSf | sh -s -- -y

# Add .cargo/bin to PATH
ENV PATH="/root/.cargo/bin:${PATH}"

# Set the working directory to /app
WORKDIR /app

Expand All @@ -10,4 +28,4 @@ RUN make install

# Install the app
COPY . /app
RUN pip install -e .
RUN pip install -e .
8 changes: 7 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,13 @@ classifiers = [
]
requires-python = ">=3.10"

dependencies = ["pydantic==1.8.2"]
dependencies = [
"pydantic>=2.4.2", # dolma does not work with very old versions of pydantic
"dolma@git+https://github.com/allenai/dolma.git@5a010a2685914b1db7744426abfb4b9ece52da95", # Install from git until a 0.9.2 package is released
"kenlm>=0.2.0", # Used for perplexity tagging
"blingfire>=0.1.8", # Used for perplexity tagging
"requests>=2.31.0",
]

[project.optional-dependencies]
dev = ["black==23.9.1", "ruff==0.1.0", "pyright==1.1.331", "pre-commit==3.5.0"]
Expand Down
4 changes: 4 additions & 0 deletions src/dfm/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
import importlib.metadata

# Fetches the version of the package as defined in pyproject.toml
__version__ = importlib.metadata.version(__package__)
Empty file added src/dfm/common/__init__.py
Empty file.
Empty file.
203 changes: 203 additions & 0 deletions src/dfm/common/data_cleaning/ccnet_text_normalizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,203 @@
# This file has initially been copied from the ccnet repository from Facebook.
# https://github.com/facebookresearch/cc_net/blob/main/cc_net/text_normalizer.py
# The utility functions can be used to normalize text before processing it
# with ccnet models, but might not be the best general purpose implementation.
#
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
#
# This file is full of ambigous characters, so disable ruff check for those.
# ruff: noqa: RUF001

import re
import unicodedata
from typing import Literal

UNICODE_PUNCT = {
",": ",",
"。": ".",
"、": ",",
"„": '"',
"”": '"',
"“": '"',
"«": '"',
"»": '"',
"1": '"',
"」": '"',
"「": '"',
"《": '"',
"》": '"',
"´": "'",
"∶": ":",
":": ":",
"?": "?",
"!": "!",
"(": "(",
")": ")",
";": ";",
"–": "-",
"—": " - ",
".": ". ",
"~": "~",
"’": "'",
"…": "...",
"━": "-",
"〈": "<",
"〉": ">",
"【": "[",
"】": "]",
"%": "%",
"►": "-",
}

UNICODE_PUNCT_RE = re.compile(f"[{''.join(UNICODE_PUNCT.keys())}]")


def replace_unicode_punct(text: str) -> str:
return "".join(UNICODE_PUNCT.get(c, c) for c in text)


def remove_unicode_punct(text: str) -> str:
"""More aggressive version of replace_unicode_punct but also faster."""
return UNICODE_PUNCT_RE.sub("", text)


def strip_accents(line: str) -> str:
"""Strips accents from a piece of text."""
nfd = unicodedata.normalize("NFD", line)
output = [c for c in nfd if unicodedata.category(c) != "Mn"]
return "".join(output)


# Build a regex matching all control characters.
NON_PRINTING_CHARS_RE = re.compile(
f"[{''.join(map(chr, list(range(32)) + list(range(127,160))))}]",
)
DIGIT_RE = re.compile(r"\d")
PUNCT_OR_NON_PRINTING_CHARS_RE = re.compile(
(UNICODE_PUNCT_RE.pattern + NON_PRINTING_CHARS_RE.pattern).replace("][", ""),
)


def remove_non_printing_char(text: str) -> str:
return NON_PRINTING_CHARS_RE.sub("", text)


def normalize_spacing_for_tok(text: str, language: str = "en") -> str:
res = (
text.replace("\r", "")
# remove extra spaces
.replace("(", " (")
.replace(")", ") ")
.replace(" +", " ")
)
res = re.sub(r"\) ([\.\!\:\?\;\,])", r"\)\1", res)
res = res.replace("( ", "(").replace(" )", ")")
res = re.sub(r"(\d) \%", r"\1\%", res)
res = res.replace(" :", ":").replace(" ;", ";")
res = res.replace("`", "'").replace("''", ' " ')

res = (
res.replace("„", '"')
.replace("“", '"')
.replace("”", '"')
.replace("–", "-")
.replace("—", " - ")
.replace(" +", " ")
.replace("´", "'")
.replace("([a-z])‘([a-z])", r"\1'\2/")
.replace("([a-z])’([a-z])", r"\1'\2/")
.replace("‘", '"')
.replace("‚", '"')
.replace("’", '"')
.replace("''", '"')
.replace("´´", '"')
.replace("…", "...")
# French quotes
.replace(" « ", ' "')
.replace("« ", '"')
.replace("«", '"')
.replace(" » ", '" ')
.replace(" »", '"')
.replace("»", '"')
# handle pseudo-spaces
.replace(" %", "%")
.replace("nº ", "nº ")
.replace(" :", ":")
.replace(" ºC", " ºC")
.replace(" cm", " cm")
.replace(" ?", "?")
.replace(" !", "!")
.replace(" ;", ";")
.replace(", ", ", ")
.replace(" +", " ")
.replace(".", ". ")
)
# English "quotation," followed by comma, style
if language == "en":
res = re.sub(r"\"([,\.]+)", r"\1\"", res)
# Czech is confused
elif language == "cs" or language == "cz":
pass
# German/Spanish/French "quotation", followed by comma, style
else:
res = res.replace(',"', '",')
res = re.sub(
r"(\.+)\"(\s*[^<])",
r"\"\1\2",
res,
) # don't fix period at end of sentence

if (
language == "de"
or language == "es"
or language == "cz"
or language == "cs"
or language == "fr"
):
res = re.sub(r"(\d) (\d)", r"\1,\2", res)
else:
res = re.sub(r"(\d) (\d)", r"\1.\2", res)
return res


def normalize(
line: str,
accent: bool = True,
case: bool = True,
numbers: bool = True,
punct: Literal[1, 2] = 1,
) -> str:
line = line.strip()
if not line:
return line
if case:
line = line.lower()
if accent:
line = strip_accents(line)
if numbers:
line = DIGIT_RE.sub("0", line)
if punct == 1:
line = replace_unicode_punct(line)
elif punct == 2:
line = remove_unicode_punct(line)
line = remove_non_printing_char(line)
return line


def slow_normalize_for_dedup(line: str) -> str:
return normalize(line, accent=False, case=True, numbers=True, punct=2)


def normalize_for_dedup(line: str) -> str:
line = line.strip()
if not line:
return line
# case
line = line.lower()
# numbers
line = DIGIT_RE.sub("0", line)
line = PUNCT_OR_NON_PRINTING_CHARS_RE.sub("", line)
return line
Empty file.
Loading

0 comments on commit 29aecb3

Please sign in to comment.