Skip to content

Commit

Permalink
Merge pull request #259 from centre-for-humanities-computing/gopher_t…
Browse files Browse the repository at this point in the history
…agger

add gopher tagger with scandi stop words instead of english
  • Loading branch information
peterbjorgensen authored Apr 15, 2024
2 parents 8503b25 + 041168f commit 2a7fc85
Showing 1 changed file with 257 additions and 0 deletions.
257 changes: 257 additions & 0 deletions data-processing/src/dfm_data/dolma_taggers/gopher_scandi.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,257 @@
"""
Module for tagging with Gopher properties.
It's a copy of the original dolma tagger, but uses stop words from spacy
instead of the English stop words.
"""
import logging
from collections import Counter
from dataclasses import dataclass
from statistics import median
from typing import Counter as CounterType
from typing import List, Tuple, Union

from dolma.core.data_types import DocResult, Document, Span
from dolma.core.registry import TaggerRegistry
from dolma.core.taggers import BaseTagger
import spacy

REQUIRED_DANISH_WORDS = spacy.blank('da').Defaults.stop_words
REQUIRED_ENGLISH_WORDS = spacy.blank('en').Defaults.stop_words
REQUIRED_ICELANDIC_WORDS = spacy.blank('is').Defaults.stop_words
REQUIRED_NORWEGIAN_WORDS = spacy.blank('no').Defaults.stop_words
REQUIRED_SWEDISH_WORDS = spacy.blank('sv').Defaults.stop_words

SYMBOLS = {"#", "\u2026"}
BULLET_POINTS = {"*", "-"}


def robust_median(values: List[Union[int, float]]) -> float:
if not values:
return 0.0
return float(median(values))


@dataclass
class GopherAttributes:
fraction_of_characters_in_most_common_ngram: List[Tuple[int, float]]
fraction_of_characters_in_duplicate_ngrams: List[Tuple[int, float]]
character_count: int = 0
word_count: int = 0
median_word_length: float = 0.0
symbol_to_word_ratio: float = 0.0
fraction_of_words_with_alpha_character: float = 0.0
required_word_count_da: int = 0
required_word_count_en: int = 0
required_word_count_is: int = 0
required_word_count_no: int = 0
required_word_count_sv: int = 0
fraction_of_lines_starting_with_bullet_point: float = 0.0
fraction_of_lines_ending_with_ellipsis: float = 0.0
fraction_of_duplicate_lines: float = 0.0
fraction_of_characters_in_duplicate_lines: float = 0.0

def as_spans(self) -> List[Span]:
spans = []
spans.extend(
[
Span(
0,
self.character_count,
f"fraction_of_characters_in_most_common_{n}grams",
v,
)
for n, v in self.fraction_of_characters_in_most_common_ngram
]
)
spans.extend(
[
Span(
0,
self.character_count,
f"fraction_of_characters_in_duplicate_{n}grams",
v,
)
for n, v in self.fraction_of_characters_in_duplicate_ngrams
]
)
spans.append(
Span(
0,
self.character_count,
type="character_count",
score=self.character_count,
)
)
spans.append(Span(0, self.character_count, type="word_count", score=self.word_count))
spans.append(
Span(
0,
self.character_count,
type="median_word_length",
score=self.median_word_length,
)
)
spans.append(
Span(
0,
self.character_count,
type="symbol_to_word_ratio",
score=self.symbol_to_word_ratio,
)
)
spans.append(
Span(
0,
self.character_count,
type="fraction_of_words_with_alpha_character",
score=self.fraction_of_words_with_alpha_character,
)
)
spans.append(
Span(
0,
self.character_count,
type="required_word_count_da",
score=self.required_word_count_da,
)
)
spans.append(
Span(
0,
self.character_count,
type="required_word_count_en",
score=self.required_word_count_en,
)
)
spans.append(
Span(
0,
self.character_count,
type="required_word_count_is",
score=self.required_word_count_is,
)
)
spans.append(
Span(
0,
self.character_count,
type="required_word_count_no",
score=self.required_word_count_no,
)
)
spans.append(
Span(
0,
self.character_count,
type="required_word_count_sv",
score=self.required_word_count_sv,
)
)
spans.append(
Span(
0,
self.character_count,
type="fraction_of_lines_starting_with_bullet_point",
score=self.fraction_of_lines_starting_with_bullet_point,
)
)
spans.append(
Span(
0,
self.character_count,
type="fraction_of_lines_ending_with_ellipsis",
score=self.fraction_of_lines_ending_with_ellipsis,
)
)
spans.append(
Span(
0,
self.character_count,
type="fraction_of_duplicate_lines",
score=self.fraction_of_duplicate_lines,
)
)
spans.append(
Span(
0,
self.character_count,
type="fraction_of_characters_in_duplicate_lines",
score=self.fraction_of_characters_in_duplicate_lines,
)
)
return spans


def get_attributes(text: str) -> GopherAttributes:
attrs = GopherAttributes([], [])
attrs.character_count = len(text)
if attrs.character_count == 0:
return attrs

try:
words = text.split()
word_count = len(words)
character_count = sum(len(word) for word in words)

attrs.word_count = word_count
attrs.median_word_length = robust_median([len(word) for word in words])
attrs.symbol_to_word_ratio = sum(1 for word in words if any(s in word for s in SYMBOLS)) / max(word_count, 1)
attrs.fraction_of_words_with_alpha_character = (
sum(1 for word in words if any(c.isalpha() for c in word)) / max(word_count, 1)
)
attrs.required_word_count_da = sum(1 for word in words if word in REQUIRED_DANISH_WORDS)
attrs.required_word_count_en = sum(1 for word in words if word in REQUIRED_ENGLISH_WORDS)
attrs.required_word_count_is = sum(1 for word in words if word in REQUIRED_ICELANDIC_WORDS)
attrs.required_word_count_no = sum(1 for word in words if word in REQUIRED_NORWEGIAN_WORDS)
attrs.required_word_count_sv = sum(1 for word in words if word in REQUIRED_SWEDISH_WORDS)

all_counts = all_ngram_counts(words)

count_most_common_ngrams = {2, 3, 4}
for n, ngram_counts in all_counts:
if not ngram_counts:
continue
if n in count_most_common_ngrams:
most_common_ngram, count = ngram_counts.most_common(1)[0]
value = count * sum(len(w) for w in most_common_ngram) / max(character_count, 1)
attrs.fraction_of_characters_in_most_common_ngram.append((n, value))
else:
ng_char_count = sum(count * sum(len(w) for w in ng) for ng, count in ngram_counts.items())
value = (
sum(count * sum(len(w) for w in ng) for ng, count in ngram_counts.items() if count > 1)
/ max(ng_char_count, 1)
)
attrs.fraction_of_characters_in_duplicate_ngrams.append((n, value))

lines = text.split("\n")
line_count = len(lines)
for line in lines:
if any(line.startswith(s) for s in BULLET_POINTS):
attrs.fraction_of_lines_starting_with_bullet_point += 1
if line.endswith("\u2026"):
attrs.fraction_of_lines_ending_with_ellipsis += 1
attrs.fraction_of_lines_starting_with_bullet_point /= max(line_count, 1)
attrs.fraction_of_lines_ending_with_ellipsis /= max(line_count, 1)

line_counts = Counter(lines)
attrs.fraction_of_duplicate_lines = (
sum(count for line, count in line_counts.items() if count > 1) / max(line_count, 1)
)
attrs.fraction_of_characters_in_duplicate_lines = (
sum(len(line) * count for line, count in line_counts.items() if count > 1) / max(character_count, 1)
)
except Exception as e:
logging.exception(f"Error processing text {e}: {text[:200]}")

return attrs


def all_ngram_counts(words) -> List[Tuple[int, CounterType[Tuple[str, ...]]]]:
return [(n, Counter(list(zip(*[words[i:] for i in range(n)])))) for n in range(2, 11)]

@TaggerRegistry.add("gopher_scandi_v1")
class GopherTagger(BaseTagger):
def predict(self, doc: Document) -> DocResult:
attrs = get_attributes(doc.text)
result = DocResult(doc=doc, spans=attrs.as_spans())
return result

0 comments on commit 2a7fc85

Please sign in to comment.