Skip to content

Commit

Permalink
Merge branch 'mb/update_repo_structure_for_v2' of https://github.com/…
Browse files Browse the repository at this point in the history
  • Loading branch information
MartinBernstorff committed Oct 12, 2023
2 parents 17aee21 + b952421 commit 4cbf2a3
Show file tree
Hide file tree
Showing 43 changed files with 934 additions and 816 deletions.
7 changes: 5 additions & 2 deletions archive_v1/src/applications/dagw_reddit/apply_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,16 @@
ds = load_from_disk(path)

ds_filtered = ds.filter(
lambda example: example["is_13_gram_duplicate"] is False, num_proc=16
lambda example: example["is_13_gram_duplicate"] is False,
num_proc=16,
)
assert len(set(ds_filtered["is_13_gram_duplicate"])) == 1

# write dataset with added metadata
save_path = os.path.join(
"/work", "dagw-clean", f"dagw_reddit_filtered_v{ds.version}.arrow"
"/work",
"dagw-clean",
f"dagw_reddit_filtered_v{ds.version}.arrow",
)
msg.info(f"Saving to disk: {save_path}")
ds_filtered.save_to_disk(save_path)
10 changes: 6 additions & 4 deletions archive_v1/src/applications/dagw_reddit/preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,8 @@
from pathlib import Path

from datasets import concatenate_datasets, load_dataset
from wasabi import msg

from dfm.cleaning import Deduper, QualityFilter
from wasabi import msg


def q_filter(batch):
Expand Down Expand Up @@ -75,7 +74,9 @@ def dedupe_batch(batch, deduper: Deduper):
return batch


def filter_categories(examples, remove_cat={"danavis"}):
def filter_categories(examples, remove_cat=None):
if remove_cat is None:
remove_cat = {"danavis"}
i = 0
while i < len(examples["source"]):
s = examples["source"][i]
Expand All @@ -99,7 +100,8 @@ def main(
reddit_da = reddit_da.rename_columns({"id": "doc_id"})
reddit_da = reddit_da.add_column("LICENSE", ["MIT"] * len(reddit_da))
reddit_da = reddit_da.add_column(
"date_built", ["Wed Dec 15 00:00:00 2021 CEST +0200"] * len(reddit_da)
"date_built",
["Wed Dec 15 00:00:00 2021 CEST +0200"] * len(reddit_da),
)
reddit_da = reddit_da.add_column("source", ["reddit-da"] * len(reddit_da))
reddit_da = reddit_da.add_column("uri", ["NA"] * len(reddit_da))
Expand Down
4 changes: 2 additions & 2 deletions archive_v1/src/applications/danews/add_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -272,8 +272,8 @@ def word_count(batch):
news_sub = news.remove_columns(
[
c
for c in news.features.keys()
for c in news.features
if c not in {"n_tokens", "is_duplicate", "passed_quality_filter", "Source"}
]
],
)
news_sub.to_csv("news_meta.csv")
8 changes: 4 additions & 4 deletions archive_v1/src/applications/danews/dedupe.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,8 @@
from functools import partial

from datasets import load_from_disk
from wasabi import msg

from dfm.cleaning import Deduper
from wasabi import msg


def filter_batch(batch, i):
Expand Down Expand Up @@ -54,7 +53,6 @@ def __extract_is_duplicate(mask):
def main(
path,
) -> None:

deduper = Deduper()

msg.info("Loading Dataset")
Expand All @@ -63,7 +61,9 @@ def main(

msg.info("Starting deduping")
deduper = partial(
dedupe, deduper=deduper, dedupe_path=os.path.join(path, "deduplicated")
dedupe,
deduper=deduper,
dedupe_path=os.path.join(path, "deduplicated"),
)
# dedupe dataset
ds = ds.map(
Expand Down
3 changes: 1 addition & 2 deletions archive_v1/src/applications/danews/quality_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,8 @@
from pathlib import Path

from datasets import load_dataset
from wasabi import msg

from dfm.cleaning import QualityFilter
from wasabi import msg


def filter_batch(batch, i):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@
## Text proportions
----
"""
""",
)


Expand All @@ -61,7 +61,7 @@ def get_proportions(taggers, md):
md.add(
f"- *Date: {date}*"
+ f"\n- *Sentences tagged: {len(df)}*"
+ f"\n- *Documents tagged: {n_docs}*"
+ f"\n- *Documents tagged: {n_docs}*",
)

n_char = sum(len(t) for t in df["text"].values)
Expand Down Expand Up @@ -100,20 +100,21 @@ def get_proportions(taggers, md):
tagger1_name, _, session_n1, __, n_docs1, date1 = tagger1.split("_")
tagger2_name, _, session_n2, __, n_docs2, date2 = tagger2.split("_")
md.add(
f"**{tagger1_name.capitalize()}** (Session: {session_n1}) vs **{tagger2_name.capitalize()}** - (Session: {session_n2})"
f"**{tagger1_name.capitalize()}** (Session: {session_n1}) vs **{tagger2_name.capitalize()}** - (Session: {session_n2})",
)
# merge
df = pd.merge(taggers[pair[0]], taggers[pair[1]], on="text", suffixes=("_1", "_2"))
kappa = cohen_kappa_score(df["category_1"], df["category_2"])
md.add(
f"- Cohen's Kappa (all categories): {kappa:.4f} (Overlap in sentences: {df.shape[0]})"
f"- Cohen's Kappa (all categories): {kappa:.4f} (Overlap in sentences: {df.shape[0]})",
)

kappa = cohen_kappa_score(
df["category_1"] == "correct_language", df["category_2"] == "correct_language"
df["category_1"] == "correct_language",
df["category_2"] == "correct_language",
)
md.add(
f"- Cohen's Kappa (correct_language vs not correct_language): {kappa:.4f} (Overlap in sentences: {df.shape[0]})"
f"- Cohen's Kappa (correct_language vs not correct_language): {kappa:.4f} (Overlap in sentences: {df.shape[0]})",
)


Expand All @@ -131,7 +132,7 @@ def get_proportions(taggers, md):
```
While non-language texts in NAT was often menu bars, contact information, or navigation.
"""
""",
)


Expand Down
7 changes: 5 additions & 2 deletions archive_v1/src/applications/dataset_validation/main.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""Script for rating text quality of NAT."""
from collections.abc import Iterable
from datetime import date
from typing import Iterable, Optional
from typing import Optional

from dfm.cleaning import QualityFilter, SentenceFilter
from dfm.dataset_validation.rating_interface import ExampleRater
Expand All @@ -10,7 +11,9 @@


def text_generator(
seed, n_texts: Optional[int], max_texts: Optional[int]
seed,
n_texts: Optional[int],
max_texts: Optional[int],
) -> Iterable[str]:
"""
Create text generator
Expand Down
1 change: 0 additions & 1 deletion archive_v1/src/applications/dataset_validation/mc4.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
from datetime import date

from datasets import load_dataset

from dfm.dataset_validation.rating_interface import ExampleRater


Expand Down
8 changes: 6 additions & 2 deletions archive_v1/src/applications/hopetwitter/add_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,14 @@
from wasabi import msg

path = os.path.join(
"/work", "twitter_cleaned", "twitter_da_stopwords_2019-01-01_2021-04-30"
"/work",
"twitter_cleaned",
"twitter_da_stopwords_2019-01-01_2021-04-30",
)
write_path = os.path.join(
"/work", "twitter_cleaned", "twitter_da_stopwords_2019-01-01_2021-04-30.arrow"
"/work",
"twitter_cleaned",
"twitter_da_stopwords_2019-01-01_2021-04-30.arrow",
)
ds = load_from_disk(path)

Expand Down
7 changes: 5 additions & 2 deletions archive_v1/src/applications/hopetwitter/apply_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,17 @@

if __name__ == "__main__":
path = os.path.join(
"/work", "twitter_cleaned", "twitter_da_stopwords_2019-01-01_2021-04-30.arrow"
"/work",
"twitter_cleaned",
"twitter_da_stopwords_2019-01-01_2021-04-30.arrow",
)

msg.info(f"loading: {path}")
ds = load_from_disk(path)

ds_filtered = ds.filter(
lambda example: example["is_duplicate"] is False, num_proc=16
lambda example: example["is_duplicate"] is False,
num_proc=16,
)
assert len(set(ds_filtered["is_duplicate"])) == 1

Expand Down
6 changes: 3 additions & 3 deletions archive_v1/src/applications/hopetwitter/dedupe.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,8 @@
from pathlib import Path

from datasets import load_dataset
from wasabi import msg

from dfm.cleaning import Deduper
from wasabi import msg


def filter_batch(batch, i):
Expand Down Expand Up @@ -67,7 +66,8 @@ def main(
json_files = glob.glob(path, recursive=True)

w_path = os.path.join(
write_path, "twitter_da_stopwords_2019-01-01_2021-04-30.arrow"
write_path,
"twitter_da_stopwords_2019-01-01_2021-04-30.arrow",
)
deduper = Deduper(ngram_size=10)

Expand Down
13 changes: 11 additions & 2 deletions archive_v1/src/applications/hopetwitter/flatten_ndjson.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,17 @@

def flatten_post(
post: dict,
keys_to_keep=["text", "id", "possibly_sensitive", "author_id", "source", "lang"],
keys_to_keep=None,
):
if keys_to_keep is None:
keys_to_keep = [
"text",
"id",
"possibly_sensitive",
"author_id",
"source",
"lang",
]
return {k: post[k] for k in keys_to_keep}


Expand All @@ -29,7 +38,7 @@ def flatten_ndjson(path: str, write_folder: str):
print(f"Flattening: {path} to {write_path}")

# stream in json from orgin to write_path
with open(path, "r") as f:
with open(path) as f:
reader = ndjson.reader(f)

with open(write_path, "w") as f:
Expand Down
6 changes: 3 additions & 3 deletions archive_v1/src/applications/hopetwitter/quality_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,8 @@
from pathlib import Path

from datasets import load_dataset
from wasabi import msg

from dfm.cleaning import QualityFilter
from wasabi import msg


def filter_batch(batch, i):
Expand Down Expand Up @@ -97,7 +96,8 @@ def main(
json_files = glob.glob(path, recursive=True)

w_path = os.path.join(
write_path, "twitter_da_stopwords_2019-01-01_2021-04-30.jsonl"
write_path,
"twitter_da_stopwords_2019-01-01_2021-04-30.jsonl",
)
if os.path.exists(w_path):
raise Exception(f"File {w_path} already exist")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def main(netarkivet_path=Path("/work/netarkivet-cleaned")):
meta = load_dataset("csv", data_files=meta_path)
meta = meta["train"]
assert len(meta) == len(
ds
ds,
), "length of dataset and its metadata is not the same."
ds = ds.add_column("is_duplicate", meta["is_duplicate"])
ds.to_json(jsonl_file)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,10 @@
import glob
import os
import random
from collections.abc import Iterable
from contextlib import ExitStack
from pathlib import Path
from typing import Iterable, List, Optional, Union
from typing import Optional, Union

import ndjson
from wasabi import msg
Expand Down Expand Up @@ -55,7 +56,7 @@ def shuffle_buffer(x: Iterable, buffer_size: int) -> Iterable:


def jsonl_merge( # noqa C901
jsonl_files: List[Union[Path, str]],
jsonl_files: list[Union[Path, str]],
buffer_size: Optional[int] = None,
sample: bool = True,
) -> Iterable[dict]:
Expand All @@ -81,8 +82,7 @@ def __sample_yield(readers: list) -> Iterable:

def __iterative_yield(readers: list) -> Iterable:
for reader in readers:
for sample in reader:
yield sample
yield from reader

yield_fn = __sample_yield if sample is True else __iterative_yield

Expand All @@ -102,7 +102,9 @@ def __iterative_yield(readers: list) -> Iterable:
yield sample


def apply_filter(dataset=Iterable[dict], columns_to_keep=["text"]) -> Iterable[dict]:
def apply_filter(dataset=Iterable[dict], columns_to_keep=None) -> Iterable[dict]:
if columns_to_keep is None:
columns_to_keep = ["text"]
for sample in dataset:
if sample["is_duplicate"] is False:
yield {k: sample[k] for k in columns_to_keep}
Expand Down
Loading

0 comments on commit 4cbf2a3

Please sign in to comment.