From ad2d85d50686fd12a1acaae02d372ea85c1caa83 Mon Sep 17 00:00:00 2001 From: jankounchained Date: Thu, 18 Apr 2024 13:32:04 +0200 Subject: [PATCH 1/3] add ncc loader refactor ncc conversion format and refactor add a cache cleanup add a few lines of docstrings --- .../scripts/convert_ncc_to_jsonlgz.py | 109 ++++++++++++++++++ 1 file changed, 109 insertions(+) create mode 100644 data-processing/scripts/convert_ncc_to_jsonlgz.py diff --git a/data-processing/scripts/convert_ncc_to_jsonlgz.py b/data-processing/scripts/convert_ncc_to_jsonlgz.py new file mode 100644 index 00000000..525600ee --- /dev/null +++ b/data-processing/scripts/convert_ncc_to_jsonlgz.py @@ -0,0 +1,109 @@ +""" +Download The Norwegian Colossal Corpus, convert to jsonl.gz with each document following the format: + +{ + "id": "...", # MANDATORY: source-specific identifier + "text": "foo", # MANDATORY: textual content of the document + "source": "...", # MANDATORY: source of the data, such as peS2o, common-crawl, etc. + "added": "...", # OPTIONAL: timestamp ai2 acquired this data + "created": "..." # OPTIONAL: timestamp when orig document was created (best-guess if not available) + "metadata": {...} # OPTIONAL: source-specific metadata +} +""" + +import os +import datetime +from functools import partial +from datasets import load_dataset, Dataset, IterableDataset + + +EXPORT_PATH = "ncc_sample.jsonl.gz" +date_added = datetime.datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S.000Z") + + +def convert_from_iterable_to_ds(iterable_ds: IterableDataset) -> Dataset: + """Iterate through an IterableDataset, creating a Dataset. + Needed for debug mode (cleaning a smaller subset of the dataset).""" + + def _gen_from_iterable_dataset(iterable_ds): + yield from iterable_ds + + return Dataset.from_generator( + partial(_gen_from_iterable_dataset, iterable_ds), features=iterable_ds.features + ) + + +def _remove_existing_jsonlgz() -> None: + """Checks if EXPORT_PATH exists, removes existing file if yes. + This is because `ds.to_json` does not overwrite existing files. + """ + if os.path.exists(EXPORT_PATH): + os.remove(EXPORT_PATH) + print(f"Removing existing export: {EXPORT_PATH}") + + +def _structure_records(obs: dict) -> dict: + """Structure a single observation to Dolma format""" + + # NCC has publish year (YYYY) which will be used to construct the `created` column. + # It is assumed that documents were created in YYYY-01-01 at midnight. + publish_year = obs["publish_year"] + + # structure into dolma format + obs = { + "id": obs["id"], + "text": obs["text"], + "source": "NCC", + "added": date_added, + "created": f"{publish_year}-01-01T00:00:00.000Z", + "metadata": { + "doc_type": obs["doc_type"], + "lang_fasttext": obs["lang_fasttext"], + "lang_fasttext_conf": obs["lang_fasttext_conf"], + }, + } + + return obs + + +def main(debug: bool = False) -> None: + + # remove existing export file + # because `ds.to_json` does not overwrite it + _remove_existing_jsonlgz() + + if debug: + ncc = load_dataset( + "NbAiLab/NCC", streaming=True, split="train", trust_remote_code=True + ) + ds = ncc.take(1000) + ds = convert_from_iterable_to_ds(ds) + + else: + ds = load_dataset("NbAiLab/NCC", split="train", trust_remote_code=True) + + # cleanup cache to force the dataset to overwrite itself + ds.cleanup_cache_files() + # structure records & remove columns that are in the `metadata` key + ds = ds.map(_structure_records) + ds = ds.remove_columns( + column_names=["doc_type", "publish_year", "lang_fasttext", "lang_fasttext_conf"] + ) + + # export + ds.to_json(EXPORT_PATH, orient="records", lines=True, compression="gzip") + + +if __name__ == "__main__": + # run the full dataset + main(debug=False) + + # test type of observation + ds = load_dataset("json", data_files=EXPORT_PATH, split="train") + assert isinstance(ds[0], dict) + # test that the right number of features are exported + assert len(ds.features) == 6 + # test that it can be streamed + ds = load_dataset("json", data_files=EXPORT_PATH, split="train", streaming=True) + example = next(iter(ds)) # type: ignore + assert isinstance(example, dict) From 223b2620f61930233b339a5c06d1d0a8ac279a39 Mon Sep 17 00:00:00 2001 From: jankounchained Date: Mon, 22 Apr 2024 17:21:26 +0200 Subject: [PATCH 2/3] add ncc paths to config --- data-processing/configs/2024-v1/dolma_dedupe_v1.yaml | 1 + .../configs/2024-v1/dolma_run_url_taggers_mc4da_hplt.yaml | 1 + 2 files changed, 2 insertions(+) diff --git a/data-processing/configs/2024-v1/dolma_dedupe_v1.yaml b/data-processing/configs/2024-v1/dolma_dedupe_v1.yaml index 02ad3b76..4cd21d01 100644 --- a/data-processing/configs/2024-v1/dolma_dedupe_v1.yaml +++ b/data-processing/configs/2024-v1/dolma_dedupe_v1.yaml @@ -15,4 +15,5 @@ documents: - /work/dfm-data/pre-training/hplt/documents/*.jsonl.gz - /work/dfm-data/pre-training/dagw/documents/*.jsonl.gz - /work/dfm-data/pre-training/mC4_da/documents/*.json.gz +- /work/dfm-data/pre-training/ncc/documents/*.jsonl.gz processes: 16 diff --git a/data-processing/configs/2024-v1/dolma_run_url_taggers_mc4da_hplt.yaml b/data-processing/configs/2024-v1/dolma_run_url_taggers_mc4da_hplt.yaml index 4dbb37f8..81e0e050 100644 --- a/data-processing/configs/2024-v1/dolma_run_url_taggers_mc4da_hplt.yaml +++ b/data-processing/configs/2024-v1/dolma_run_url_taggers_mc4da_hplt.yaml @@ -3,6 +3,7 @@ destination: null documents: - /work/dfm-data/pre-training/mC4_da/documents/*.json.gz - /work/dfm-data/pre-training/hplt/documents/*.jsonl.gz +- /work/dfm-data/pre-training/ncc/documents/*.jsonl.gz dryrun: false experiment: v1blockurltaggers ignore_existing: false From 847027764896332e3705ad87062ac6969984dfe2 Mon Sep 17 00:00:00 2001 From: jankounchained Date: Tue, 23 Apr 2024 12:42:35 +0200 Subject: [PATCH 3/3] add license to metadata --- .../scripts/convert_ncc_to_jsonlgz.py | 41 ++++++++++++++++++- 1 file changed, 39 insertions(+), 2 deletions(-) diff --git a/data-processing/scripts/convert_ncc_to_jsonlgz.py b/data-processing/scripts/convert_ncc_to_jsonlgz.py index 525600ee..134771b5 100644 --- a/data-processing/scripts/convert_ncc_to_jsonlgz.py +++ b/data-processing/scripts/convert_ncc_to_jsonlgz.py @@ -7,7 +7,12 @@ "source": "...", # MANDATORY: source of the data, such as peS2o, common-crawl, etc. "added": "...", # OPTIONAL: timestamp ai2 acquired this data "created": "..." # OPTIONAL: timestamp when orig document was created (best-guess if not available) - "metadata": {...} # OPTIONAL: source-specific metadata + "metadata": { # OPTIONAL: source-specific metadata + "doc_type": "...", # OPTIONAL: see NCC/Document Types + "lang_fasttext": "...", # OPTIONAL: see NCC/Languages + "lang_fasttext_conf": float, # OPTIONAL: see NCC/Languages + "license": "..." # OPTIONAL: see NCC/License + } } """ @@ -17,7 +22,7 @@ from datasets import load_dataset, Dataset, IterableDataset -EXPORT_PATH = "ncc_sample.jsonl.gz" +EXPORT_PATH = "ncc.jsonl.gz" date_added = datetime.datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S.000Z") @@ -42,6 +47,37 @@ def _remove_existing_jsonlgz() -> None: print(f"Removing existing export: {EXPORT_PATH}") +def _convert_doctype_to_license(source: str) -> str | None: + """Hardcode licenses based on + https://huggingface.co/datasets/NbAiLab/NCC#license + """ + + source2license = { + "newspaper_ocr": "CC0 1.0", + "newspaper_pdf": "CC0 1.0", + "books": "CC0 1.0", + "newspapers_online_nb": "CC BY-NC 2.0", + "newspapers_online_nn": "CC BY-NC 2.0", + "opensubtitles": "CC BY-SA 3.0", + "wikipedia": "CC BY-SA 3.0", + "government_nb": "NLOD 2.0", + "government_nn": "NLOD 2.0", + "parliament": "NLOD 2.0", + "publicreports": "NLOD 2.0", + } + + if source in source2license: + license = source2license[source] + elif source.startswith("lovdata_cd_") or source.startswith("maalfrid_"): + license = "NLOD 2.0" + elif source.startswith("wikipedia"): + license = "CC BY-SA 3.0" + else: + license = None + + return license + + def _structure_records(obs: dict) -> dict: """Structure a single observation to Dolma format""" @@ -60,6 +96,7 @@ def _structure_records(obs: dict) -> dict: "doc_type": obs["doc_type"], "lang_fasttext": obs["lang_fasttext"], "lang_fasttext_conf": obs["lang_fasttext_conf"], + "license": _convert_doctype_to_license(obs["doc_type"]), }, }