From ad2d85d50686fd12a1acaae02d372ea85c1caa83 Mon Sep 17 00:00:00 2001
From: jankounchained <honza.anfas@gmail.com>
Date: Thu, 18 Apr 2024 13:32:04 +0200
Subject: [PATCH 1/3] add ncc loader

refactor ncc conversion

format and refactor

add a cache cleanup

add a few lines of docstrings
---
 .../scripts/convert_ncc_to_jsonlgz.py         | 109 ++++++++++++++++++
 1 file changed, 109 insertions(+)
 create mode 100644 data-processing/scripts/convert_ncc_to_jsonlgz.py

diff --git a/data-processing/scripts/convert_ncc_to_jsonlgz.py b/data-processing/scripts/convert_ncc_to_jsonlgz.py
new file mode 100644
index 00000000..525600ee
--- /dev/null
+++ b/data-processing/scripts/convert_ncc_to_jsonlgz.py
@@ -0,0 +1,109 @@
+"""
+Download  The Norwegian Colossal Corpus, convert to jsonl.gz with each document following the format:
+
+{
+    "id": "...",             # MANDATORY: source-specific identifier
+    "text": "foo",           # MANDATORY: textual content of the document
+    "source": "...",         # MANDATORY: source of the data, such as peS2o, common-crawl, etc.
+    "added": "...",          # OPTIONAL: timestamp ai2 acquired this data
+    "created": "..."         # OPTIONAL: timestamp when orig document was created (best-guess if not available)
+    "metadata": {...}        # OPTIONAL: source-specific metadata
+}
+"""
+
+import os
+import datetime
+from functools import partial
+from datasets import load_dataset, Dataset, IterableDataset
+
+
+EXPORT_PATH = "ncc_sample.jsonl.gz"
+date_added = datetime.datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S.000Z")
+
+
+def convert_from_iterable_to_ds(iterable_ds: IterableDataset) -> Dataset:
+    """Iterate through an IterableDataset, creating a Dataset. 
+    Needed for debug mode (cleaning a smaller subset of the dataset)."""
+
+    def _gen_from_iterable_dataset(iterable_ds):
+        yield from iterable_ds
+
+    return Dataset.from_generator(
+        partial(_gen_from_iterable_dataset, iterable_ds), features=iterable_ds.features
+    )
+
+
+def _remove_existing_jsonlgz() -> None:
+    """Checks if EXPORT_PATH exists, removes existing file if yes.
+    This is because `ds.to_json` does not overwrite existing files.
+    """
+    if os.path.exists(EXPORT_PATH):
+        os.remove(EXPORT_PATH)
+        print(f"Removing existing export: {EXPORT_PATH}")
+
+
+def _structure_records(obs: dict) -> dict:
+    """Structure a single observation to Dolma format"""
+
+    # NCC has publish year (YYYY) which will be used to construct the `created` column.
+    # It is assumed that documents were created in YYYY-01-01 at midnight.
+    publish_year = obs["publish_year"]
+
+    # structure into dolma format
+    obs = {
+        "id": obs["id"],
+        "text": obs["text"],
+        "source": "NCC",
+        "added": date_added,
+        "created": f"{publish_year}-01-01T00:00:00.000Z",
+        "metadata": {
+            "doc_type": obs["doc_type"],
+            "lang_fasttext": obs["lang_fasttext"],
+            "lang_fasttext_conf": obs["lang_fasttext_conf"],
+        },
+    }
+
+    return obs
+
+
+def main(debug: bool = False) -> None:
+
+    # remove existing export file
+    # because `ds.to_json` does not overwrite it
+    _remove_existing_jsonlgz()
+
+    if debug:
+        ncc = load_dataset(
+            "NbAiLab/NCC", streaming=True, split="train", trust_remote_code=True
+        )
+        ds = ncc.take(1000)
+        ds = convert_from_iterable_to_ds(ds)
+
+    else:
+        ds = load_dataset("NbAiLab/NCC", split="train", trust_remote_code=True)
+
+    # cleanup cache to force the dataset to overwrite itself
+    ds.cleanup_cache_files()
+    # structure records & remove columns that are in the `metadata` key
+    ds = ds.map(_structure_records)
+    ds = ds.remove_columns(
+        column_names=["doc_type", "publish_year", "lang_fasttext", "lang_fasttext_conf"]
+    )
+
+    # export
+    ds.to_json(EXPORT_PATH, orient="records", lines=True, compression="gzip")
+
+
+if __name__ == "__main__":
+    # run the full dataset
+    main(debug=False)
+
+    # test type of observation
+    ds = load_dataset("json", data_files=EXPORT_PATH, split="train")
+    assert isinstance(ds[0], dict)
+    # test that the right number of features are exported
+    assert len(ds.features) == 6
+    # test that it can be streamed
+    ds = load_dataset("json", data_files=EXPORT_PATH, split="train", streaming=True)
+    example = next(iter(ds))  # type: ignore
+    assert isinstance(example, dict)

From 223b2620f61930233b339a5c06d1d0a8ac279a39 Mon Sep 17 00:00:00 2001
From: jankounchained <honza.anfas@gmail.com>
Date: Mon, 22 Apr 2024 17:21:26 +0200
Subject: [PATCH 2/3] add ncc paths to config

---
 data-processing/configs/2024-v1/dolma_dedupe_v1.yaml             | 1 +
 .../configs/2024-v1/dolma_run_url_taggers_mc4da_hplt.yaml        | 1 +
 2 files changed, 2 insertions(+)

diff --git a/data-processing/configs/2024-v1/dolma_dedupe_v1.yaml b/data-processing/configs/2024-v1/dolma_dedupe_v1.yaml
index 02ad3b76..4cd21d01 100644
--- a/data-processing/configs/2024-v1/dolma_dedupe_v1.yaml
+++ b/data-processing/configs/2024-v1/dolma_dedupe_v1.yaml
@@ -15,4 +15,5 @@ documents:
 - /work/dfm-data/pre-training/hplt/documents/*.jsonl.gz
 - /work/dfm-data/pre-training/dagw/documents/*.jsonl.gz
 - /work/dfm-data/pre-training/mC4_da/documents/*.json.gz
+- /work/dfm-data/pre-training/ncc/documents/*.jsonl.gz
 processes: 16
diff --git a/data-processing/configs/2024-v1/dolma_run_url_taggers_mc4da_hplt.yaml b/data-processing/configs/2024-v1/dolma_run_url_taggers_mc4da_hplt.yaml
index 4dbb37f8..81e0e050 100644
--- a/data-processing/configs/2024-v1/dolma_run_url_taggers_mc4da_hplt.yaml
+++ b/data-processing/configs/2024-v1/dolma_run_url_taggers_mc4da_hplt.yaml
@@ -3,6 +3,7 @@ destination: null
 documents:
 - /work/dfm-data/pre-training/mC4_da/documents/*.json.gz
 - /work/dfm-data/pre-training/hplt/documents/*.jsonl.gz
+- /work/dfm-data/pre-training/ncc/documents/*.jsonl.gz
 dryrun: false
 experiment: v1blockurltaggers
 ignore_existing: false

From 847027764896332e3705ad87062ac6969984dfe2 Mon Sep 17 00:00:00 2001
From: jankounchained <honza.anfas@gmail.com>
Date: Tue, 23 Apr 2024 12:42:35 +0200
Subject: [PATCH 3/3] add license to metadata

---
 .../scripts/convert_ncc_to_jsonlgz.py         | 41 ++++++++++++++++++-
 1 file changed, 39 insertions(+), 2 deletions(-)

diff --git a/data-processing/scripts/convert_ncc_to_jsonlgz.py b/data-processing/scripts/convert_ncc_to_jsonlgz.py
index 525600ee..134771b5 100644
--- a/data-processing/scripts/convert_ncc_to_jsonlgz.py
+++ b/data-processing/scripts/convert_ncc_to_jsonlgz.py
@@ -7,7 +7,12 @@
     "source": "...",         # MANDATORY: source of the data, such as peS2o, common-crawl, etc.
     "added": "...",          # OPTIONAL: timestamp ai2 acquired this data
     "created": "..."         # OPTIONAL: timestamp when orig document was created (best-guess if not available)
-    "metadata": {...}        # OPTIONAL: source-specific metadata
+    "metadata": {            # OPTIONAL: source-specific metadata
+        "doc_type": "...",              # OPTIONAL: see NCC/Document Types
+        "lang_fasttext": "...",         # OPTIONAL: see NCC/Languages
+        "lang_fasttext_conf": float,    # OPTIONAL: see NCC/Languages
+        "license": "..."                # OPTIONAL: see NCC/License
+        }
 }
 """
 
@@ -17,7 +22,7 @@
 from datasets import load_dataset, Dataset, IterableDataset
 
 
-EXPORT_PATH = "ncc_sample.jsonl.gz"
+EXPORT_PATH = "ncc.jsonl.gz"
 date_added = datetime.datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S.000Z")
 
 
@@ -42,6 +47,37 @@ def _remove_existing_jsonlgz() -> None:
         print(f"Removing existing export: {EXPORT_PATH}")
 
 
+def _convert_doctype_to_license(source: str) -> str | None:
+    """Hardcode licenses based on
+    https://huggingface.co/datasets/NbAiLab/NCC#license
+    """
+
+    source2license = {
+        "newspaper_ocr": "CC0 1.0",
+        "newspaper_pdf": "CC0 1.0",
+        "books": "CC0 1.0",
+        "newspapers_online_nb": "CC BY-NC 2.0",
+        "newspapers_online_nn": "CC BY-NC 2.0",
+        "opensubtitles": "CC BY-SA 3.0",
+        "wikipedia": "CC BY-SA 3.0",
+        "government_nb": "NLOD 2.0",
+        "government_nn": "NLOD 2.0",
+        "parliament": "NLOD 2.0",
+        "publicreports": "NLOD 2.0",
+    }
+
+    if source in source2license:
+        license = source2license[source]
+    elif source.startswith("lovdata_cd_") or source.startswith("maalfrid_"):
+        license = "NLOD 2.0"
+    elif source.startswith("wikipedia"):
+        license = "CC BY-SA 3.0"
+    else:
+        license = None
+
+    return license
+
+
 def _structure_records(obs: dict) -> dict:
     """Structure a single observation to Dolma format"""
 
@@ -60,6 +96,7 @@ def _structure_records(obs: dict) -> dict:
             "doc_type": obs["doc_type"],
             "lang_fasttext": obs["lang_fasttext"],
             "lang_fasttext_conf": obs["lang_fasttext_conf"],
+            "license": _convert_doctype_to_license(obs["doc_type"]),
         },
     }