From 847027764896332e3705ad87062ac6969984dfe2 Mon Sep 17 00:00:00 2001
From: jankounchained <honza.anfas@gmail.com>
Date: Tue, 23 Apr 2024 12:42:35 +0200
Subject: [PATCH] add license to metadata

---
 .../scripts/convert_ncc_to_jsonlgz.py         | 41 ++++++++++++++++++-
 1 file changed, 39 insertions(+), 2 deletions(-)

diff --git a/data-processing/scripts/convert_ncc_to_jsonlgz.py b/data-processing/scripts/convert_ncc_to_jsonlgz.py
index 525600ee..134771b5 100644
--- a/data-processing/scripts/convert_ncc_to_jsonlgz.py
+++ b/data-processing/scripts/convert_ncc_to_jsonlgz.py
@@ -7,7 +7,12 @@
     "source": "...",         # MANDATORY: source of the data, such as peS2o, common-crawl, etc.
     "added": "...",          # OPTIONAL: timestamp ai2 acquired this data
     "created": "..."         # OPTIONAL: timestamp when orig document was created (best-guess if not available)
-    "metadata": {...}        # OPTIONAL: source-specific metadata
+    "metadata": {            # OPTIONAL: source-specific metadata
+        "doc_type": "...",              # OPTIONAL: see NCC/Document Types
+        "lang_fasttext": "...",         # OPTIONAL: see NCC/Languages
+        "lang_fasttext_conf": float,    # OPTIONAL: see NCC/Languages
+        "license": "..."                # OPTIONAL: see NCC/License
+        }
 }
 """
 
@@ -17,7 +22,7 @@
 from datasets import load_dataset, Dataset, IterableDataset
 
 
-EXPORT_PATH = "ncc_sample.jsonl.gz"
+EXPORT_PATH = "ncc.jsonl.gz"
 date_added = datetime.datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S.000Z")
 
 
@@ -42,6 +47,37 @@ def _remove_existing_jsonlgz() -> None:
         print(f"Removing existing export: {EXPORT_PATH}")
 
 
+def _convert_doctype_to_license(source: str) -> str | None:
+    """Hardcode licenses based on
+    https://huggingface.co/datasets/NbAiLab/NCC#license
+    """
+
+    source2license = {
+        "newspaper_ocr": "CC0 1.0",
+        "newspaper_pdf": "CC0 1.0",
+        "books": "CC0 1.0",
+        "newspapers_online_nb": "CC BY-NC 2.0",
+        "newspapers_online_nn": "CC BY-NC 2.0",
+        "opensubtitles": "CC BY-SA 3.0",
+        "wikipedia": "CC BY-SA 3.0",
+        "government_nb": "NLOD 2.0",
+        "government_nn": "NLOD 2.0",
+        "parliament": "NLOD 2.0",
+        "publicreports": "NLOD 2.0",
+    }
+
+    if source in source2license:
+        license = source2license[source]
+    elif source.startswith("lovdata_cd_") or source.startswith("maalfrid_"):
+        license = "NLOD 2.0"
+    elif source.startswith("wikipedia"):
+        license = "CC BY-SA 3.0"
+    else:
+        license = None
+
+    return license
+
+
 def _structure_records(obs: dict) -> dict:
     """Structure a single observation to Dolma format"""
 
@@ -60,6 +96,7 @@ def _structure_records(obs: dict) -> dict:
             "doc_type": obs["doc_type"],
             "lang_fasttext": obs["lang_fasttext"],
             "lang_fasttext_conf": obs["lang_fasttext_conf"],
+            "license": _convert_doctype_to_license(obs["doc_type"]),
         },
     }