From 1f25b558d910dc81874f0b827273218a73767aad Mon Sep 17 00:00:00 2001 From: Rasmus Larsen Date: Mon, 15 Jan 2024 14:04:16 +0100 Subject: [PATCH] Update and rename convert_dataset_json.py to jsonl_to_mds.py --- ...onvert_dataset_json.py => jsonl_to_mds.py} | 31 +++++++++---------- 1 file changed, 15 insertions(+), 16 deletions(-) rename scripts/data/{convert_dataset_json.py => jsonl_to_mds.py} (98%) diff --git a/scripts/data/convert_dataset_json.py b/scripts/data/jsonl_to_mds.py similarity index 98% rename from scripts/data/convert_dataset_json.py rename to scripts/data/jsonl_to_mds.py index 0a9f797b..cd694e58 100644 --- a/scripts/data/convert_dataset_json.py +++ b/scripts/data/jsonl_to_mds.py @@ -2,7 +2,7 @@ # Modified by @rlrs for Danish Foundation Models # SPDX-License-Identifier: Apache-2.0 -"""Streaming dataset conversion scripts for json files.""" +"""Convert jsonl data to streaming MDS format, while tokenizing and concatenating.""" import os from argparse import ArgumentParser, Namespace from glob import glob @@ -16,21 +16,6 @@ import numpy as np -def generate_chunks(dataset: Union[hf_datasets.IterableDataset, hf_datasets.Dataset], - bos_tokens: list[int], eos_tokens: list[int], chunk_length: int) -> Iterable[Dict[str, bytes]]: - buffer = np.empty(0, dtype=np.int64, order='C') - for sample in dataset: - iids = sample['input_ids'] - buffer = np.append(buffer, [*bos_tokens, *iids, *eos_tokens]) - while len(buffer) >= chunk_length: - concat_sample = buffer[:chunk_length] - buffer = buffer[chunk_length:] #if should_wrap else np.empty(0, dtype=np.int64, order='C') - yield { - # convert to bytes to store in MDS binary format - 'tokens': np.asarray(concat_sample, dtype=np.int64).tobytes() # unsure why the np.asarray is necessary, tbh, but it is - } - - def parse_args() -> Namespace: """Parse commandline arguments.""" parser = ArgumentParser( @@ -62,6 +47,20 @@ def parse_args() -> Namespace: parsed.eos_text = '' return parsed +def generate_chunks(dataset: Union[hf_datasets.IterableDataset, hf_datasets.Dataset], + bos_tokens: list[int], eos_tokens: list[int], chunk_length: int) -> Iterable[Dict[str, bytes]]: + buffer = np.empty(0, dtype=np.int64, order='C') + for sample in dataset: + iids = sample['input_ids'] + buffer = np.append(buffer, [*bos_tokens, *iids, *eos_tokens]) + while len(buffer) >= chunk_length: + concat_sample = buffer[:chunk_length] + buffer = buffer[chunk_length:] #if should_wrap else np.empty(0, dtype=np.int64, order='C') + yield { + # convert to bytes to store in MDS binary format + 'tokens': np.asarray(concat_sample, dtype=np.int64).tobytes() # unsure why the np.asarray is necessary, tbh, but it is + } + def build_hf_dataset( path: str,