Skip to content

Commit

Permalink
Update and rename convert_dataset_json.py to jsonl_to_mds.py
Browse files Browse the repository at this point in the history
  • Loading branch information
rlrs authored Jan 15, 2024
1 parent 1d32dbd commit 1f25b55
Showing 1 changed file with 15 additions and 16 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# Modified by @rlrs for Danish Foundation Models
# SPDX-License-Identifier: Apache-2.0

"""Streaming dataset conversion scripts for json files."""
"""Convert jsonl data to streaming MDS format, while tokenizing and concatenating."""
import os
from argparse import ArgumentParser, Namespace
from glob import glob
Expand All @@ -16,21 +16,6 @@
import numpy as np


def generate_chunks(dataset: Union[hf_datasets.IterableDataset, hf_datasets.Dataset],
bos_tokens: list[int], eos_tokens: list[int], chunk_length: int) -> Iterable[Dict[str, bytes]]:
buffer = np.empty(0, dtype=np.int64, order='C')
for sample in dataset:
iids = sample['input_ids']
buffer = np.append(buffer, [*bos_tokens, *iids, *eos_tokens])
while len(buffer) >= chunk_length:
concat_sample = buffer[:chunk_length]
buffer = buffer[chunk_length:] #if should_wrap else np.empty(0, dtype=np.int64, order='C')
yield {
# convert to bytes to store in MDS binary format
'tokens': np.asarray(concat_sample, dtype=np.int64).tobytes() # unsure why the np.asarray is necessary, tbh, but it is
}


def parse_args() -> Namespace:
"""Parse commandline arguments."""
parser = ArgumentParser(
Expand Down Expand Up @@ -62,6 +47,20 @@ def parse_args() -> Namespace:
parsed.eos_text = ''
return parsed

def generate_chunks(dataset: Union[hf_datasets.IterableDataset, hf_datasets.Dataset],
bos_tokens: list[int], eos_tokens: list[int], chunk_length: int) -> Iterable[Dict[str, bytes]]:
buffer = np.empty(0, dtype=np.int64, order='C')
for sample in dataset:
iids = sample['input_ids']
buffer = np.append(buffer, [*bos_tokens, *iids, *eos_tokens])
while len(buffer) >= chunk_length:
concat_sample = buffer[:chunk_length]
buffer = buffer[chunk_length:] #if should_wrap else np.empty(0, dtype=np.int64, order='C')
yield {
# convert to bytes to store in MDS binary format
'tokens': np.asarray(concat_sample, dtype=np.int64).tobytes() # unsure why the np.asarray is necessary, tbh, but it is
}


def build_hf_dataset(
path: str,
Expand Down

0 comments on commit 1f25b55

Please sign in to comment.