Skip to content

Commit

Permalink
Add concatenate_datasets from huggingface as well as the outlines for…
Browse files Browse the repository at this point in the history
… using the datasets mentioned in the LongRoPE paper
  • Loading branch information
jshuadvd committed Jul 8, 2024
1 parent f03ea61 commit f885fc7
Showing 1 changed file with 6 additions and 1 deletion.
7 changes: 6 additions & 1 deletion train.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from torch.optim.lr_scheduler import CosineAnnealingLR
import gzip
from transformers import GPT2Tokenizer
from datasets import load_dataset
from datasets import load_dataset, concatenate_datasets
from importlib import reload
import src.main
from accelerate import Accelerator
Expand Down Expand Up @@ -308,6 +308,11 @@ def main():
# Load the raw data
data = load_data("../data/raw/enwik8.gz")

# Load datasets mentioned in the LongRoPE paper
pg19_dataset = load_dataset("pg19", split="train")
arxiv_dataset = load_dataset("arxiv_dataset", split="train")
github_dataset = load_dataset("github_dataset", split="train")

# Set parameters for data preprocessing
max_length = 65536
overlap = 4096
Expand Down

0 comments on commit f885fc7

Please sign in to comment.