Implement a caching mechanism for tokenized sequences

jshuadvd · Jul 11, 2024 · 1c1a00b · 1c1a00b
1 parent 287957e
commit 1c1a00b
Showing 1 changed file with 1 addition and 0 deletions.
diff --git a/train.py b/train.py
@@ -132,6 +132,7 @@ def preprocess_data(data, tokenizer, max_length, overlap):
         end = start + max_length
         chunk = data[start:end]
         # tokenized_chunk = tokenizer.encode(chunk)
+        # Cache the tokenized chunk
         tokenized_chunk = cached_tokenize(chunk, tokenizer)
 
         # Create sliding window sequences from the tokenized chunk