Skip to content

Commit

Permalink
test
Browse files Browse the repository at this point in the history
  • Loading branch information
ydli-ai committed Dec 24, 2023
1 parent 45c39bc commit 51350e6
Showing 1 changed file with 7 additions and 0 deletions.
7 changes: 7 additions & 0 deletions tencentpretrain/utils/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -229,6 +229,7 @@ class MlmDataset(Dataset):
def __init__(self, args, vocab, tokenizer):
super(MlmDataset, self).__init__(args, vocab, tokenizer)
self.full_sentences = args.full_sentences
self.json_format_corpus = args.json_format_corpus

def worker(self, proc_id, start, end):
print("Worker %d is building dataset ... " % proc_id)
Expand All @@ -243,7 +244,13 @@ def worker(self, proc_id, start, end):
pos += 1
while True:
line = f.readline()
if self.json_format_corpus:
data = json.loads(line)
line = data.get("text", "") + data.get("content", "")

pos += 1
if len(line) < 5:
continue

document = [self.vocab.get(CLS_TOKEN)] + self.tokenizer.convert_tokens_to_ids(self.tokenizer.tokenize(line)) + [self.vocab.get(SEP_TOKEN)]

Expand Down

0 comments on commit 51350e6

Please sign in to comment.