Skip to content

Commit

Permalink
test
Browse files Browse the repository at this point in the history
  • Loading branch information
ydli-ai committed Dec 25, 2023
1 parent 51350e6 commit d1af253
Showing 1 changed file with 5 additions and 4 deletions.
9 changes: 5 additions & 4 deletions tencentpretrain/utils/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -247,10 +247,12 @@ def worker(self, proc_id, start, end):
if self.json_format_corpus:
data = json.loads(line)
line = data.get("text", "") + data.get("content", "")
if len(line) < 5:
continue
if pos >= end:
break

pos += 1
if len(line) < 5:
continue

document = [self.vocab.get(CLS_TOKEN)] + self.tokenizer.convert_tokens_to_ids(self.tokenizer.tokenize(line)) + [self.vocab.get(SEP_TOKEN)]

Expand Down Expand Up @@ -281,8 +283,7 @@ def worker(self, proc_id, start, end):
for instance in instances:
pickle.dump(instance, dataset_writer)

if pos >= end:
break


dataset_writer.close()

Expand Down

0 comments on commit d1af253

Please sign in to comment.