From e06c3db2020c2f06ee36580fd119a6ab40b2ce8a Mon Sep 17 00:00:00 2001 From: mr Date: Tue, 16 Jul 2024 18:01:10 +0800 Subject: [PATCH] modifiy cache path rule for pretrain datasets to support different model and redo tokenizer when source file changed . --- train.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/train.py b/train.py index 32bd084..0241491 100644 --- a/train.py +++ b/train.py @@ -1,6 +1,7 @@ import argparse from loguru import logger import os +import time from os.path import join import torch from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training @@ -140,7 +141,9 @@ def group_texts(examples): logger.info(f'Loading file: {file}') file_name = os.path.basename(file) file_name = file_name.replace('.jsonl', '') - cache_path = os.path.join(cache_dir, file_name) + + mtime=time.strftime("%Y%m%d%H%M%S", time.localtime(os.stat(file).st_mtime)) + cache_path = os.path.join(cache_dir,tokenizer.name_or_path,file_name,mtime) os.makedirs(cache_path, exist_ok=True) try: @@ -148,7 +151,7 @@ def group_texts(examples): logger.info(f'Finished loading datasets-{file_name} from cache') except Exception: tmp_cache_path = join(cache_path, 'tmp') # 临时缓存目录,会被自动删除 - logger.info(f'There is no cache of file {file_name}, start preprocessing...') + logger.info(f'There is no cache of file {file_name} modified @ {mtime} for tokenizer {tokenizer.name_or_path}, start preprocessing...') raw_dataset = load_dataset("json", data_files=file, cache_dir=tmp_cache_path, keep_in_memory=False) tokenized_dataset = raw_dataset.map( tokenize_function,