From e06c3db2020c2f06ee36580fd119a6ab40b2ce8a Mon Sep 17 00:00:00 2001
From: mr <mr@mygistar.com>
Date: Tue, 16 Jul 2024 18:01:10 +0800
Subject: [PATCH] modifiy cache path rule for pretrain datasets to support
 different model  and redo tokenizer when source file changed .

---
 train.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/train.py b/train.py
index 32bd084..0241491 100644
--- a/train.py
+++ b/train.py
@@ -1,6 +1,7 @@
 import argparse
 from loguru import logger
 import os
+import time
 from os.path import join
 import torch
 from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
@@ -140,7 +141,9 @@ def group_texts(examples):
             logger.info(f'Loading file: {file}')
             file_name = os.path.basename(file)
             file_name = file_name.replace('.jsonl', '')
-            cache_path = os.path.join(cache_dir, file_name)
+
+            mtime=time.strftime("%Y%m%d%H%M%S", time.localtime(os.stat(file).st_mtime))
+            cache_path = os.path.join(cache_dir,tokenizer.name_or_path,file_name,mtime)
             os.makedirs(cache_path, exist_ok=True)
 
             try:
@@ -148,7 +151,7 @@ def group_texts(examples):
                 logger.info(f'Finished loading datasets-{file_name} from cache')
             except Exception:
                 tmp_cache_path = join(cache_path, 'tmp')    # 临时缓存目录，会被自动删除
-                logger.info(f'There is no cache of file {file_name}, start preprocessing...')
+                logger.info(f'There is no cache of file {file_name} modified @ {mtime} for tokenizer {tokenizer.name_or_path}, start preprocessing...')
                 raw_dataset = load_dataset("json", data_files=file, cache_dir=tmp_cache_path, keep_in_memory=False)
                 tokenized_dataset = raw_dataset.map(
                     tokenize_function,