add code generation scripts

yyDing1 · yyDing1 · commit c1f3791510aa · 2024-10-25T16:43:06.000+08:00
diff --git a/.gitignore b/.gitignore
@@ -158,16 +158,5 @@ cython_debug/
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
-automatic_gen/data/
-query_sft/models/
-query_sft/data/
-query_dpo/models
-dart-math/
-evaluation/code/
-evaluation/outputs/
-evol_instruct/output
-temp.py
-automatic_gen/*data
-baselines
-check_solvability_difficulty
-test_rm
+src/data_generation/generation
+src/train_question_generator/models
diff --git a/README.md b/README.md
@@ -9,24 +9,26 @@
     <a href="https://opennlg.cn/"><img src="https://img.shields.io/badge/Organization-OpenNLG%20Group-blueviolet"></a>
 </p>
 
-We introduce ScaleQuest, a scalable, cost-effective, and novel data synthesis method that utilizes small-size open-source models to generate questions from scratch without the need for seed data with complex augmentation constraints.
+We introduce ScaleQuest, a scalable, cost-effective, and novel data synthesis method that utilizes small-size open-source models to generate questions from scratch without the need for seed questions with complex augmentation constraints.
 
 ![](img/results.png)
 
 This repository contains our complete data synthesis method, including:
 
-1. Training a question generator through question fine-tuning (code in the `qft_train` folder).
-2. Constructing preference data (code in the `question_optim` folder) and performing question preference optimization (code in the `qpo_train` folder).
-3. Using the trained question generator to synthesize questions (code in the `data_generation` folder).
-4. Applying a filtering process to the generated questions (code in the `question_filtering` folder).
-5. Generating responses (code in the `data_generation` folder) and applying a reward filtering strategy (code in the `reward_filtering` folder).
-6. For instruction-tuning and evaluation, we directly use the DART-Math framework.
+```
+
 
 We randomly sampled 100 generated data points and placed them in `data_samples/samples.jsonl`
 
 ## Method Overview
 
 ![](img/method.png)
 
-
+1. Training a question generator
+- through question fine-tuning (code in the `qft_train` folder).
+- Constructing preference data (code in the `question_optim` folder) and performing question preference optimization (code in the `qpo_train` folder).
+2. Using the trained question generator to synthesize questions (code in the `data_generation` folder).
+- Applying a filtering process to the generated questions (code in the `question_filtering` folder).
+- Generating responses (code in the `data_generation` folder) and applying a reward filtering strategy (code in the `reward_filtering` folder).
+3. For instruction-tuning and evaluation, we directly use the DART-Math framework.
 
diff --git a/samples/samples.jsonl b/samples/samples.jsonl
diff --git a/src/data_generation/gen.py b/src/data_generation/gen.py
@@ -81,6 +81,9 @@ def get_args():
     elif "deepseek" in args.qry_prompt_type:
         pre_query_template = "<｜begin▁of▁sentence｜>User: "
         stop_tokens = ["<｜begin▁of▁sentence｜>", "<｜end▁of▁sentence｜>"]
+    elif "qwen2.5-code" in args.qry_prompt_type:
+        pre_query_template = "<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n<|im_start|>user\n"
+        stop_tokens = ["<|im_start|>", "<|im_end|>", "<|endoftext|>"]
     else:
         raise NotImplementedError(
             f"Query prompt type {args.qry_prompt_type} is not implemented"
@@ -155,6 +158,13 @@ def flatten_batch_and_strip(line_data):
             "and put your final answer within \\boxed{{}}.\n\nAssistant:"
         )
         stop_tokens = ["<｜begin▁of▁sentence｜>", "<｜end▁of▁sentence｜>"]
+    elif "qwen2.5-code" in args.res_prompt_type:
+        res_generation_template = (
+            "<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n"
+            "<|im_start|>user\n{input}<|im_end|>\n"
+            "<|im_start|>assistant\n"
+        )
+        stop_tokens = ["<|im_start|>", "<|im_end|>", "<|endoftext|>"]
     else:
         raise NotImplementedError(
             f"Response prompt type {args.res_prompt_type} is not implemented"
@@ -203,7 +213,9 @@ def filter_data(line_data):
         has_answer = "boxed" in response or "he answer is" in response or "final answer is" in response
         return has_answer
 
-    dataset = dataset.map(strip_data, concurrency=4).filter(filter_data, concurrency=4)
+    dataset = dataset.map(strip_data, concurrency=4)
+    if "math" in args.res_prompt_type:
+        dataset = dataset.filter(filter_data, concurrency=4)
 
     res_gen_output_path = os.path.join(
         args.output_folder,
diff --git a/src/train_question_generator/qft_train/train.py b/src/train_question_generator/qft_train/train.py
@@ -128,6 +128,11 @@ def formatting_prompts_func(example):
         text = f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{example['query'].strip()}<|im_end|>"
     elif script_args.prompt_type == "deepseek-math":
         text = f"User: {example['query'].strip()}\n\n<｜end▁of▁sentence｜>"
+    elif script_args.prompt_type == "deepseek-code":
+        text = f"You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n" \
+                f"### Instruction:\n{example['query'].strip()}\n### Response:\n"
+    elif script_args.prompt_type == "qwen2.5-code":
+        text = f"<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n<|im_start|>user\n{example['query'].strip()}<|im_end|>"
     else:
         raise NotImplementedError(
             f"Prompt type {script_args.prompt_type} not implemented."
@@ -140,7 +145,7 @@ def formatting_prompts_func(example):
 train_dataset = dataset["train"]
 eval_dataset = dataset["test"] if "test" in dataset else None
 if script_args.max_training_samples > 0:
-    train_dataset = train_dataset.select(range(script_args.max_training_samples))
+    train_dataset = train_dataset.shuffle(seed=42).select(range(script_args.max_training_samples))
 
 
 # formatting_prompts_func
diff --git a/src/train_question_generator/scripts/run_dscode_qft.sh b/src/train_question_generator/scripts/run_dscode_qft.sh
@@ -0,0 +1,37 @@
+# Step 1: QFT
+ACCELERATE_LOG_LEVEL=info accelerate launch \
+--config_file ./zero3.yaml \
+--main_process_port 29600 \
+qft_train/train.py \
+    --model_path deepseek-ai/deepseek-coder-6.7b-instruct \
+    --dataset_path /path/to/CodeFeedback-Filtered-Instruction \
+    --prompt_type deepseek-code \
+    --num_train_epochs 1 \
+    --gradient_checkpointing false \
+    --max_length 256 \
+    --output_dir models/Deepseek-Coder-7B-QFT \
+    --per_device_train_batch_size 1 \
+    --per_device_eval_batch_size 1 \
+    --gradient_accumulation_steps 4 \
+
+Step 2: QPO
+ACCELERATE_LOG_LEVEL=info accelerate launch \
+--config_file ./zero3.yaml \
+--main_process_port 29601 \
+train.py \
+    --model_path models/Deepseek-Coder-7B-QFT \
+    --ref_model models/Deepseek-Coder-7B-QFT \
+    --dataset_path /path/to/qpo_data \
+    --prompt_type deepseek-code \
+    --run_name deepseek-code-qgen-sft-dpo \
+    --learning_rate 5e-7 \
+    --lr_scheduler_type cosine \
+    --loss_type sigmoid \
+    --warmup_steps 20 \
+    --num_train_epochs 1 \
+    --gradient_checkpointing true \
+    --max_length 1024 \
+    --output_dir models/Deepseek-Coder-7B-QGen \
+    --per_device_train_batch_size 8 \
+    --per_device_eval_batch_size 8 \
+    --gradient_accumulation_steps 2 \
diff --git a/src/train_question_generator/scripts/run_dsmath_qft.sh b/src/train_question_generator/scripts/run_dsmath_qft.sh
diff --git a/src/train_question_generator/scripts/run_qwen2code_qft.sh b/src/train_question_generator/scripts/run_qwen2code_qft.sh
@@ -0,0 +1,38 @@
+# Step 1: QFT
+ACCELERATE_LOG_LEVEL=info accelerate launch \
+--config_file scripts/zero3.yaml \
+--main_process_port 29500 \
+qft_train/train.py \
+    --model_path Qwen/Qwen2.5-Coder-7B-Instruct \
+    --dataset_path /path/to/CodeFeedback-Filtered-Instruction \
+    --prompt_type qwen2.5-code \
+    --num_train_epochs 1 \
+    --gradient_checkpointing false \
+    --max_length 256 \
+    --max_training_samples 20000 \
+    --output_dir models/Qwen2.5-Coder-7B-QFT \
+    --per_device_train_batch_size 1 \
+    --per_device_eval_batch_size 1 \
+    --gradient_accumulation_steps 4 \
+
+# Step 2: QPO
+ACCELERATE_LOG_LEVEL=info accelerate launch \
+--config_file ./zero3.yaml \
+--main_process_port 29051 \
+train.py \
+    --model_path models/Qwen2.5-Coder-7B-QFT \
+    --ref_model models/Qwen2.5-Coder-7B-QFT \
+    --dataset_path /path/to/qpo_data \
+    --prompt_type qwen2.5-code \
+    --run_name qwen2-code-qgen-sft-dpo \
+    --learning_rate 5e-7 \
+    --lr_scheduler_type cosine \
+    --loss_type sigmoid \
+    --warmup_steps 20 \
+    --num_train_epochs 1 \
+    --gradient_checkpointing true \
+    --max_length 1024 \
+    --output_dir models/Qwen2-Coder-7B-QGen \
+    --per_device_train_batch_size 8 \
+    --per_device_eval_batch_size 8 \
+    --gradient_accumulation_steps 2 \
diff --git a/src/train_question_generator/scripts/run_qwen2math_qft.sh b/src/train_question_generator/scripts/run_qwen2math_qft.sh
@@ -14,6 +14,7 @@ train.py \
     --per_device_eval_batch_size 1 \
     --gradient_accumulation_steps 4 \
 
+# Step 2: QPO
 ACCELERATE_LOG_LEVEL=info accelerate launch \
 --config_file ./zero3.yaml \
 --main_process_port 29051 \