InternLM · vansin · Dec 19, 2023 · Dec 19, 2023 · Dec 19, 2023 · Dec 19, 2023
diff --git a/xtuner/MedQA2019-structured-test.jsonl b/xtuner/MedQA2019-structured-test.jsonl
diff --git a/xtuner/MedQA2019-structured-train.jsonl b/xtuner/MedQA2019-structured-train.jsonl
diff --git a/xtuner/MedQA2019-structured.jsonl b/xtuner/MedQA2019-structured.jsonl
diff --git a/xtuner/MedQA2019.xlsx b/xtuner/MedQA2019.xlsx
diff --git a/xtuner/README.md b/xtuner/README.md
diff --git a/xtuner/imgs/afterFT.png b/xtuner/imgs/afterFT.png
diff --git a/xtuner/imgs/beforeFT.png b/xtuner/imgs/beforeFT.png
diff --git a/xtuner/imgs/bugfix1.png b/xtuner/imgs/bugfix1.png
diff --git a/xtuner/imgs/bugfix2.png b/xtuner/imgs/bugfix2.png
diff --git a/xtuner/imgs/cat_fly.png b/xtuner/imgs/cat_fly.png
diff --git a/xtuner/imgs/cfgs.png b/xtuner/imgs/cfgs.png
diff --git a/xtuner/imgs/dataProcessed.png b/xtuner/imgs/dataProcessed.png
diff --git a/xtuner/imgs/head.png b/xtuner/imgs/head.png
diff --git a/xtuner/imgs/medqa2019samples.png b/xtuner/imgs/medqa2019samples.png
diff --git a/xtuner/imgs/msagent_data.png b/xtuner/imgs/msagent_data.png
diff --git a/xtuner/imgs/serper.png b/xtuner/imgs/serper.png
diff --git a/xtuner/imgs/ysqd.png b/xtuner/imgs/ysqd.png
diff --git a/xtuner/split2train_and_test.py b/xtuner/split2train_and_test.py
@@ -0,0 +1,35 @@
+import json
+import random
+
+def split_conversations(input_file, train_output_file, test_output_file):
+    # Read the input JSONL file
+    with open(input_file, 'r', encoding='utf-8') as jsonl_file:
+        data = json.load(jsonl_file)
+
+    # Count the number of conversation elements
+    num_conversations = len(data)
+
+    # Shuffle the data randomly
+    random.shuffle(data)
+    random.shuffle(data)
+    random.shuffle(data)
+
+    # Calculate the split points for train and test
+    split_point = int(num_conversations * 0.7)
+
+    # Split the data into train and test
+    train_data = data[:split_point]
+    test_data = data[split_point:]
+
+    # Write the train data to a new JSONL file
+    with open(train_output_file, 'w', encoding='utf-8') as train_jsonl_file:
+        json.dump(train_data, train_jsonl_file, indent=4)
+
+    # Write the test data to a new JSONL file
+    with open(test_output_file, 'w', encoding='utf-8') as test_jsonl_file:
+        json.dump(test_data, test_jsonl_file, indent=4)
+
+    print(f"Split complete. Train data written to {train_output_file}, Test data written to {test_output_file}")
+
+# Replace 'input.jsonl', 'train.jsonl', and 'test.jsonl' with your actual file names
+split_conversations('MedQA2019-structured.jsonl', 'MedQA2019-structured-train.jsonl', 'MedQA2019-structured-test.jsonl')
diff --git a/xtuner/xlsx2jsonl.py b/xtuner/xlsx2jsonl.py
@@ -0,0 +1,35 @@
+import openpyxl
+import json
+
+def process_excel_to_json(input_file, output_file):
+    # Load the workbook
+    wb = openpyxl.load_workbook(input_file)
+
+    # Select the "DrugQA" sheet
+    sheet = wb["DrugQA"]
+
+    # Initialize the output data structure
+    output_data = []
+
+    # Iterate through each row in column A and D
+    for row in sheet.iter_rows(min_row=2, max_col=4, values_only=True):
+        system_value = "You are a professional, highly experienced doctor professor. You always provide accurate, comprehensive, and detailed answers based on the patients' questions."
+
+        # Create the conversation dictionary
+        conversation = {
+            "system": system_value,
+            "input": row[0],
+            "output": row[3]
+        }
+
+        # Append the conversation to the output data
+        output_data.append({"conversation": [conversation]})
+
+    # Write the output data to a JSON file
+    with open(output_file, 'w', encoding='utf-8') as json_file:
+        json.dump(output_data, json_file, indent=4)
+
+    print(f"Conversion complete. Output written to {output_file}")
+
+# Replace 'MedQA2019.xlsx' and 'output.jsonl' with your actual input and output file names
+process_excel_to_json('MedQA2019.xlsx', 'output.jsonl')