Skip to content

Commit

Permalink
modified merge_jsonl and merge_jsonl_r
Browse files Browse the repository at this point in the history
merge_jsonl is for merge jsonl files in a folder
merge_jsonl_r if for merge jsonl files in one folder's subfolders

uasge:
python merge_jsonl_r.py > qwen2.txt
python merge_jsonl_r.py > zhipuai.txt

python merge_jsonl.py > curr.txt

│   学业_merge.json
│   家人_merge.json
│   就业_merge.json
│   工作_merge.json
│   恋爱_merge.json
│   朋友_merge.json
│   环境_merge.json
│   生活_merge.json
│   社交_merge.json
│   责任_merge.json
│   身体_merge.json
│   隐私_merge.json
│
├───学业
│       兴奋.jsonl
│       冷静.jsonl
│       厌倦.jsonl
│       厌恶.jsonl
│       同情.jsonl
│       困惑.jsonl
│       娱乐.jsonl
│       嫉妒.jsonl
│       尴尬.jsonl
│       崇拜.jsonl
│       快乐.jsonl
│       怀旧.jsonl
│       性欲.jsonl
│       恐惧.jsonl
│       悲伤.jsonl
│       敬畏.jsonl
│       有趣.jsonl
│       欣赏.jsonl
│       浪漫.jsonl
│       渴望.jsonl
│       满意.jsonl
│       满足.jsonl
│       焦虑.jsonl
│       痛恨.jsonl
│       痛苦.jsonl
│       着迷.jsonl
│       钦佩.jsonl
│
├───家人
│       兴奋.jsonl
│       冷静.jsonl
│       厌倦.jsonl
│       厌恶.jsonl
│       同情.jsonl
│       困惑.jsonl
│       娱乐.jsonl
│       嫉妒.jsonl
  • Loading branch information
chg0901 committed Mar 18, 2024
1 parent 0553c38 commit a38ef60
Show file tree
Hide file tree
Showing 3 changed files with 136 additions and 0 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ zhipuai/
data/

*.jsonl
*.json
# ./generate_data/*.josnl
# ./generate_data/*/*/*.josnl

Expand Down
60 changes: 60 additions & 0 deletions generate_data/final_data/merge_jsonl.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
import json
import os


def save_merge_json(data_lis, file_path):
with open(file_path, 'wt', encoding='utf-8') as file:
json.dump(data_lis, file, ensure_ascii=False, separators=(',\n',':'))


def get_all_file_paths(folder_path, file_type='.jsonl'):
# 确保传入的是一个目录
if not os.path.isdir(folder_path):
raise ValueError(f"{folder_path} is not a valid directory")

# 获取文件夹下所有文件的路径
file_paths = [os.path.join(folder_path, file) for file in os.listdir(
folder_path) if os.path.isfile(os.path.join(folder_path, file)) and (file_type in file)]
return file_paths


if __name__ == '__main__':
conversion_lis = []

folder_path = r'./'

merge_path = folder_path.split('/')[-1]
try:
merge_last_path = folder_path.split('/')[-2] if folder_path.split('/')[-2]!='.' else ''
except:
merge_last_path = ''
print(f'merge_path={merge_path},merge_last_path={merge_last_path}')


for path in get_all_file_paths(folder_path):
print(path)

with open(path, 'rt', encoding='utf-8') as file:
for line in file:
# # 移除行尾的换行符
# if line == '\n':
# line = line.rstrip('\n')
line = line.rstrip('\n')
# 解析JSON
try:
data = json.loads(line)
conversion_lis.append(data)
# conversion_lis.append('\n')
except json.JSONDecodeError as e:
print(f"Error decoding JSON: {e}")

if merge_last_path!='':
save_merge_json_path = rf'./{merge_last_path}/{merge_path}_merge.json'
elif merge_path!='':
save_merge_json_path = rf'./{merge_path}_merge.json'
else:
save_merge_json_path = rf'./curr_merge.json'

save_merge_json(data_lis=conversion_lis,
file_path=save_merge_json_path)
print(len(conversion_lis),save_merge_json_path)
75 changes: 75 additions & 0 deletions generate_data/final_data/merge_jsonl_r.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
import json
import os


def save_merge_json(data_lis, file_path):
with open(file_path, 'wt', encoding='utf-8') as file:
json.dump(data_lis, file, ensure_ascii=False, separators=(',\n',':'))


def get_all_file_paths(folder_path, file_type='.jsonl'):
# 确保传入的是一个目录
if not os.path.isdir(folder_path):
raise ValueError(f"{folder_path} is not a valid directory")

# 获取文件夹下所有文件的路径
file_paths = [os.path.join(folder_path, file) for file in os.listdir(
folder_path) if os.path.isfile(os.path.join(folder_path, file)) and (file_type in file)]
return file_paths


if __name__ == '__main__':

data_ai = 'qwen' # python merge_jsonl_r.py > qwen.txt
# data_ai = 'zhipuai' # python merge_jsonl_r.py > zhipuai.txt
root_dir = rf'./{data_ai}/'

save_final_merge_json_path = f'{data_ai}_final_merge.json'

subfolders = [os.path.join(root_dir, d) for d in os.listdir(root_dir) if os.path.isdir(os.path.join(root_dir, d))]

final_list = []
for folder_path in subfolders:
conversion_lis = []
merge_path = folder_path.split('/')[-1]
try:
merge_last_path = folder_path.split('/')[-2] if folder_path.split('/')[-2]!='.' else ''
except:
merge_last_path = ''
print(f'merge_path={merge_path},merge_last_path={merge_last_path}')


for path in get_all_file_paths(folder_path):
print(path)

with open(path, 'rt', encoding='utf-8') as file:
for line in file:
# # 移除行尾的换行符
# if line == '\n':
# line = line.rstrip('\n')
line = line.rstrip('\n')
# 解析JSON
try:
data = json.loads(line)
conversion_lis.append(data)
# conversion_lis.append('\n')
except json.JSONDecodeError as e:
print(f"Error decoding JSON: {e}")

if merge_last_path!='':
save_merge_json_path = rf'./{merge_last_path}/{merge_path}_merge.json'
elif merge_path!='':
save_merge_json_path = rf'./{merge_path}_merge.json'
else:
save_merge_json_path = rf'./curr_merge.json'

save_merge_json(data_lis=conversion_lis,
file_path=save_merge_json_path)

final_list = final_list+conversion_lis
print(len(conversion_lis),len(final_list),save_merge_json_path)

save_merge_json(data_lis=final_list,file_path=save_final_merge_json_path)
print(save_final_merge_json_path)


0 comments on commit a38ef60

Please sign in to comment.