From 76790394c4659530243897fb62d74454a564dbdc Mon Sep 17 00:00:00 2001 From: XuhuiRen Date: Tue, 10 Sep 2024 09:57:38 +0800 Subject: [PATCH 1/2] fixjsonl Signed-off-by: XuhuiRen --- comps/dataprep/utils.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/comps/dataprep/utils.py b/comps/dataprep/utils.py index 571d5e8a4..b3daabf63 100644 --- a/comps/dataprep/utils.py +++ b/comps/dataprep/utils.py @@ -284,6 +284,15 @@ def load_json(json_path): content_list = [json.dumps(item) for item in data] return content_list +def load_jsonl(jsonl_path): + """Load and process jsonl file.""" + content_list = [] + with open(jsonl_path, 'r') as file: + for line in file: + json_obj = json.loads(line) + content_list.append(json_obj) + return content_list + def load_yaml(yaml_path): """Load and process yaml file.""" @@ -351,8 +360,10 @@ def document_loader(doc_path): return load_md(doc_path) elif doc_path.endswith(".xml"): return load_xml(doc_path) - elif doc_path.endswith(".json") or doc_path.endswith(".jsonl"): + elif doc_path.endswith(".json"): return load_json(doc_path) + elif doc_path.endswith(".jsonl"): + return load_jsonl(doc_path) elif doc_path.endswith(".yaml"): return load_yaml(doc_path) elif doc_path.endswith(".xlsx") or doc_path.endswith(".xls"): From d4ef35a874cb99afdf5cc4ded371ca3cc64c6405 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 10 Sep 2024 02:00:06 +0000 Subject: [PATCH 2/2] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- comps/dataprep/utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/comps/dataprep/utils.py b/comps/dataprep/utils.py index b3daabf63..f48d97157 100644 --- a/comps/dataprep/utils.py +++ b/comps/dataprep/utils.py @@ -284,10 +284,11 @@ def load_json(json_path): content_list = [json.dumps(item) for item in data] return content_list + def load_jsonl(jsonl_path): """Load and process jsonl file.""" content_list = [] - with open(jsonl_path, 'r') as file: + with open(jsonl_path, "r") as file: for line in file: json_obj = json.loads(line) content_list.append(json_obj)