From f21137b44f1115d25d1ff8ecaf7780c36498c5de Mon Sep 17 00:00:00 2001 From: Edwin Zhang Date: Wed, 15 Jan 2020 17:48:09 -0500 Subject: [PATCH] Specify line endings and file encoding so MSMARCO passage works on Windows (#953) --- src/main/python/msmarco/convert_collection_to_jsonl.py | 4 ++-- src/main/python/msmarco/filter_queries.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/main/python/msmarco/convert_collection_to_jsonl.py b/src/main/python/msmarco/convert_collection_to_jsonl.py index a2bf495ab5..10d0e3dd84 100644 --- a/src/main/python/msmarco/convert_collection_to_jsonl.py +++ b/src/main/python/msmarco/convert_collection_to_jsonl.py @@ -22,7 +22,7 @@ def convert_collection(args): print('Converting collection...') file_index = 0 - with open(args.collection_path) as f: + with open(args.collection_path, encoding='utf-8') as f: for i, line in enumerate(f): doc_id, doc_text = line.rstrip().split('\t') @@ -30,7 +30,7 @@ def convert_collection(args): if i > 0: output_jsonl_file.close() output_path = os.path.join(args.output_folder, 'docs{:02d}.json'.format(file_index)) - output_jsonl_file = open(output_path, 'w') + output_jsonl_file = open(output_path, 'w', encoding='utf-8', newline='\n') file_index += 1 output_dict = {'id': doc_id, 'contents': doc_text} output_jsonl_file.write(json.dumps(output_dict) + '\n') diff --git a/src/main/python/msmarco/filter_queries.py b/src/main/python/msmarco/filter_queries.py index bb2035f1fe..38e8e52c4b 100644 --- a/src/main/python/msmarco/filter_queries.py +++ b/src/main/python/msmarco/filter_queries.py @@ -31,8 +31,8 @@ query_id, _, _, _ = line.rstrip().split('\t') qrels.add(query_id) - with open(args.output_queries, 'w') as fout: - with open(args.queries) as f: + with open(args.output_queries, 'w', encoding='utf-8', newline='\n') as fout: + with open(args.queries, encoding='utf-8') as f: for line in f: query_id, _ = line.rstrip().split('\t') if query_id in qrels: