Skip to content

Commit

Permalink
Specify line endings and file encoding so MSMARCO passage works on Wi…
Browse files Browse the repository at this point in the history
…ndows (#953)
  • Loading branch information
edwinzhng authored and lintool committed Jan 15, 2020
1 parent 51879e0 commit f21137b
Show file tree
Hide file tree
Showing 2 changed files with 4 additions and 4 deletions.
4 changes: 2 additions & 2 deletions src/main/python/msmarco/convert_collection_to_jsonl.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,15 +22,15 @@
def convert_collection(args):
print('Converting collection...')
file_index = 0
with open(args.collection_path) as f:
with open(args.collection_path, encoding='utf-8') as f:
for i, line in enumerate(f):
doc_id, doc_text = line.rstrip().split('\t')

if i % args.max_docs_per_file == 0:
if i > 0:
output_jsonl_file.close()
output_path = os.path.join(args.output_folder, 'docs{:02d}.json'.format(file_index))
output_jsonl_file = open(output_path, 'w')
output_jsonl_file = open(output_path, 'w', encoding='utf-8', newline='\n')
file_index += 1
output_dict = {'id': doc_id, 'contents': doc_text}
output_jsonl_file.write(json.dumps(output_dict) + '\n')
Expand Down
4 changes: 2 additions & 2 deletions src/main/python/msmarco/filter_queries.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,8 @@
query_id, _, _, _ = line.rstrip().split('\t')
qrels.add(query_id)

with open(args.output_queries, 'w') as fout:
with open(args.queries) as f:
with open(args.output_queries, 'w', encoding='utf-8', newline='\n') as fout:
with open(args.queries, encoding='utf-8') as f:
for line in f:
query_id, _ = line.rstrip().split('\t')
if query_id in qrels:
Expand Down

0 comments on commit f21137b

Please sign in to comment.