-
Notifications
You must be signed in to change notification settings - Fork 4
/
to_csv.py
executable file
·96 lines (84 loc) · 3.65 KB
/
to_csv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
#! /usr/bin/env python3
import argparse
import gzip
from typing import Union, Any, Tuple
from ciff_toolkit.read import CiffReader
class ToCSV:
"""
Class for creating csv files that represent tables in the old dog paper:
- https://dl.acm.org/doi/10.1145/2600428.2609460
The files are created from a CIFF as described in:
- https://arxiv.org/abs/2003.08276
"""
def __init__(self, **kwargs: Any) -> None:
self.arguments = self.get_arguments(kwargs)
self.create_csv_files()
@staticmethod
def get_arguments(kwargs: Any) -> dict:
arguments = {
'protobuf_file': None,
'output_docs': 'docs.csv',
'output_term_dict': 'term_dict.csv',
'output_term_doc': 'term_docs.csv'
}
for key, item in arguments.items():
if kwargs.get(key) is not None:
arguments[key] = kwargs.get(key)
return arguments
@staticmethod
def decode(buffer: Union[str, bytes], pos: int) -> Union[Tuple[int, int], None]:
mask = (1 << 32) - 1
result = 0
shift = 0
while True:
b = buffer[pos]
result |= ((b & 0x7f) << shift)
pos += 1
if not (b & 0x80):
result &= mask
result = int(result)
return result, pos
shift += 7
if shift >= 64:
raise IOError('Too many bytes when decoding.')
def create_csv_files(self) -> None:
if self.arguments['protobuf_file'].endswith('.gz'):
with gzip.open(self.arguments['protobuf_file'], 'rb') as f:
data = f.read()
else:
with open(self.arguments['protobuf_file'], 'rb') as f:
data = f.read()
next_pos, pos = 0, 0
with CiffReader(self.arguments['protobuf_file']) as reader:
with open(self.arguments['output_term_dict'], 'w') as term_dict_writer, \
open(self.arguments['output_term_doc'], 'w') as term_doc_writer:
for term_id, postings_list in enumerate(reader.read_postings_lists()):
term_dict_writer.write(f'{term_id}|{postings_list.term}|{postings_list.df}\n')
docid = 0
for posting in postings_list.postings:
docid += posting.docid
term_doc_writer.write(f'{term_id}|{docid}|{posting.tf}\n')
with open(self.arguments['output_docs'], 'w') as docs_writer:
for doc_record in reader.read_documents():
docs_writer.write(f'{doc_record.collection_docid}|{doc_record.docid}|{doc_record.doclength}\n')
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('-p',
'--protobuf_file',
required=True,
metavar='[file]',
help='Location of the protobuf file, if this is included ' +
'output files for term related files should also be specified.')
parser.add_argument('-o',
'--output_docs',
metavar='[file]',
help='Output csv file for the docs table.')
parser.add_argument('-t',
'--output_term_dict',
metavar='[file]',
help='Output csv file for the term dictionary table.')
parser.add_argument('-e',
'--output_term_doc',
metavar='[file]',
help='Output csv file for the term doc mapper table.')
ToCSV(**vars(parser.parse_args()))