-
Notifications
You must be signed in to change notification settings - Fork 0
/
io_utils.py
127 lines (103 loc) · 4.33 KB
/
io_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import os
import glob
import json
import logging
from typing import Any, Mapping, Iterable, Union, List, Callable, Optional
from tqdm.auto import tqdm
def resolve_globs(glob_paths: Union[str, Iterable[str]]):
"""Returns filepaths corresponding to input filepath pattern(s)."""
filepaths = []
if isinstance(glob_paths, str):
glob_paths = [glob_paths]
for path in glob_paths:
filepaths.extend(glob.glob(path))
return filepaths
def read_jsonlines(filename: str) -> Iterable[Mapping[str, Any]]:
"""Yields an iterable of Python dicts after reading jsonlines from the input file."""
file_size = os.path.getsize(filename)
with open(filename) as fp:
for line in tqdm(
fp.readlines(), desc=f"Reading JSON lines from {filename}", unit="lines"
):
try:
example = json.loads(line)
yield example
except json.JSONDecodeError as ex:
logging.error(f'Input text: "{line}"')
logging.error(ex.args)
raise ex
def hf_read_jsonlines(
filename: str,
n: Optional[int] = None,
minimal_questions: Optional[bool] = False,
unique_questions: Optional[bool] = False,
) -> Iterable[Mapping[str, Any]]:
"""Yields an iterable of Python dicts after reading jsonlines from the input file.
Optionally reads only first n lines from file."""
file_size = os.path.getsize(filename)
# O(n) but no memory
with open(filename) as f:
num_lines = sum(1 for _ in f)
if n is None:
n = num_lines
# returning a generator with the scope stmt seemed to be the issue, but I am not 100% sure
# I also don't know if there's a side effect, but I can't see how the scope wouldn't have
# remained upen in the first place with the original version...
# with open(filename) as fp:
def line_generator():
unique_qc_ids = set()
# note, I am p sure that readlines is not lazy, returns a list, thus really only the
# object conversion is lazy
for i, line in tqdm(
enumerate(open(filename).readlines()[:n]),
desc=f"Reading JSON lines from {filename}",
unit="lines",
):
try:
full_example = json.loads(line)
if unique_questions:
qc_id = full_example["object"]["qc_id"]
if qc_id in unique_qc_ids:
continue
else:
unique_qc_ids.add(qc_id)
if not minimal_questions:
example = full_example
else:
full_example = full_example
q_object = full_example["object"]
q_object.pop("question_info")
example = {}
example["object"] = {
"answer": q_object["answer"],
"clue_spans": q_object["clue_spans"],
"qc_id": q_object["qc_id"],
"question_text": q_object["question_text"],
}
yield example
except json.JSONDecodeError as ex:
logging.error(f'Input text: "{line}"')
logging.error(ex.args)
raise ex
return line_generator
def load_jsonlines(filename: str) -> List[Mapping[str, Any]]:
"""Returns a list of Python dicts after reading jsonlines from the input file."""
return list(read_jsonlines(filename))
def write_jsonlines(
objs: Iterable[Mapping[str, Any]], filename: str, to_dict: Callable = lambda x: x
):
"""Writes a list of Python Mappings as jsonlines at the input file."""
with open(filename, "w") as fp:
for obj in tqdm(objs, desc=f"Writing JSON lines at {filename}"):
fp.write(json.dumps(to_dict(obj)))
fp.write("\n")
def read_json(filename: str) -> Mapping[str, Any]:
"""Returns a Python dict representation of JSON object at input file."""
with open(filename) as fp:
return json.load(fp)
def write_json(obj: Mapping[str, Any], filename: str, indent: int = None):
"""Writes a Python Mapping at the input file in JSON format."""
with open(filename, "w") as fp:
json.dump(obj, fp, indent=indent)
def print_json(d, indent=4):
print(json.dumps(d, indent=indent))