Skip to content

Commit d462ea8

Browse files
authored
Fixes bug while reading json (iterator failed to be created twice) (#26)
1 parent e4f8575 commit d462ea8

File tree

5 files changed

+48
-4
lines changed

5 files changed

+48
-4
lines changed

_unittests/ut_df/data/example.json

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
{"a": 1, "b": 2}
2+
{"a": 3, "b": 4}
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
[{"a":1,"b":2},{"a":3,"b":4}]

_unittests/ut_df/test_dataframe_io_helpers.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -145,6 +145,8 @@ def test_read_json_raw_head(self):
145145
h1 = it.head()
146146
h2 = it.head()
147147
self.assertEqualDataFrame(h1, h2)
148+
self.assertGreater(h1.shape[0], 1)
149+
self.assertGreater(h2.shape[0], 1)
148150

149151
def test_pandas_json_chunksize(self):
150152
jsonl = '''{"a": 1, "b": 2}
@@ -186,6 +188,28 @@ def test_read_json_rows2_head(self):
186188
self.assertNotEmpty(h2)
187189
self.assertEqualDataFrame(h1, h2)
188190

191+
def test_read_json_rows_file_head(self):
192+
data = self.abs_path_join(__file__, 'data', 'example2.json')
193+
dfs = pandas.read_json(data, orient='records')
194+
self.assertEqual(dfs.shape, (2, 2))
195+
it = StreamingDataFrame.read_json(data)
196+
h1 = it.head()
197+
h2 = it.head()
198+
self.assertNotEmpty(h1)
199+
self.assertNotEmpty(h2)
200+
self.assertEqualDataFrame(h1, h2)
201+
202+
def test_read_json_rows_file_lines_head(self):
203+
data = self.abs_path_join(__file__, 'data', 'example.json')
204+
dfs = pandas.read_json(data, orient='records', lines=True)
205+
self.assertEqual(dfs.shape, (2, 2))
206+
it = StreamingDataFrame.read_json(data, lines="stream")
207+
h1 = it.head()
208+
h2 = it.head()
209+
self.assertNotEmpty(h1)
210+
self.assertNotEmpty(h2)
211+
self.assertEqualDataFrame(h1, h2)
212+
189213
def test_read_json_ijson(self):
190214
it = StreamingDataFrame.read_json(
191215
BytesIO(TestDataFrameIOHelpers.text_json))

pandas_streaming/df/dataframe.py

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -178,7 +178,7 @@ def read_json(*args, chunksize=100000, flatten=False, **kwargs) -> 'StreamingDat
178178
and it must be defined to return an iterator.
179179
If *lines* is True, the function falls back into
180180
:epkg:`pandas:read_json`, otherwise it used
181-
@see fn enumerate_json_items. If *lines is ``'stream'``,
181+
@see fn enumerate_json_items. If *lines* is ``'stream'``,
182182
*enumerate_json_items* is called with parameter
183183
``lines=True``.
184184
Parameter *flatten* uses the trick described at
@@ -212,6 +212,13 @@ def read_json(*args, chunksize=100000, flatten=False, **kwargs) -> 'StreamingDat
212212
it = StreamingDataFrame.read_json(BytesIO(data))
213213
dfs = list(it)
214214
print(dfs)
215+
216+
.. index:: IncompleteJSONError
217+
218+
The parsed json must have an empty line at the end otherwise
219+
the following exception is raised:
220+
`ijson.common.IncompleteJSONError: `
221+
`parse error: unallowed token at this point in JSON text`.
215222
"""
216223
if not isinstance(chunksize, int) or chunksize <= 0:
217224
raise ValueError( # pragma: no cover
@@ -228,7 +235,8 @@ def read_json(*args, chunksize=100000, flatten=False, **kwargs) -> 'StreamingDat
228235
del kwargs['lines']
229236

230237
def localf(a0=args[0]):
231-
a0.seek(0)
238+
if hasattr(a0, 'seek'):
239+
a0.seek(0)
232240
return enumerate_json_items(
233241
a0, encoding=kwargs.get('encoding', None), lines=True,
234242
flatten=flatten)
@@ -280,6 +288,7 @@ def fct2(args=args, chunksize=chunksize, kw=kwargs.copy()):
280288
**kwargs_create)
281289

282290
def fct3(st=st, args=args, chunksize=chunksize, kw=kwargs.copy()):
291+
st.seek(0)
283292
for r in pandas.read_json(
284293
st, *args, chunksize=chunksize, nrows=chunksize,
285294
lines=True, **kw):
@@ -920,8 +929,9 @@ def __getitem__(self, *args):
920929

921930
def iterate_col():
922931
"iterate on one column"
932+
one_col = [cols]
923933
for df in iter_creation():
924-
yield df[[cols]]
934+
yield df[one_col]
925935
return StreamingSeries(iterate_col, **self.get_kwargs())
926936

927937
if not isinstance(cols, list):

pandas_streaming/df/dataframe_io_helpers.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -113,11 +113,13 @@ def flatten_dictionary(dico, sep="_"):
113113
"""
114114
Flattens a dictionary with nested structure to a dictionary with no
115115
hierarchy.
116+
116117
:param dico: dictionary to flatten
117118
:param sep: string to separate dictionary keys by
118119
:return: flattened dictionary
119120
120-
Inspired from `flatten_json <https://github.com/amirziai/flatten/blob/master/flatten_json.py>`_.
121+
Inspired from `flatten_json
122+
<https://github.com/amirziai/flatten/blob/master/flatten_json.py>`_.
121123
"""
122124
flattened_dict = {}
123125

@@ -223,6 +225,11 @@ def enumerate_json_items(filename, encoding=None, lines=False, flatten=False, fL
223225
224226
for item in enumerate_json_items(text_json):
225227
print(item)
228+
229+
The parsed json must have an empty line at the end otherwise
230+
the following exception is raised:
231+
`ijson.common.IncompleteJSONError: `
232+
`parse error: unallowed token at this point in JSON text`.
226233
"""
227234
if isinstance(filename, str):
228235
if "{" not in filename and os.path.exists(filename):

0 commit comments

Comments
 (0)