Skip to content

Commit 03008b6

Browse files
committed
2 parents 64d9203 + 71b83b7 commit 03008b6

File tree

6 files changed

+109
-19
lines changed

6 files changed

+109
-19
lines changed

HISTORY.rst

Lines changed: 25 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -5,24 +5,33 @@
55
History
66
=======
77

8-
current - 2021-01-01 - 0.00Mb
8+
current - 2021-10-26 - 0.00Mb
99
=============================
1010

11+
* #27: Fixes json parser when input is a stream (2021-10-26)
12+
* #26: Fixes bug while reading json (iterator failed to be created twice) (2021-10-26)
13+
* #25: Fixes documentation (2021-10-18)
14+
* #24: Implements a first version of sort_values. (2021-10-18)
15+
* #23: First version of operator __setitem__ (2021-10-16)
16+
* #22: Fixes nan values after pandas update, add documentation example to the unit tests (2021-07-11)
17+
* #21: Fixes grouping by nan values after update pandas to 1.3.0 (2021-07-10)
18+
* #17: Implements method describe (2021-04-08)
19+
1120
0.2.175 - 2020-08-06 - 0.03Mb
1221
=============================
1322

14-
* `16`: Unit tests failing with pandas 1.1.0. (2020-08-06)
15-
* `15`: implements parameter lines, flatten for read_json (2018-11-21)
16-
* `14`: implements fillna (2018-10-29)
17-
* `13`: implement concat for axis=0,1 (2018-10-26)
18-
* `12`: add groupby_streaming (2018-10-26)
19-
* `11`: add method add_column (2018-10-26)
20-
* `10`: plan B to bypass a bug in pandas about read_csv when iterator=True --> closed, pandas has a weird behaviour when names is too small compare to the number of columns (2018-10-26)
21-
* `9`: head is very slow (2018-10-26)
22-
* `8`: fix pandas_streaming for pandas 0.23.1 (2018-07-31)
23-
* `7`: implement read_json (2018-05-17)
24-
* `6`: add pandas_groupby_nan from pyensae (2018-05-17)
25-
* `5`: add random_state parameter to splitting functions (2018-02-04)
26-
* `2`: add method sample, resevoir sampling (2017-11-05)
27-
* `3`: method train_test_split for out-of-memory datasets (2017-10-21)
28-
* `1`: Excited for your project (2017-10-10)
23+
* #16: Unit tests failing with pandas 1.1.0. (2020-08-06)
24+
* #15: implements parameter lines, flatten for read_json (2018-11-21)
25+
* #14: implements fillna (2018-10-29)
26+
* #13: implement concat for axis=0,1 (2018-10-26)
27+
* #12: add groupby_streaming (2018-10-26)
28+
* #11: add method add_column (2018-10-26)
29+
* #10: plan B to bypass a bug in pandas about read_csv when iterator=True --> closed, pandas has a weird behaviour when names is too small compare to the number of columns (2018-10-26)
30+
* #9: head is very slow (2018-10-26)
31+
* #8: fix pandas_streaming for pandas 0.23.1 (2018-07-31)
32+
* #7: implement read_json (2018-05-17)
33+
* #6: add pandas_groupby_nan from pyensae (2018-05-17)
34+
* #5: add random_state parameter to splitting functions (2018-02-04)
35+
* #2: add method sample, resevoir sampling (2017-11-05)
36+
* #3: method train_test_split for out-of-memory datasets (2017-10-21)
37+
* #1: Excited for your project (2017-10-10)

_unittests/ut_df/data/classic.json

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
[{"name":"cudaGetDeviceCount",
2+
"ph":"X",
3+
"cat":"cuda",
4+
"ts":1634290065724226794,
5+
"dur":800,
6+
"tid":"Thread 2080429824: Runtime API",
7+
"pid":"[89792] Process",
8+
"args":{}},
9+
{"name":"_Z25full_reduce_tensor_kernelIfLi256ELi1ELi1ELi256EL21cudnnReduceTensorOp_t0EL21cudnnNanPropagation_t0EEv17cudnnTensorStructPjS3_PT_S5_S4_bii",
10+
"ph":"X",
11+
"cat":"cuda",
12+
"ts":1634290112071305413,
13+
"dur":1888,
14+
"tid":"_Z25full_reduce_tensor_kernelIfLi256ELi1ELi1ELi256EL21cudnnReduceTensorOp_t0EL21cudnnNanPropagation_t0EEv17cudnnTensorStructPjS3_PT_S5_S4_bii",
15+
"pid":"[0:1] Compute",
16+
"args":{"Grid size":"[ 1, 1, 1 ]",
17+
"Block size":"[ 256, 1, 1 ]"}},
18+
{"name":"_Z28op_tensor_kernel_alpha2_zeroILi3EfffLi1ELi256ELi1ELi1EL17cudnnOpTensorOp_t0EEv16alpha2_zero_argsIT0_T1_T2_E",
19+
"ph":"X",
20+
"cat":"cuda",
21+
"ts":1634290112071308133,
22+
"dur":1440,
23+
"tid":"Compute",
24+
"pid":"[0:1] Overview",
25+
"args":{"Grid size":"[ 1, 1, 1 ]",
26+
"Block size":"[ 1, 256, 1 ]"}},
27+
{"name":"_Z28op_tensor_kernel_alpha2_zeroILi3EfffLi1ELi256ELi1ELi1EL17cudnnOpTensorOp_t0EEv16alpha2_zero_argsIT0_T1_T2_E",
28+
"ph":"X",
29+
"cat":"cuda",
30+
"ts":1634290112071308133,
31+
"dur":1440,
32+
"tid":"_Z28op_tensor_kernel_alpha2_zeroILi3EfffLi1ELi256ELi1ELi1EL17cudnnOpTensorOp_t0EEv16alpha2_zero_argsIT0_T1_T2_E",
33+
"pid":"[0:1] Compute",
34+
"args":{"Grid size":"[ 1, 1, 1 ]",
35+
"Block size":"[ 1, 256, 1 ]"}}]

_unittests/ut_df/test_dataframe_io_helpers.py

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -285,6 +285,51 @@ def test_bug_documentation(self):
285285
items.append(item)
286286
self.assertEqual(len(items), 2)
287287

288+
def test_read_json_classic(self):
289+
data = self.abs_path_join(__file__, 'data', 'classic.json')
290+
dfs = pandas.read_json(data, orient='records')
291+
dfs['ts2'] = dfs['ts'].apply(lambda t: t / 1e9)
292+
self.assertEqual(dfs.shape[1], 9)
293+
self.assertGreater(dfs.shape[0], 2)
294+
it = StreamingDataFrame.read_json(data)
295+
it['ts2'] = it['ts'].apply(lambda t: t / 1e9)
296+
h1 = it.to_df()
297+
h2 = it.to_df()
298+
self.assertNotEmpty(h1)
299+
self.assertNotEmpty(h2)
300+
self.assertEqualDataFrame(h1, h2)
301+
self.assertEqual(h1.shape[1], 9)
302+
303+
def test_read_json_classic_file(self):
304+
data = self.abs_path_join(__file__, 'data', 'classic.json')
305+
dfs = pandas.read_json(data, orient='records')
306+
self.assertEqual(dfs.shape[1], 8)
307+
self.assertGreater(dfs.shape[0], 2)
308+
with open(data, "r", encoding="utf-8") as f:
309+
it = StreamingDataFrame.read_json(f, orient='records')
310+
h1 = it.to_df()
311+
h2 = it.to_df()
312+
self.assertNotEmpty(h1)
313+
self.assertNotEmpty(h2)
314+
self.assertEqualDataFrame(h1, h2)
315+
self.assertEqual(h1.shape[1], 8)
316+
317+
def test_read_json_classic_file_formula(self):
318+
data = self.abs_path_join(__file__, 'data', 'classic.json')
319+
dfs = pandas.read_json(data, orient='records')
320+
dfs['ts2'] = dfs['ts'].apply(lambda t: t / 1e9)
321+
self.assertEqual(dfs.shape[1], 9)
322+
self.assertGreater(dfs.shape[0], 2)
323+
with open(data, "r", encoding="utf-8") as f:
324+
it = StreamingDataFrame.read_json(f)
325+
it['ts2'] = it['ts'].apply(lambda t: t / 1e9)
326+
h1 = it.to_df()
327+
h2 = it.to_df()
328+
self.assertNotEmpty(h1)
329+
self.assertNotEmpty(h2)
330+
self.assertEqualDataFrame(h1, h2)
331+
self.assertEqual(h1.shape[1], 9)
332+
288333

289334
if __name__ == "__main__":
290335
unittest.main()

pandas_streaming/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
:epkg:`pandas` functionalites.
88
"""
99

10-
__version__ = "0.2.184"
10+
__version__ = "0.3.218"
1111
__author__ = "Xavier Dupré"
1212
__github__ = "https://github.com/sdpython/pandas_streaming"
1313
__url__ = "http://www.xavierdupre.fr/app/pandas_streaming/helpsphinx/index.html"

pandas_streaming/df/dataframe_io_helpers.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -256,6 +256,8 @@ def enumerate_json_items(filename, encoding=None, lines=False, flatten=False, fL
256256
encoding=encoding, lines=False, flatten=flatten, fLOG=fLOG):
257257
yield el
258258
else:
259+
if hasattr(filename, 'seek'):
260+
filename.seek(0)
259261
parser = ijson.parse(filename)
260262
current = None
261263
curkey = None

setup.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,9 +31,8 @@
3131
# data
3232
#######
3333

34-
here = os.path.dirname(__file__)
3534
packages = find_packages()
36-
package_dir = {k: os.path.join(here, k.replace(".", "/")) for k in packages}
35+
package_dir = {k: os.path.join('.', k.replace(".", "/")) for k in packages}
3736
package_data = {}
3837

3938

0 commit comments

Comments
 (0)