Merge branch 'master' of https://github.com/sdpython/pandas_streaming

sdpython · sdpython · commit 03008b63545e · 2021-10-26T14:44:38.000+02:00
diff --git a/HISTORY.rst b/HISTORY.rst
@@ -5,24 +5,33 @@
 History
 =======
 
-current - 2021-01-01 - 0.00Mb
+current - 2021-10-26 - 0.00Mb
 =============================
 
+* #27: Fixes json parser when input is a stream (2021-10-26)
+* #26: Fixes bug while reading json (iterator failed to be created twice) (2021-10-26)
+* #25: Fixes documentation (2021-10-18)
+* #24: Implements a first version of sort_values. (2021-10-18)
+* #23: First version of operator __setitem__ (2021-10-16)
+* #22: Fixes nan values after pandas update, add documentation example to the unit tests (2021-07-11)
+* #21: Fixes grouping by nan values after update pandas to 1.3.0 (2021-07-10)
+* #17: Implements method describe (2021-04-08)
+
 0.2.175 - 2020-08-06 - 0.03Mb
 =============================
 
-* `16`: Unit tests failing with pandas 1.1.0. (2020-08-06)
-* `15`: implements parameter lines, flatten for read_json (2018-11-21)
-* `14`: implements fillna (2018-10-29)
-* `13`: implement concat for axis=0,1 (2018-10-26)
-* `12`: add groupby_streaming (2018-10-26)
-* `11`: add method add_column (2018-10-26)
-* `10`: plan B to bypass a bug in pandas about read_csv when iterator=True --> closed, pandas has a weird behaviour when names is too small compare to the number of columns (2018-10-26)
-* `9`: head is very slow (2018-10-26)
-* `8`: fix pandas_streaming for pandas 0.23.1 (2018-07-31)
-* `7`: implement read_json (2018-05-17)
-* `6`: add pandas_groupby_nan from pyensae (2018-05-17)
-* `5`: add random_state parameter to splitting functions (2018-02-04)
-* `2`: add method sample, resevoir sampling (2017-11-05)
-* `3`: method train_test_split for out-of-memory datasets (2017-10-21)
-* `1`: Excited for your project (2017-10-10)
+* #16: Unit tests failing with pandas 1.1.0. (2020-08-06)
+* #15: implements parameter lines, flatten for read_json (2018-11-21)
+* #14: implements fillna (2018-10-29)
+* #13: implement concat for axis=0,1 (2018-10-26)
+* #12: add groupby_streaming (2018-10-26)
+* #11: add method add_column (2018-10-26)
+* #10: plan B to bypass a bug in pandas about read_csv when iterator=True --> closed, pandas has a weird behaviour when names is too small compare to the number of columns (2018-10-26)
+* #9: head is very slow (2018-10-26)
+* #8: fix pandas_streaming for pandas 0.23.1 (2018-07-31)
+* #7: implement read_json (2018-05-17)
+* #6: add pandas_groupby_nan from pyensae (2018-05-17)
+* #5: add random_state parameter to splitting functions (2018-02-04)
+* #2: add method sample, resevoir sampling (2017-11-05)
+* #3: method train_test_split for out-of-memory datasets (2017-10-21)
+* #1: Excited for your project (2017-10-10)
diff --git a/_unittests/ut_df/data/classic.json b/_unittests/ut_df/data/classic.json
@@ -0,0 +1,35 @@
+[{"name":"cudaGetDeviceCount",
+"ph":"X",
+"cat":"cuda",
+"ts":1634290065724226794,
+"dur":800,
+"tid":"Thread 2080429824: Runtime API",
+"pid":"[89792] Process",
+"args":{}},
+{"name":"_Z25full_reduce_tensor_kernelIfLi256ELi1ELi1ELi256EL21cudnnReduceTensorOp_t0EL21cudnnNanPropagation_t0EEv17cudnnTensorStructPjS3_PT_S5_S4_bii",
+"ph":"X",
+"cat":"cuda",
+"ts":1634290112071305413,
+"dur":1888,
+"tid":"_Z25full_reduce_tensor_kernelIfLi256ELi1ELi1ELi256EL21cudnnReduceTensorOp_t0EL21cudnnNanPropagation_t0EEv17cudnnTensorStructPjS3_PT_S5_S4_bii",
+"pid":"[0:1] Compute",
+"args":{"Grid size":"[ 1, 1, 1 ]",
+"Block size":"[ 256, 1, 1 ]"}},
+{"name":"_Z28op_tensor_kernel_alpha2_zeroILi3EfffLi1ELi256ELi1ELi1EL17cudnnOpTensorOp_t0EEv16alpha2_zero_argsIT0_T1_T2_E",
+"ph":"X",
+"cat":"cuda",
+"ts":1634290112071308133,
+"dur":1440,
+"tid":"Compute",
+"pid":"[0:1] Overview",
+"args":{"Grid size":"[ 1, 1, 1 ]",
+"Block size":"[ 1, 256, 1 ]"}},
+{"name":"_Z28op_tensor_kernel_alpha2_zeroILi3EfffLi1ELi256ELi1ELi1EL17cudnnOpTensorOp_t0EEv16alpha2_zero_argsIT0_T1_T2_E",
+"ph":"X",
+"cat":"cuda",
+"ts":1634290112071308133,
+"dur":1440,
+"tid":"_Z28op_tensor_kernel_alpha2_zeroILi3EfffLi1ELi256ELi1ELi1EL17cudnnOpTensorOp_t0EEv16alpha2_zero_argsIT0_T1_T2_E",
+"pid":"[0:1] Compute",
+"args":{"Grid size":"[ 1, 1, 1 ]",
+"Block size":"[ 1, 256, 1 ]"}}]
diff --git a/_unittests/ut_df/test_dataframe_io_helpers.py b/_unittests/ut_df/test_dataframe_io_helpers.py
@@ -285,6 +285,51 @@ def test_bug_documentation(self):
             items.append(item)
         self.assertEqual(len(items), 2)
 
+    def test_read_json_classic(self):
+        data = self.abs_path_join(__file__, 'data', 'classic.json')
+        dfs = pandas.read_json(data, orient='records')
+        dfs['ts2'] = dfs['ts'].apply(lambda t: t / 1e9)
+        self.assertEqual(dfs.shape[1], 9)
+        self.assertGreater(dfs.shape[0], 2)
+        it = StreamingDataFrame.read_json(data)
+        it['ts2'] = it['ts'].apply(lambda t: t / 1e9)
+        h1 = it.to_df()
+        h2 = it.to_df()
+        self.assertNotEmpty(h1)
+        self.assertNotEmpty(h2)
+        self.assertEqualDataFrame(h1, h2)
+        self.assertEqual(h1.shape[1], 9)
+
+    def test_read_json_classic_file(self):
+        data = self.abs_path_join(__file__, 'data', 'classic.json')
+        dfs = pandas.read_json(data, orient='records')
+        self.assertEqual(dfs.shape[1], 8)
+        self.assertGreater(dfs.shape[0], 2)
+        with open(data, "r", encoding="utf-8") as f:
+            it = StreamingDataFrame.read_json(f, orient='records')
+            h1 = it.to_df()
+            h2 = it.to_df()
+        self.assertNotEmpty(h1)
+        self.assertNotEmpty(h2)
+        self.assertEqualDataFrame(h1, h2)
+        self.assertEqual(h1.shape[1], 8)
+
+    def test_read_json_classic_file_formula(self):
+        data = self.abs_path_join(__file__, 'data', 'classic.json')
+        dfs = pandas.read_json(data, orient='records')
+        dfs['ts2'] = dfs['ts'].apply(lambda t: t / 1e9)
+        self.assertEqual(dfs.shape[1], 9)
+        self.assertGreater(dfs.shape[0], 2)
+        with open(data, "r", encoding="utf-8") as f:
+            it = StreamingDataFrame.read_json(f)
+            it['ts2'] = it['ts'].apply(lambda t: t / 1e9)
+            h1 = it.to_df()
+            h2 = it.to_df()
+        self.assertNotEmpty(h1)
+        self.assertNotEmpty(h2)
+        self.assertEqualDataFrame(h1, h2)
+        self.assertEqual(h1.shape[1], 9)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/pandas_streaming/__init__.py b/pandas_streaming/__init__.py
@@ -7,7 +7,7 @@
 :epkg:`pandas` functionalites.
 """
 
-__version__ = "0.2.184"
+__version__ = "0.3.218"
 __author__ = "Xavier Dupré"
 __github__ = "https://github.com/sdpython/pandas_streaming"
 __url__ = "http://www.xavierdupre.fr/app/pandas_streaming/helpsphinx/index.html"
diff --git a/pandas_streaming/df/dataframe_io_helpers.py b/pandas_streaming/df/dataframe_io_helpers.py
@@ -256,6 +256,8 @@ def enumerate_json_items(filename, encoding=None, lines=False, flatten=False, fL
                 encoding=encoding, lines=False, flatten=flatten, fLOG=fLOG):
             yield el
     else:
+        if hasattr(filename, 'seek'):
+            filename.seek(0)
         parser = ijson.parse(filename)
         current = None
         curkey = None
diff --git a/setup.py b/setup.py
@@ -31,9 +31,8 @@
 # data
 #######
 
-here = os.path.dirname(__file__)
 packages = find_packages()
-package_dir = {k: os.path.join(here, k.replace(".", "/")) for k in packages}
+package_dir = {k: os.path.join('.', k.replace(".", "/")) for k in packages}
 package_data = {}