From bc1ef72a7040ee9d5966e3c1e1bd3ae0b509db1c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?xavier=20dupr=C3=A9?= Date: Tue, 26 Oct 2021 11:39:55 +0200 Subject: [PATCH] Fixes json parser when input is a stream --- _unittests/ut_df/data/classic.json | 35 +++++++++++++++ _unittests/ut_df/test_dataframe_io_helpers.py | 45 +++++++++++++++++++ pandas_streaming/df/dataframe_io_helpers.py | 2 + 3 files changed, 82 insertions(+) create mode 100644 _unittests/ut_df/data/classic.json diff --git a/_unittests/ut_df/data/classic.json b/_unittests/ut_df/data/classic.json new file mode 100644 index 0000000..3957fbd --- /dev/null +++ b/_unittests/ut_df/data/classic.json @@ -0,0 +1,35 @@ +[{"name":"cudaGetDeviceCount", +"ph":"X", +"cat":"cuda", +"ts":1634290065724226794, +"dur":800, +"tid":"Thread 2080429824: Runtime API", +"pid":"[89792] Process", +"args":{}}, +{"name":"_Z25full_reduce_tensor_kernelIfLi256ELi1ELi1ELi256EL21cudnnReduceTensorOp_t0EL21cudnnNanPropagation_t0EEv17cudnnTensorStructPjS3_PT_S5_S4_bii", +"ph":"X", +"cat":"cuda", +"ts":1634290112071305413, +"dur":1888, +"tid":"_Z25full_reduce_tensor_kernelIfLi256ELi1ELi1ELi256EL21cudnnReduceTensorOp_t0EL21cudnnNanPropagation_t0EEv17cudnnTensorStructPjS3_PT_S5_S4_bii", +"pid":"[0:1] Compute", +"args":{"Grid size":"[ 1, 1, 1 ]", +"Block size":"[ 256, 1, 1 ]"}}, +{"name":"_Z28op_tensor_kernel_alpha2_zeroILi3EfffLi1ELi256ELi1ELi1EL17cudnnOpTensorOp_t0EEv16alpha2_zero_argsIT0_T1_T2_E", +"ph":"X", +"cat":"cuda", +"ts":1634290112071308133, +"dur":1440, +"tid":"Compute", +"pid":"[0:1] Overview", +"args":{"Grid size":"[ 1, 1, 1 ]", +"Block size":"[ 1, 256, 1 ]"}}, +{"name":"_Z28op_tensor_kernel_alpha2_zeroILi3EfffLi1ELi256ELi1ELi1EL17cudnnOpTensorOp_t0EEv16alpha2_zero_argsIT0_T1_T2_E", +"ph":"X", +"cat":"cuda", +"ts":1634290112071308133, +"dur":1440, +"tid":"_Z28op_tensor_kernel_alpha2_zeroILi3EfffLi1ELi256ELi1ELi1EL17cudnnOpTensorOp_t0EEv16alpha2_zero_argsIT0_T1_T2_E", +"pid":"[0:1] Compute", +"args":{"Grid size":"[ 1, 1, 1 ]", +"Block size":"[ 1, 256, 1 ]"}}] diff --git a/_unittests/ut_df/test_dataframe_io_helpers.py b/_unittests/ut_df/test_dataframe_io_helpers.py index f537c3b..a6ac96d 100644 --- a/_unittests/ut_df/test_dataframe_io_helpers.py +++ b/_unittests/ut_df/test_dataframe_io_helpers.py @@ -285,6 +285,51 @@ def test_bug_documentation(self): items.append(item) self.assertEqual(len(items), 2) + def test_read_json_classic(self): + data = self.abs_path_join(__file__, 'data', 'classic.json') + dfs = pandas.read_json(data, orient='records') + dfs['ts2'] = dfs['ts'].apply(lambda t: t / 1e9) + self.assertEqual(dfs.shape[1], 9) + self.assertGreater(dfs.shape[0], 2) + it = StreamingDataFrame.read_json(data) + it['ts2'] = it['ts'].apply(lambda t: t / 1e9) + h1 = it.to_df() + h2 = it.to_df() + self.assertNotEmpty(h1) + self.assertNotEmpty(h2) + self.assertEqualDataFrame(h1, h2) + self.assertEqual(h1.shape[1], 9) + + def test_read_json_classic_file(self): + data = self.abs_path_join(__file__, 'data', 'classic.json') + dfs = pandas.read_json(data, orient='records') + self.assertEqual(dfs.shape[1], 8) + self.assertGreater(dfs.shape[0], 2) + with open(data, "r", encoding="utf-8") as f: + it = StreamingDataFrame.read_json(f, orient='records') + h1 = it.to_df() + h2 = it.to_df() + self.assertNotEmpty(h1) + self.assertNotEmpty(h2) + self.assertEqualDataFrame(h1, h2) + self.assertEqual(h1.shape[1], 8) + + def test_read_json_classic_file_formula(self): + data = self.abs_path_join(__file__, 'data', 'classic.json') + dfs = pandas.read_json(data, orient='records') + dfs['ts2'] = dfs['ts'].apply(lambda t: t / 1e9) + self.assertEqual(dfs.shape[1], 9) + self.assertGreater(dfs.shape[0], 2) + with open(data, "r", encoding="utf-8") as f: + it = StreamingDataFrame.read_json(f) + it['ts2'] = it['ts'].apply(lambda t: t / 1e9) + h1 = it.to_df() + h2 = it.to_df() + self.assertNotEmpty(h1) + self.assertNotEmpty(h2) + self.assertEqualDataFrame(h1, h2) + self.assertEqual(h1.shape[1], 9) + if __name__ == "__main__": unittest.main() diff --git a/pandas_streaming/df/dataframe_io_helpers.py b/pandas_streaming/df/dataframe_io_helpers.py index e37353a..b778571 100644 --- a/pandas_streaming/df/dataframe_io_helpers.py +++ b/pandas_streaming/df/dataframe_io_helpers.py @@ -256,6 +256,8 @@ def enumerate_json_items(filename, encoding=None, lines=False, flatten=False, fL encoding=encoding, lines=False, flatten=flatten, fLOG=fLOG): yield el else: + if hasattr(filename, 'seek'): + filename.seek(0) parser = ijson.parse(filename) current = None curkey = None