Skip to content

Commit a5b6f69

Browse files
authored
Slightly better Typechecking when exporting to SQL (#174)
* corret type clause * add tests, changes in create_json_string * create json-string now gives back None * revert changes * fix panda sql export * add SQL test * fixed None type export for csv and sql.gz * move None parsing to json io * alter regex * revert changes * only replace None with empty str when necessary * fixed deserialze_df for python 3.7 * add more tesets * fix case where gz was ignored * hand voer gz explicitly * replace nan by None in non-Obs columns * moved warning to csv export, mroe tests * only values able to be nan are put in np.isnan() * added python float for warning
1 parent b75aa74 commit a5b6f69

File tree

3 files changed

+198
-24
lines changed

3 files changed

+198
-24
lines changed

pyerrors/input/json.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -479,7 +479,6 @@ def import_json_string(json_string, verbose=True, full_output=False):
479479
result : dict
480480
if full_output=True
481481
"""
482-
483482
return _parse_json_dict(json.loads(json_string), verbose, full_output)
484483

485484

pyerrors/input/pandas.py

Lines changed: 36 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
from ..obs import Obs
66
from ..correlators import Corr
77
from .json import create_json_string, import_json_string
8+
import numpy as np
89

910

1011
def to_sql(df, table_name, db, if_exists='fail', gz=True, **kwargs):
@@ -76,6 +77,13 @@ def dump_df(df, fname, gz=True):
7677
-------
7778
None
7879
"""
80+
for column in df:
81+
serialize = _need_to_serialize(df[column])
82+
if not serialize:
83+
if all(isinstance(entry, (int, np.integer, float, np.floating)) for entry in df[column]):
84+
if any([np.isnan(entry) for entry in df[column]]):
85+
warnings.warn("nan value in column " + column + " will be replaced by None", UserWarning)
86+
7987
out = _serialize_df(df, gz=False)
8088

8189
if not fname.endswith('.csv'):
@@ -114,11 +122,11 @@ def load_df(fname, auto_gamma=False, gz=True):
114122
if not fname.endswith('.gz'):
115123
fname += '.gz'
116124
with gzip.open(fname) as f:
117-
re_import = pd.read_csv(f)
125+
re_import = pd.read_csv(f, keep_default_na=False)
118126
else:
119127
if fname.endswith('.gz'):
120128
warnings.warn("Trying to read from %s without unzipping!" % fname, UserWarning)
121-
re_import = pd.read_csv(fname)
129+
re_import = pd.read_csv(fname, keep_default_na=False)
122130

123131
return _deserialize_df(re_import, auto_gamma=auto_gamma)
124132

@@ -135,17 +143,12 @@ def _serialize_df(df, gz=False):
135143
"""
136144
out = df.copy()
137145
for column in out:
138-
serialize = False
139-
if isinstance(out[column][0], (Obs, Corr)):
140-
serialize = True
141-
elif isinstance(out[column][0], list):
142-
if all(isinstance(o, Obs) for o in out[column][0]):
143-
serialize = True
146+
serialize = _need_to_serialize(out[column])
144147

145148
if serialize is True:
146-
out[column] = out[column].transform(lambda x: create_json_string(x, indent=0))
149+
out[column] = out[column].transform(lambda x: create_json_string(x, indent=0) if x is not None else None)
147150
if gz is True:
148-
out[column] = out[column].transform(lambda x: gzip.compress(x.encode('utf-8')))
151+
out[column] = out[column].transform(lambda x: gzip.compress((x if x is not None else '').encode('utf-8')))
149152
return out
150153

151154

@@ -168,12 +171,29 @@ def _deserialize_df(df, auto_gamma=False):
168171
if isinstance(df[column][0], bytes):
169172
if df[column][0].startswith(b"\x1f\x8b\x08\x00"):
170173
df[column] = df[column].transform(lambda x: gzip.decompress(x).decode('utf-8'))
171-
if isinstance(df[column][0], str):
172-
if '"program":' in df[column][0][:20]:
173-
df[column] = df[column].transform(lambda x: import_json_string(x, verbose=False))
174+
df = df.replace({r'^$': None}, regex=True)
175+
i = 0
176+
while df[column][i] is None:
177+
i += 1
178+
if isinstance(df[column][i], str):
179+
if '"program":' in df[column][i][:20]:
180+
df[column] = df[column].transform(lambda x: import_json_string(x, verbose=False) if x is not None else None)
174181
if auto_gamma is True:
175-
if isinstance(df[column][0], list):
176-
df[column].apply(lambda x: [o.gm() for o in x])
182+
if isinstance(df[column][i], list):
183+
df[column].apply(lambda x: [o.gm() if o is not None else x for o in x])
177184
else:
178-
df[column].apply(lambda x: x.gamma_method())
185+
df[column].apply(lambda x: x.gm() if x is not None else x)
179186
return df
187+
188+
189+
def _need_to_serialize(col):
190+
serialize = False
191+
i = 0
192+
while col[i] is None:
193+
i += 1
194+
if isinstance(col[i], (Obs, Corr)):
195+
serialize = True
196+
elif isinstance(col[i], list):
197+
if all(isinstance(o, Obs) for o in col[i]):
198+
serialize = True
199+
return serialize

tests/pandas_test.py

Lines changed: 162 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,14 @@
22
import pandas as pd
33
import pyerrors as pe
44
import pytest
5+
import warnings
6+
57

68
def test_df_export_import(tmp_path):
79
my_dict = {"int": 1,
8-
"float": -0.01,
9-
"Obs1": pe.pseudo_Obs(87, 21, "test_ensemble"),
10-
"Obs2": pe.pseudo_Obs(-87, 21, "test_ensemble2")}
10+
"float": -0.01,
11+
"Obs1": pe.pseudo_Obs(87, 21, "test_ensemble"),
12+
"Obs2": pe.pseudo_Obs(-87, 21, "test_ensemble2")}
1113
for gz in [True, False]:
1214
my_df = pd.DataFrame([my_dict] * 10)
1315

@@ -18,13 +20,166 @@ def test_df_export_import(tmp_path):
1820
pe.input.pandas.load_df((tmp_path / 'df_output.csv').as_posix(), gz=gz)
1921

2022

23+
def test_null_first_line_df_export_import(tmp_path):
24+
my_dict = {"int": 1,
25+
"float": -0.01,
26+
"Obs1": pe.pseudo_Obs(87, 21, "test_ensemble"),
27+
"Obs2": pe.pseudo_Obs(-87, 21, "test_ensemble2")}
28+
my_df = pd.DataFrame([my_dict] * 4)
29+
my_df.loc[0, "Obs1"] = None
30+
my_df.loc[2, "Obs1"] = None
31+
for gz in [True, False]:
32+
pe.input.pandas.dump_df(my_df, (tmp_path / 'df_output').as_posix(), gz=gz)
33+
reconstructed_df = pe.input.pandas.load_df((tmp_path / 'df_output').as_posix(), auto_gamma=True, gz=gz)
34+
assert reconstructed_df.loc[0, "Obs1"] is None
35+
assert reconstructed_df.loc[2, "Obs1"] is None
36+
assert np.all(reconstructed_df.loc[1] == my_df.loc[1])
37+
assert np.all(reconstructed_df.loc[3] == my_df.loc[3])
38+
39+
40+
def test_nan_df_export_import(tmp_path):
41+
my_dict = {"int": 1,
42+
"float": -0.01,
43+
"Obs1": pe.pseudo_Obs(87, 21, "test_ensemble"),
44+
"Obs2": pe.pseudo_Obs(-87, 21, "test_ensemble2")}
45+
my_df = pd.DataFrame([my_dict] * 4)
46+
my_df.loc[1, "int"] = np.nan
47+
48+
for gz in [True, False]:
49+
pe.input.pandas.dump_df(my_df, (tmp_path / 'df_output').as_posix(), gz=gz)
50+
reconstructed_df = pe.input.pandas.load_df((tmp_path / 'df_output').as_posix(), auto_gamma=True, gz=gz)
51+
with pytest.warns(UserWarning, match="nan value in column int will be replaced by None"):
52+
warnings.warn("nan value in column int will be replaced by None", UserWarning)
53+
assert reconstructed_df.loc[1, "int"] is None
54+
assert np.all(reconstructed_df.loc[:, "float"] == my_df.loc[:, "float"])
55+
assert np.all(reconstructed_df.loc[:, "Obs1"] == my_df.loc[:, "Obs1"])
56+
assert np.all(reconstructed_df.loc[:, "Obs2"] == my_df.loc[:, "Obs2"])
57+
58+
59+
def test_null_second_line_df_export_import(tmp_path):
60+
my_dict = {"int": 1,
61+
"float": -0.01,
62+
"Obs1": pe.pseudo_Obs(87, 21, "test_ensemble"),
63+
"Obs2": pe.pseudo_Obs(-87, 21, "test_ensemble2")}
64+
my_df = pd.DataFrame([my_dict] * 4)
65+
my_df.loc[1, "Obs1"] = None
66+
for gz in [True, False]:
67+
pe.input.pandas.dump_df(my_df, (tmp_path / 'df_output').as_posix(), gz=gz)
68+
reconstructed_df = pe.input.pandas.load_df((tmp_path / 'df_output').as_posix(), auto_gamma=True, gz=gz)
69+
assert reconstructed_df.loc[1, "Obs1"] is None
70+
assert np.all(reconstructed_df.loc[0] == my_df.loc[0])
71+
assert np.all(reconstructed_df.loc[2:] == my_df.loc[2:])
72+
73+
74+
def test_null_first_line_df_gzsql_export_import(tmp_path):
75+
my_dict = {"int": 1,
76+
"float": -0.01,
77+
"Obs1": pe.pseudo_Obs(87, 21, "test_ensemble"),
78+
"Obs2": pe.pseudo_Obs(-87, 21, "test_ensemble2")}
79+
80+
my_df = pd.DataFrame([my_dict] * 4)
81+
my_df.loc[0, "Obs1"] = None
82+
my_df.loc[2, "Obs1"] = None
83+
gz = True
84+
pe.input.pandas.to_sql(my_df, 'test', (tmp_path / 'test.db').as_posix(), gz=gz)
85+
reconstructed_df = pe.input.pandas.read_sql('SELECT * FROM test', (tmp_path / 'test.db').as_posix(), auto_gamma=True)
86+
assert reconstructed_df.loc[0, "Obs1"] is None
87+
assert reconstructed_df.loc[2, "Obs1"] is None
88+
assert np.all(reconstructed_df.loc[1] == my_df.loc[1])
89+
assert np.all(reconstructed_df.loc[3] == my_df.loc[3])
90+
91+
92+
def test_null_second_line_df_gzsql_export_import(tmp_path):
93+
my_dict = {"int": 1,
94+
"float": -0.01,
95+
"Obs1": pe.pseudo_Obs(87, 21, "test_ensemble"),
96+
"Obs2": pe.pseudo_Obs(-87, 21, "test_ensemble2")}
97+
98+
my_df = pd.DataFrame([my_dict] * 4)
99+
my_df.loc[1, "Obs1"] = None
100+
gz = True
101+
pe.input.pandas.to_sql(my_df, 'test', (tmp_path / 'test.db').as_posix(), gz=gz)
102+
reconstructed_df = pe.input.pandas.read_sql('SELECT * FROM test', (tmp_path / 'test.db').as_posix(), auto_gamma=True)
103+
assert reconstructed_df.loc[1, "Obs1"] is None
104+
assert np.all(reconstructed_df.loc[0] == my_df.loc[0])
105+
assert np.all(reconstructed_df.loc[2:] == my_df.loc[2:])
106+
107+
108+
def test_null_first_line_df_sql_export_import(tmp_path):
109+
my_dict = {"int": 1,
110+
"float": -0.01,
111+
"Obs1": pe.pseudo_Obs(87, 21, "test_ensemble"),
112+
"Obs2": pe.pseudo_Obs(-87, 21, "test_ensemble2")}
113+
114+
my_df = pd.DataFrame([my_dict] * 4)
115+
my_df.loc[0, "Obs1"] = None
116+
my_df.loc[2, "Obs1"] = None
117+
gz = False
118+
pe.input.pandas.to_sql(my_df, 'test', (tmp_path / 'test.db').as_posix(), gz=gz)
119+
reconstructed_df = pe.input.pandas.read_sql('SELECT * FROM test', (tmp_path / 'test.db').as_posix(), auto_gamma=True)
120+
assert reconstructed_df.loc[0, "Obs1"] is None
121+
assert reconstructed_df.loc[2, "Obs1"] is None
122+
assert np.all(reconstructed_df.loc[1] == my_df.loc[1])
123+
assert np.all(reconstructed_df.loc[3] == my_df.loc[3])
124+
125+
126+
def test_nan_sql_export_import(tmp_path):
127+
my_dict = {"int": 1,
128+
"float": -0.01,
129+
"Obs1": pe.pseudo_Obs(87, 21, "test_ensemble"),
130+
"Obs2": pe.pseudo_Obs(-87, 21, "test_ensemble2")}
131+
my_df = pd.DataFrame([my_dict] * 4)
132+
my_df.loc[1, "int"] = np.nan
133+
gz = False
134+
pe.input.pandas.to_sql(my_df, 'test', (tmp_path / 'test.db').as_posix(), gz=gz)
135+
reconstructed_df = pe.input.pandas.read_sql('SELECT * FROM test', (tmp_path / 'test.db').as_posix(), auto_gamma=True)
136+
with pytest.warns(UserWarning, match="nan value in column int will be replaced by None"):
137+
warnings.warn("nan value in column int will be replaced by None", UserWarning)
138+
assert np.isnan(reconstructed_df.loc[1, "int"])
139+
assert np.all(reconstructed_df.loc[:, "float"] == my_df.loc[:, "float"])
140+
assert np.all(reconstructed_df.loc[:, "Obs1"] == my_df.loc[:, "Obs1"])
141+
assert np.all(reconstructed_df.loc[:, "Obs2"] == my_df.loc[:, "Obs2"])
142+
143+
144+
def test_nan_gzsql_export_import(tmp_path):
145+
my_dict = {"int": 1,
146+
"float": -0.01,
147+
"Obs1": pe.pseudo_Obs(87, 21, "test_ensemble"),
148+
"Obs2": pe.pseudo_Obs(-87, 21, "test_ensemble2")}
149+
my_df = pd.DataFrame([my_dict] * 4)
150+
my_df.loc[1, "int"] = np.nan
151+
gz = True
152+
pe.input.pandas.to_sql(my_df, 'test', (tmp_path / 'test.db').as_posix(), gz=gz)
153+
reconstructed_df = pe.input.pandas.read_sql('SELECT * FROM test', (tmp_path / 'test.db').as_posix(), auto_gamma=True)
154+
assert np.isnan(reconstructed_df.loc[1, "int"])
155+
assert np.all(reconstructed_df.loc[:, "float"] == my_df.loc[:, "float"])
156+
assert np.all(reconstructed_df.loc[:, "Obs1"] == my_df.loc[:, "Obs1"])
157+
assert np.all(reconstructed_df.loc[:, "Obs2"] == my_df.loc[:, "Obs2"])
158+
159+
160+
def test_null_second_line_df_sql_export_import(tmp_path):
161+
my_dict = {"int": 1,
162+
"float": -0.01,
163+
"Obs1": pe.pseudo_Obs(87, 21, "test_ensemble"),
164+
"Obs2": pe.pseudo_Obs(-87, 21, "test_ensemble2")}
165+
166+
my_df = pd.DataFrame([my_dict] * 4)
167+
my_df.loc[1, "Obs1"] = None
168+
gz = False
169+
pe.input.pandas.to_sql(my_df, 'test', (tmp_path / 'test.db').as_posix(), gz=gz)
170+
reconstructed_df = pe.input.pandas.read_sql('SELECT * FROM test', (tmp_path / 'test.db').as_posix(), auto_gamma=True)
171+
assert reconstructed_df.loc[1, "Obs1"] is None
172+
assert np.all(reconstructed_df.loc[0] == my_df.loc[0])
173+
assert np.all(reconstructed_df.loc[2:] == my_df.loc[2:])
174+
175+
21176
def test_df_Corr(tmp_path):
22177

23178
my_corr = pe.Corr([pe.pseudo_Obs(-0.48, 0.04, "test"), pe.pseudo_Obs(-0.154, 0.03, "test")])
24179

25180
my_dict = {"int": 1,
26-
"float": -0.01,
27-
"Corr": my_corr}
181+
"float": -0.01,
182+
"Corr": my_corr}
28183
my_df = pd.DataFrame([my_dict] * 5)
29184

30185
pe.input.pandas.dump_df(my_df, (tmp_path / 'df_output').as_posix())
@@ -76,8 +231,8 @@ def test_sql_if_exists_fail(tmp_path):
76231

77232
def test_Obs_list_sql(tmp_path):
78233
my_dict = {"int": 1,
79-
"Obs1": pe.pseudo_Obs(17, 11, "test_sql_if_exists_failnsemble"),
80-
"Obs_list": [[pe.pseudo_Obs(0.0, 0.1, "test_ensemble2"), pe.pseudo_Obs(3.2, 1.1, "test_ensemble2")]]}
234+
"Obs1": pe.pseudo_Obs(17, 11, "test_sql_if_exists_failnsemble"),
235+
"Obs_list": [[pe.pseudo_Obs(0.0, 0.1, "test_ensemble2"), pe.pseudo_Obs(3.2, 1.1, "test_ensemble2")]]}
81236
pe_df = pd.DataFrame(my_dict)
82237
my_db = (tmp_path / "test_db.sqlite").as_posix()
83238
pe.input.pandas.to_sql(pe_df, "My_table", my_db)

0 commit comments

Comments
 (0)