Skip to content

Commit 7001a4f

Browse files
committed
Add id_gaps and id_gaps_index functions to times.py.
gh-20
1 parent cc506d1 commit 7001a4f

File tree

7 files changed

+188
-35
lines changed

7 files changed

+188
-35
lines changed

CHANGELOG.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
## 0.1.1 - Unreleased
88
### Added
99
- SeriesProfile now reports gaps in pd.Series with type `datetime64` or for Series with `DatetimeIndex`. [gh-20](https://github.com/ray310/Panda-Helper/issues/20)
10-
- `times.py` module has been added with public functions `time_diffs` and `time_diffs_index`. [gh-20](https://github.com/ray310/Panda-Helper/issues/20)
10+
- `times.py` module has been added with public functions `time_diffs`, `time_diffs_index`, `id_gaps`, `id_gaps_index`. [gh-20](https://github.com/ray310/Panda-Helper/issues/20)
1111
- [`freq_most_least` default parameter for SeriesProfile has been changed to `(10, 5)`.](https://github.com/ray310/Panda-Helper/commit/9ea7a4108996422eaa433e3b86ed20dbbb3c0bdb)
1212

1313
____

mkdocs.yml

+1
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,7 @@ plugins:
101101
python:
102102
paths: [src]
103103
options:
104+
members_order: alphabetical
104105
docstring_style: google
105106
docstring_section_style: list
106107
docstring_options:

src/pandahelper/__init__.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
from pandahelper.profiles import DataFrameProfile, SeriesProfile
66
from pandahelper.stats import distribution_stats, frequency_table
7-
from pandahelper.times import time_diffs, time_diffs_index
7+
from pandahelper.times import time_diffs, time_diffs_index, id_gaps, id_gaps_index
88

99
__version__ = "0.1.1"
1010
__all__ = [
@@ -14,4 +14,6 @@
1414
"SeriesProfile",
1515
"time_diffs",
1616
"time_diffs_index",
17+
"id_gaps",
18+
"id_gaps_index",
1719
]

src/pandahelper/times.py

+117-5
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77

88
def time_diffs(series: Union[pd.Series, pd.DatetimeIndex]) -> pd.Series(pd.Timedelta):
9-
"""Calculate time diffs (gaps) for Pandas Series or Index of timestamps.
9+
"""Calculate time difference between subsequent observations.
1010
1111
Sorts input by time before calculating diffs.
1212
@@ -19,19 +19,39 @@ def time_diffs(series: Union[pd.Series, pd.DatetimeIndex]) -> pd.Series(pd.Timed
1919
2020
Raises:
2121
TypeError: If input is not Series of type datetime64 or DatetimeIndex.
22+
23+
Examples:
24+
Calculate time differences between observations on Series of timestamps after
25+
it has been randomized:
26+
27+
>>> import pandahelper as ph
28+
>>> import pandas as pd
29+
>>>
30+
>>> start = pd.Timestamp(year=1999, month=1, day=1)
31+
>>> rng = pd.date_range(start, periods=10, freq="D").delete([3, 4, 5, 8])
32+
>>> series = pd.Series(rng).sample(frac=1, random_state=3) # randomize order
33+
34+
>>> ph.time_diffs(series)
35+
1999-01-01 NaT
36+
1999-01-02 1 days
37+
1999-01-03 1 days
38+
1999-01-07 4 days
39+
1999-01-08 1 days
40+
1999-01-10 2 days
41+
Name: diffs, dtype: timedelta64[ns]
2242
"""
2343
if not pat.is_datetime64_any_dtype(series.dtype):
24-
raise TypeError("Should be Series of datetime64 dtype.")
44+
raise TypeError("Should be of datetime64 dtype.")
2545
series = series.sort_values()
2646
diffs = pd.Series(series.diff(), name="diffs")
2747
diffs.index = series
2848
return diffs
2949

3050

31-
def time_diffs_index(df: Union[pd.Series, pd.DatetimeIndex]) -> pd.Series(pd.Timedelta):
32-
"""Calculate time diffs (gaps) for time-indexed Pandas Series or Dataframe.
51+
def time_diffs_index(df: Union[pd.Series, pd.DataFrame]) -> pd.Series(pd.Timedelta):
52+
"""Calculate time difference between subsequent time-indexed observations.
3353
34-
Sorts input by time before calculating diffs.
54+
Sorts input by time index before calculating diffs.
3555
3656
Args:
3757
df (pd.Series or pd.DataFrame): Pandas Series or DataFrame with DateTimeIndex
@@ -42,10 +62,102 @@ def time_diffs_index(df: Union[pd.Series, pd.DatetimeIndex]) -> pd.Series(pd.Tim
4262
4363
Raises:
4464
TypeError: If input does not have a DatetimeIndex.
65+
66+
Examples:
67+
Calculate time differences between observations on time-indexed DataFrame after
68+
it has been randomized:
69+
70+
>>> import pandahelper as ph
71+
>>> import pandas as pd
72+
>>>
73+
>>> start = pd.Timestamp(year=1999, month=1, day=1)
74+
>>> rng = pd.date_range(start, periods=10, freq="D").delete([3, 4, 5, 8])
75+
>>> # index by time then randomize order
76+
>>> df = pd.DataFrame(range(len(rng)), index=rng).sample(frac=1, random_state=3)
77+
78+
>>> ph.time_diffs_index(df)
79+
1999-01-01 NaT
80+
1999-01-02 1 days
81+
1999-01-03 1 days
82+
1999-01-07 4 days
83+
1999-01-08 1 days
84+
1999-01-10 2 days
85+
Name: diffs, dtype: timedelta64[ns]
4586
"""
4687
if isinstance(df.index, pd.DatetimeIndex):
4788
df = df.sort_index()
4889
diffs = pd.Series(df.index.diff(), name="diffs")
4990
diffs.index = df.index
5091
return diffs
5192
raise TypeError(f"Index should be of type {pd.DatetimeIndex}")
93+
94+
95+
def id_gaps(
96+
series: Union[pd.Series, pd.DatetimeIndex], threshold: pd.Timedelta
97+
) -> pd.DataFrame:
98+
"""Identify time gaps above `threshold` in datetime64 Series or DatetimeIndex.
99+
100+
Sorts input by time before calculating gaps.
101+
102+
Args:
103+
series (pd.Series or pd.DatetimeIndex): `datetime64` Series or DatetimeIndex.
104+
threshold (pd.Timedelta): Threshold to identify gaps
105+
(and not expected time differences).
106+
107+
Returns:
108+
One-column Pandas DataFrame of gaps indexed by when gap was calculated.
109+
110+
Examples:
111+
Identify time gaps on Series of timestamps with a 2 and 4 hour
112+
gap after it has been randomized:
113+
114+
>>> import pandahelper as ph
115+
>>> import pandas as pd
116+
>>>
117+
>>> start = pd.Timestamp(year=1999, month=1, day=1)
118+
>>> rng = pd.date_range(start, periods=24, freq="1h").delete([3, 4, 8, 9, 10])
119+
>>> series = pd.Series(rng).sample(frac=1, random_state=3) # randomize order
120+
121+
>>> ph.id_gaps(series, pd.Timedelta(hours=1))
122+
diffs
123+
1999-01-01 11:00:00 0 days 04:00:00
124+
1999-01-01 04:00:00 0 days 02:00:00
125+
"""
126+
diffs = time_diffs(series)
127+
return diffs[diffs > threshold].sort_values(ascending=False).to_frame()
128+
129+
130+
def id_gaps_index(
131+
df: Union[pd.Series, pd.DataFrame], threshold: pd.Timedelta
132+
) -> pd.DataFrame:
133+
"""Identify time gaps above `threshold` in time-indexed Series or DataFrame.
134+
135+
Sorts input by time index before calculating diffs.
136+
137+
Args:
138+
df (pd.Series or pd.DataFrame): Time-indexed Series or DataFrame.
139+
threshold (pd.Timedelta): Threshold to identify gaps
140+
(and not expected time differences).
141+
142+
Returns:
143+
One-column Pandas DataFrame of gaps indexed by when gap was calculated.
144+
145+
Examples:
146+
Identify time gaps on an hourly, time-indexed Series with a 2 and 4 hour
147+
gap after it has been randomized:
148+
149+
>>> import pandahelper as ph
150+
>>> import pandas as pd
151+
>>>
152+
>>> start = pd.Timestamp(year=1999, month=1, day=1)
153+
>>> rng = pd.date_range(start, periods=24, freq="1h").delete([3, 8, 9, 10])
154+
>>> # index by time then randomize order
155+
>>> df = pd.DataFrame(range(len(rng)), index=rng).sample(frac=1, random_state=3)
156+
157+
>>> ph.id_gaps_index(df, pd.Timedelta(hours=1))
158+
diffs
159+
1999-01-01 11:00:00 0 days 04:00:00
160+
1999-01-01 04:00:00 0 days 02:00:00
161+
"""
162+
diffs = time_diffs_index(df)
163+
return diffs[diffs > threshold].sort_values(ascending=False).to_frame()

tests/conftest.py

+14-4
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
cached value.
55
"""
66

7-
from datetime import datetime
87
import os
98
import numpy as np
109
import pandas as pd
@@ -19,17 +18,28 @@
1918

2019
@pytest.fixture
2120
def cat_df(scope="package"): # pylint: disable=W0613
22-
"""Return test pd.DataFrame."""
23-
start = datetime(year=1999, month=1, day=1, hour=0, minute=0)
21+
"""Return test pd.DataFrame with DatetimeIndex."""
22+
start = pd.Timestamp(year=1999, month=1, day=1)
2423
end = start + pd.Timedelta(hours=10)
2524
df = make_category_data("Springfield", start, end, freq="h")
2625
df = df.sample(frac=1, random_state=2) # index is out of order
2726
return df
2827

2928

29+
@pytest.fixture
30+
def ts_timeindex(scope="package"): # pylint: disable=W0613
31+
"""Return pd.Series of type datetime64 with DatetimeIndex."""
32+
start = pd.Timestamp(year=1999, month=1, day=1)
33+
end = start + pd.Timedelta(hours=40)
34+
time_series = pd.Series(pd.date_range(start, end, freq="4h", inclusive="left"))
35+
index_end = start + pd.Timedelta(hours=10)
36+
time_series.index = pd.date_range(start, index_end, freq="h", inclusive="left")
37+
return time_series
38+
39+
3040
@pytest.fixture
3141
def test_df(scope="package"): # pylint: disable=W0613
32-
"""Return test pd.DataFrame."""
42+
"""Return test pd.DataFrame from sample of NYC collisions dataset."""
3343
return pd.read_csv(os.path.join(TEST_DATA_DIR, TEST_DATA_FILE))
3444

3545

tests/test_profiles.py

-11
Original file line numberDiff line numberDiff line change
@@ -218,17 +218,6 @@ def test_series_profile_time_index_false(cat_df):
218218
assert profile.time_diffs is None
219219

220220

221-
@pytest.fixture
222-
def ts_timeindex(scope="module"): # pylint: disable=W0613
223-
"""Return pd.Series of type datetime64 with DatetimeIndex."""
224-
start = datetime(year=1999, month=1, day=1, hour=0, minute=0)
225-
end = start + pd.Timedelta(hours=40)
226-
time_series = pd.Series(pd.date_range(start, end, freq="4h", inclusive="left"))
227-
index_end = start + pd.Timedelta(hours=10)
228-
time_series.index = pd.date_range(start, index_end, freq="h", inclusive="left")
229-
return time_series
230-
231-
232221
def test_series_profile_ts_range_index_true(ts_timeindex): # pylint: disable=W0621
233222
"""time_index=True does not calculate time diffs for Series with RangeIndex."""
234223
series = ts_timeindex

tests/test_times.py

+52-13
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,23 @@
55
import pandahelper.times as pht
66

77

8+
def test_time_diffs(cat_df):
9+
"""time_diffs should work on shuffled pd.Series or Index of timestamps."""
10+
valid = [cat_df.index, pd.Series(cat_df.index)]
11+
for v in valid:
12+
result = pht.time_diffs(v)
13+
assert result.iloc[0] is pd.NaT
14+
assert all(result[1:] == pd.Timedelta(hours=1))
15+
16+
17+
def test_time_diffs_exception():
18+
"""Non-datetime64 pd.Series raises exception."""
19+
invalid = [pd.Series(list(range(5))), pd.Series([pd.Timedelta(hours=1)] * 2)]
20+
for tipo in invalid:
21+
with pytest.raises(TypeError):
22+
pht.time_diffs(tipo)
23+
24+
825
def test_time_diffs_index(cat_df):
926
"""time_diffs_index should work on shuffled pd.Series or pd.DataFrame."""
1027
# test DF
@@ -27,18 +44,40 @@ def test_time_diffs_index_exception():
2744
assert str(pd.DatetimeIndex) in str(exc)
2845

2946

30-
def test_time_diffs(cat_df):
31-
"""time_diffs should work on shuffled pd.Series or Index of timestamps."""
32-
valid = [cat_df.index, pd.Series(cat_df.index)]
33-
for v in valid:
34-
result = pht.time_diffs(v)
35-
assert result.iloc[0] is pd.NaT
36-
assert all(result[1:] == pd.Timedelta(hours=1))
47+
def test_id_gaps_index(ts_timeindex):
48+
"""id_gap_index returns expected gap from time-Series with DatetimeIndex."""
49+
result = pht.id_gaps_index(
50+
ts_timeindex, pd.Timedelta(minutes=59, microseconds=999999)
51+
)
52+
expected = pd.DataFrame(
53+
[pd.Timedelta(hours=1)] * 9,
54+
index=pd.date_range(pd.Timestamp(1999, 1, 1, 1), periods=9, freq="h"),
55+
columns=["diffs"],
56+
)
57+
pd.testing.assert_frame_equal(expected, result, check_index_type=True)
3758

3859

39-
def test_time_diffs_exception():
40-
"""Non-datetime64 pd.Series raises exception."""
41-
invalid = [pd.Series(list(range(5))), pd.Series([pd.Timedelta(hours=1)] * 2)]
42-
for tipo in invalid:
43-
with pytest.raises(TypeError):
44-
pht.time_diffs(tipo)
60+
def test_id_gaps_index_no_gaps(ts_timeindex):
61+
"""id_gap_index returns empty Dataframe when threshold exceeds diffs."""
62+
result = pht.id_gaps_index(ts_timeindex, pd.Timedelta(minutes=60, microseconds=1))
63+
assert len(result) == 0
64+
65+
66+
def test_id_gaps_(ts_timeindex):
67+
"""id_gap returns expected gap from time-Series with DatetimeIndex."""
68+
result = pht.id_gaps(
69+
ts_timeindex, pd.Timedelta(hours=3, minutes=59, microseconds=999999)
70+
)
71+
expected = pd.DataFrame(
72+
[pd.Timedelta(hours=4)] * 9,
73+
index=pd.date_range(pd.Timestamp(1999, 1, 1, 4), periods=9, freq="4h"),
74+
columns=["diffs"],
75+
)
76+
expected.index.freq = None # diffs won't have freq set
77+
pd.testing.assert_frame_equal(expected, result, check_index_type=True)
78+
79+
80+
def test_id_gaps_no_gaps(ts_timeindex):
81+
"""id_gap_index returns empty Dataframe when threshold exceeds diffs."""
82+
result = pht.id_gaps(ts_timeindex, pd.Timedelta(hours=4, microseconds=1))
83+
assert len(result) == 0

0 commit comments

Comments
 (0)