6
6
7
7
8
8
def time_diffs (series : Union [pd .Series , pd .DatetimeIndex ]) -> pd .Series (pd .Timedelta ):
9
- """Calculate time diffs (gaps) for Pandas Series or Index of timestamps .
9
+ """Calculate time difference between subsequent observations .
10
10
11
11
Sorts input by time before calculating diffs.
12
12
@@ -19,19 +19,39 @@ def time_diffs(series: Union[pd.Series, pd.DatetimeIndex]) -> pd.Series(pd.Timed
19
19
20
20
Raises:
21
21
TypeError: If input is not Series of type datetime64 or DatetimeIndex.
22
+
23
+ Examples:
24
+ Calculate time differences between observations on Series of timestamps after
25
+ it has been randomized:
26
+
27
+ >>> import pandahelper as ph
28
+ >>> import pandas as pd
29
+ >>>
30
+ >>> start = pd.Timestamp(year=1999, month=1, day=1)
31
+ >>> rng = pd.date_range(start, periods=10, freq="D").delete([3, 4, 5, 8])
32
+ >>> series = pd.Series(rng).sample(frac=1, random_state=3) # randomize order
33
+
34
+ >>> ph.time_diffs(series)
35
+ 1999-01-01 NaT
36
+ 1999-01-02 1 days
37
+ 1999-01-03 1 days
38
+ 1999-01-07 4 days
39
+ 1999-01-08 1 days
40
+ 1999-01-10 2 days
41
+ Name: diffs, dtype: timedelta64[ns]
22
42
"""
23
43
if not pat .is_datetime64_any_dtype (series .dtype ):
24
- raise TypeError ("Should be Series of datetime64 dtype." )
44
+ raise TypeError ("Should be of datetime64 dtype." )
25
45
series = series .sort_values ()
26
46
diffs = pd .Series (series .diff (), name = "diffs" )
27
47
diffs .index = series
28
48
return diffs
29
49
30
50
31
- def time_diffs_index (df : Union [pd .Series , pd .DatetimeIndex ]) -> pd .Series (pd .Timedelta ):
32
- """Calculate time diffs (gaps) for time-indexed Pandas Series or Dataframe .
51
+ def time_diffs_index (df : Union [pd .Series , pd .DataFrame ]) -> pd .Series (pd .Timedelta ):
52
+ """Calculate time difference between subsequent time-indexed observations .
33
53
34
- Sorts input by time before calculating diffs.
54
+ Sorts input by time index before calculating diffs.
35
55
36
56
Args:
37
57
df (pd.Series or pd.DataFrame): Pandas Series or DataFrame with DateTimeIndex
@@ -42,10 +62,102 @@ def time_diffs_index(df: Union[pd.Series, pd.DatetimeIndex]) -> pd.Series(pd.Tim
42
62
43
63
Raises:
44
64
TypeError: If input does not have a DatetimeIndex.
65
+
66
+ Examples:
67
+ Calculate time differences between observations on time-indexed DataFrame after
68
+ it has been randomized:
69
+
70
+ >>> import pandahelper as ph
71
+ >>> import pandas as pd
72
+ >>>
73
+ >>> start = pd.Timestamp(year=1999, month=1, day=1)
74
+ >>> rng = pd.date_range(start, periods=10, freq="D").delete([3, 4, 5, 8])
75
+ >>> # index by time then randomize order
76
+ >>> df = pd.DataFrame(range(len(rng)), index=rng).sample(frac=1, random_state=3)
77
+
78
+ >>> ph.time_diffs_index(df)
79
+ 1999-01-01 NaT
80
+ 1999-01-02 1 days
81
+ 1999-01-03 1 days
82
+ 1999-01-07 4 days
83
+ 1999-01-08 1 days
84
+ 1999-01-10 2 days
85
+ Name: diffs, dtype: timedelta64[ns]
45
86
"""
46
87
if isinstance (df .index , pd .DatetimeIndex ):
47
88
df = df .sort_index ()
48
89
diffs = pd .Series (df .index .diff (), name = "diffs" )
49
90
diffs .index = df .index
50
91
return diffs
51
92
raise TypeError (f"Index should be of type { pd .DatetimeIndex } " )
93
+
94
+
95
+ def id_gaps (
96
+ series : Union [pd .Series , pd .DatetimeIndex ], threshold : pd .Timedelta
97
+ ) -> pd .DataFrame :
98
+ """Identify time gaps above `threshold` in datetime64 Series or DatetimeIndex.
99
+
100
+ Sorts input by time before calculating gaps.
101
+
102
+ Args:
103
+ series (pd.Series or pd.DatetimeIndex): `datetime64` Series or DatetimeIndex.
104
+ threshold (pd.Timedelta): Threshold to identify gaps
105
+ (and not expected time differences).
106
+
107
+ Returns:
108
+ One-column Pandas DataFrame of gaps indexed by when gap was calculated.
109
+
110
+ Examples:
111
+ Identify time gaps on Series of timestamps with a 2 and 4 hour
112
+ gap after it has been randomized:
113
+
114
+ >>> import pandahelper as ph
115
+ >>> import pandas as pd
116
+ >>>
117
+ >>> start = pd.Timestamp(year=1999, month=1, day=1)
118
+ >>> rng = pd.date_range(start, periods=24, freq="1h").delete([3, 4, 8, 9, 10])
119
+ >>> series = pd.Series(rng).sample(frac=1, random_state=3) # randomize order
120
+
121
+ >>> ph.id_gaps(series, pd.Timedelta(hours=1))
122
+ diffs
123
+ 1999-01-01 11:00:00 0 days 04:00:00
124
+ 1999-01-01 04:00:00 0 days 02:00:00
125
+ """
126
+ diffs = time_diffs (series )
127
+ return diffs [diffs > threshold ].sort_values (ascending = False ).to_frame ()
128
+
129
+
130
+ def id_gaps_index (
131
+ df : Union [pd .Series , pd .DataFrame ], threshold : pd .Timedelta
132
+ ) -> pd .DataFrame :
133
+ """Identify time gaps above `threshold` in time-indexed Series or DataFrame.
134
+
135
+ Sorts input by time index before calculating diffs.
136
+
137
+ Args:
138
+ df (pd.Series or pd.DataFrame): Time-indexed Series or DataFrame.
139
+ threshold (pd.Timedelta): Threshold to identify gaps
140
+ (and not expected time differences).
141
+
142
+ Returns:
143
+ One-column Pandas DataFrame of gaps indexed by when gap was calculated.
144
+
145
+ Examples:
146
+ Identify time gaps on an hourly, time-indexed Series with a 2 and 4 hour
147
+ gap after it has been randomized:
148
+
149
+ >>> import pandahelper as ph
150
+ >>> import pandas as pd
151
+ >>>
152
+ >>> start = pd.Timestamp(year=1999, month=1, day=1)
153
+ >>> rng = pd.date_range(start, periods=24, freq="1h").delete([3, 8, 9, 10])
154
+ >>> # index by time then randomize order
155
+ >>> df = pd.DataFrame(range(len(rng)), index=rng).sample(frac=1, random_state=3)
156
+
157
+ >>> ph.id_gaps_index(df, pd.Timedelta(hours=1))
158
+ diffs
159
+ 1999-01-01 11:00:00 0 days 04:00:00
160
+ 1999-01-01 04:00:00 0 days 02:00:00
161
+ """
162
+ diffs = time_diffs_index (df )
163
+ return diffs [diffs > threshold ].sort_values (ascending = False ).to_frame ()
0 commit comments