-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutils.py
267 lines (216 loc) · 8.04 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
# This file is part of the Pattern and Anomaly Detection Library (openclean_pattern).
#
# Copyright (C) 2020 New York University.
#
# openclean_pattern is released under the Revised BSD License. See file LICENSE for
# full license details.
"""A collection of useful utility methods"""
import re
from openclean_pattern.datatypes.resolver import DateResolver
from abc import ABCMeta, abstractmethod
import random
import bisect
from collections import Counter
### Comparators
class Comparator(metaclass=ABCMeta):
"""Compares different dataitems
"""
@abstractmethod
def compare(self, a, b, meta=None):
"""Compares a with b and returns True if a and b are equal. The comparison can involve any
extra meta information that the user wants to consider
Parameters:
----------
a: Any
the datatype to compare
b: Any
the datatype to compare against
meta: Any (Optional)
any extra information used in the comparison
Returns
-------
bool
"""
raise NotImplementedError()
class DateComparator(Comparator):
def __init__(self):
self.dt = DateResolver()
def compare(self, a, b):
if self.dt.is_datetime(a) != False:
return self.dt.is_datetime(b), 0
class StringComparator(Comparator):
"""Class of useful string comparison methods
"""
@staticmethod
def compare_strings(s1, s2, ambiguous_char='X'):
"""
Compares two strings in sequence of characters and replaces distinct characters with ambiguous character. Then
returns the new string along with an ambiguity ratio
Parameters
----------
s1 : str
string 1
s2 : str
string 2
ambiguous_char: str
replaces the distinct characters with
Returns
-------
str, float
"""
smaller_size = min(len(s1), len(s2))
new_string = ''
for i in range(smaller_size):
if s1[i] == s2[i]:
new_string += s1[i]
else:
new_string += ambiguous_char
for j in range(abs(len(s1) - len(s2))):
new_string += ambiguous_char
return new_string, new_string.count(ambiguous_char) / len(new_string)
@staticmethod
def substring_finder(string1, string2):
anslist = []
len1, len2 = len(string1), len(string2)
for i in range(len1):
match = ""
for j in range(len2):
if (i + j < len1 and string1[i + j] == string2[j]):
match += string2[j]
else:
answer = match
if answer != '' and len(answer) > 1:
anslist.append(answer)
match = ""
if match != '':
anslist.append(match)
return anslist
class PatternComparator(Comparator):
def compare(self, a, b, meta=None):
raise NotImplementedError()
# def compare(self, pattern, token):(self, pattern, width, token):
# if len(token) >= width[0] and len(token) <= width[1]:
# for i in range(min(len(token), len(pattern))):
# if pattern[i].upper() != token[i].upper(): # not case sensitive?
# if pattern[i].upper() == 'X':
# continue
# else:
# return False
# return True
# return False
def has_numbers(inputString):
return bool(re.search(r'\d', inputString))
### Samplers
class Sampler(metaclass=ABCMeta):
"""Class to sample an input iterable. This was necessary because pandas.sample sampling can be slow."""
def __init__(self, iterable, n=1):
"""initizlizes the Sampler class
Parameters
----------
iterable: Iterable
the iterable class object which has data to be sampled
n: float
the proportion or number of records to sample
"""
self.iterable = iterable
self.n = n
self.frac = 0 <= n <= 1
@abstractmethod
def __call__(self, *args, **kwargs):
"""Method to sample the input iterable sequence
"""
raise NotImplementedError()
def sample(self):
"""a convenience sample method
"""
return self.__call__()
class WeightedRandomSampler(Sampler):
"""Implements weighted random sampling using the distribution provided collections.Counter object.
Based on the work: https://eli.thegreenplace.net/2010/01/22/weighted-random-generation-in-python/
Note: if a Counter or dict of type {value:frequency} is passed in, there is no rowidx information tied to
the sampled series and this can possibly require an extra lookup during anomaly detection
"""
def __init__(self, weights, n=1, random_state=None):
"""initizlizes the WeightedRandomSampler class
Parameters
----------
weights: collections.Counter
the counter object in the format key:frequency
n: float
the proportion or number of records to sample
random_state: int (default: None)
the seed value for the pseudo random number generator
"""
super(WeightedRandomSampler, self).__init__(weights, n)
self.random_state = random_state
self.totals = [] # cumulative sum
running_total = 0
for w in weights.values():
running_total += w
self.totals.append(running_total)
def next(self):
"""selects a new randomly sampled value from the input series based on their weight distribution and returns
the respective index
Returns
-------
int
"""
rnd = random.random() * self.totals[-1]
return bisect.bisect_right(self.totals, rnd)
def __call__(self):
"""samples n (or n*total_inputs, if n is a fraction) times and returns the sampled frequencies as a counter
Returns
-------
sampled list of rows
"""
sample = Counter()
n = int(self.totals[-1] * self.n) if self.frac else int(self.n)
keys = list(self.iterable.keys())
random.seed(self.random_state)
for _c in range(n):
sample[keys[self.next()]] += 1
return WeightedRandomSampler.counter_to_list(sample)
@staticmethod
def counter_to_list(counter):
""" method to create a series list from a counter object
Parameters
----------
counter: collections.Counter
the counter object to convert to a list
Returns
-------
list of values
"""
series = list()
for k, v in counter.items():
for _ in range(v):
series.append(k)
return series
class RandomSampler(Sampler):
"""Class to randomly sample an input iterable. This was necessary because pandas.sample samples a dataframe
which can be slow.
Note: if a Counter or dict of type {value:frequency} is passed in, there is no rowidx information tied to
the sampled series and this can possibly require an extra lookup during anomaly detection
"""
def __init__(self, iterable, n=1, random_state=None):
"""initizlizes the Random Sampler class
Parameters
----------
iterable: Iterable
the iterable class object which has data to be sampled
n: float
the proportion or number of records to sample
random_state: int (default: None)
the seed value for the pseudo random number generator
"""
super(RandomSampler, self).__init__(iterable, n)
self.random_state = random_state
def __call__(self, *args, **kwargs):
"""Method to sample the input iterable sequence
Returns
-------
sampled list of rows
"""
random.seed(self.random_state)
n = int(len(self.iterable) * self.n) if self.frac else int(self.n)
return random.sample(self.iterable, n)