-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdictionary.py
76 lines (59 loc) · 2.39 KB
/
dictionary.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import pandas as pd
from tableschema import Table
from dateutil.parser import parse
def is_date(input):
if str(input) == "" or str(input).lower() == "nan":
return True
try:
parse(input)
return True
except ValueError:
return False
except TypeError:
return False
def infer(path, limit = 2000):
table = Table(path)
table.infer(limit=limit, confidence=0.75)
data = pd.read_csv(path, low_memory=False)
num_rows = data.index._stop
rows_to_scan = limit if limit < num_rows else num_rows
metadata_array = []
for field in table.schema.fields:
metadata = Metadata()
metadata.name = field.name
metadata.type = field.type
metadata.format = field.format
object_description = data[field.name].astype(object).describe()
missing_count = num_rows - int(object_description['count'])
metadata.missing_count = missing_count
metadata.missing_percentage = round(float(missing_count)/num_rows * 100 , 2)
distinct_count = int(object_description['unique'])
metadata.distinct_count = distinct_count
metadata.distinct_percentage = round(float(distinct_count) / (num_rows - missing_count) * 100, 2)
metadata.most_frequent = object_description['top']
if metadata.type == "string" and metadata.missing_percentage != 100.0:
if rows_to_scan == data[metadata.name].head(rows_to_scan).apply(lambda x :is_date(x)).sum():
metadata.type = "date"
if (metadata.type == "integer" or metadata.type == "number") and (data.dtypes[field.name] == "int64" or data.dtypes[field.name] == "float64") :
numeric_description = data[field.name].describe()
metadata.min = numeric_description['min']
metadata.max = numeric_description['max']
metadata.ml_type = metadata.type
if metadata.type == "integer" or metadata.type == "number" :
metadata.ml_type = "numeric"
if metadata.type == "string" :
metadata.ml_type = "open_text"
metadata_array.append(metadata)
return metadata_array
class Metadata:
name = ""
type = ""
format = ""
missing_count = 0
missing_percentage = 0.0
distinct_count = 0
distinct_percentage = 0.0
most_frequent = object()
min = 0.0
max = 0.0
ml_type = ""