-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathowner.py
108 lines (84 loc) · 3.6 KB
/
owner.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import re
p = re.compile(r'[^0-9a-z]')
class Owner:
def __init__(self, item):
self.item = item
def __eq__(self, other):
return (isinstance(other, self.__class__)
and self.transform() == other.transform())
def __ne__(self, other):
return not self.__eq__(other)
def __getitem__(self, name):
return self.item[name]
def __repr__(self):
return str(self.item)
def __str__(self):
result = '\tName:\n\t\t' + str(self['name'])
result += '\n\tAddress:\n\t\t' + str(self['addr1']).strip() + ', ' + str(self['addr2']).strip() + ', ' + str(self['city']) + ', ' + str(self['state']) + ', ' + str(self['zip'])
return result
def __hash__(self):
return hash(self.item['name']) ^ hash(self.item['addr1']) ^ hash(self.item['addr2']) ^ hash(self.item['city']) ^ hash(self.item['state']) ^ hash(self.item['zip'])
def transform(self):
item = {
'name': self.item['name'],
'addr1': self.item['addr1'],
'addr2': self.item['addr2'],
'city': self.item['city'],
'state': self.item['state'],
'zip': self.item['zip']
}
return item
def similarity(self, other):
# Min = 0, Max = 2
if not isinstance(other, self.__class__):
return 0
# Don't need to compare the same entity
if self is other:
return 0
# If they are exactly the same, then give 1
if self == other:
return 1
return self.name_similarity(other) + self.addr_similarity(other)
def name_similarity(self, other):
self_name = set(filter(None, p.split(self['name'].lower())))
other_name = set(filter(None, p.split(other['name'].lower())))
try:
score = float(len(self_name & other_name)) / len(self_name | other_name)
except ZeroDivisionError:
return 0
if score > 0.5:
return score
return 0
def get_number_from_addr2(self):
if self['addr2'] is None or self['addr2'] == '':
return None
try:
numbers = [int(s) for s in str(self['addr2']).split() if s.isdigit()]
except:
return None
if len(numbers) == 0:
return None
return numbers[0]
def addr_filter(self, s):
return s not in "st street ave avenue av place pl parkway pkwy".split()
def addr_similarity(self, other):
if str(self['state']).lower() != str(other['state']).lower():
return 0
if self['zip'] != '' and other['zip'] != '':
try:
#if int(str(self['zip'])[0:5]) != int(str(other['zip'])[0:5]):
if int(str(self['zip'])[0:3]) != int(str(other['zip'])[0:3]):
return 0
except:
return 0
# Now only need to consider address matching
self_addr = set(filter(self.addr_filter, filter(None, p.split(self['addr1'].lower()))))
other_addr = set(filter(self.addr_filter, filter(None, p.split(other['addr1'].lower()))))
try:
score = float(len(self_addr & other_addr)) / len(self_addr | other_addr)
except ZeroDivisionError:
return 0
if score > 0.5:
if self.get_number_from_addr2() == other.get_number_from_addr2():
return score
return 0