-
Notifications
You must be signed in to change notification settings - Fork 10
/
outliers.py
72 lines (62 loc) · 2.36 KB
/
outliers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import numpy
import sys
import math
MEDIAN_THRESHOLD = 5.0
median_distance_cache = {}
def median_distances(pts, aggregate=numpy.median):
key = tuple(sorted(pts))
if key in median_distance_cache: return median_distance_cache[key]
median = (numpy.median([pt[0] for pt in pts]),
numpy.median([pt[1] for pt in pts]))
distances = []
for pt in pts:
dist = math.sqrt(((median[0]-pt[0])*math.cos(median[1]*math.pi/180.0))**2+(median[1]-pt[1])**2)
distances.append((dist, pt))
median_dist = aggregate([dist for dist, pt in distances])
median_distance_cache[key] = (median_dist, distances)
return (median_dist, distances)
def mean_distances(pts):
return median_distances(pts, numpy.mean)
def load_points(point_file):
places = {}
count = 0
for line in file(point_file):
data = line.strip().split()
place_id, lon, lat = data if len(data) == 3 else data[1:]
place_id = int(place_id)
point = (float(lon), float(lat))
pts = places.setdefault(place_id, set())
pts.add(point)
count += 1
if count % 1000 == 0:
print >>sys.stderr, "\rRead %d points in %d places." % (count, len(places)),
print >>sys.stderr, "\rRead %d points in %d places." % (count, len(places))
return places
def discard_outliers(places, threshold=MEDIAN_THRESHOLD):
count = 0
discarded = 0
result = {}
for place_id, pts in places.items():
count += 1
print >>sys.stderr, "\rComputing outliers for %d of %d places..." % (count, len(places)),
median_dist, distances = median_distances(pts)
keep = [pt for dist, pt in distances if dist < median_dist * threshold]
discarded += len(pts) - len(keep)
result[place_id] = keep
print >>sys.stderr, "%d points discarded." % discarded
return result
def get_bbox_for_points(places):
bbox = [180, 90, -180, -90]
for pid, pts in places.items():
for pt in pts:
for i in range(4):
bbox[i] = min(bbox[i], pt[i%2]) if i<2 else max(bbox[i], pt[i%2])
return bbox
def main(filename):
places = load_points(filename)
places = discard_outliers(places)
bbox = get_bbox_for_points(places)
#print ",".join(map(str, bbox))
print "%s %s, %s %s" % (bbox[0], bbox[1], bbox[2], bbox[3])
if __name__ == "__main__":
main(sys.argv[1])