-
Notifications
You must be signed in to change notification settings - Fork 0
/
post_processing.py
67 lines (50 loc) · 2.32 KB
/
post_processing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import json
import numpy as np
from typing import List
# Hardcoded, ugly, but the files won't change.
ALL_TRACKERS = {'adguarddns': np.loadtxt('3rd-party-trackers/adguarddns-justdomains-sorted.txt', dtype=str),
'easylist': np.loadtxt('3rd-party-trackers/easylist-justdomains-sorted.txt', dtype=str),
'easyprivacy': np.loadtxt('3rd-party-trackers/easyprivacy-justdomains-sorted.txt', dtype=str),
'nocoin': np.loadtxt('3rd-party-trackers/nocoin-justdomains-sorted.txt', dtype=str)}
FIELDS = ["name", "domain", "expires", "time_to_live"]
def parse_cookie(cookie, website):
trackers = check_trackers(cookie["domain"])
# Copy only selected fields
c = {k: cookie[k] for k in FIELDS}
c['third_party'] = website not in c['domain']
c["trackers_list"] = trackers
return c
def post_processing(cookies):
out = dict()
for website in cookies.keys():
out[website] = {"frontpage": [], "hopped": []}
# Parse frontpage
for cookie in cookies[website]["frontpage"]:
# Select field and add trackers
c = parse_cookie(cookie, website)
out[website]["frontpage"].append(c)
# Parse hops
for cookie in cookies[website]["hopped"]:
# Select field and add trackers
c = parse_cookie(cookie, website)
out[website]["hopped"].append(c)
return out
# For each tracker list, check if the domain name is present and save the list name if that is the case.
def check_trackers(domain_name: str) -> List[str]:
found_in = []
for k, v in ALL_TRACKERS.items():
if is_tracker(domain_name, v):
found_in.append(k)
return found_in
# This function checks whether the given domain name is present in one of the trackers lists.
def is_tracker(domain_name: str, trackers_list: np.array) -> bool:
# Strip the domain name if some prefixes.
if domain_name.startswith('.'):
domain_name = domain_name.replace('.', '', 1)
if domain_name.startswith('www.'):
domain_name = domain_name.replace('www.', '', 1)
# Binary search.
index = np.searchsorted(trackers_list, domain_name)
return trackers_list[index] == domain_name and index != len(trackers_list)
if __name__ == "__main__":
print(check_trackers("6006206.global.siteimproveanalytics.io"))