Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Check domains with PyFunceble #23

Merged
merged 5 commits into from
Sep 5, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ COPY results.json /home/$USER/old-results.json
COPY privacybadger /home/$USER/privacybadger
ENV OUTPATH=/home/$USER/out
ENV EXTPATH=/home/$USER/privacybadger/src
ENV PYFUNCEBLE_AUTO_CONFIGURATION=True
RUN mkdir -p $OUTPATH

ENTRYPOINT ["./docker-entry.sh"]
Expand Down
27 changes: 27 additions & 0 deletions audit.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
from collections import Counter
import git
import json
import re

def count_domain_blocks():
repo = git.Repo('./')
old_maps = {}

# load old map data
for c in repo.iter_commits('master'):
if re.match('Update seed data: \d+\.\d+\.\d+', c.message):
repo.git.checkout(c.hexsha)
with open('results.json') as f:
js = json.load(f)
if 'version' in js:
old_maps[js['version']] = js

# count number of times each domain has been blocked
ctr = Counter()
for m in old_maps.values():
for domain, data in m['action_map'].items():
if data['heuristicAction'] == 'block' or \
data['heuristicAction'] == 'cookieblock':
ctr[domain] += 1

return ctr
40 changes: 34 additions & 6 deletions crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import time
from urllib.request import urlopen

from PyFunceble import test as PyFunceble
from selenium import webdriver
from selenium.common.exceptions import TimeoutException, WebDriverException,\
NoSuchWindowException,\
Expand Down Expand Up @@ -41,7 +42,7 @@
help='Browser to use for the scan')
ap.add_argument('--n-sites', type=int, default=2000,
help='Number of websites to visit on the crawl')
ap.add_argument('--timeout', type=float, default=10,
ap.add_argument('--timeout', type=float, default=30,
help='Amount of time to allow each site to load, in seconds')
ap.add_argument('--wait-time', type=float, default=5,
help='Amount of time to wait on each site after it loads, in seconds')
Expand Down Expand Up @@ -89,26 +90,53 @@ def get_chrome_extension_id(crx_file):

def get_domain_list(n_sites, out_path):
"""Load the top million domains from disk or the web"""
domains = []

top_1m_file = os.path.join(out_path, MAJESTIC_URL.split('/')[-1])
pyfunc_cache_file = os.path.join(out_path, 'pyfunceable_cache.txt')

# download the file if it doesn't exist or if it's more than a week stale
if (not os.path.exists(top_1m_file) or
time.time() - os.path.getmtime(top_1m_file) > WEEK_IN_SECONDS):
logger.info('Loading new Majestic data and refreshing PyFunceble cache')
response = urlopen(MAJESTIC_URL)
with open(top_1m_file, 'w') as f:
f.write(response.read().decode())

# if the majestic file is expired, let's refresh the pyfunceable cache
with open(pyfunc_cache_file, 'w') as f:
pass

# load cache
if os.path.exists(pyfunc_cache_file):
with open(pyfunc_cache_file) as f:
pyfunc_cache = json.load(f)
else:
pyfunc_cache = {}

domains = []
with open(top_1m_file) as f:
# first line is CSV header
next(f)

# only read the first n_sites lines
for i, l in enumerate(f):
if i >= n_sites:
for l in f:
domain = l.split(',')[2]

if domain in pyfunc_cache:
if pyfunc_cache[domain] == 'ACTIVE':
domains.append(domain)
else:
status = PyFunceble(domain)
logger.info('PyFunceble: %s is %s', domain, status)
if status == 'ACTIVE':
domains.append(domain)
pyfunc_cache[domain] = status

if len(domains) >= n_sites:
break
domains.append(l.split(',')[2])

# save pyfunceble cache again
with open(pyfunc_cache_file, 'w') as f:
json.dump(pyfunc_cache, f)

return domains

Expand Down
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,5 @@ colorama==0.3.9
selenium==3.12.0
tldextract==2.2.0
xvfbwrapper==0.2.9
GitPython==2.1.11
PyFunceble==0.94.3
Loading