-
Notifications
You must be signed in to change notification settings - Fork 313
/
test_crawl.py
142 lines (124 loc) · 4.3 KB
/
test_crawl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
"""Runs a short test crawl.
This should be used to test any features that require real crawl data.
This should be avoided if possible, as controlled tests will be easier
to debug.
"""
import json
import os
import tarfile
import domain_utils as du
import pytest
from openwpm.utilities import db_utils
TEST_SITES = [
"http://google.com",
"http://facebook.com",
"http://youtube.com",
"http://yahoo.com",
"http://baidu.com",
"http://wikipedia.org",
"http://qq.com",
"http://linkedin.com",
"http://taobao.com",
"http://twitter.com",
"http://live.com",
"http://amazon.com",
"http://sina.com.cn",
"http://google.co.in",
"http://hao123.com",
"http://blogspot.com",
"http://weibo.com",
"http://wordpress.com",
"http://yandex.ru",
"http://yahoo.co.jp",
]
@pytest.mark.skipif(
"CI" not in os.environ or os.environ["CI"] == "false",
reason="Makes remote connections",
)
@pytest.mark.slow
def test_browser_profile_coverage(default_params, task_manager_creator):
"""Test the coverage of the browser's profile.
This verifies that Firefox's places.sqlite database contains all
visited sites (with a few exceptions). If it does not, it is likely
the profile is lost at some point during the crawl.
"""
# Run the test crawl
manager_params, browser_params = default_params
manager_params.num_browsers = 1
browser_params[0].profile_archive_dir = (
manager_params.data_directory / "browser_profile"
)
browser_params[0].http_instrument = True
manager, crawl_db = task_manager_creator((manager_params, browser_params[:1]))
for site in TEST_SITES:
manager.get(site)
manager.close()
# Extract crawl profile
ff_db_tar = browser_params[0].profile_archive_dir / "profile.tar.gz"
with tarfile.open(ff_db_tar) as tar:
tar.extractall(browser_params[0].profile_archive_dir)
# Output databases
ff_db = browser_params[0].profile_archive_dir / "places.sqlite"
# Grab urls from crawl database
rows = db_utils.query_db(crawl_db, "SELECT url FROM http_requests")
req_ps = set() # visited domains from http_requests table
for (url,) in rows:
req_ps.add(du.get_ps_plus_1(url))
hist_ps = set() # visited domains from crawl_history Table
statuses = dict()
rows = db_utils.query_db(
crawl_db,
"SELECT arguments, command_status FROM crawl_history WHERE"
" command='GetCommand'",
)
for arguments, command_status in rows:
url = json.loads(arguments)["url"]
ps = du.get_ps_plus_1(url)
hist_ps.add(ps)
statuses[ps] = command_status
# Grab urls from Firefox database
profile_ps = set() # visited domains from firefox profile
rows = db_utils.query_db(ff_db, "SELECT url FROM moz_places")
for (host,) in rows:
try:
profile_ps.add(du.get_ps_plus_1(host))
except AttributeError:
pass
# We expect a url to be in the Firefox profile if:
# 1. We've made requests to it
# 2. The url is a top_url we entered into the address bar
# 3. The url successfully loaded (see: Issue #40)
# 4. The site does not respond to the initial request with a 204
# (won't show in FF DB)
missing_urls = req_ps.intersection(hist_ps).difference(profile_ps)
unexpected_missing_urls = set()
for url in missing_urls:
if command_status[url] != "ok":
continue
# Get the visit id for the url
rows = db_utils.query_db(
crawl_db,
"SELECT visit_id FROM site_visits WHERE site_url = ?",
("http://" + url,),
)
visit_id = rows[0]
rows = db_utils.query_db(
crawl_db,
"SELECT COUNT(*) FROM http_responses WHERE visit_id = ?",
(visit_id,),
)
if rows[0] > 1:
continue
rows = db_utils.query_db(
crawl_db,
"SELECT response_status, location FROM "
"http_responses WHERE visit_id = ?",
(visit_id,),
)
response_status, location = rows[0]
if response_status == 204:
continue
if location == "http://": # site returned a blank redirect
continue
unexpected_missing_urls.add(url)
assert len(unexpected_missing_urls) == 0