This repository has been archived by the owner on Jan 8, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
web_weekly.pig
103 lines (81 loc) · 2.49 KB
/
web_weekly.pig
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
-- updates all the stats for the week starting on Sunday
-- Expects params LOGDIR, REPORTDIR and SITE
run -param LOGDIR=$LOGDIR web_load.pig
-- determine total number of requests and bytes served by UTC hour of day
by_hour_count =
FOREACH
(GROUP log BY FORMAT_DT('HH',datetime))
GENERATE
$0,
COUNT($1) AS num_requests,
SUM($1.bytes) AS num_bytes
;
STORE by_hour_count INTO '$REPORTDIR/total_requests_bytes_per_hour';
-- top 50 ips by requests and bytes
by_ip_count =
FOREACH
(GROUP log BY FORMAT('%s', EXTRACT(remoteAddr, '(\\d+\\.\\d+\\.\\d+\\.\\d+)')))
GENERATE
$0,
COUNT($1) AS num_requests,
SUM($1.bytes) AS num_bytes
;
by_ip_count_requests =
-- order ip by the number of requests they make
LIMIT (ORDER by_ip_count BY num_requests DESC) 50;
STORE by_ip_count_requests into '$REPORTDIR/top_50_ips_by_requests';
by_ip_count_bytes =
-- order ip by the number of requests they make
LIMIT (ORDER by_ip_count BY num_bytes DESC) 50;
STORE by_ip_count_bytes into '$REPORTDIR/top_50_ips_by_bytes';
-- top 50 external referrers
by_referrer_count =
FOREACH
(GROUP log BY referrer)
GENERATE
FLATTEN($0),
COUNT($1) AS num_requests
;
by_referrer_count_filtered =
-- exclude matches for site
FILTER by_referrer_count BY NOT $0 matches 'http[s]?://$SITE.*';
by_referrer_count_sorted =
-- take the top 50 results
LIMIT (ORDER by_referrer_count_filtered BY num_requests DESC) 50;
STORE by_referrer_count_sorted INTO '$REPORTDIR/top_50_external_referrers';
-- top 50 pages by requests
by_pages_count =
FOREACH
(GROUP log BY uri)
GENERATE
FLATTEN($0),
COUNT($1) AS num_requests
;
by_pages_count_sorted =
-- take the top 50 results
LIMIT (ORDER by_pages_count BY num_requests DESC) 50;
STORE by_pages_count_sorted INTO '$REPORTDIR/top_50_pages_by_requests';
-- top 50 pages by bytes
by_pages_bytes =
FOREACH
(GROUP log BY uri)
GENERATE
FLATTEN($0),
SUM($1.bytes) AS num_bytes
;
by_pages_bytes_sorted =
-- take the top 50 results
LIMIT (ORDER by_pages_bytes BY num_bytes DESC) 50;
STORE by_pages_bytes_sorted INTO '$REPORTDIR/top_50_pages_by_bytes';
-- top 50 pages by average time taken
by_pages_time =
FOREACH
(GROUP log BY uri)
GENERATE
FLATTEN($0),
SUM($1.timeTaken)/COUNT($1) AS avgTime
;
by_pages_time_sorted =
-- take the top 50 results
LIMIT (ORDER by_pages_time BY avgTime DESC) 50;
STORE by_pages_time_sorted INTO '$REPORTDIR/top_50_pages_by_timetaken';