-
Notifications
You must be signed in to change notification settings - Fork 3
/
wikipedia.tpl
112 lines (87 loc) · 3.25 KB
/
wikipedia.tpl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
<!--head
Title: Compare Wikipedia search queries
Description: Compare the daily number of Wikipedia searches for various subjects.
Author: Rapporter Team (@rapporter)
Packages: plyr, lubridate, stringr, ggplot2, RJSONIO, forecast
Data required: FALSE
stringsO | SPSS,SAS,STATA,MATLAB | Subject | Choose
stringsF | string[0,500]=cloud computing | Free-text | A comma separated list
lookback | number[1,12]=5 | Months to look back | Months to look back
logplot | FALSE | Logaritmic scale | Logaritmic scale
head-->
<%=
lookback <- round(lookback)
t <- Sys.time()
dates <- format(seq(Sys.time() - months(lookback), Sys.time(), by = "month"), '%Y%m')
datesS <- format(parse_date_time(dates, 'YM'), '%Y %b')
df <- data.frame(count = numeric(), date = character(), name = character())
stringsA <- unique(c(stringsO, strsplit(stringsF, ',')[[1]]))
for (s in stringsA) {
res <- data.frame(count = numeric())
for (d in dates) {
res <- rbind(res, as.data.frame(fromJSON(paste('http://stats.grok.se/json/en/', d, s, sep = '/'))$daily_views))
}
res$date <- as.Date(rownames(res))
res$name <- s
colnames(res) <- c("count", "date", "name")
res <- arrange(res, date)
df <- rbind(df, res)
}
%>
We have fetched <%=nrow(df)*3%> records from [http://stats.grok.se](http://stats.grok.se) in <%=as.numeric(Sys.time()-t)%> seconds with the following descriptives:
<%=
rp.desc('count', 'name', data = df, fn = c('min', 'mean', 'median', 'max', 'sd', 'IQR'))
%>
# Overview
<%=
set.caption(p(stringsA))
if (logplot) {
ggplot(df, aes(x = date, y = log10(count), group = name, colour = name)) + geom_line() + ylab("log10") + xlab("")
} else {
ggplot(df, aes(x = date, y = count, group = name, colour = name)) + geom_line() + ylab("") + xlab("")
}
%>
# A basic time-series analysis
Here we will try to apply some standard univariate methods to the fetched data.
<%
for (n in stringsA) {
%>
## <%=n%>
<%=
d <- subset(df, name == n)
d <- na.omit(d)
set.caption(paste('Wikipedia searches for', n, 'between ', p(c(head(datesS, 1), tail(datesS, 1)))))
plot(x=d$date, y=d$count, type = "l")
%>
First, let us check the possible auto-correlation values:
<%=
fc <- panderOptions('graph.fontcolor')
fbs <- panderOptions('graph.fontsize')
bc <- panderOptions('graph.background')
gc <- panderOptions('graph.grid.color')
cex <- fbs/12
cs <- panderOptions('graph.colors')
if (panderOptions('graph.color.rnd'))
cs <- sample(cs)
cb <- cs[1]
par(
family = panderOptions('graph.fontfamily'),
cex = cex, cex.axis = cex * 0.8, cex.lab = cex, cex.main = cex * 1.2, cex.sub = cex,
bg = bc,
las = panderOptions('graph.axis.angle'),
lwd = 2,
pch = panderOptions('graph.symbol'),
col.axis = fc, col.lab = fc, col.main = fc, col.sub = fc)
+acf(d$count, main = paste('Autocorrelation for', n))
+grid(lty = panderOptions('graph.grid.lty'), col = panderOptions('graph.grid.color'), lwd = 0.5)
%>
And fitting some ARIMA models:
<%=
fit <- tryCatch(auto.arima(ts(d$count, frequency = 365)), error = function(e) e)
%>
<% if (!inherits(fit, 'error')) { %>
<%= fit$var.coef %>
Where AIC equals to <%=fit$aic%>, AICc to <%=fit$aicc%> and BIC to <%=fit$bic%>.
<% } else { %>
Damn, we could not fit a model: `<%=fit$message%>`
<% }} %>