-
Notifications
You must be signed in to change notification settings - Fork 1
/
mineBlog.R
108 lines (90 loc) · 3.85 KB
/
mineBlog.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
#' Get URLs to blog post full text for all posts
#' by scraping them out of each page of the
#' main blog aggregator
#'
# Set the working directory
setwd('/Users/danielpett/githubProjects/blogMine/')
# List of packages needed
list.of.packages <- c(
"RCurl", "XML","jsonlite",
"stringr", "ggplot2", "reshape2",
"igraph", "d3Network"
)
# Install packages if not already
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages)
# Initiate the libraries needed to get data from the blog.
library(RCurl)
library(XML)
library(stringr)
# determined by inspecting the first page
n <- 74
# pre-allocate list to fill
links <- vector("list", length = n)
links[[1]] <- unname(xpathSApply(htmlParse(getURI("https://blog.britishmuseum.org/")),"//h2/a/@href"))
# Now loop through the rest of the pages to get data
for(i in 1:n){
# track progress by showing the iteration we're up to
print(i)
# get all content on the i+1 th page of the main blog list
blogdata <- htmlParse(getURI(paste0("https://blog.britishmuseum.org/page/", i+1,"/")))
# extract links for all posts
links[[i+1]] <- unname(xpathSApply(blogdata,"//h2/a/@href"))
}
#' Start with output from scraping for URLs, now pull full text
#' get text from each post using the URLs we just got
# Create a big list of URL to scrape data
linksall <- unlist(links)
# Create an empty data.frame to store the full text in and get date and author of full text also
blogtext <- data.frame(text = vector(length = length(linksall)),
date = vector(length = length(linksall)),
day = vector(length = length(linksall)),
year = vector(length = length(linksall)),
time = vector(length = length(linksall)),
author = vector(length = length(linksall))
)
# make a list to store comments for each blog post
names <- vector("list", length = length(linksall))
# loop over the URLs to pull full text, etc. from each URL
# includes error handling in case a field is empty, etc.
for(i in 1:length(linksall)){
# track progress
print(i)
# get URL
blogdata <- htmlParse(getURI(linksall[[i]]))
# get text from URL
result <- try(
blogtext[i,1] <- gsub('Share this:Like this:Like Loading...:Filed under','', xpathSApply(blogdata, "//*/div[@class='entry']", xmlValue))
); if(class(result) == "try-error") next;
# get date of blog post
# first month and day
result <- try(
blogtext[i,2] <- strsplit(xpathSApply(blogdata, "//*/span[@class='date']", xmlValue), ",")[[1]][1],
); if(class(result) == "try-error") next;
# and then year, and remove excess white space
result <- try(
blogtext[i,3] <- str_sub(gsub("\\s","", strsplit(xpathSApply(blogdata, "//*/span[@class='date']", xmlValue), ",")[[1]][2]),0,4)
); if(class(result) == "try-error") next;
# and then time, and remove excess white space
result <- try(
blogtext[i,4] <- str_sub(gsub("\\s","", strsplit(xpathSApply(blogdata, "//*/span[@class='date']", xmlValue), ",")[[1]][2]),-7)
); if(class(result) == "try-error") next;
# and then author
result <- try(
blogtext[i,5] <- xpathSApply(blogdata, "//*/span[@class='contributor']", xmlValue)
); if(class(result) == "try-error") next;
# and the names of the commenters
result <- try(
commenters[[i]] <- xpathSApply(blogdata, "//*/cite[@class='fn']", xmlValue)
); if(class(result) == "try-error") next;
}
# add columns of URLs to the fulltext post
blogtext$url <- linksall
#' clean out non-ASCII characters and formatting
# remove non-ASCII characters
Encoding(blogtext[,1]) <- "latin1"
iconv(blogtext[,1], "latin1", "ASCII", sub="")
# remove newline character
blogtext[,1] <- gsub("\n","", blogtext[,1])
# save as CSV so others can use it
write.csv(blogtext, 'britishmuseum.csv')