Skip to content

Commit

Permalink
replace httr::GET with httr::RETRY (closing #24)
Browse files Browse the repository at this point in the history
  • Loading branch information
dmi3kno committed Mar 29, 2020
1 parent 0860b68 commit 7d34280
Show file tree
Hide file tree
Showing 15 changed files with 81 additions and 61 deletions.
5 changes: 2 additions & 3 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
Package: polite
Version: 0.1.1.9000
Version: 0.1.1.9010
Title: Be Nice on the Web
Description: Be responsible when scraping data from websites by following polite principles: introduce yourself, ask for permission, take slowly and never ask twice.
Authors@R: person("Dmytro", "Perepolkin", email = "dperepolkin@gmail.com", role = c("aut", "cre"), comment = structure("0000-0001-8558-6183", .Names = "ORCID"))
Expand All @@ -10,9 +10,8 @@ ByteCompile: true
URL: https://github.com/dmi3kno/polite
BugReports: https://github.com/dmi3kno/polite/issues
Roxygen: list(markdown = TRUE)
RoxygenNote: 6.1.1
RoxygenNote: 7.1.0
Imports:
here,
httr,
magrittr,
memoise,
Expand Down
4 changes: 0 additions & 4 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@ export(scrape)
export(set_rip_delay)
export(set_scrape_delay)
export(use_manners)
importFrom(here,here)
importFrom(httr,GET)
importFrom(httr,add_headers)
importFrom(httr,config)
Expand All @@ -20,7 +19,6 @@ importFrom(httr,handle)
importFrom(httr,http_error)
importFrom(httr,modify_url)
importFrom(httr,parse_url)
importFrom(httr,warn_for_status)
importFrom(magrittr,"%>%")
importFrom(memoise,forget)
importFrom(ratelimitr,UPDATE_RATE)
Expand All @@ -33,7 +31,5 @@ importFrom(rvest,html_attrs)
importFrom(rvest,html_text)
importFrom(stats,na.omit)
importFrom(stats,setNames)
importFrom(tools,file_ext)
importFrom(tools,file_path_sans_ext)
importFrom(usethis,use_template)
importFrom(utils,download.file)
6 changes: 6 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
# polite (development version)

# polite 0.1.1.9010 (Release date: 2020-03-29)

* Replaced httr::GET with httr::RETRY (closing #24)
* Removed tests base on Wikipedia due to changed routing
* Removed dependency on `here`

# polite 0.1.1 (Release date: 2019-11-22)

* Released on CRAN. Initial release v0.1.1
Expand Down
2 changes: 1 addition & 1 deletion R/bow.R
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ bow <- function(url,
url_subdomain <- paste0(url_parsed$scheme, "://", url_parsed$hostname)
rt <- robotstxt::robotstxt(domain = url_subdomain,
user_agent = user_agent,
warn=verbose, force = force)
warn = verbose, force = force)

delay_df <- rt$crawl_delay
delay_rt <- as.numeric(delay_df[with(delay_df, useragent==user_agent), "value"]) %||%
Expand Down
2 changes: 0 additions & 2 deletions R/rip.R
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,6 @@
#' nod("media/img/sub-page-slide2.jpg") %>%
#' rip()
#' }
#' @importFrom here here
#' @importFrom tools file_path_sans_ext file_ext
rip <- function(bow, destfile=NULL, ..., mode="wb", path=tempdir(), overwrite=FALSE){

url <- bow$url
Expand Down
34 changes: 18 additions & 16 deletions R/scrape.R
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#' @importFrom httr http_error add_headers warn_for_status content modify_url parse_url
#' @importFrom httr http_error content modify_url parse_url
m_scrape <- function(bow, query=NULL, params=NULL, accept="html", content=NULL, verbose=FALSE) { # nolint

if(!inherits(bow, "polite"))
Expand Down Expand Up @@ -29,21 +29,23 @@ m_scrape <- function(bow, query=NULL, params=NULL, accept="html", content=NULL,
accept_type <- httr::accept(accept)
bow$config <- c(bow$config, accept_type)

response <- httr_get_ltd(bow$url, bow$config, bow$handle)
max_attempts <- 3

att_msg <- c(rep("",max_attempts-1),
"This is the last attempt, if it fails will return NULL")

try_number <- 1
while (httr::http_error(response) && try_number < max_attempts) {
try_number <- try_number + 1
if (verbose)
message(paste0("Attempt number ", try_number,".", att_msg[[try_number]]))

Sys.sleep(2^try_number)
response <- httr_get_ltd(bow$url, bow$config, bow$handle)
}
response <- httr_get_ltd(bow$url, bow$config, bow$handle, verbose)
########################## replaced with RETRY
# max_attempts <- 3
#
# att_msg <- c(rep("",max_attempts-1),
# "This is the last attempt, if it fails will return NULL")
#
# try_number <- 1
# while (httr::http_error(response) && try_number < max_attempts) {
# try_number <- try_number + 1
# if (verbose)
# message(paste0("Attempt number ", try_number,".", att_msg[[try_number]]))
#
# Sys.sleep(2^try_number)
# response <- httr_get_ltd(bow$url, bow$config, bow$handle)
# }
############################ end replace with RETRY

if(httr::http_error(response)){
warning(httr::http_status(response)$message, " ", bow$url, call. = FALSE)
Expand Down
9 changes: 6 additions & 3 deletions R/utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,14 @@ is_scrapable <- function(bow){
}

#' @importFrom httr GET
httr_get <- function(url, config, handle){
httr::GET(
httr_get <- function(url, config, handle, verbose){
httr::RETRY(verb="GET",
url = url,
config = config,
handle = handle
handle = handle,
times = 3,
pause_base = 5,
quiet=!verbose
)
}

Expand Down
2 changes: 1 addition & 1 deletion README.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ library(polite)
library(rvest)
session <- bow("https://www.cheese.com/by_type", force = TRUE)
result <- scrape(session, query=list(t="semi-soft",per_page=100)) %>%
result <- scrape(session, query=list(t="semi-soft", per_page=100)) %>%
html_node("#main-body") %>%
html_nodes("h3") %>%
html_text()
Expand Down
30 changes: 15 additions & 15 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ library(polite)
library(rvest)

session <- bow("https://www.cheese.com/by_type", force = TRUE)
result <- scrape(session, query=list(t="semi-soft",per_page=100)) %>%
result <- scrape(session, query=list(t="semi-soft", per_page=100)) %>%
html_node("#main-body") %>%
html_nodes("h3") %>%
html_text()
Expand Down Expand Up @@ -144,20 +144,20 @@ tibble(name=lnks %>% html_text(),

df <- pages_df %>% pmap_df(get_cheese_page)
df
#> # A tibble: 515 x 2
#> name link
#> <chr> <chr>
#> 1 Abbaye de Belloc /abbaye-de-belloc/
#> 2 Abbaye de Belval /abbaye-de-belval/
#> 3 Abbaye de Citeaux /abbaye-de-citeaux/
#> 4 Abbaye de Timadeuc /abbaye-de-timadeuc/
#> 5 Abbaye du Mont des Cats /abbaye-du-mont-des-cats/
#> 6 Abbot’s Gold /abbots-gold/
#> 7 Abertam /abertam/
#> 8 Abondance /abondance/
#> 9 Acapella /acapella/
#> 10 "Accasciato " /accasciato/
#> # … with 505 more rows
#> # A tibble: 517 x 2
#> name link
#> <chr> <chr>
#> 1 "Abbaye de Belloc" /abbaye-de-belloc/
#> 2 "Abbaye de Belval" /abbaye-de-belval/
#> 3 "Abbaye de Citeaux" /abbaye-de-citeaux/
#> 4 "Abbaye de Timadeuc" /abbaye-de-timadeuc/
#> 5 "Abbaye du Mont des Cats" /abbaye-du-mont-des-cats/
#> 6 "Abbot’s Gold" /abbots-gold/
#> 7 "Abertam" /abertam/
#> 8 "Abondance" /abondance/
#> 9 "Acapella" /acapella/
#> 10 "Accasciato " /accasciato/
#> # … with 507 more rows
```

## Another example
Expand Down
9 changes: 7 additions & 2 deletions man/bow.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

9 changes: 7 additions & 2 deletions man/html_attrs_dfr.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

10 changes: 8 additions & 2 deletions man/rip.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

10 changes: 8 additions & 2 deletions man/scrape.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 1 addition & 3 deletions tests/testthat/test_bow.R
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
context("Bow")
library(polite)

test_that("Bow warns about disallowed url", {
expect_warning(bow("https://www.wikipedia.org/w/", verbose = TRUE), "scrape")
})

6 changes: 1 addition & 5 deletions tests/testthat/test_nod.R
Original file line number Diff line number Diff line change
@@ -1,8 +1,4 @@
context("Nod")
library(polite)

test_that("Nod warns about disallowed url", {
expect_warning(
nod(bow("https://www.wikipedia.org/"),
path="w/", verbose = TRUE), "scrape")
})

0 comments on commit 7d34280

Please sign in to comment.