Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DEV-13: Update tool to accomadate Oryx site changes. #19

Merged
merged 3 commits into from
Apr 21, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 27 additions & 17 deletions R/scrape_data.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,20 +3,20 @@
#'
#' @return a tibble
#' @export
scrape_data <- function() {
scrape_data <- function(country) {
if (country == "Russia") {
url <-
russia_url
} else {
url <-
ukraine_url
}

materiel <-
get_data(
"https://www.oryxspioenkop.com/2022/02/attack-on-europe-documenting-equipment.html",
"article"
) %>%
get_data(url,
"article") %>%
rvest::html_elements("li")

# Retreive the start position of each country
country_pos <- materiel %>% rvest::html_text2() %>%
# T-64BV is the first row in the tank list and marks the beginning of each country
stringr::str_which("T-64BV")

#' Run Program
data <-
tibble::tibble(
country = character(),
Expand All @@ -31,8 +31,7 @@ scrape_data <- function() {
status <- materiel[[a]] %>% rvest::html_elements("a")
for (b in seq_along(status)) {
counter = counter + 1
data[counter, 1] <-
ifelse(a < country_pos[2], "Russia", "Ukraine")
data[counter, 1] <- country
data[counter, 2] <- extract_origin(materiel, a)
data[counter, 3] <- extract_system(materiel, a)
data[counter, 4] <- extract_status(status, b)
Expand All @@ -45,6 +44,15 @@ scrape_data <- function() {
tidyr::unnest_longer(status) %>%
dplyr::mutate(date_recorded = as.Date(lubridate::today())) %>%
trim_all()
}

create_data <- function() {
russia <- scrape_data("Russia")
ukraine <- scrape_data("Ukraine")

data <- russia %>%
dplyr::bind_rows(ukraine) %>%
dplyr::select(country, origin, system, status, url, date_recorded)

previous <- get_inputfile("totals_by_system") %>%
trim_all() %>%
Expand All @@ -61,12 +69,14 @@ scrape_data <- function() {
)) %>%
dplyr::arrange(country, system, date_recorded)

data <- check %>% dplyr::bind_rows(get_inputfile("totals_by_system")) %>%
dplyr::arrange(country, system, date_recorded)
data <- check %>% dplyr::bind_rows(previous, .id = NULL) %>%
dplyr::arrange(country, system, date_recorded)

previous %>% readr::write_csv("inputfiles/totals_by_system.csv.bak")
previous %>% readr::write_csv("inputfiles/totals_by_system.csv.bak")

data %>% readr::write_csv(glue::glue("inputfiles/totals_by_system{lubridate::today()+1}.csv"))
data %>% readr::write_csv(glue::glue(
"inputfiles/totals_by_system{lubridate::today()+1}.csv"
))

} else {
logr::put("No new data")
Expand Down
38 changes: 28 additions & 10 deletions R/totals_by_type.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,18 @@
#' @description Gets data by system category.
#'
#' @return a tibble
totals_by_type <- function() {
create_by_type <- function(country) {
if (country == "Russia") {
url <-
russia_url
} else {
url <-
ukraine_url
}

heads <-
get_data(
"https://www.oryxspioenkop.com/2022/02/attack-on-europe-documenting-equipment.html",
url,
"article div"
) %>%
rvest::html_elements("h3") %>%
Expand All @@ -15,8 +23,7 @@ totals_by_type <- function() {
heads <- heads[nchar(heads) > 0]

# Get the positons of the Russia and Ukraine headers
rus_pos <- heads %>% stringr::str_which("Russia") %>% as.double()
ukr_pos <- heads %>% stringr::str_which("Ukraine") %>% as.double()
pos <- heads %>% stringr::str_which(country) %>% as.double()

totals <- tibble(
country = character(),
Expand Down Expand Up @@ -45,20 +52,31 @@ totals_by_type <- function() {
}


totals_df <- totals %>%
country_df <- totals %>%
dplyr::mutate(
dplyr::across(destroyed:damaged, ~ as.double(tidyr::replace_na(.x, "0"))),
type_total = destroyed + abandoned + captured + damaged,
row_id = 1:n(),
country = dplyr::case_when(row_id < ukr_pos ~ "Russia",
row_id >= ukr_pos ~ "Ukraine")
row_id = 1:n()
) %>%
dplyr::mutate(country = tidyr::replace_na(country, !!!country)) %>%
select(-row_id) %>%
dplyr::mutate(
equipment = replace(equipment, rus_pos, "All Types"),
equipment = replace(equipment, ukr_pos, "All Types")
equipment = replace(equipment, pos, "All Types"),
) %>%
dplyr::rename(equipment_type = equipment)

return(country_df)
}

totals_by_type <- function() {
russia <- create_by_type("Russia")
ukraine <- create_by_type("Ukraine")

totals_df <- russia %>%
dplyr::bind_rows(ukraine, .id=NULL)

return(totals_df)
}



4,630 changes: 2,475 additions & 2,155 deletions index.html

Large diffs are not rendered by default.

592 changes: 592 additions & 0 deletions inputfiles/daily_count_baseline2022-04-20.csv

Large diffs are not rendered by default.

2,548 changes: 0 additions & 2,548 deletions inputfiles/totals_by_system.csv

This file was deleted.

2,508 changes: 0 additions & 2,508 deletions inputfiles/totals_by_system2022-03-28.csv

This file was deleted.

2,548 changes: 0 additions & 2,548 deletions inputfiles/totals_by_system2022-03-29.csv

This file was deleted.

2,643 changes: 0 additions & 2,643 deletions inputfiles/totals_by_system2022-03-30.csv

This file was deleted.

2,736 changes: 0 additions & 2,736 deletions inputfiles/totals_by_system2022-03-31.csv

This file was deleted.

2,813 changes: 0 additions & 2,813 deletions inputfiles/totals_by_system2022-04-01.csv

This file was deleted.

2,953 changes: 0 additions & 2,953 deletions inputfiles/totals_by_system2022-04-02.csv

This file was deleted.

2,963 changes: 0 additions & 2,963 deletions inputfiles/totals_by_system2022-04-03.csv

This file was deleted.

2,963 changes: 0 additions & 2,963 deletions inputfiles/totals_by_system2022-04-04.csv

This file was deleted.

3,174 changes: 0 additions & 3,174 deletions inputfiles/totals_by_system2022-04-05.csv

This file was deleted.

30,561 changes: 0 additions & 30,561 deletions inputfiles/totals_by_system2022-04-07.csv

This file was deleted.

3,963 changes: 3,963 additions & 0 deletions inputfiles/totals_by_system2022-04-20.csv

Large diffs are not rendered by default.

5 changes: 4 additions & 1 deletion scrape_oryx.R
Original file line number Diff line number Diff line change
Expand Up @@ -35,13 +35,16 @@ source("R/totals_by_type.R")
source("R/per_event.R")
source("R/daily_count.R")

russia_url <- "https://www.oryxspioenkop.com/2022/02/attack-on-europe-documenting-equipment.html"
ukraine_url <- "https://www.oryxspioenkop.com/2022/02/attack-on-europe-documenting-ukrainian.html"

tmp <-
file.path("outputfiles", sprintf("scrape_oryx_%s.log", format(Sys.time(), "%Y%m%dT%H%M%S")))
lf <- logr::log_open(tmp)
today <- format(Sys.Date(), "%Y-%m-%d")


totals_by_system <- scrape_data() %>%
totals_by_system <- create_data() %>%
readr::write_csv(., file = glue::glue("outputfiles/totals_by_system.csv"))

#' Write Event Tables
Expand Down