Skip to content

Commit

Permalink
Merge pull request #26 from scarnecchia/DEV-16
Browse files Browse the repository at this point in the history
DEV: Modify get_inputfiles() to pass proper input dataset to str_detect.
  • Loading branch information
scarnecchia authored May 6, 2022
2 parents 914307f + b797157 commit b5204ef
Show file tree
Hide file tree
Showing 5 changed files with 68 additions and 42 deletions.
29 changes: 17 additions & 12 deletions R/functions.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@ get_data <- function(url, elements) {
}

get_inputfile <- function(.file) {
path <- fs::dir_info("inputfiles", type="file") %>%
path <- fs::dir_info("inputfiles", type = "file") %>%
dplyr::select(path, change_time, birth_time) %>%
dplyr::filter(stringr::str_detect(path, file)) %>%
dplyr::filter(stringr::str_detect(path, .file)) %>%
dplyr::filter(birth_time == max(birth_time)) %>%
dplyr::pull(path)

Expand All @@ -17,21 +17,24 @@ get_inputfile <- function(.file) {
}

extract_total <- function(indsn, x) {
total <- indsn[[x]] %>% rvest::html_text2() %>%
total <- indsn[[x]] %>%
rvest::html_text2() %>%
stringr::str_extract("\\d+(?= \\b)") %>%
readr::parse_double()
}

extract_origin <- function(indsn, x) {
indsn[[x]] %>% rvest::html_element("img") %>%
indsn[[x]] %>%
rvest::html_element("img") %>%
rvest::html_attr("src") %>%
stringr::str_extract("(Flag_of_the_|Flag_of_)([a-zA-Z_]+|[a-zA-Z]+)") %>%
stringr::str_remove("(Flag_of_the_|Flag_of_)") %>%
stringr::str_replace_all("_", " ")
}

extract_counts <- function(indsn, x, condition) {
counts <- indsn[[x]] %>% rvest::html_text2() %>%
counts <- indsn[[x]] %>%
rvest::html_text2() %>%
stringr::str_remove_all(".*(?=:)") %>%
stringr::str_remove_all(": ") %>%
stringr::str_remove_all("\\(") %>%
Expand All @@ -53,7 +56,8 @@ extract_counts <- function(indsn, x, condition) {
}

extract_system <- function(indsn, x) {
indsn[[x]] %>% rvest::html_text2() %>%
indsn[[x]] %>%
rvest::html_text2() %>%
stringr::str_remove_all("^\\d+ ") %>%
stringr::str_extract(".*(?=:)")
}
Expand All @@ -76,8 +80,9 @@ extract_url <- function(indsn, x) {


trim_all <- function(indsn) {
indsn %>% dplyr::ungroup() %>%
dplyr::mutate(dplyr::across(tidyr::everything(), ~ stringr::str_trim(.,)))
indsn %>%
dplyr::ungroup() %>%
dplyr::mutate(dplyr::across(tidyr::everything(), ~ stringr::str_trim(., )))
}

#' create_keys
Expand All @@ -97,21 +102,21 @@ create_keys <- function(indsn) {
dplyr::mutate(sysID = dplyr::row_number())

indsn <- indsn %>%
dplyr::left_join(sysID, by="system")
dplyr::left_join(sysID, by = "system")

imageID <- indsn %>%
dplyr::distinct(url) %>%
dplyr::mutate(imageID = dplyr::row_number())

indsn <- indsn %>%
dplyr::left_join(imageID, by="url")
dplyr::left_join(imageID, by = "url")

statusID <- indsn %>%
dplyr::distinct(status) %>%
dplyr::mutate(statusID = dplyr::row_number())

indsn <- indsn %>%
dplyr::left_join(statusID, by="status")
dplyr::left_join(statusID, by = "status")

matID <- indsn %>%
dplyr::distinct(country, sysID, imageID, statusID) %>%
Expand All @@ -121,7 +126,7 @@ create_keys <- function(indsn) {
))

indsn <- indsn %>%
dplyr::left_join(matID, by=c("country", "sysID", "imageID", "statusID"))
dplyr::left_join(matID, by = c("country", "sysID", "imageID", "statusID"))

return(indsn)
}
10 changes: 6 additions & 4 deletions R/per_event.R
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,13 @@
create_event_tables <- function(indsn, ...) {
idnsn <- indsn %>% dplyr::ungroup()

x <- indsn %>% dplyr::group_by(...) %>%
x <- indsn %>%
dplyr::group_by(...) %>%
{
setNames(group_split(.), group_keys(.)[[1]])
setNames(dplyr::group_split(.), dplyr::group_keys(.)[[1]])
}

x %>% names(.) %>%
purrr::map( ~ write_csv(x[[.]], glue::glue("outputfiles/event_{.}.csv")))
x %>%
names(.) %>%
purrr::map(~ write_csv(x[[.]], glue::glue("outputfiles/event_{.}.csv")))
}
28 changes: 15 additions & 13 deletions R/scrape_data.R
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,10 @@ scrape_data <- function(country) {
}

materiel <-
get_data(url,
"article") %>%
get_data(
url,
"article"
) %>%
rvest::html_elements("li")

data <-
Expand All @@ -26,11 +28,11 @@ scrape_data <- function(country) {
url = character()
)

counter = 0
counter <- 0
for (a in seq_along(materiel)) {
status <- materiel[[a]] %>% rvest::html_elements("a")
for (b in seq_along(status)) {
counter = counter + 1
counter <- counter + 1
data[counter, 1] <- country
data[counter, 2] <- extract_origin(materiel, a)
data[counter, 3] <- extract_system(materiel, a)
Expand All @@ -55,26 +57,25 @@ create_data <- function() {
dplyr::select(country, origin, system, status, url, date_recorded) %>%
dplyr::distinct()

previous <- get_inputfile("totals_by_system") %>%
previous <- get_inputfile(.file="totals_by_system") %>%
trim_all() %>%
dplyr::mutate(date_recorded = as.Date(date_recorded)) %>%
dplyr::select(country,origin,system,status,url,date_recorded) %>%
dplyr::select(country, origin, system, status, url, date_recorded) %>%
dplyr::distinct()

check <- data %>%
dplyr::anti_join(previous, by = c("url")) %>%
dplyr::mutate(date_recorded = as.Date(date_recorded))

if (nrow(check) > 0) {
data <- check %>% dplyr::bind_rows(previous, .id = NULL) %>%
data <- check %>%
dplyr::bind_rows(previous, .id = NULL) %>%
dplyr::arrange(country, system, date_recorded)

previous %>% readr::write_csv("inputfiles/totals_by_system.csv.bak")

data %>% readr::write_csv(glue::glue(
"inputfiles/totals_by_system{lubridate::today()+1}.csv"
))

"inputfiles/totals_by_system{lubridate::today()+1}.csv"))
} else {
data <- previous
}
Expand All @@ -86,15 +87,16 @@ create_data <- function() {
dplyr::ungroup()

return(data)

}

total_by_system_wide <- function(indsn) {
indsn %>% dplyr::select(country, system, status) %>%
indsn %>%
dplyr::select(country, system, status) %>%
dplyr::group_by(country, system, status) %>%
dplyr::summarise(count = n()) %>%
tidyr::pivot_wider(names_from = status, values_from = count) %>%
dplyr::ungroup() %>%
dplyr::mutate(dplyr::across(where(is.numeric), ~ tidyr::replace_na(.x, 0)),
total = destroyed + captured + damaged + abandoned)
total = destroyed + captured + damaged + abandoned
)
}
21 changes: 12 additions & 9 deletions R/totals_by_type.R
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,9 @@ create_by_type <- function(country) {
heads <- heads[nchar(heads) > 0]

# Get the positons of the Russia and Ukraine headers
pos <- heads %>% stringr::str_which(country) %>% as.double()
pos <- heads %>%
stringr::str_which(country) %>%
as.double()

totals <- tibble(
country = character(),
Expand All @@ -38,16 +40,20 @@ create_by_type <- function(country) {
totals[l, "equipment"] <-
heads[l] %>% stringr::str_remove_all(" \\(.*\\)")
totals[l, "destroyed"] <-
heads[l] %>% stringr::str_extract("destroyed: \\d+") %>%
heads[l] %>%
stringr::str_extract("destroyed: \\d+") %>%
stringr::str_remove_all("[:alpha:]|[:punct:]")
totals[l, "abandoned"] <-
heads[l] %>% stringr::str_extract("(abandoned|aboned): \\d+") %>%
heads[l] %>%
stringr::str_extract("(abandoned|aboned): \\d+") %>%
stringr::str_remove_all("[:alpha:]|[:punct:]")
totals[l, "captured"] <-
heads[l] %>% stringr::str_extract("captured: \\d+") %>%
heads[l] %>%
stringr::str_extract("captured: \\d+") %>%
stringr::str_remove_all("[:alpha:]|[:punct:]")
totals[l, "damaged"] <-
heads[l] %>% stringr::str_extract("damaged: \\d+") %>%
heads[l] %>%
stringr::str_extract("damaged: \\d+") %>%
stringr::str_remove_all("[:alpha:]|[:punct:]")
}

Expand All @@ -73,10 +79,7 @@ totals_by_type <- function() {
ukraine <- create_by_type("Ukraine")

totals_df <- russia %>%
dplyr::bind_rows(ukraine, .id=NULL)
dplyr::bind_rows(ukraine, .id = NULL)

return(totals_df)
}



22 changes: 18 additions & 4 deletions index.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -39,24 +39,38 @@ graph_counts(daily_count, "All Types", "type_total")
### Russia

```{r russia-total-system-type, echo=FALSE, warning=FALSE, message=FALSE}
totals_by_type() %>% dplyr::filter(country=="Russia") %>% kableExtra::kbl(caption="Russia: Totals by System Type and Status") %>% kableExtra::kable_classic(font_size=16)
totals_by_type() %>%
dplyr::filter(country == "Russia") %>%
kableExtra::kbl(caption = "Russia: Totals by System Type and Status") %>%
kableExtra::kable_classic(font_size = 16)
```

### Ukraine

```{r total-system-type, echo=FALSE, warning=FALSE, message=FALSE}
totals_by_type() %>% dplyr::filter(country=="Ukraine") %>% kableExtra::kbl(caption="Ukraine: Totals by System Type and Status") %>% kableExtra::kable_classic(font_size=16)
totals_by_type() %>%
dplyr::filter(country == "Ukraine") %>%
kableExtra::kbl(caption = "Ukraine: Totals by System Type and Status") %>%
kableExtra::kable_classic(font_size = 16)
```

## By System^["Due to how the data is recorded, weapons systems may be recorded in here twice—*e.g.*, if it is listed as abandoned and captured, a system will be counted in both the abandoned and captured tallies."] {.tabset}

### Russia
```{r russia-total-system, echo=FALSE, warning=FALSE, message=FALSE}
total_by_system_wide(totals_by_system) %>% dplyr::filter(country=="Russia") %>% dplyr::arrange(desc(total)) %>% kableExtra::kbl(caption="Russia: Totals by System and Status") %>% kableExtra::kable_classic(font_size=16)
total_by_system_wide(totals_by_system) %>%
dplyr::filter(country == "Russia") %>%
dplyr::arrange(desc(total)) %>%
kableExtra::kbl(caption = "Russia: Totals by System and Status") %>%
kableExtra::kable_classic(font_size = 16)
```

### Ukraine

```{r total-system, warning=FALSE, echo=FALSE, message=FALSE}
total_by_system_wide(totals_by_system) %>% dplyr::filter(country=="Ukraine") %>% dplyr::arrange(desc(total)) %>% kableExtra::kbl(caption="Ukraine: Totals by System and Status") %>% kableExtra::kable_classic(font_size=16)
total_by_system_wide(totals_by_system) %>%
dplyr::filter(country == "Ukraine") %>%
dplyr::arrange(desc(total)) %>%
kableExtra::kbl(caption = "Ukraine: Totals by System and Status") %>%
kableExtra::kable_classic(font_size = 16)
```

0 comments on commit b5204ef

Please sign in to comment.