Merge pull request #26 from scarnecchia/DEV-16

DEV: Modify get_inputfiles() to pass proper input dataset to str_detect.
scarnecchia · May 6, 2022 · b5204ef · b5204ef
2 parents 914307f + b797157
commit b5204ef
Show file tree

Hide file tree

Showing 5 changed files with 68 additions and 42 deletions.
diff --git a/R/functions.R b/R/functions.R
@@ -3,9 +3,9 @@ get_data <- function(url, elements) {
 }
 
 get_inputfile <- function(.file) {
-  path <- fs::dir_info("inputfiles", type="file") %>%
+  path <- fs::dir_info("inputfiles", type = "file") %>%
     dplyr::select(path, change_time, birth_time) %>%
-    dplyr::filter(stringr::str_detect(path, file)) %>%
+    dplyr::filter(stringr::str_detect(path, .file)) %>%
     dplyr::filter(birth_time == max(birth_time)) %>%
     dplyr::pull(path)
 
@@ -17,21 +17,24 @@ get_inputfile <- function(.file) {
 }
 
 extract_total <- function(indsn, x) {
-  total <- indsn[[x]] %>% rvest::html_text2() %>%
+  total <- indsn[[x]] %>%
+    rvest::html_text2() %>%
     stringr::str_extract("\\d+(?= \\b)") %>%
     readr::parse_double()
 }
 
 extract_origin <- function(indsn, x) {
-  indsn[[x]] %>% rvest::html_element("img") %>%
+  indsn[[x]] %>%
+    rvest::html_element("img") %>%
     rvest::html_attr("src") %>%
     stringr::str_extract("(Flag_of_the_|Flag_of_)([a-zA-Z_]+|[a-zA-Z]+)") %>%
     stringr::str_remove("(Flag_of_the_|Flag_of_)") %>%
     stringr::str_replace_all("_", " ")
 }
 
 extract_counts <- function(indsn, x, condition) {
-  counts <- indsn[[x]] %>% rvest::html_text2() %>%
+  counts <- indsn[[x]] %>%
+    rvest::html_text2() %>%
     stringr::str_remove_all(".*(?=:)") %>%
     stringr::str_remove_all(": ") %>%
     stringr::str_remove_all("\\(") %>%
@@ -53,7 +56,8 @@ extract_counts <- function(indsn, x, condition) {
 }
 
 extract_system <- function(indsn, x) {
-  indsn[[x]] %>% rvest::html_text2() %>%
+  indsn[[x]] %>%
+    rvest::html_text2() %>%
     stringr::str_remove_all("^\\d+ ") %>%
     stringr::str_extract(".*(?=:)")
 }
@@ -76,8 +80,9 @@ extract_url <- function(indsn, x) {
 
 
 trim_all <- function(indsn) {
-  indsn %>% dplyr::ungroup() %>%
-    dplyr::mutate(dplyr::across(tidyr::everything(), ~ stringr::str_trim(.,)))
+  indsn %>%
+    dplyr::ungroup() %>%
+    dplyr::mutate(dplyr::across(tidyr::everything(), ~ stringr::str_trim(., )))
 }
 
 #' create_keys
@@ -97,21 +102,21 @@ create_keys <- function(indsn) {
     dplyr::mutate(sysID = dplyr::row_number())
 
   indsn <- indsn %>%
-    dplyr::left_join(sysID, by="system")
+    dplyr::left_join(sysID, by = "system")
 
   imageID <- indsn %>%
     dplyr::distinct(url) %>%
     dplyr::mutate(imageID = dplyr::row_number())
 
   indsn <- indsn %>%
-    dplyr::left_join(imageID, by="url")
+    dplyr::left_join(imageID, by = "url")
 
   statusID <- indsn %>%
     dplyr::distinct(status) %>%
     dplyr::mutate(statusID = dplyr::row_number())
 
   indsn <- indsn %>%
-     dplyr::left_join(statusID, by="status")
+    dplyr::left_join(statusID, by = "status")
 
   matID <- indsn %>%
     dplyr::distinct(country, sysID, imageID, statusID) %>%
@@ -121,7 +126,7 @@ create_keys <- function(indsn) {
     ))
 
   indsn <- indsn %>%
-    dplyr::left_join(matID, by=c("country", "sysID", "imageID", "statusID"))
+    dplyr::left_join(matID, by = c("country", "sysID", "imageID", "statusID"))
 
   return(indsn)
 }
diff --git a/R/per_event.R b/R/per_event.R
@@ -9,11 +9,13 @@
 create_event_tables <- function(indsn, ...) {
   idnsn <- indsn %>% dplyr::ungroup()
 
-  x <- indsn %>% dplyr::group_by(...) %>%
+  x <- indsn %>%
+    dplyr::group_by(...) %>%
     {
-      setNames(group_split(.), group_keys(.)[[1]])
+      setNames(dplyr::group_split(.), dplyr::group_keys(.)[[1]])
     }
 
-  x %>% names(.) %>%
-    purrr::map( ~ write_csv(x[[.]], glue::glue("outputfiles/event_{.}.csv")))
+  x %>%
+    names(.) %>%
+    purrr::map(~ write_csv(x[[.]], glue::glue("outputfiles/event_{.}.csv")))
 }
diff --git a/R/scrape_data.R b/R/scrape_data.R
@@ -13,8 +13,10 @@ scrape_data <- function(country) {
   }
 
   materiel <-
-    get_data(url,
-             "article") %>%
+    get_data(
+      url,
+      "article"
+    ) %>%
     rvest::html_elements("li")
 
   data <-
@@ -26,11 +28,11 @@ scrape_data <- function(country) {
       url = character()
     )
 
-  counter = 0
+  counter <- 0
   for (a in seq_along(materiel)) {
     status <- materiel[[a]] %>% rvest::html_elements("a")
     for (b in seq_along(status)) {
-      counter = counter + 1
+      counter <- counter + 1
       data[counter, 1] <- country
       data[counter, 2] <- extract_origin(materiel, a)
       data[counter, 3] <- extract_system(materiel, a)
@@ -55,26 +57,25 @@ create_data <- function() {
     dplyr::select(country, origin, system, status, url, date_recorded) %>%
     dplyr::distinct()
 
-  previous <- get_inputfile("totals_by_system") %>%
+  previous <- get_inputfile(.file="totals_by_system") %>%
     trim_all() %>%
     dplyr::mutate(date_recorded = as.Date(date_recorded)) %>%
-    dplyr::select(country,origin,system,status,url,date_recorded) %>%
+    dplyr::select(country, origin, system, status, url, date_recorded) %>%
     dplyr::distinct()
 
   check <- data %>%
     dplyr::anti_join(previous, by = c("url")) %>%
     dplyr::mutate(date_recorded = as.Date(date_recorded))
 
   if (nrow(check) > 0) {
-    data <- check %>% dplyr::bind_rows(previous, .id = NULL) %>%
+    data <- check %>%
+      dplyr::bind_rows(previous, .id = NULL) %>%
       dplyr::arrange(country, system, date_recorded)
 
     previous %>% readr::write_csv("inputfiles/totals_by_system.csv.bak")
 
     data %>% readr::write_csv(glue::glue(
-      "inputfiles/totals_by_system{lubridate::today()+1}.csv"
-    ))
-
+      "inputfiles/totals_by_system{lubridate::today()+1}.csv"))
   } else {
     data <- previous
   }
@@ -86,15 +87,16 @@ create_data <- function() {
     dplyr::ungroup()
 
   return(data)
-
 }
 
 total_by_system_wide <- function(indsn) {
-  indsn %>% dplyr::select(country, system, status) %>%
+  indsn %>%
+    dplyr::select(country, system, status) %>%
     dplyr::group_by(country, system, status) %>%
     dplyr::summarise(count = n()) %>%
     tidyr::pivot_wider(names_from = status, values_from = count) %>%
     dplyr::ungroup() %>%
     dplyr::mutate(dplyr::across(where(is.numeric), ~ tidyr::replace_na(.x, 0)),
-                  total = destroyed + captured + damaged + abandoned)
+      total = destroyed + captured + damaged + abandoned
+    )
 }
diff --git a/R/totals_by_type.R b/R/totals_by_type.R
@@ -23,7 +23,9 @@ create_by_type <- function(country) {
   heads <- heads[nchar(heads) > 0]
 
   # Get the positons of the Russia and Ukraine headers
-  pos <- heads %>% stringr::str_which(country) %>% as.double()
+  pos <- heads %>%
+    stringr::str_which(country) %>%
+    as.double()
 
   totals <- tibble(
     country = character(),
@@ -38,16 +40,20 @@ create_by_type <- function(country) {
     totals[l, "equipment"] <-
       heads[l] %>% stringr::str_remove_all(" \\(.*\\)")
     totals[l, "destroyed"] <-
-      heads[l] %>% stringr::str_extract("destroyed: \\d+") %>%
+      heads[l] %>%
+      stringr::str_extract("destroyed: \\d+") %>%
       stringr::str_remove_all("[:alpha:]|[:punct:]")
     totals[l, "abandoned"] <-
-      heads[l] %>% stringr::str_extract("(abandoned|aboned): \\d+") %>%
+      heads[l] %>%
+      stringr::str_extract("(abandoned|aboned): \\d+") %>%
       stringr::str_remove_all("[:alpha:]|[:punct:]")
     totals[l, "captured"] <-
-      heads[l] %>% stringr::str_extract("captured: \\d+") %>%
+      heads[l] %>%
+      stringr::str_extract("captured: \\d+") %>%
       stringr::str_remove_all("[:alpha:]|[:punct:]")
     totals[l, "damaged"] <-
-      heads[l] %>% stringr::str_extract("damaged: \\d+") %>%
+      heads[l] %>%
+      stringr::str_extract("damaged: \\d+") %>%
       stringr::str_remove_all("[:alpha:]|[:punct:]")
   }
 
@@ -73,10 +79,7 @@ totals_by_type <- function() {
   ukraine <- create_by_type("Ukraine")
 
   totals_df <- russia %>%
-    dplyr::bind_rows(ukraine, .id=NULL)
+    dplyr::bind_rows(ukraine, .id = NULL)
 
   return(totals_df)
 }
-
-
-
diff --git a/index.Rmd b/index.Rmd
@@ -39,24 +39,38 @@ graph_counts(daily_count, "All Types", "type_total")
 ### Russia
 
 ```{r russia-total-system-type, echo=FALSE, warning=FALSE, message=FALSE}
-totals_by_type() %>% dplyr::filter(country=="Russia") %>% kableExtra::kbl(caption="Russia: Totals by System Type and Status") %>% kableExtra::kable_classic(font_size=16)
+totals_by_type() %>%
+  dplyr::filter(country == "Russia") %>%
+  kableExtra::kbl(caption = "Russia: Totals by System Type and Status") %>%
+  kableExtra::kable_classic(font_size = 16)
 ```
 
 ### Ukraine
 
 ```{r total-system-type, echo=FALSE, warning=FALSE, message=FALSE}
-totals_by_type() %>% dplyr::filter(country=="Ukraine") %>% kableExtra::kbl(caption="Ukraine: Totals by System Type and Status") %>% kableExtra::kable_classic(font_size=16)
+totals_by_type() %>%
+  dplyr::filter(country == "Ukraine") %>%
+  kableExtra::kbl(caption = "Ukraine: Totals by System Type and Status") %>%
+  kableExtra::kable_classic(font_size = 16)
 ```
 
 ## By System^["Due to how the data is recorded, weapons systems may be recorded in here twice—*e.g.*, if it is listed as abandoned and captured, a system will be counted in both the abandoned and captured tallies."]  {.tabset}
 
 ### Russia
 ```{r russia-total-system, echo=FALSE, warning=FALSE, message=FALSE}
-total_by_system_wide(totals_by_system) %>% dplyr::filter(country=="Russia") %>% dplyr::arrange(desc(total)) %>% kableExtra::kbl(caption="Russia: Totals by System and Status") %>% kableExtra::kable_classic(font_size=16)
+total_by_system_wide(totals_by_system) %>%
+  dplyr::filter(country == "Russia") %>%
+  dplyr::arrange(desc(total)) %>%
+  kableExtra::kbl(caption = "Russia: Totals by System and Status") %>%
+  kableExtra::kable_classic(font_size = 16)
 ```
 
 ### Ukraine
 
 ```{r total-system, warning=FALSE, echo=FALSE, message=FALSE}
-total_by_system_wide(totals_by_system) %>% dplyr::filter(country=="Ukraine") %>% dplyr::arrange(desc(total)) %>% kableExtra::kbl(caption="Ukraine: Totals by System and Status") %>% kableExtra::kable_classic(font_size=16)
+total_by_system_wide(totals_by_system) %>%
+  dplyr::filter(country == "Ukraine") %>%
+  dplyr::arrange(desc(total)) %>%
+  kableExtra::kbl(caption = "Ukraine: Totals by System and Status") %>%
+  kableExtra::kable_classic(font_size = 16)
 ```