Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

replace summative content for episode on "read case data" #103

Open
avallecam opened this issue Aug 7, 2024 · 0 comments
Open

replace summative content for episode on "read case data" #103

avallecam opened this issue Aug 7, 2024 · 0 comments
Labels
clean-validation set of issues about the clean-validation episode enhancement New feature or request

Comments

@avallecam
Copy link
Member

avallecam commented Aug 7, 2024

scenario: tables are collected from separate database systems at different moments (report and followup). we can use their primary keys to access, filter, and join them to get a linelist.

# read data ---------------------------------------------------------------

library(outbreaks)
library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
library(purrr)

# read data
dat <- outbreaks::mers_korea_2015 %>% 
  purrr::pluck("linelist") %>% 
  dplyr::as_tibble()

dat %>% dplyr::glimpse()
#> Rows: 162
#> Columns: 15
#> $ id             <chr> "SK_1", "SK_2", "SK_3", "SK_4", "SK_5", "SK_6", "SK_7",…
#> $ age            <int> 68, 63, 76, 46, 50, 71, 28, 46, 56, 44, 79, 49, 49, 35,…
#> $ age_class      <chr> "60-69", "60-69", "70-79", "40-49", "50-59", "70-79", "…
#> $ sex            <fct> M, F, M, F, M, M, F, F, M, M, F, F, M, M, M, M, M, F, M…
#> $ place_infect   <fct> Middle East, Outside Middle East, Outside Middle East, …
#> $ reporting_ctry <fct> South Korea, South Korea, South Korea, South Korea, Sou…
#> $ loc_hosp       <fct> "Pyeongtaek St. Mary, Hospital, Pyeongtaek, Gyeonggi", …
#> $ dt_onset       <date> 2015-05-11, 2015-05-18, 2015-05-20, 2015-05-25, 2015-0…
#> $ dt_report      <date> 2015-05-19, 2015-05-20, 2015-05-20, 2015-05-26, 2015-0…
#> $ week_report    <fct> 2015_21, 2015_21, 2015_21, 2015_22, 2015_22, 2015_22, 2…
#> $ dt_start_exp   <date> 2015-04-18, 2015-05-15, 2015-05-16, 2015-05-16, 2015-0…
#> $ dt_end_exp     <date> 2015-05-04, 2015-05-20, 2015-05-16, 2015-05-20, 2015-0…
#> $ dt_diag        <date> 2015-05-20, 2015-05-20, 2015-05-21, 2015-05-26, 2015-0…
#> $ outcome        <fct> Alive, Alive, Dead, Alive, Alive, Dead, Alive, Alive, A…
#> $ dt_death       <date> NA, NA, 2015-06-04, NA, NA, 2015-06-01, NA, NA, NA, NA…
# assumption: in this dataset we have two tables: report and followup (death)
table_report <- dat %>% dplyr::select(id:dt_diag)
table_followup <- dat %>% dplyr::select(id,outcome,dt_death)

# database management -----------------------------------------------------

library(DBI)
library(RSQLite)

# Create a temporary SQLite database in memory
database_outbreak <- DBI::dbConnect(
  drv = RSQLite::SQLite(),
  dbname = ":memory:"
)

# Store the dataframes as a table for the database
# in the SQLite database
DBI::dbWriteTable(
  conn = database_outbreak,
  name = "report",
  value = table_report
)

DBI::dbWriteTable(
  conn = database_outbreak,
  name = "followup",
  value = table_followup
)

database_outbreak
#> <SQLiteConnection>
#>   Path: :memory:
#>   Extensions: TRUE
# query data --------------------------------------------------------------

library(dplyr)
# library(dbplyr)

# Query data using dplyr verbs
database_report <- dplyr::tbl(database_outbreak, "report")
database_followup <- dplyr::tbl(database_outbreak, "followup")

database_report_query <- database_report %>%
  dplyr::select(id,age,sex,dt_onset,dt_report) %>% 
  filter(sex == "F")

# Show SQL query
database_report_query %>% 
  dplyr::show_query()
#> <SQL>
#> SELECT `id`, `age`, `sex`, `dt_onset`, `dt_report`
#> FROM `report`
#> WHERE (`sex` = 'F')
# Join tables
database_join_tables <- database_report_query %>%
  dplyr::left_join(database_followup)
#> Joining with `by = join_by(id)`
database_join_tables %>% 
  dplyr::show_query()
#> <SQL>
#> SELECT `LHS`.*, `outcome`, `dt_death`
#> FROM (
#>   SELECT `id`, `age`, `sex`, `dt_onset`, `dt_report`
#>   FROM `report`
#>   WHERE (`sex` = 'F')
#> ) AS `LHS`
#> LEFT JOIN `followup`
#>   ON (`LHS`.`id` = `followup`.`id`)
# Collect query and join
database_collect <- database_join_tables %>%
  dplyr::collect()

database_collect
#> # A tibble: 63 × 7
#>    id      age sex   dt_onset dt_report outcome dt_death
#>    <chr> <int> <chr>    <dbl>     <dbl> <chr>      <dbl>
#>  1 SK_2     63 F        16573     16575 Alive         NA
#>  2 SK_4     46 F        16580     16581 Alive         NA
#>  3 SK_7     28 F        16576     16583 Alive         NA
#>  4 SK_8     46 F        16581     16584 Alive         NA
#>  5 SK_11    79 F        16575     16584 Alive         NA
#>  6 SK_12    49 F        16576     16584 Alive         NA
#>  7 SK_18    77 F        16575     16587 Alive         NA
#>  8 SK_21    59 F        16578     16588 Alive         NA
#>  9 SK_22    39 F        16582     16588 Alive         NA
#> 10 SK_25    57 F           NA     16589 Dead       16587
#> # ℹ 53 more rows
# # Read data from the 'cases' table
# result <- DBI::dbReadTable(
#   conn = database_con,
#   name = "cases"
# )

# close connection --------------------------------------------------------

# Previously created database objects - readable
database_outbreak
#> <SQLiteConnection>
#>   Path: :memory:
#>   Extensions: TRUE
database_report
#> # Source:   table<`report`> [?? x 13]
#> # Database: sqlite 3.46.0 [:memory:]
#>    id      age age_class sex   place_infect     reporting_ctry loc_hosp dt_onset
#>    <chr> <int> <chr>     <chr> <chr>            <chr>          <chr>       <dbl>
#>  1 SK_1     68 60-69     M     Middle East      South Korea    Pyeongt…    16566
#>  2 SK_2     63 60-69     F     Outside Middle … South Korea    Pyeongt…    16573
#>  3 SK_3     76 70-79     M     Outside Middle … South Korea    Pyeongt…    16575
#>  4 SK_4     46 40-49     F     Outside Middle … South Korea    Pyeongt…    16580
#>  5 SK_5     50 50-59     M     Outside Middle … South Korea    365 Yeo…    16580
#>  6 SK_6     71 70-79     M     Outside Middle … South Korea    Pyeongt…    16579
#>  7 SK_7     28 20-29     F     Outside Middle … South Korea    Pyeongt…    16576
#>  8 SK_8     46 40-49     F     Outside Middle … South Korea    Seoul C…    16581
#>  9 SK_9     56 50-59     M     Outside Middle … South Korea    Pyeongt…       NA
#> 10 SK_10    44 40-49     M     Outside Middle … China          Pyeongt…    16576
#> # ℹ more rows
#> # ℹ 5 more variables: dt_report <dbl>, week_report <chr>, dt_start_exp <dbl>,
#> #   dt_end_exp <dbl>, dt_diag <dbl>
database_followup
#> # Source:   table<`followup`> [?? x 3]
#> # Database: sqlite 3.46.0 [:memory:]
#>    id    outcome dt_death
#>    <chr> <chr>      <dbl>
#>  1 SK_1  Alive         NA
#>  2 SK_2  Alive         NA
#>  3 SK_3  Dead       16590
#>  4 SK_4  Alive         NA
#>  5 SK_5  Alive         NA
#>  6 SK_6  Dead       16587
#>  7 SK_7  Alive         NA
#>  8 SK_8  Alive         NA
#>  9 SK_9  Alive         NA
#> 10 SK_10 Alive         NA
#> # ℹ more rows
database_join_tables
#> # Source:   SQL [?? x 7]
#> # Database: sqlite 3.46.0 [:memory:]
#>    id      age sex   dt_onset dt_report outcome dt_death
#>    <chr> <int> <chr>    <dbl>     <dbl> <chr>      <dbl>
#>  1 SK_2     63 F        16573     16575 Alive         NA
#>  2 SK_4     46 F        16580     16581 Alive         NA
#>  3 SK_7     28 F        16576     16583 Alive         NA
#>  4 SK_8     46 F        16581     16584 Alive         NA
#>  5 SK_11    79 F        16575     16584 Alive         NA
#>  6 SK_12    49 F        16576     16584 Alive         NA
#>  7 SK_18    77 F        16575     16587 Alive         NA
#>  8 SK_21    59 F        16578     16588 Alive         NA
#>  9 SK_22    39 F        16582     16588 Alive         NA
#> 10 SK_25    57 F           NA     16589 Dead       16587
#> # ℹ more rows
# Close the database connection
DBI::dbDisconnect(conn = database_outbreak)

# Previously created database objects - not readable
database_outbreak
#> <SQLiteConnection>
#>   DISCONNECTED
database_report
#> Error in `collect()`:
#> ! Failed to collect lazy table.
#> Caused by error:
#> ! Invalid or closed connection
database_followup
#> Error in `collect()`:
#> ! Failed to collect lazy table.
#> Caused by error:
#> ! Invalid or closed connection
database_join_tables
#> Error in `collect()`:
#> ! Failed to collect lazy table.
#> Caused by error:
#> ! Invalid or closed connection

Created on 2024-08-07 with reprex v2.1.0

@avallecam avallecam added the enhancement New feature or request label Sep 26, 2024
@avallecam avallecam moved this to Todo in tutorials-planning Sep 26, 2024
@avallecam avallecam added the clean-validation set of issues about the clean-validation episode label Sep 26, 2024
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
clean-validation set of issues about the clean-validation episode enhancement New feature or request
Projects
Status: Todo
Development

No branches or pull requests

1 participant