Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Splitting the dataset definition to build and apply the DM algo separately #10

Merged
merged 7 commits into from
Jan 8, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 21 additions & 25 deletions analysis/data_process.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,12 @@
## This script does the following:
# 1. Import/extract feather dataset from OpenSAFELY
# 2. Basic type formatting of variables -> fn_extract_data.R()
# 3. Process some covariates and apply the diabetes algorithm -> fn_diabetes_algorithm()
# 4. Evaluate/apply the quality assurance criteria -> fn_quality_assurance_midpoint6()
# 5. Evaluate/apply the completeness criteria: -> fn_completeness_criteria_midpoint6()
# 6. Evaluate/apply the eligibility criteria: -> fn_elig_criteria_midpoint6()
# (for now: just to double-check: Assign treatment and main outcome)
# 3. Process some covariates
# 4. Import the processed dataset with the DM variables (and ethnicity and qa_num_birth_year) and merge
# 5. Evaluate/apply the quality assurance criteria -> fn_quality_assurance_midpoint6()
# 6. Evaluate/apply the completeness criteria: -> fn_completeness_criteria_midpoint6()
# 7. Evaluate/apply the eligibility criteria: -> fn_elig_criteria_midpoint6()
# (for now to double-check: 8. Assign treatment, various patterns and main outcome)
## Save the output: data_processed and the 1-row tables for the flow chart
################################################################################

Expand All @@ -25,7 +26,6 @@ library('ggplot2')
## Import custom user functions and meta-dates
source(here::here("analysis", "functions", "fn_extract_data.R"))
source(here::here("analysis", "functions", "utility.R"))
source(here::here("analysis", "functions", "fn_diabetes_algorithm.R"))
source(here::here("analysis", "functions", "fn_quality_assurance_midpoint6.R"))
source(here::here("analysis", "functions", "fn_completeness_criteria_midpoint6.R"))
source(here::here("analysis", "functions", "fn_elig_criteria_midpoint6.R"))
Expand Down Expand Up @@ -53,7 +53,7 @@ study_dates <- lapply(study_dates, function(x) as.Date(x))
threshold <- 6

################################################################################
# 1 Import data
# 1 Import the dataset definition
################################################################################
input_filename <- "dataset.arrow"

Expand All @@ -63,9 +63,9 @@ input_filename <- "dataset.arrow"
data_extracted <- fn_extract_data(input_filename)

################################################################################
# 3 Process the data and apply diabetes algorithm
# 3 Process the data
################################################################################
data_extracted <- data_extracted %>%
data_processed <- data_extracted %>%
mutate(
# POPULATION/DEMOGRAPHIC ----
cov_cat_age = cut(
Expand Down Expand Up @@ -97,15 +97,6 @@ data_extracted <- data_extracted %>%

cov_cat_stp = as.factor(cov_cat_stp),

cov_cat_ethnicity = fn_case_when(
cov_cat_ethnicity == "1" ~ "White",
cov_cat_ethnicity == "4" ~ "Black",
cov_cat_ethnicity == "3" ~ "South Asian",
cov_cat_ethnicity == "2" ~ "Mixed",
cov_cat_ethnicity == "5" ~ "Other",
cov_cat_ethnicity == "0" ~ "Unknown",
TRUE ~ NA_character_),

# Finalize smoking status
cov_cat_smoking_status = fn_case_when(
cov_cat_smoking_status == "S" ~ "Smoker",
Expand All @@ -132,26 +123,31 @@ data_extracted <- data_extracted %>%
cov_num_tc_hdl_ratio = replace(cov_num_tc_hdl_ratio, cov_num_tc_hdl_ratio > 50 | cov_num_tc_hdl_ratio < 1, NA_real_),
)

# apply diabetes algorithm and delete all helper variables (tmp & step) at the end
data_processed <- fn_diabetes_algorithm(data_extracted)
################################################################################
# 4 Import the processed DM algo dataset and merge
################################################################################
data_processed_dm_algo <- readRDS(here::here("output", "data", "data_processed_dm_algo.rds"))
data_processed <- merge(data_processed, data_processed_dm_algo,
by = "patient_id",
all.x = TRUE)

################################################################################
# 4 Apply the quality assurance criteria
# 5 Apply the quality assurance criteria
################################################################################
qa <- fn_quality_assurance_midpoint6(data_processed, study_dates, threshold)
n_qa_excluded_midpoint6 <- qa$n_qa_excluded_midpoint6
data_processed <- qa$data_processed

################################################################################
# 5 Apply the completeness criteria
# 6 Apply the completeness criteria
################################################################################
completeness <- fn_completeness_criteria_midpoint6(data_processed, threshold)
n_completeness_excluded <- completeness$n_completeness_excluded
n_completeness_excluded_midpoint6 <- completeness$n_completeness_excluded_midpoint6
data_processed <- completeness$data_processed # CAVE: Being alive and registration based on mid2018, not landmark!

################################################################################
# 6 Apply the eligibility criteria
# 7 Apply the eligibility criteria
################################################################################
# Our primary eligibility window to define incident T2DM is mid2018-mid2019, but maybe we may want to extend the window until max. mid2013 later on => if so, use function with loop that can be mapped to other windows
eligibility <- fn_elig_criteria_midpoint6(data_processed, study_dates, years_in_days = 0)
Expand Down Expand Up @@ -179,7 +175,7 @@ data_processed <- eligibility$data_processed
# names(data_processed_all_windows) <- c("elig_mid2018", "elig_mid2017", "elig_mid2016", "elig_mid2015", "elig_mid2014", "elig_mid2013")

################################################################################
# 7 Double-check feasibility: Assign treatment/exposure and main outcome
# 8 Double-check feasibility: Assign treatment/exposure and main outcome
################################################################################
# assign treatment/exposure and one outcome measure
data_processed <- data_processed %>%
Expand Down Expand Up @@ -604,7 +600,7 @@ n_exp_out_midpoint6 <- data_processed %>%
# names(n_exp_severecovid_midpoint6) <- c("treat_outcome_mid2018_midpoint6", "treat_outcome_mid2017_midpoint6", "treat_outcome_mid2016_midpoint6", "treat_outcome_mid2015_midpoint6", "treat_outcome_mid2014_midpoint6", "treat_outcome_mid2013_midpoint6")

################################################################################
# 8 Save output
# 9 Save output
################################################################################
# the data
write_rds(data_processed, here::here("output", "data", "data_processed.rds"))
Expand Down
82 changes: 82 additions & 0 deletions analysis/data_process_dm_algo.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
################################################################################
## This script does the following:
# 1. Import feather dataset from OpenSAFELY
# 2. Basic formatting of variables -> fn_extract_data.R()
# 3. Process the ethnicity variable
# 4. Apply the diabetes algorithm -> fn_diabetes_algorithm()
# 5. Save the output: data_processed
################################################################################

################################################################################
# 0.0 Import libraries + functions
################################################################################
library('arrow')
library('readr')
library('here')
library('lubridate')
library('dplyr')
library('tidyr')

## Import custom user functions
source(here::here("analysis", "functions", "fn_extract_data.R"))
source(here::here("analysis", "functions", "utility.R"))
source(here::here("analysis", "functions", "fn_diabetes_algorithm.R"))

################################################################################
# 0.1 Create directories for output
################################################################################
fs::dir_create(here::here("output", "data"))
fs::dir_create(here::here("output", "data_properties"))

################################################################################
# 0.2 Import command-line arguments and dates # to be adapted at a later stage
################################################################################
args <- commandArgs(trailingOnly=TRUE)
# study_dates <-
# jsonlite::read_json(path = here::here("output", "study_dates.json")) %>%
# map(as.Date)
source(here::here("analysis", "metadates.R"))
# Convert the meta-dates into Date objects
study_dates <- lapply(study_dates, function(x) as.Date(x))

################################################################################
# 0.3 Define redaction threshold
################################################################################
threshold <- 6

################################################################################
# 1 Import data
################################################################################
input_filename <- "dataset_dm_algo.arrow"

################################################################################
# 2 Reformat the imported data
################################################################################
data_extracted <- fn_extract_data(input_filename)

################################################################################
# 3 Process the data and apply diabetes algorithm
################################################################################
data_extracted <- data_extracted %>%
mutate(
cov_cat_ethnicity = fn_case_when(
cov_cat_ethnicity == "1" ~ "White",
cov_cat_ethnicity == "4" ~ "Black",
cov_cat_ethnicity == "3" ~ "South Asian",
cov_cat_ethnicity == "2" ~ "Mixed",
cov_cat_ethnicity == "5" ~ "Other",
cov_cat_ethnicity == "0" ~ "Unknown",
TRUE ~ NA_character_) # if ethnicity is NA, it remains NA -> will not influence diabetes algo, except that for step 5 only age will be used for these cases
)

################################################################################
# 4 Apply diabetes algorithm
################################################################################
# apply diabetes algorithm and delete all helper variables (tmp & step) at the end
data_processed_dm_algo <- fn_diabetes_algorithm(data_extracted)

################################################################################
# 4 Save output
################################################################################
# the data
write_rds(data_processed_dm_algo, here::here("output", "data", "data_processed_dm_algo.rds"))
144 changes: 144 additions & 0 deletions analysis/dataset_definition_dm_algo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
#######################################################################################
# IMPORT
#######################################################################################
## ehrQL functions
from ehrql import (
create_dataset,
minimum_of
)

## TPP tables
from ehrql.tables.tpp import (
clinical_events,
patients
)

## All codelists from codelists.py
from codelists import *

## variable helper functions
from variable_helper_functions import *

## json (for the dates)
import json

# numpy for random seed - and set random seed
#import numpy as np
#np.random.seed(1928374) # random seed

#######################################################################################
# DEFINE the feasibility study end date
#######################################################################################
with open("output/study_dates.json") as f:
study_dates = json.load(f)
landmark_date = study_dates["landmark_date"]

#######################################################################################
# INITIALISE the dataset and set the dummy dataset size
#######################################################################################
dataset = create_dataset()
dataset.configure_dummy_data(population_size=10000)
dataset.define_population(patients.exists_for_patient())

#######################################################################################
# Table 3) ELIGIBILITY criteria
#######################################################################################

## Year of birth
dataset.qa_num_birth_year = patients.date_of_birth

## Ethnicity in 6 categories (mainly for diabetes algo())
dataset.cov_cat_ethnicity = (
clinical_events.where(clinical_events.ctv3_code.is_in(ethnicity_codes))
.sort_by(clinical_events.date)
.last_for_patient()
.ctv3_code.to_category(ethnicity_codes)
)

## DIABETES algo variables start ------------------------
## See https://github.com/opensafely/post-covid-diabetes/blob/main/analysis/common_variables.py

## Type 1 Diabetes
# First date from primary+secondary, but also primary care date separately for diabetes algo
dataset.tmp_elig_date_t1dm_ctv3 = first_matching_event_clinical_ctv3_before(diabetes_type1_ctv3_clinical, landmark_date).date
dataset.elig_date_t1dm = minimum_of(
(first_matching_event_clinical_ctv3_before(diabetes_type1_ctv3_clinical, landmark_date).date),
(first_matching_event_apc_before(diabetes_type1_icd10, landmark_date).admission_date)
)
# Count codes (individually and together, for diabetes algo)
dataset.tmp_elig_count_t1dm_ctv3 = count_matching_event_clinical_ctv3_before(diabetes_type1_ctv3_clinical, landmark_date)
dataset.tmp_elig_count_t1dm_hes = count_matching_event_apc_before(diabetes_type1_icd10, landmark_date)
dataset.tmp_elig_count_t1dm = dataset.tmp_elig_count_t1dm_ctv3 + dataset.tmp_elig_count_t1dm_hes

## Type 2 Diabetes
# First date from primary+secondary, but also primary care date separately for diabetes algo)
dataset.tmp_elig_date_t2dm_ctv3 = first_matching_event_clinical_ctv3_before(diabetes_type2_ctv3_clinical, landmark_date).date
dataset.elig_date_t2dm = minimum_of(
(first_matching_event_clinical_ctv3_before(diabetes_type2_ctv3_clinical, landmark_date).date),
(first_matching_event_apc_before(diabetes_type2_icd10, landmark_date).admission_date)
)
# Count codes (individually and together, for diabetes algo)
dataset.tmp_elig_count_t2dm_ctv3 = count_matching_event_clinical_ctv3_before(diabetes_type2_ctv3_clinical, landmark_date)
dataset.tmp_elig_count_t2dm_hes = count_matching_event_apc_before(diabetes_type2_icd10, landmark_date)
dataset.tmp_elig_count_t2dm = dataset.tmp_elig_count_t2dm_ctv3 + dataset.tmp_elig_count_t2dm_hes

## Diabetes unspecified/other
# First date
dataset.elig_date_otherdm = first_matching_event_clinical_ctv3_before(diabetes_other_ctv3_clinical, landmark_date).date
# Count codes
dataset.tmp_elig_count_otherdm = count_matching_event_clinical_ctv3_before(diabetes_other_ctv3_clinical, landmark_date)

## Gestational diabetes ## Comment 10/12/2024: Search in both primary and secondary
# First date from primary+secondary
dataset.elig_date_gestationaldm = minimum_of(
(first_matching_event_clinical_ctv3_before(diabetes_gestational_ctv3_clinical, landmark_date).date),
(first_matching_event_apc_before(diabetes_gestational_icd10, landmark_date).admission_date)
)

## Diabetes diagnostic codes
# First date
dataset.tmp_elig_date_poccdm = first_matching_event_clinical_ctv3_before(diabetes_diagnostic_ctv3_clinical, landmark_date).date
# Count codes
dataset.tmp_elig_count_poccdm_ctv3 = count_matching_event_clinical_ctv3_before(diabetes_diagnostic_ctv3_clinical, landmark_date)

### Other variables needed to define diabetes
## HbA1c
# Maximum HbA1c measure (in the same period)
dataset.tmp_elig_num_max_hba1c_mmol_mol = (
clinical_events.where(
clinical_events.snomedct_code.is_in(hba1c_snomed))
.where(clinical_events.date.is_on_or_before(landmark_date))
.numeric_value.maximum_for_patient()
)
# Date of first maximum HbA1c measure
dataset.tmp_elig_date_max_hba1c = (
clinical_events.where(
clinical_events.snomedct_code.is_in(hba1c_snomed))
.where(clinical_events.date.is_on_or_before(landmark_date)) # this line of code probably not needed again
.where(clinical_events.numeric_value == dataset.tmp_elig_num_max_hba1c_mmol_mol)
.sort_by(clinical_events.date)
.first_for_patient()
.date
)

## Diabetes drugs
# First dates
dataset.tmp_elig_date_insulin_snomed = first_matching_med_dmd_before(insulin_dmd, landmark_date).date
dataset.tmp_elig_date_antidiabetic_drugs_snomed = first_matching_med_dmd_before(antidiabetic_drugs_snomed_clinical, landmark_date).date
dataset.tmp_elig_date_nonmetform_drugs_snomed = first_matching_med_dmd_before(non_metformin_dmd, landmark_date).date # this extra step makes sense for the diabetes algorithm (otherwise not)

# Identify first date (in same period) that any diabetes medication was prescribed
dataset.tmp_elig_date_diabetes_medication = minimum_of(dataset.tmp_elig_date_insulin_snomed, dataset.tmp_elig_date_antidiabetic_drugs_snomed) # why excluding tmp_elig_date_nonmetform_drugs_snomed? -> this extra step makes sense for the diabetes algorithm (otherwise not)

# Identify first date (in same period) that any diabetes diagnosis codes were recorded
dataset.tmp_elig_date_first_diabetes_diag = minimum_of(
dataset.elig_date_t2dm,
dataset.elig_date_t1dm,
dataset.elig_date_otherdm,
dataset.elig_date_gestationaldm,
dataset.tmp_elig_date_poccdm,
dataset.tmp_elig_date_diabetes_medication,
dataset.tmp_elig_date_nonmetform_drugs_snomed
)

## DIABETES algo variables end ------------------------
Loading
Loading