opensafely · alainamstutz · Jan 8, 2025 · Dec 20, 2024 · Dec 20, 2024 · Dec 20, 2024
diff --git a/analysis/data_process.R b/analysis/data_process.R
@@ -2,11 +2,12 @@
 ## This script does the following:
 # 1. Import/extract feather dataset from OpenSAFELY 
 # 2. Basic type formatting of variables -> fn_extract_data.R()
-# 3. Process some covariates and apply the diabetes algorithm -> fn_diabetes_algorithm()
-# 4. Evaluate/apply the quality assurance criteria -> fn_quality_assurance_midpoint6()
-# 5. Evaluate/apply the completeness criteria: -> fn_completeness_criteria_midpoint6()
-# 6. Evaluate/apply the eligibility criteria: -> fn_elig_criteria_midpoint6()
-# (for now: just to double-check: Assign treatment and main outcome)
+# 3. Process some covariates
+# 4. Import the processed dataset with the DM variables (and ethnicity and qa_num_birth_year) and merge
+# 5. Evaluate/apply the quality assurance criteria -> fn_quality_assurance_midpoint6()
+# 6. Evaluate/apply the completeness criteria: -> fn_completeness_criteria_midpoint6()
+# 7. Evaluate/apply the eligibility criteria: -> fn_elig_criteria_midpoint6()
+# (for now to double-check: 8. Assign treatment, various patterns and main outcome)
 ## Save the output: data_processed and the 1-row tables for the flow chart
 ################################################################################
 
@@ -25,7 +26,6 @@ library('ggplot2')
 ## Import custom user functions and meta-dates
 source(here::here("analysis", "functions", "fn_extract_data.R"))
 source(here::here("analysis", "functions", "utility.R"))
-source(here::here("analysis", "functions", "fn_diabetes_algorithm.R"))
 source(here::here("analysis", "functions", "fn_quality_assurance_midpoint6.R"))
 source(here::here("analysis", "functions", "fn_completeness_criteria_midpoint6.R"))
 source(here::here("analysis", "functions", "fn_elig_criteria_midpoint6.R"))
@@ -53,7 +53,7 @@ study_dates <- lapply(study_dates, function(x) as.Date(x))
 threshold <- 6
 
 ################################################################################
-# 1 Import data
+# 1 Import the dataset definition
 ################################################################################
 input_filename <- "dataset.arrow"
 
@@ -63,9 +63,9 @@ input_filename <- "dataset.arrow"
 data_extracted <- fn_extract_data(input_filename)
 
 ################################################################################
-# 3 Process the data and apply diabetes algorithm
+# 3 Process the data
 ################################################################################
-data_extracted <- data_extracted %>%
+data_processed <- data_extracted %>%
   mutate(
     # POPULATION/DEMOGRAPHIC ----
     cov_cat_age = cut(
@@ -97,15 +97,6 @@ data_extracted <- data_extracted %>%
 
     cov_cat_stp = as.factor(cov_cat_stp),
 
-    cov_cat_ethnicity = fn_case_when(
-      cov_cat_ethnicity == "1" ~ "White",
-      cov_cat_ethnicity == "4" ~ "Black",
-      cov_cat_ethnicity == "3" ~ "South Asian",
-      cov_cat_ethnicity == "2" ~ "Mixed",
-      cov_cat_ethnicity == "5" ~ "Other",
-      cov_cat_ethnicity == "0" ~ "Unknown",
-      TRUE ~ NA_character_),
-
     # Finalize smoking status
     cov_cat_smoking_status = fn_case_when(
       cov_cat_smoking_status == "S" ~ "Smoker",
@@ -132,26 +123,31 @@ data_extracted <- data_extracted %>%
     cov_num_tc_hdl_ratio = replace(cov_num_tc_hdl_ratio, cov_num_tc_hdl_ratio > 50 | cov_num_tc_hdl_ratio < 1, NA_real_),
     )
 
-# apply diabetes algorithm and delete all helper variables (tmp & step) at the end
-data_processed <- fn_diabetes_algorithm(data_extracted)
+################################################################################
+# 4 Import the processed DM algo dataset and merge
+################################################################################
+data_processed_dm_algo <- readRDS(here::here("output", "data", "data_processed_dm_algo.rds"))
+data_processed <- merge(data_processed, data_processed_dm_algo, 
+                        by = "patient_id", 
+                        all.x = TRUE)
 
 ################################################################################
-# 4 Apply the quality assurance criteria
+# 5 Apply the quality assurance criteria
 ################################################################################
 qa <- fn_quality_assurance_midpoint6(data_processed, study_dates, threshold)
 n_qa_excluded_midpoint6 <- qa$n_qa_excluded_midpoint6
 data_processed <- qa$data_processed
 
 ################################################################################
-# 5 Apply the completeness criteria
+# 6 Apply the completeness criteria
 ################################################################################
 completeness <- fn_completeness_criteria_midpoint6(data_processed, threshold)
 n_completeness_excluded <- completeness$n_completeness_excluded
 n_completeness_excluded_midpoint6 <- completeness$n_completeness_excluded_midpoint6
 data_processed <- completeness$data_processed # CAVE: Being alive and registration based on mid2018, not landmark!
 
 ################################################################################
-# 6 Apply the eligibility criteria
+# 7 Apply the eligibility criteria
 ################################################################################
 # Our primary eligibility window to define incident T2DM is mid2018-mid2019, but maybe we may want to extend the window until max. mid2013 later on => if so, use function with loop that can be mapped to other windows
 eligibility <- fn_elig_criteria_midpoint6(data_processed, study_dates, years_in_days = 0)
@@ -179,7 +175,7 @@ data_processed <- eligibility$data_processed
 # names(data_processed_all_windows) <- c("elig_mid2018", "elig_mid2017", "elig_mid2016", "elig_mid2015", "elig_mid2014", "elig_mid2013")
 
 ################################################################################
-# 7 Double-check feasibility: Assign treatment/exposure and main outcome
+# 8 Double-check feasibility: Assign treatment/exposure and main outcome
 ################################################################################
 # assign treatment/exposure and one outcome measure
 data_processed <- data_processed %>% 
@@ -604,7 +600,7 @@ n_exp_out_midpoint6 <- data_processed %>%
 # names(n_exp_severecovid_midpoint6) <- c("treat_outcome_mid2018_midpoint6", "treat_outcome_mid2017_midpoint6", "treat_outcome_mid2016_midpoint6", "treat_outcome_mid2015_midpoint6", "treat_outcome_mid2014_midpoint6", "treat_outcome_mid2013_midpoint6")
 
 ################################################################################
-# 8 Save output
+# 9 Save output
 ################################################################################
 # the data
 write_rds(data_processed, here::here("output", "data", "data_processed.rds"))

diff --git a/analysis/data_process_dm_algo.R b/analysis/data_process_dm_algo.R
@@ -0,0 +1,82 @@
+################################################################################
+## This script does the following:
+# 1. Import feather dataset from OpenSAFELY 
+# 2. Basic formatting of variables -> fn_extract_data.R()
+# 3. Process the ethnicity variable 
+# 4. Apply the diabetes algorithm -> fn_diabetes_algorithm()
+# 5. Save the output: data_processed
+################################################################################
+
+################################################################################
+# 0.0 Import libraries + functions
+################################################################################
+library('arrow')
+library('readr')
+library('here')
+library('lubridate')
+library('dplyr')
+library('tidyr')
+
+## Import custom user functions
+source(here::here("analysis", "functions", "fn_extract_data.R"))
+source(here::here("analysis", "functions", "utility.R"))
+source(here::here("analysis", "functions", "fn_diabetes_algorithm.R"))
+
+################################################################################
+# 0.1 Create directories for output
+################################################################################
+fs::dir_create(here::here("output", "data"))
+fs::dir_create(here::here("output", "data_properties"))
+
+################################################################################
+# 0.2 Import command-line arguments and dates # to be adapted at a later stage
+################################################################################
+args <- commandArgs(trailingOnly=TRUE)
+# study_dates <-
+#    jsonlite::read_json(path = here::here("output", "study_dates.json")) %>%
+#    map(as.Date)
+source(here::here("analysis", "metadates.R"))
+# Convert the meta-dates into Date objects
+study_dates <- lapply(study_dates, function(x) as.Date(x))
+
+################################################################################
+# 0.3 Define redaction threshold
+################################################################################
+threshold <- 6
+
+################################################################################
+# 1 Import data
+################################################################################
+input_filename <- "dataset_dm_algo.arrow"
+
+################################################################################
+# 2 Reformat the imported data
+################################################################################
+data_extracted <- fn_extract_data(input_filename)
+
+################################################################################
+# 3 Process the data and apply diabetes algorithm
+################################################################################
+data_extracted <- data_extracted %>%
+  mutate(
+    cov_cat_ethnicity = fn_case_when(
+      cov_cat_ethnicity == "1" ~ "White",
+      cov_cat_ethnicity == "4" ~ "Black",
+      cov_cat_ethnicity == "3" ~ "South Asian",
+      cov_cat_ethnicity == "2" ~ "Mixed",
+      cov_cat_ethnicity == "5" ~ "Other",
+      cov_cat_ethnicity == "0" ~ "Unknown",
+      TRUE ~ NA_character_) # if ethnicity is NA, it remains NA -> will not influence diabetes algo, except that for step 5 only age will be used for these cases
+    )
+
+################################################################################
+# 4 Apply diabetes algorithm
+################################################################################
+# apply diabetes algorithm and delete all helper variables (tmp & step) at the end
+data_processed_dm_algo <- fn_diabetes_algorithm(data_extracted)
+
+################################################################################
+# 4 Save output
+################################################################################
+# the data
+write_rds(data_processed_dm_algo, here::here("output", "data", "data_processed_dm_algo.rds"))
diff --git a/analysis/dataset_definition_dm_algo.py b/analysis/dataset_definition_dm_algo.py
@@ -0,0 +1,144 @@
+#######################################################################################
+# IMPORT
+#######################################################################################
+## ehrQL functions
+from ehrql import (
+    create_dataset,
+    minimum_of
+)
+
+## TPP tables
+from ehrql.tables.tpp import (
+    clinical_events,
+    patients
+)
+
+## All codelists from codelists.py
+from codelists import *
+
+## variable helper functions 
+from variable_helper_functions import *
+
+## json (for the dates)
+import json
+
+# numpy for random seed - and set random seed
+#import numpy as np 
+#np.random.seed(1928374) # random seed
+
+#######################################################################################
+# DEFINE the feasibility study end date
+#######################################################################################
+with open("output/study_dates.json") as f:
+  study_dates = json.load(f)
+landmark_date = study_dates["landmark_date"]
+
+#######################################################################################
+# INITIALISE the dataset and set the dummy dataset size
+#######################################################################################
+dataset = create_dataset()
+dataset.configure_dummy_data(population_size=10000)
+dataset.define_population(patients.exists_for_patient())
+
+#######################################################################################
+# Table 3) ELIGIBILITY criteria
+#######################################################################################
+
+## Year of birth
+dataset.qa_num_birth_year = patients.date_of_birth
+
+## Ethnicity in 6 categories (mainly for diabetes algo())
+dataset.cov_cat_ethnicity = (
+    clinical_events.where(clinical_events.ctv3_code.is_in(ethnicity_codes))
+    .sort_by(clinical_events.date)
+    .last_for_patient()
+    .ctv3_code.to_category(ethnicity_codes)
+)
+
+## DIABETES algo variables start ------------------------
+## See https://github.com/opensafely/post-covid-diabetes/blob/main/analysis/common_variables.py 
+
+## Type 1 Diabetes 
+# First date from primary+secondary, but also primary care date separately for diabetes algo
+dataset.tmp_elig_date_t1dm_ctv3 = first_matching_event_clinical_ctv3_before(diabetes_type1_ctv3_clinical, landmark_date).date
+dataset.elig_date_t1dm = minimum_of(
+    (first_matching_event_clinical_ctv3_before(diabetes_type1_ctv3_clinical, landmark_date).date),
+    (first_matching_event_apc_before(diabetes_type1_icd10, landmark_date).admission_date)
+)
+# Count codes (individually and together, for diabetes algo)
+dataset.tmp_elig_count_t1dm_ctv3 = count_matching_event_clinical_ctv3_before(diabetes_type1_ctv3_clinical, landmark_date)
+dataset.tmp_elig_count_t1dm_hes = count_matching_event_apc_before(diabetes_type1_icd10, landmark_date)
+dataset.tmp_elig_count_t1dm = dataset.tmp_elig_count_t1dm_ctv3 + dataset.tmp_elig_count_t1dm_hes
+
+## Type 2 Diabetes
+# First date from primary+secondary, but also primary care date separately for diabetes algo)
+dataset.tmp_elig_date_t2dm_ctv3 = first_matching_event_clinical_ctv3_before(diabetes_type2_ctv3_clinical, landmark_date).date
+dataset.elig_date_t2dm = minimum_of(
+    (first_matching_event_clinical_ctv3_before(diabetes_type2_ctv3_clinical, landmark_date).date),
+    (first_matching_event_apc_before(diabetes_type2_icd10, landmark_date).admission_date)
+)
+# Count codes (individually and together, for diabetes algo)
+dataset.tmp_elig_count_t2dm_ctv3 = count_matching_event_clinical_ctv3_before(diabetes_type2_ctv3_clinical, landmark_date)
+dataset.tmp_elig_count_t2dm_hes = count_matching_event_apc_before(diabetes_type2_icd10, landmark_date)
+dataset.tmp_elig_count_t2dm = dataset.tmp_elig_count_t2dm_ctv3 + dataset.tmp_elig_count_t2dm_hes
+
+## Diabetes unspecified/other
+# First date
+dataset.elig_date_otherdm = first_matching_event_clinical_ctv3_before(diabetes_other_ctv3_clinical, landmark_date).date
+# Count codes
+dataset.tmp_elig_count_otherdm = count_matching_event_clinical_ctv3_before(diabetes_other_ctv3_clinical, landmark_date)
+
+## Gestational diabetes ## Comment 10/12/2024: Search in both primary and secondary
+# First date from primary+secondary
+dataset.elig_date_gestationaldm = minimum_of(
+    (first_matching_event_clinical_ctv3_before(diabetes_gestational_ctv3_clinical, landmark_date).date),
+    (first_matching_event_apc_before(diabetes_gestational_icd10, landmark_date).admission_date)
+)
+
+## Diabetes diagnostic codes
+# First date
+dataset.tmp_elig_date_poccdm = first_matching_event_clinical_ctv3_before(diabetes_diagnostic_ctv3_clinical, landmark_date).date
+# Count codes
+dataset.tmp_elig_count_poccdm_ctv3 = count_matching_event_clinical_ctv3_before(diabetes_diagnostic_ctv3_clinical, landmark_date)
+
+### Other variables needed to define diabetes
+## HbA1c
+# Maximum HbA1c measure (in the same period)
+dataset.tmp_elig_num_max_hba1c_mmol_mol = (
+  clinical_events.where(
+    clinical_events.snomedct_code.is_in(hba1c_snomed))
+    .where(clinical_events.date.is_on_or_before(landmark_date))
+    .numeric_value.maximum_for_patient()
+)
+# Date of first maximum HbA1c measure
+dataset.tmp_elig_date_max_hba1c = ( 
+  clinical_events.where(
+    clinical_events.snomedct_code.is_in(hba1c_snomed))
+    .where(clinical_events.date.is_on_or_before(landmark_date)) # this line of code probably not needed again
+    .where(clinical_events.numeric_value == dataset.tmp_elig_num_max_hba1c_mmol_mol)
+    .sort_by(clinical_events.date)
+    .first_for_patient() 
+    .date
+)
+
+## Diabetes drugs
+# First dates
+dataset.tmp_elig_date_insulin_snomed = first_matching_med_dmd_before(insulin_dmd, landmark_date).date
+dataset.tmp_elig_date_antidiabetic_drugs_snomed = first_matching_med_dmd_before(antidiabetic_drugs_snomed_clinical, landmark_date).date
+dataset.tmp_elig_date_nonmetform_drugs_snomed = first_matching_med_dmd_before(non_metformin_dmd, landmark_date).date # this extra step makes sense for the diabetes algorithm (otherwise not)
+
+# Identify first date (in same period) that any diabetes medication was prescribed
+dataset.tmp_elig_date_diabetes_medication = minimum_of(dataset.tmp_elig_date_insulin_snomed, dataset.tmp_elig_date_antidiabetic_drugs_snomed) # why excluding tmp_elig_date_nonmetform_drugs_snomed? -> this extra step makes sense for the diabetes algorithm (otherwise not)
+
+# Identify first date (in same period) that any diabetes diagnosis codes were recorded
+dataset.tmp_elig_date_first_diabetes_diag = minimum_of(
+  dataset.elig_date_t2dm, 
+  dataset.elig_date_t1dm,
+  dataset.elig_date_otherdm,
+  dataset.elig_date_gestationaldm,
+  dataset.tmp_elig_date_poccdm,
+  dataset.tmp_elig_date_diabetes_medication,
+  dataset.tmp_elig_date_nonmetform_drugs_snomed
+)
+
+## DIABETES algo variables end ------------------------