Minor fix help with Unit test Mem usage

Al-Murphy · Feb 5, 2024 · 0d94161 · 0d94161
1 parent 8011525
commit 0d94161
Show file tree

Hide file tree

Showing 9 changed files with 108 additions and 33 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: MungeSumstats
 Type: Package
 Title: Standardise summary statistics from GWAS
-Version: 1.11.3
+Version: 1.11.4
 Authors@R:
     c(person(given = "Alan",
            family = "Murphy",

diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,9 @@
+## CHANGES IN VERSION 1.11.4
+
+### Bug fix
+* Minor fix to `get_genome_builds()` to help with RAM & CPU usage during unit 
+tests. No change in functionality for end user.
+
 ## CHANGES IN VERSION 1.11.3
 
 ### Bug fix

diff --git a/R/get_genome_build.R b/R/get_genome_build.R
@@ -23,7 +23,8 @@
 #' from disk each time.
 #' @param allele_match_ref Instead of returning the genome_build this will 
 #' return the propotion of matches to each genome build for each allele (A1,A2).
-#' @inheritParams format_sumstats 
+#' @inheritParams format_sumstats
+#' @inheritParams get_genome_builds 
 #' 
 #' @return ref_genome the genome build of the data
 #' @importFrom data.table setDT :=
@@ -36,7 +37,8 @@ get_genome_build <- function(sumstats,
                              dbSNP=155,
                              header_only = FALSE,
                              allele_match_ref = FALSE,
-                             ref_genome = NULL) {
+                             ref_genome = NULL,
+                             chr_filt = NULL) {
     ### Add this to avoid confusing BiocCheck
     seqnames <- CHR <- SNP <- BP <- alt_alleles <- NULL 
     if(isFALSE(allele_match_ref)){
@@ -116,13 +118,20 @@ get_genome_build <- function(sumstats,
     #also deal with common misformatting of CHR
     # if chromosome col has chr prefix remove it
     sumstats[, CHR := gsub("chr", "", CHR)]
+
+    #for internal testing - filter to specified chromosomes
+    if(!is.null(chr_filt)){
+      sumstats <- sumstats[CHR %in% chr_filt]
+      print(nrow(sumstats))
+    }
+
     # if removing erroneous cases leads to <min(10k,50% org dataset) will fail -
     # NOT ENOUGH DATA TO INFER
     nrow_clean <- nrow(sumstats)
     size_okay <- FALSE
-    if (nrow_clean > sampled_snps || (nrow(sumstats) != 0 && 
-                                      nrow_clean / nrow_org > .5)) {
-        size_okay <- TRUE
+    if (nrow_clean > sampled_snps || (nrow_clean != 0 && 
+                                      (nrow_clean / nrow_org) > .5)) {
+        print("size_okay")
     }
     if (!size_okay) {
       #want it returned rather than throwing an error
@@ -230,12 +239,14 @@ get_genome_build <- function(sumstats,
       snp_loc_data_37 <- load_ref_genome_data(
         snps = snps,
         ref_genome = "GRCH37",
-        dbSNP = dbSNP
+        dbSNP = dbSNP,
+        chr_filt = chr_filt
       )
       snp_loc_data_38 <- load_ref_genome_data(
         snps = snps,
         ref_genome = "GRCH38",
-        dbSNP = dbSNP
+        dbSNP = dbSNP,
+        chr_filt = chr_filt
       )
       # convert CHR filed in ref genomes to character not factor
       snp_loc_data_37[, seqnames := as.character(seqnames)]

diff --git a/R/get_genome_builds.R b/R/get_genome_builds.R
@@ -18,6 +18,9 @@
 #' Only works if \code{sumstats_list} is a list of paths.
 #' @param dbSNP version of dbSNP to be used (144 or 155). Default is 155.
 #' @param nThread Number of threads to use for parallel processes.
+#' @param chr_filt Internal for testing - filter reference genomes and sumstats
+#' to specific chromosomes for testing. Pass a list of chroms in format: 
+#' c("1","2"). Default is NULL i.e. no filtering
 #'
 #' @return ref_genome the genome build of the data
 #'
@@ -52,7 +55,8 @@ get_genome_builds <- function(sumstats_list,
                               sampled_snps = 10000,
                               names_from_paths = FALSE,
                               dbSNP=155,
-                              nThread = 1) {
+                              nThread = 1,
+                              chr_filt = NULL) {
     start <- Sys.time()
     #### Convert to list if it isn't already ####
     if (!methods::is(sumstats_list, "list")) {
@@ -82,24 +86,37 @@ get_genome_builds <- function(sumstats_list,
         names(sumstats_list) <- paste0("ss", seq(1, length(sumstats_list)))
     }
     #### Infer builds ####
-    builds <- parallel::mclapply(names(sumstats_list),
-        function(x,
-                 .sampled_snps = sampled_snps,
-                 .dbSNP=dbSNP,
-                 .header_only = header_only) {
-            message_parallel(x)
-            get_genome_build(
-                sumstats = sumstats_list[[x]],
-                sampled_snps = .sampled_snps,
-                dbSNP = .dbSNP, 
-                header_only = .header_only,
-                nThread = 1
-            )
-        },
-        mc.cores = nThread
-    ) %>%
-        `names<-`(names(sumstats_list))
-
+    #Weirdly more efficient to not use parallel::mcapply() if only one
+    if(length(names(sumstats_list))==1){
+      build_ <- get_genome_build(
+        sumstats = sumstats_list[[1]],
+        sampled_snps = sampled_snps,
+        dbSNP = dbSNP, 
+        header_only = header_only,
+        nThread = nThread,
+        chr_filt = chr_filt
+      )
+      builds <- list(build_)
+      names(builds) <- c(names(sumstats_list))
+    }else{
+      builds <- parallel::mclapply(names(sumstats_list),
+                                   function(x,
+                                            .sampled_snps = sampled_snps,
+                                            .dbSNP=dbSNP,
+                                            .header_only = header_only) {
+                                     message_parallel(x)
+                                     get_genome_build(
+                                       sumstats = sumstats_list[[x]],
+                                       sampled_snps = .sampled_snps,
+                                       dbSNP = .dbSNP, 
+                                       header_only = .header_only,
+                                       nThread = 1,
+                                       chr_filt = chr_filt
+                                     )
+                                   },
+                                   mc.cores = nThread) %>%
+        `names<-`(names(sumstats_list)) 
+    }
     #### Report time ####
     message(utils::capture.output(difftime(Sys.time(), start)))
     #### Report build counts ####

diff --git a/R/load_ref_genome_data.R b/R/load_ref_genome_data.R
@@ -6,6 +6,9 @@
 #' @param dbSNP version of dbSNP to be used (144 or 155)
 #' @param msg Optional name of the column missing from the dataset in question.
 #'  Default is NULL
+#' @param chr_filt Internal for testing - filter reference genomes and sumstats
+#' to specific chromosomes for testing. Pass a list of chroms in format: 
+#' c("1","2"). Default is NULL i.e. no filtering.
 #' @return data table of snpsById, filtered to SNPs of interest.
 #' @importFrom data.table setDT setkey := setnames setorder
 #' @importFrom BSgenome snpsById
@@ -20,7 +23,8 @@
 load_ref_genome_data <- function(snps, 
                                  ref_genome,
                                  dbSNP=c(144,155),
-                                 msg = NULL) {
+                                 msg = NULL,
+                                 chr_filt = NULL) {
 
     SNP <- NULL
     SNP_LOC_DATA <- load_snp_loc_data(ref_genome, dbSNP, msg = msg)
@@ -47,6 +51,22 @@ load_ref_genome_data <- function(snps,
 
     messager("Validating RSIDs of",formatC(length(snps),big.mark = ","),
              "SNPs using BSgenome::snpsById...")
+    #this is memory intensive
+    #if user specifies for testing, filt to a specific chr 
+    #chr should be in format "1", "2" etc
+    if(!is.null(chr_filt)){
+      #from here: https://support.bioconductor.org/p/83588/
+      keepBSgenomeSequences <- function(genome, seqnames)
+      {
+        stopifnot(all(seqnames %in% seqnames(genome)))
+        genome@user_seqnames <- setNames(seqnames, seqnames)
+        genome@seqinfo <- genome@seqinfo[seqnames]
+        genome
+      }
+      genome <- keepBSgenomeSequences(genome, chr_filt)
+      #can't find a method to filter ODLT_SNPlocs object ....
+      #SNP_LOC_DATA <- SNP_LOC_DATA[[chr_filt]]
+    }
     tm <- system.time({
         gr_rsids <- BSgenome::snpsById(
             x = SNP_LOC_DATA,

diff --git a/man/get_genome_build.Rd b/man/get_genome_build.Rd
diff --git a/man/get_genome_builds.Rd b/man/get_genome_builds.Rd
diff --git a/man/load_ref_genome_data.Rd b/man/load_ref_genome_data.Rd
diff --git a/tests/testthat/test-on_ref_genome.R b/tests/testthat/test-on_ref_genome.R
@@ -48,8 +48,9 @@ test_that("SNPs not on reference genome are removed", {
     sumstats_list <- list(ss1 = eduAttainOkbayPth)
     ref_genomes <- get_genome_builds(
       sumstats_list = sumstats_list,
-      sampled_snps = 50,
-      dbSNP = 144
+      sampled_snps = 19,
+      dbSNP = 144,
+      chr_filt = c("1","2","3","4") #filtering to reduce computational burden
     )
     expect_equal(all.equal(ref_genomes, list("ss1" = "GRCH37")), TRUE)
   } else {