Skip to content

Commit

Permalink
Minor fix help with Unit test Mem usage
Browse files Browse the repository at this point in the history
  • Loading branch information
Al-Murphy committed Feb 5, 2024
1 parent 8011525 commit 0d94161
Show file tree
Hide file tree
Showing 9 changed files with 108 additions and 33 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: MungeSumstats
Type: Package
Title: Standardise summary statistics from GWAS
Version: 1.11.3
Version: 1.11.4
Authors@R:
c(person(given = "Alan",
family = "Murphy",
Expand Down
6 changes: 6 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
## CHANGES IN VERSION 1.11.4

### Bug fix
* Minor fix to `get_genome_builds()` to help with RAM & CPU usage during unit
tests. No change in functionality for end user.

## CHANGES IN VERSION 1.11.3

### Bug fix
Expand Down
25 changes: 18 additions & 7 deletions R/get_genome_build.R
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,8 @@
#' from disk each time.
#' @param allele_match_ref Instead of returning the genome_build this will
#' return the propotion of matches to each genome build for each allele (A1,A2).
#' @inheritParams format_sumstats
#' @inheritParams format_sumstats
#' @inheritParams get_genome_builds
#'
#' @return ref_genome the genome build of the data
#' @importFrom data.table setDT :=
Expand All @@ -36,7 +37,8 @@ get_genome_build <- function(sumstats,
dbSNP=155,
header_only = FALSE,
allele_match_ref = FALSE,
ref_genome = NULL) {
ref_genome = NULL,
chr_filt = NULL) {
### Add this to avoid confusing BiocCheck
seqnames <- CHR <- SNP <- BP <- alt_alleles <- NULL
if(isFALSE(allele_match_ref)){
Expand Down Expand Up @@ -116,13 +118,20 @@ get_genome_build <- function(sumstats,
#also deal with common misformatting of CHR
# if chromosome col has chr prefix remove it
sumstats[, CHR := gsub("chr", "", CHR)]

#for internal testing - filter to specified chromosomes
if(!is.null(chr_filt)){
sumstats <- sumstats[CHR %in% chr_filt]
print(nrow(sumstats))
}

# if removing erroneous cases leads to <min(10k,50% org dataset) will fail -
# NOT ENOUGH DATA TO INFER
nrow_clean <- nrow(sumstats)
size_okay <- FALSE
if (nrow_clean > sampled_snps || (nrow(sumstats) != 0 &&
nrow_clean / nrow_org > .5)) {
size_okay <- TRUE
if (nrow_clean > sampled_snps || (nrow_clean != 0 &&
(nrow_clean / nrow_org) > .5)) {
print("size_okay")
}
if (!size_okay) {
#want it returned rather than throwing an error
Expand Down Expand Up @@ -230,12 +239,14 @@ get_genome_build <- function(sumstats,
snp_loc_data_37 <- load_ref_genome_data(
snps = snps,
ref_genome = "GRCH37",
dbSNP = dbSNP
dbSNP = dbSNP,
chr_filt = chr_filt
)
snp_loc_data_38 <- load_ref_genome_data(
snps = snps,
ref_genome = "GRCH38",
dbSNP = dbSNP
dbSNP = dbSNP,
chr_filt = chr_filt
)
# convert CHR filed in ref genomes to character not factor
snp_loc_data_37[, seqnames := as.character(seqnames)]
Expand Down
55 changes: 36 additions & 19 deletions R/get_genome_builds.R
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@
#' Only works if \code{sumstats_list} is a list of paths.
#' @param dbSNP version of dbSNP to be used (144 or 155). Default is 155.
#' @param nThread Number of threads to use for parallel processes.
#' @param chr_filt Internal for testing - filter reference genomes and sumstats
#' to specific chromosomes for testing. Pass a list of chroms in format:
#' c("1","2"). Default is NULL i.e. no filtering
#'
#' @return ref_genome the genome build of the data
#'
Expand Down Expand Up @@ -52,7 +55,8 @@ get_genome_builds <- function(sumstats_list,
sampled_snps = 10000,
names_from_paths = FALSE,
dbSNP=155,
nThread = 1) {
nThread = 1,
chr_filt = NULL) {
start <- Sys.time()
#### Convert to list if it isn't already ####
if (!methods::is(sumstats_list, "list")) {
Expand Down Expand Up @@ -82,24 +86,37 @@ get_genome_builds <- function(sumstats_list,
names(sumstats_list) <- paste0("ss", seq(1, length(sumstats_list)))
}
#### Infer builds ####
builds <- parallel::mclapply(names(sumstats_list),
function(x,
.sampled_snps = sampled_snps,
.dbSNP=dbSNP,
.header_only = header_only) {
message_parallel(x)
get_genome_build(
sumstats = sumstats_list[[x]],
sampled_snps = .sampled_snps,
dbSNP = .dbSNP,
header_only = .header_only,
nThread = 1
)
},
mc.cores = nThread
) %>%
`names<-`(names(sumstats_list))

#Weirdly more efficient to not use parallel::mcapply() if only one
if(length(names(sumstats_list))==1){
build_ <- get_genome_build(
sumstats = sumstats_list[[1]],
sampled_snps = sampled_snps,
dbSNP = dbSNP,
header_only = header_only,
nThread = nThread,
chr_filt = chr_filt
)
builds <- list(build_)
names(builds) <- c(names(sumstats_list))
}else{
builds <- parallel::mclapply(names(sumstats_list),
function(x,
.sampled_snps = sampled_snps,
.dbSNP=dbSNP,
.header_only = header_only) {
message_parallel(x)
get_genome_build(
sumstats = sumstats_list[[x]],
sampled_snps = .sampled_snps,
dbSNP = .dbSNP,
header_only = .header_only,
nThread = 1,
chr_filt = chr_filt
)
},
mc.cores = nThread) %>%
`names<-`(names(sumstats_list))
}
#### Report time ####
message(utils::capture.output(difftime(Sys.time(), start)))
#### Report build counts ####
Expand Down
22 changes: 21 additions & 1 deletion R/load_ref_genome_data.R
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@
#' @param dbSNP version of dbSNP to be used (144 or 155)
#' @param msg Optional name of the column missing from the dataset in question.
#' Default is NULL
#' @param chr_filt Internal for testing - filter reference genomes and sumstats
#' to specific chromosomes for testing. Pass a list of chroms in format:
#' c("1","2"). Default is NULL i.e. no filtering.
#' @return data table of snpsById, filtered to SNPs of interest.
#' @importFrom data.table setDT setkey := setnames setorder
#' @importFrom BSgenome snpsById
Expand All @@ -20,7 +23,8 @@
load_ref_genome_data <- function(snps,
ref_genome,
dbSNP=c(144,155),
msg = NULL) {
msg = NULL,
chr_filt = NULL) {

SNP <- NULL
SNP_LOC_DATA <- load_snp_loc_data(ref_genome, dbSNP, msg = msg)
Expand All @@ -47,6 +51,22 @@ load_ref_genome_data <- function(snps,

messager("Validating RSIDs of",formatC(length(snps),big.mark = ","),
"SNPs using BSgenome::snpsById...")
#this is memory intensive
#if user specifies for testing, filt to a specific chr
#chr should be in format "1", "2" etc
if(!is.null(chr_filt)){
#from here: https://support.bioconductor.org/p/83588/
keepBSgenomeSequences <- function(genome, seqnames)
{
stopifnot(all(seqnames %in% seqnames(genome)))
genome@user_seqnames <- setNames(seqnames, seqnames)
genome@seqinfo <- genome@seqinfo[seqnames]
genome
}
genome <- keepBSgenomeSequences(genome, chr_filt)
#can't find a method to filter ODLT_SNPlocs object ....
#SNP_LOC_DATA <- SNP_LOC_DATA[[chr_filt]]
}
tm <- system.time({
gr_rsids <- BSgenome::snpsById(
x = SNP_LOC_DATA,
Expand Down
7 changes: 6 additions & 1 deletion man/get_genome_build.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

7 changes: 6 additions & 1 deletion man/get_genome_builds.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

12 changes: 11 additions & 1 deletion man/load_ref_genome_data.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 3 additions & 2 deletions tests/testthat/test-on_ref_genome.R
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,9 @@ test_that("SNPs not on reference genome are removed", {
sumstats_list <- list(ss1 = eduAttainOkbayPth)
ref_genomes <- get_genome_builds(
sumstats_list = sumstats_list,
sampled_snps = 50,
dbSNP = 144
sampled_snps = 19,
dbSNP = 144,
chr_filt = c("1","2","3","4") #filtering to reduce computational burden
)
expect_equal(all.equal(ref_genomes, list("ss1" = "GRCH37")), TRUE)
} else {
Expand Down

0 comments on commit 0d94161

Please sign in to comment.