From cae73bf61d0dd7af05839add09f1ce2867dfb1cc Mon Sep 17 00:00:00 2001 From: NING-L Date: Wed, 9 Sep 2020 16:19:57 +0200 Subject: [PATCH 01/14] convert vcf to target assembly --- R/convert_assembly.R | 112 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 112 insertions(+) create mode 100644 R/convert_assembly.R diff --git a/R/convert_assembly.R b/R/convert_assembly.R new file mode 100644 index 0000000..4f3a6a5 --- /dev/null +++ b/R/convert_assembly.R @@ -0,0 +1,112 @@ +#' Convert VCF to different assembly with CrossMap +#' +#' @param ref_fasta A `character`. A path to the reference fasta file. +#' @param chain_file A `character`. A path to the chain file. +#' @param input_directory A `character`. The path to the VCFs. +#' @param output_directory A `character`. The path to the output directory. +#' @param bin_path A `list(character)`. A list giving the binary path of `CrossMap`, `bcftools`, `tabix` and `bgzip` +#' @param nb_cores An `integer`. The number of CPUs to use. +#' +#' @return NULL +#' @export +#' +#' @examples +convert_assembly <- function( + ref_fasta, + chain_file, + input_directory , + output_directory, + bin_path = list( + CrossMap = "/usr/local/bin/CrossMap.py", + bcftools = "/usr/bin/bcftools", + tabix = "/usr/bin/tabix", + bgzip = "/usr/bin/bgzip" + ), + nb_cores = 1 +) { + + if (all(file_test("-f", input_directory ))) { + vcf_files <- input_directory + } else if(file_test("-d", input_directory )) { + vcf_files <- list.files(input_directory , pattern = "(.vcf.gz|.vcf)$", full.names = TRUE) + } else { + stop('"input_directory " contains file or directory which does not exist, please check!') + } + + if (!dir.exists(output_directory)) { + system(paste("sudo mkdir", output_directory)) + system(paste("sudo chmod g+w -R", output_directory)) + } + + ## convert to target assembly + invisible(parallel::mclapply(vcf_files, mc.cores = min(length(vcf_files), nb_cores), function(file_i) { + out_name <- file.path(output_directory, basename(file_i)) + system(paste( + bin_path[["CrossMap"]], + "vcf", + chain_file, + file_i, + ref_fasta, + out_name + )) + system(paste( + bin_path[["bcftools"]], + "sort", + "-Oz -o", out_name, + out_name + )) + system(paste(bin_path[["tabix"]], "-f -p vcf", out_name)) + })) + + ## resolve changed chromosomes + chrs <- c(1:25, "X", "Y", "M", "MT") + chrs <- c(chrs, paste0("chr", chrs)) + list_vcfs <- list.files(output_directory, pattern = "vcf.gz$", full.names = TRUE) + invisible(lapply(list_vcfs, function(vcf_i) { + chr_i_orig <- gsub("([0-9]+|X|Y|M|MT).*", "\\1", basename(vcf_i)) + chr_list <- system(paste( + bin_path[["tabix"]], + "--list-chroms", + vcf_i + ), intern = TRUE) + chr_list <- intersect(chr_list, chrs) + + parallel::mclapply(chr_list, mc.cores = min(length(chr_list), nb_cores), function(chr_i) { + out_name <- file.path( + dirname(vcf_i), + gsub(chr_i_orig, paste0(chr_i, "_from_", chr_i_orig), basename(vcf_i)) + ) + system(paste( + bin_path[["bcftools"]], + "view", + "--regions", chr_i, + vcf_i, + "-Oz -o", out_name + )) + system(paste(bin_path[["tabix"]], "-f -p vcf", out_name)) + }) + })) + + list_vcfs_split <- list.files(output_directory, pattern = "from.*vcf.gz$", full.names = TRUE) + invisible(parallel::mclapply(list_vcfs, mc.cores = min(length(list_vcfs), nb_cores), function(vcf_i) { + chr_i <- gsub("([0-9]+|X|Y|M|MT).*", "\\1", basename(vcf_i)) + vcf_to_bind <- list_vcfs_split[grep(paste0("^", chr_i, "_from"), basename(list_vcfs_split))] + vcf_tmp <- file.path(dirname(vcf_i), paste0("tempo_", basename(vcf_i))) + system(paste( + bin_path[["bcftools"]], + "concat -a", paste(vcf_to_bind, collapse = " "), + "-Oz -o", vcf_tmp + )) + unlink(paste0(vcf_i, ".tbi")) + system(paste( + bin_path[["bcftools"]], + "sort", + "-Oz -o", vcf_i, + vcf_tmp + )) + unlink(vcf_tmp) + system(paste(bin_path[["tabix"]], "-p vcf", vcf_i)) + })) + + unlink(list.files(output_directory, pattern = "_from_", full.names = TRUE)) +} From 3e18ceb765562ab574161c6b8bf55387031581ce Mon Sep 17 00:00:00 2001 From: NING-L Date: Wed, 9 Sep 2020 16:26:47 +0200 Subject: [PATCH 02/14] update namespace and docs --- DESCRIPTION | 3 ++- NAMESPACE | 1 + man/convert_assembly.Rd | 32 ++++++++++++++++++++++++++++++++ 3 files changed, 35 insertions(+), 1 deletion(-) create mode 100644 man/convert_assembly.Rd diff --git a/DESCRIPTION b/DESCRIPTION index c02ab66..852e58d 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -46,4 +46,5 @@ SystemRequirements: gcta64 (>= 1.93.2), perl (>= 5.28.1), plink1.9 (>= 1.90), - vcftools (>= 0.1.16) + vcftools (>= 0.1.16), + CrossMap (>= 0.4.3) - https://crossmap.readthedocs.io/en/latest/ diff --git a/NAMESPACE b/NAMESPACE index 9a9b6d8..096d4be 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,4 +1,5 @@ # Generated by roxygen2: do not edit by hand +export(convert_assembly) export(qc_plink) export(qc_vcf) diff --git a/man/convert_assembly.Rd b/man/convert_assembly.Rd new file mode 100644 index 0000000..b13a7b3 --- /dev/null +++ b/man/convert_assembly.Rd @@ -0,0 +1,32 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/convert_assembly.R +\name{convert_assembly} +\alias{convert_assembly} +\title{Convert VCF to different assembly with CrossMap} +\usage{ +convert_assembly( + ref_fasta, + chain_file, + input_directory, + output_directory, + bin_path = list(CrossMap = "/usr/local/bin/CrossMap.py", bcftools = + "/usr/bin/bcftools", tabix = "/usr/bin/tabix", bgzip = "/usr/bin/bgzip"), + nb_cores = 1 +) +} +\arguments{ +\item{ref_fasta}{A \code{character}. A path to the reference fasta file.} + +\item{chain_file}{A \code{character}. A path to the chain file.} + +\item{input_directory}{A \code{character}. The path to the VCFs.} + +\item{output_directory}{A \code{character}. The path to the output directory.} + +\item{bin_path}{A \code{list(character)}. A list giving the binary path of \code{CrossMap}, \code{bcftools}, \code{tabix} and \code{bgzip}} + +\item{nb_cores}{An \code{integer}. The number of CPUs to use.} +} +\description{ +Convert VCF to different assembly with CrossMap +} From 4dc7948b0af447f20a164d03afeae4ccceae565b Mon Sep 17 00:00:00 2001 From: lning Date: Wed, 9 Sep 2020 17:39:52 +0200 Subject: [PATCH 03/14] add prefix, modify doc, reindent --- R/convert_assembly.R | 58 +++++++++++++++++++++++--------------------- 1 file changed, 30 insertions(+), 28 deletions(-) diff --git a/R/convert_assembly.R b/R/convert_assembly.R index 4f3a6a5..1064437 100644 --- a/R/convert_assembly.R +++ b/R/convert_assembly.R @@ -2,15 +2,16 @@ #' #' @param ref_fasta A `character`. A path to the reference fasta file. #' @param chain_file A `character`. A path to the chain file. -#' @param input_directory A `character`. The path to the VCFs. +#' @param input_directory A `character`. The path to the VCFs, files should be prefixed by +#' chromosome name like: 1, 2, ..., 23 (X), 24 (Y), 25 (M or MT). #' @param output_directory A `character`. The path to the output directory. -#' @param bin_path A `list(character)`. A list giving the binary path of `CrossMap`, `bcftools`, `tabix` and `bgzip` +#' @param bin_path A `list(character)`. A list giving the binary path of `CrossMap`, `bcftools`, +#' `tabix` and `bgzip` #' @param nb_cores An `integer`. The number of CPUs to use. #' #' @return NULL -#' @export #' -#' @examples +#' @export convert_assembly <- function( ref_fasta, chain_file, @@ -25,17 +26,16 @@ convert_assembly <- function( nb_cores = 1 ) { - if (all(file_test("-f", input_directory ))) { + if (all(utils::file_test("-f", input_directory ))) { vcf_files <- input_directory - } else if(file_test("-d", input_directory )) { + } else if(utils::file_test("-d", input_directory )) { vcf_files <- list.files(input_directory , pattern = "(.vcf.gz|.vcf)$", full.names = TRUE) } else { - stop('"input_directory " contains file or directory which does not exist, please check!') + stop('"input_directory" contains file or directory which does not exist, please check!') } if (!dir.exists(output_directory)) { - system(paste("sudo mkdir", output_directory)) - system(paste("sudo chmod g+w -R", output_directory)) + dir.create(output_directory, showWarnings = FALSE, recursive = TRUE, mode = "0777") } ## convert to target assembly @@ -88,25 +88,27 @@ convert_assembly <- function( })) list_vcfs_split <- list.files(output_directory, pattern = "from.*vcf.gz$", full.names = TRUE) - invisible(parallel::mclapply(list_vcfs, mc.cores = min(length(list_vcfs), nb_cores), function(vcf_i) { - chr_i <- gsub("([0-9]+|X|Y|M|MT).*", "\\1", basename(vcf_i)) - vcf_to_bind <- list_vcfs_split[grep(paste0("^", chr_i, "_from"), basename(list_vcfs_split))] - vcf_tmp <- file.path(dirname(vcf_i), paste0("tempo_", basename(vcf_i))) - system(paste( - bin_path[["bcftools"]], - "concat -a", paste(vcf_to_bind, collapse = " "), - "-Oz -o", vcf_tmp - )) - unlink(paste0(vcf_i, ".tbi")) - system(paste( - bin_path[["bcftools"]], - "sort", - "-Oz -o", vcf_i, - vcf_tmp - )) - unlink(vcf_tmp) - system(paste(bin_path[["tabix"]], "-p vcf", vcf_i)) - })) + invisible(parallel::mclapply( + list_vcfs, mc.cores = min(length(list_vcfs), nb_cores), mc.preschedule = FALSE, function(vcf_i) { + chr_i <- gsub("([0-9]+|X|Y|M|MT).*", "\\1", basename(vcf_i)) + vcf_to_bind <- list_vcfs_split[grep(paste0("^", chr_i, "_from"), basename(list_vcfs_split))] + vcf_tmp <- file.path(dirname(vcf_i), paste0("tempo_", basename(vcf_i))) + system(paste( + bin_path[["bcftools"]], + "concat -a", paste(vcf_to_bind, collapse = " "), + "-Oz -o", vcf_tmp + )) + unlink(paste0(vcf_i, ".tbi")) + system(paste( + bin_path[["bcftools"]], + "sort", + "-Oz -o", vcf_i, + vcf_tmp + )) + unlink(vcf_tmp) + system(paste(bin_path[["tabix"]], "-p vcf", vcf_i)) + } + )) unlink(list.files(output_directory, pattern = "_from_", full.names = TRUE)) } From 5fe9fcd34e285a8e709e0945ee69760332edb487 Mon Sep 17 00:00:00 2001 From: lning Date: Wed, 9 Sep 2020 17:43:13 +0200 Subject: [PATCH 04/14] update doc --- man/convert_assembly.Rd | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/man/convert_assembly.Rd b/man/convert_assembly.Rd index b13a7b3..7d953c6 100644 --- a/man/convert_assembly.Rd +++ b/man/convert_assembly.Rd @@ -19,11 +19,13 @@ convert_assembly( \item{chain_file}{A \code{character}. A path to the chain file.} -\item{input_directory}{A \code{character}. The path to the VCFs.} +\item{input_directory}{A \code{character}. The path to the VCFs, files should be prefixed by +chromosome name like: 1, 2, ..., 23 (X), 24 (Y), 25 (M or MT).} \item{output_directory}{A \code{character}. The path to the output directory.} -\item{bin_path}{A \code{list(character)}. A list giving the binary path of \code{CrossMap}, \code{bcftools}, \code{tabix} and \code{bgzip}} +\item{bin_path}{A \code{list(character)}. A list giving the binary path of \code{CrossMap}, \code{bcftools}, +\code{tabix} and \code{bgzip}} \item{nb_cores}{An \code{integer}. The number of CPUs to use.} } From 02590cb6a29e614cead56dbbee60c13bbb7657c6 Mon Sep 17 00:00:00 2001 From: lning Date: Wed, 9 Sep 2020 18:01:01 +0200 Subject: [PATCH 05/14] update readme --- README.Rmd | 2 ++ README.md | 2 ++ 2 files changed, 4 insertions(+) diff --git a/README.Rmd b/README.Rmd index 3e9cc29..d9b814e 100644 --- a/README.Rmd +++ b/README.Rmd @@ -38,6 +38,8 @@ remotes::install_github("mcanouil/dgapaq") using a [rmarkdown template](inst/rmarkdown/templates/qc_plink/skeleton/skeleton.Rmd). * `qc_vcf()` allows to compute post-imputation quality-control report using a default [rmarkdown template](inst/rmarkdown/templates/qc_vcf/skeleton/skeleton.Rmd). +* `convert_assembly()` allows to convert VCFs to target genome assembly using the software + [CrossMap](https://crossmap.readthedocs.io/en/latest/). ## Getting help diff --git a/README.md b/README.md index 16f26df..6972cb1 100644 --- a/README.md +++ b/README.md @@ -30,6 +30,8 @@ remotes::install_github("mcanouil/dgapaq") - `qc_vcf()` allows to compute post-imputation quality-control report using a default [rmarkdown template](inst/rmarkdown/templates/qc_vcf/skeleton/skeleton.Rmd). + - `convert_assembly()` allows to convert VCFs to target genome assembly + using the software [CrossMap](https://crossmap.readthedocs.io/en/latest/). ## Getting help From f6c88f3f3cacf38079c8e1278c1921a8463fb1a4 Mon Sep 17 00:00:00 2001 From: lning Date: Wed, 9 Sep 2020 18:14:51 +0200 Subject: [PATCH 06/14] add tabix --- DESCRIPTION | 1 + 1 file changed, 1 insertion(+) diff --git a/DESCRIPTION b/DESCRIPTION index 852e58d..459a246 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -42,6 +42,7 @@ RoxygenNote: 7.1.0 SystemRequirements: pandoc (>= 1.12.3) - http://pandoc.org, bcftools (>= 1.10.2), + tabix (>= 1.10.2), bgzip (>= 1.10.2), gcta64 (>= 1.93.2), perl (>= 5.28.1), From 24a4c97b2fbf016cf32d359ce986fe33ebe59d61 Mon Sep 17 00:00:00 2001 From: lning Date: Wed, 9 Sep 2020 18:25:05 +0200 Subject: [PATCH 07/14] remove extra space --- R/convert_assembly.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/convert_assembly.R b/R/convert_assembly.R index 1064437..e3905cf 100644 --- a/R/convert_assembly.R +++ b/R/convert_assembly.R @@ -26,9 +26,9 @@ convert_assembly <- function( nb_cores = 1 ) { - if (all(utils::file_test("-f", input_directory ))) { + if (all(utils::file_test("-f", input_directory))) { vcf_files <- input_directory - } else if(utils::file_test("-d", input_directory )) { + } else if (utils::file_test("-d", input_directory)) { vcf_files <- list.files(input_directory , pattern = "(.vcf.gz|.vcf)$", full.names = TRUE) } else { stop('"input_directory" contains file or directory which does not exist, please check!') From 58018b1528e3045dbef4c218d924478959bf7f19 Mon Sep 17 00:00:00 2001 From: NING-L Date: Thu, 10 Sep 2020 13:45:34 +0200 Subject: [PATCH 08/14] remove temp files right after merged --- R/convert_assembly.R | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/R/convert_assembly.R b/R/convert_assembly.R index e3905cf..2c03240 100644 --- a/R/convert_assembly.R +++ b/R/convert_assembly.R @@ -98,7 +98,7 @@ convert_assembly <- function( "concat -a", paste(vcf_to_bind, collapse = " "), "-Oz -o", vcf_tmp )) - unlink(paste0(vcf_i, ".tbi")) + unlink(c(paste0(vcf_i, ".tbi"), vcf_to_bind)) system(paste( bin_path[["bcftools"]], "sort", @@ -110,5 +110,4 @@ convert_assembly <- function( } )) - unlink(list.files(output_directory, pattern = "_from_", full.names = TRUE)) } From cfe2d6b38bbc12479330985c1dba0b3b6161b5bd Mon Sep 17 00:00:00 2001 From: NING-L Date: Thu, 10 Sep 2020 13:50:09 +0200 Subject: [PATCH 09/14] update new feature --- NEWS.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/NEWS.md b/NEWS.md index 997b12d..e9708a4 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,10 @@ +# dgapaq 0.6.0 + +## New feature + +* `convert_assembly()` allows to convert VCFs to target genome assembly + using the software [CrossMap](https://crossmap.readthedocs.io/en/latest/). + # dgapaq 0.5.1 * In [rmarkdown templates](inst/rmarkdown/templates), From 5a295c4c6e1dabddfb4586d30349e99be2120270 Mon Sep 17 00:00:00 2001 From: NING-L Date: Thu, 10 Sep 2020 14:01:05 +0200 Subject: [PATCH 10/14] update new feature --- DESCRIPTION | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 459a246..83c8e46 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: dgapaq Title: DNA Genotyping Arrays Processing And Quality-Control -Version: 0.5.1 +Version: 0.6.0 Authors@R: c(person(given = "Mickaël", family = "Canouil", @@ -38,7 +38,7 @@ Suggests: Encoding: UTF-8 LazyData: true Roxygen: list(markdown = TRUE) -RoxygenNote: 7.1.0 +RoxygenNote: 7.1.1 SystemRequirements: pandoc (>= 1.12.3) - http://pandoc.org, bcftools (>= 1.10.2), From 534125c1f01f40666cb5e33d51d2639423b7ef23 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micka=C3=ABl=20Canouil?= Date: Tue, 20 Oct 2020 13:03:16 +0200 Subject: [PATCH 11/14] dev version --- DESCRIPTION | 2 +- NEWS.md | 11 +++++++++-- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 83c8e46..5fbb2d1 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: dgapaq Title: DNA Genotyping Arrays Processing And Quality-Control -Version: 0.6.0 +Version: 0.5.2.9000 Authors@R: c(person(given = "Mickaël", family = "Canouil", diff --git a/NEWS.md b/NEWS.md index e9708a4..af7f131 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,10 +1,17 @@ -# dgapaq 0.6.0 +# dgapaq (development version) ## New feature -* `convert_assembly()` allows to convert VCFs to target genome assembly +* In `R/convert_assembly.R`, + + `convert_assembly()` allows to convert VCFs to target genome assembly using the software [CrossMap](https://crossmap.readthedocs.io/en/latest/). +# dgapaq 0.5.2 + +* In [rmarkdown templates](inst/rmarkdown/templates), + + Small refactoring. + + Ensure FID and IID are of type character. + # dgapaq 0.5.1 * In [rmarkdown templates](inst/rmarkdown/templates), From 8842985ed592be2541444c3404785b52b354f4fc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micka=C3=ABl=20Canouil?= Date: Tue, 20 Oct 2020 13:24:41 +0200 Subject: [PATCH 12/14] rewrite doc and stop condition --- R/convert_assembly.R | 78 ++++++++++++++++++++++++-------------------- 1 file changed, 42 insertions(+), 36 deletions(-) diff --git a/R/convert_assembly.R b/R/convert_assembly.R index 2c03240..bf87638 100644 --- a/R/convert_assembly.R +++ b/R/convert_assembly.R @@ -1,22 +1,21 @@ #' Convert VCF to different assembly with CrossMap #' -#' @param ref_fasta A `character`. A path to the reference fasta file. -#' @param chain_file A `character`. A path to the chain file. -#' @param input_directory A `character`. The path to the VCFs, files should be prefixed by -#' chromosome name like: 1, 2, ..., 23 (X), 24 (Y), 25 (M or MT). -#' @param output_directory A `character`. The path to the output directory. -#' @param bin_path A `list(character)`. A list giving the binary path of `CrossMap`, `bcftools`, -#' `tabix` and `bgzip` +#' @param input_directory A `character`. The path to the VCFs files or directory. Default is `NULL`. +#' @param output_directory A `character`. The path to the output directory. Default is `NULL`. +#' @param ref_fasta A `character`. A path to the reference fasta file. Default is `NULL`. +#' @param chain_file A `character`. A path to the chain file. Default is `NULL`. +#' @param bin_path A `list(character)`. A list giving the binary path of +#' `CrossMap`, `bcftools`, `tabix` and `bgzip`. #' @param nb_cores An `integer`. The number of CPUs to use. #' #' @return NULL #' #' @export convert_assembly <- function( - ref_fasta, - chain_file, - input_directory , - output_directory, + input_directory = NULL, + output_directory = NULL, + ref_fasta = NULL, + chain_file = NULL, bin_path = list( CrossMap = "/usr/local/bin/CrossMap.py", bcftools = "/usr/bin/bcftools", @@ -26,37 +25,44 @@ convert_assembly <- function( nb_cores = 1 ) { - if (all(utils::file_test("-f", input_directory))) { + if (length(input_directory) == 1 & length(list.files(input_directory, pattern = "(.vcf.gz|.vcf)$")) <= 1) { + stop('"input_directory" must be a directory of VCF files splitted by chromosome!') + } + + if ( + length(input_directory) > 1 & + !all(file.exists(input_directory)) & + !all(grepl("(.vcf.gz|.vcf)$", input_directory)) + ) { + stop('"input_directory" must be a vector of VCF files splitted by chromosome!') + } + + if (length(input_directory) > 1) { vcf_files <- input_directory - } else if (utils::file_test("-d", input_directory)) { - vcf_files <- list.files(input_directory , pattern = "(.vcf.gz|.vcf)$", full.names = TRUE) } else { - stop('"input_directory" contains file or directory which does not exist, please check!') + vcf_files <- list.files(input_directory, pattern = "(.vcf.gz|.vcf)$", full.names = TRUE) } - if (!dir.exists(output_directory)) { - dir.create(output_directory, showWarnings = FALSE, recursive = TRUE, mode = "0777") - } + temp_directory <- file.path(tempdir(), "convert_assembly") + dir.create(path = temp_directory, showWarnings = FALSE) ## convert to target assembly - invisible(parallel::mclapply(vcf_files, mc.cores = min(length(vcf_files), nb_cores), function(file_i) { - out_name <- file.path(output_directory, basename(file_i)) - system(paste( - bin_path[["CrossMap"]], - "vcf", - chain_file, - file_i, - ref_fasta, - out_name - )) - system(paste( - bin_path[["bcftools"]], - "sort", - "-Oz -o", out_name, - out_name - )) - system(paste(bin_path[["tabix"]], "-f -p vcf", out_name)) - })) + invisible(parallel::mclapply( + X = vcf_files, + mc.cores = min(length(vcf_files), nb_cores), + FUN = function(file_i) { + output_file <- file.path(temp_directory, basename(file_i)) + system( + intern = TRUE, wait = TRUE, + command = paste(bin_path[["CrossMap"]], "vcf", chain_file, file_i, ref_fasta, output_file) + ) + system( + intern = TRUE, wait = TRUE, + command = paste(bin_path[["bcftools"]], "sort", "-Oz -o", output_file, output_file) + ) + system(intern = TRUE, wait = TRUE, command = paste(bin_path[["tabix"]], "-f -p vcf", output_file)) + } + )) ## resolve changed chromosomes chrs <- c(1:25, "X", "Y", "M", "MT") From aa8e816681783ccd0be52bc788759881db6af527 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micka=C3=ABl=20Canouil?= Date: Tue, 20 Oct 2020 14:19:32 +0200 Subject: [PATCH 13/14] small refactoring and error condition --- R/convert_assembly.R | 127 ++++++++++++++++++++++++++----------------- 1 file changed, 76 insertions(+), 51 deletions(-) diff --git a/R/convert_assembly.R b/R/convert_assembly.R index bf87638..976197f 100644 --- a/R/convert_assembly.R +++ b/R/convert_assembly.R @@ -4,8 +4,7 @@ #' @param output_directory A `character`. The path to the output directory. Default is `NULL`. #' @param ref_fasta A `character`. A path to the reference fasta file. Default is `NULL`. #' @param chain_file A `character`. A path to the chain file. Default is `NULL`. -#' @param bin_path A `list(character)`. A list giving the binary path of -#' `CrossMap`, `bcftools`, `tabix` and `bgzip`. +#' @param bin_path A `list(character)`. A list giving the binary path of `CrossMap`, `bcftools`, `tabix` and `bgzip`. #' @param nb_cores An `integer`. The number of CPUs to use. #' #' @return NULL @@ -17,7 +16,7 @@ convert_assembly <- function( ref_fasta = NULL, chain_file = NULL, bin_path = list( - CrossMap = "/usr/local/bin/CrossMap.py", + crossmap = "/usr/local/bin/CrossMap.py", bcftools = "/usr/bin/bcftools", tabix = "/usr/bin/tabix", bgzip = "/usr/bin/bgzip" @@ -25,7 +24,19 @@ convert_assembly <- function( nb_cores = 1 ) { - if (length(input_directory) == 1 & length(list.files(input_directory, pattern = "(.vcf.gz|.vcf)$")) <= 1) { + names(bin_path) <- tolower(names(bin_path)) + + if ( + length(intersect(names(bin_path), c("crossmap", "bcftools", "tabix", "bgzip"))) != 4 | + !all(sapply(bin_path, file.exists)) + ) { + stop('"bin_path" must contains valid named path to "crossmap", "bcftools", "tabix" and "bgzip"!') + } + + if ( + length(input_directory) == 1 & + length(list.files(input_directory, pattern = "(.vcf.gz|.vcf)$")) <= 1 + ) { stop('"input_directory" must be a directory of VCF files splitted by chromosome!') } @@ -53,67 +64,81 @@ convert_assembly <- function( FUN = function(file_i) { output_file <- file.path(temp_directory, basename(file_i)) system( - intern = TRUE, wait = TRUE, - command = paste(bin_path[["CrossMap"]], "vcf", chain_file, file_i, ref_fasta, output_file) + command = paste(bin_path[["crossmap"]], "vcf", chain_file, file_i, ref_fasta, output_file), + wait = TRUE ) system( - intern = TRUE, wait = TRUE, - command = paste(bin_path[["bcftools"]], "sort", "-Oz -o", output_file, output_file) + command = paste(bin_path[["bcftools"]], "sort", "-Oz -o", output_file, output_file), + wait = TRUE + ) + system( + command = paste(bin_path[["tabix"]], "-f -p vcf", output_file), + wait = TRUE ) - system(intern = TRUE, wait = TRUE, command = paste(bin_path[["tabix"]], "-f -p vcf", output_file)) } )) ## resolve changed chromosomes - chrs <- c(1:25, "X", "Y", "M", "MT") - chrs <- c(chrs, paste0("chr", chrs)) - list_vcfs <- list.files(output_directory, pattern = "vcf.gz$", full.names = TRUE) + chrs <- paste0(rep(c("", "chr"), each = 29), c(1:25, "X", "Y", "M", "MT")) + list_vcfs <- list.files(temp_directory, pattern = "vcf.gz$", full.names = TRUE) invisible(lapply(list_vcfs, function(vcf_i) { - chr_i_orig <- gsub("([0-9]+|X|Y|M|MT).*", "\\1", basename(vcf_i)) - chr_list <- system(paste( - bin_path[["tabix"]], - "--list-chroms", - vcf_i - ), intern = TRUE) + chr_i_orig <- gsub(paste0("^(", paste(chrs, collapse = "|"), ").*"), "\\1", basename(vcf_i)) + chr_list <- system( + command = paste(bin_path[["tabix"]], "--list-chroms", vcf_i), + intern = TRUE, + wait = TRUE + ) chr_list <- intersect(chr_list, chrs) - parallel::mclapply(chr_list, mc.cores = min(length(chr_list), nb_cores), function(chr_i) { - out_name <- file.path( - dirname(vcf_i), - gsub(chr_i_orig, paste0(chr_i, "_from_", chr_i_orig), basename(vcf_i)) - ) - system(paste( - bin_path[["bcftools"]], - "view", - "--regions", chr_i, - vcf_i, - "-Oz -o", out_name - )) - system(paste(bin_path[["tabix"]], "-f -p vcf", out_name)) - }) + parallel::mclapply( + X = chr_list, + mc.cores = min(length(chr_list), nb_cores), + mc.preschedule = FALSE, + FUN = function(chr_i) { + output_file <- file.path( + temp_directory, + gsub(chr_i_orig, paste0(chr_i, "_from_", chr_i_orig), basename(vcf_i)) + ) + system( + command = paste(bin_path[["bcftools"]], "view", "--regions", chr_i, vcf_i, "-Oz -o", output_file), + wait = TRUE + ) + system( + command = paste(bin_path[["tabix"]], "-f -p vcf", output_file), + wait = TRUE + ) + } + ) })) - list_vcfs_split <- list.files(output_directory, pattern = "from.*vcf.gz$", full.names = TRUE) invisible(parallel::mclapply( - list_vcfs, mc.cores = min(length(list_vcfs), nb_cores), mc.preschedule = FALSE, function(vcf_i) { - chr_i <- gsub("([0-9]+|X|Y|M|MT).*", "\\1", basename(vcf_i)) - vcf_to_bind <- list_vcfs_split[grep(paste0("^", chr_i, "_from"), basename(list_vcfs_split))] - vcf_tmp <- file.path(dirname(vcf_i), paste0("tempo_", basename(vcf_i))) - system(paste( - bin_path[["bcftools"]], - "concat -a", paste(vcf_to_bind, collapse = " "), - "-Oz -o", vcf_tmp - )) - unlink(c(paste0(vcf_i, ".tbi"), vcf_to_bind)) - system(paste( - bin_path[["bcftools"]], - "sort", - "-Oz -o", vcf_i, - vcf_tmp - )) - unlink(vcf_tmp) - system(paste(bin_path[["tabix"]], "-p vcf", vcf_i)) + X = list_vcfs, + mc.cores = min(length(list_vcfs), nb_cores), + mc.preschedule = FALSE, + FUN = function(vcf_i) { + chr_i <- gsub(paste0("^(", paste(chrs, collapse = "|"), ").*"), "\\1", basename(vcf_i)) + vcf_to_bind <- paste( + list.files(temp_directory, pattern = paste0(chr_i, "_from_.*vcf.gz$"), full.names = TRUE), + collapse = " " + ) + + vcf_tmp <- file.path(temp_directory, paste0("temp_", basename(vcf_i))) + vcf_out <- file.path(output_directory, basename(vcf_i)) + + system( + command = paste(bin_path[["bcftools"]], "concat -a", vcf_to_bind, "-Oz -o", vcf_tmp), + wait = TRUE + ) + system( + command = paste(bin_path[["bcftools"]], "sort", "-Oz -o", vcf_out, vcf_tmp), + wait = TRUE + ) + system( + command = paste(bin_path[["tabix"]], "-f -p vcf", vcf_out), + wait = TRUE + ) } )) + invisible(unlink(temp_directory, recursive = TRUE, force = TRUE)) } From 3ac1330719ce62a53497ad7282ce5723c42faed8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micka=C3=ABl=20Canouil?= Date: Tue, 20 Oct 2020 14:20:01 +0200 Subject: [PATCH 14/14] update doc --- man/convert_assembly.Rd | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/man/convert_assembly.Rd b/man/convert_assembly.Rd index 7d953c6..8f10a31 100644 --- a/man/convert_assembly.Rd +++ b/man/convert_assembly.Rd @@ -5,27 +5,25 @@ \title{Convert VCF to different assembly with CrossMap} \usage{ convert_assembly( - ref_fasta, - chain_file, - input_directory, - output_directory, - bin_path = list(CrossMap = "/usr/local/bin/CrossMap.py", bcftools = + input_directory = NULL, + output_directory = NULL, + ref_fasta = NULL, + chain_file = NULL, + bin_path = list(crossmap = "/usr/local/bin/CrossMap.py", bcftools = "/usr/bin/bcftools", tabix = "/usr/bin/tabix", bgzip = "/usr/bin/bgzip"), nb_cores = 1 ) } \arguments{ -\item{ref_fasta}{A \code{character}. A path to the reference fasta file.} +\item{input_directory}{A \code{character}. The path to the VCFs files or directory. Default is \code{NULL}.} -\item{chain_file}{A \code{character}. A path to the chain file.} +\item{output_directory}{A \code{character}. The path to the output directory. Default is \code{NULL}.} -\item{input_directory}{A \code{character}. The path to the VCFs, files should be prefixed by -chromosome name like: 1, 2, ..., 23 (X), 24 (Y), 25 (M or MT).} +\item{ref_fasta}{A \code{character}. A path to the reference fasta file. Default is \code{NULL}.} -\item{output_directory}{A \code{character}. The path to the output directory.} +\item{chain_file}{A \code{character}. A path to the chain file. Default is \code{NULL}.} -\item{bin_path}{A \code{list(character)}. A list giving the binary path of \code{CrossMap}, \code{bcftools}, -\code{tabix} and \code{bgzip}} +\item{bin_path}{A \code{list(character)}. A list giving the binary path of \code{CrossMap}, \code{bcftools}, \code{tabix} and \code{bgzip}.} \item{nb_cores}{An \code{integer}. The number of CPUs to use.} }