Merge pull request #1 from Ning-L/vcf_assembly_convert

new feature to convert VCF between genome assemblies
umr1283 · Oct 20, 2020 · 75eb107 · 75eb107
2 parents ca9b271 + 085a64f
commit 75eb107
Show file tree

Hide file tree

Showing 7 changed files with 191 additions and 2 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -38,12 +38,14 @@ Suggests:
 Encoding: UTF-8
 LazyData: true
 Roxygen: list(markdown = TRUE)
-RoxygenNote: 7.1.0
+RoxygenNote: 7.1.1
 SystemRequirements: 
     pandoc (>= 1.12.3) - http://pandoc.org, 
     bcftools (>= 1.10.2), 
+    tabix (>= 1.10.2), 
     bgzip (>= 1.10.2), 
     gcta64 (>= 1.93.2), 
     perl (>= 5.28.1), 
     plink1.9 (>= 1.90),
-    vcftools (>= 0.1.16)
+    vcftools (>= 0.1.16),
+    CrossMap (>= 0.4.3) - https://crossmap.readthedocs.io/en/latest/
diff --git a/NAMESPACE b/NAMESPACE
@@ -1,4 +1,5 @@
 # Generated by roxygen2: do not edit by hand
 
+export(convert_assembly)
 export(qc_plink)
 export(qc_vcf)
diff --git a/NEWS.md b/NEWS.md
@@ -1,5 +1,11 @@
 # dgapaq (development version)
 
+## New feature
+
+* In `R/convert_assembly.R`, 
+    + `convert_assembly()` allows to convert VCFs to target genome assembly 
+    using the software [CrossMap](https://crossmap.readthedocs.io/en/latest/).
+
 # dgapaq 0.5.2
 
 * In [rmarkdown templates](inst/rmarkdown/templates),

diff --git a/R/convert_assembly.R b/R/convert_assembly.R
@@ -0,0 +1,144 @@
+#' Convert VCF to different assembly with CrossMap
+#'
+#' @param input_directory  A `character`. The path to the VCFs files or directory. Default is `NULL`.
+#' @param output_directory A `character`. The path to the output directory. Default is `NULL`.
+#' @param ref_fasta A `character`. A path to the reference fasta file. Default is `NULL`.
+#' @param chain_file A `character`. A path to the chain file. Default is `NULL`.
+#' @param bin_path A `list(character)`. A list giving the binary path of `CrossMap`, `bcftools`, `tabix` and `bgzip`.
+#' @param nb_cores An `integer`. The number of CPUs to use.
+#'
+#' @return NULL
+#'
+#' @export
+convert_assembly <- function(
+  input_directory = NULL,
+  output_directory = NULL,
+  ref_fasta = NULL,
+  chain_file = NULL,
+  bin_path = list(
+    crossmap = "/usr/local/bin/CrossMap.py",
+    bcftools = "/usr/bin/bcftools",
+    tabix = "/usr/bin/tabix",
+    bgzip = "/usr/bin/bgzip"
+  ),
+  nb_cores = 1
+) {
+
+  names(bin_path) <- tolower(names(bin_path))
+
+  if (
+    length(intersect(names(bin_path), c("crossmap", "bcftools", "tabix", "bgzip"))) != 4 |
+      !all(sapply(bin_path, file.exists))
+  ) {
+    stop('"bin_path" must contains valid named path to "crossmap", "bcftools", "tabix" and "bgzip"!')
+  }
+
+  if (
+    length(input_directory) == 1 &
+      length(list.files(input_directory, pattern = "(.vcf.gz|.vcf)$")) <= 1
+  ) {
+    stop('"input_directory" must be a directory of VCF files splitted by chromosome!')
+  }
+
+  if (
+    length(input_directory) > 1 &
+      !all(file.exists(input_directory)) &
+      !all(grepl("(.vcf.gz|.vcf)$", input_directory))
+  ) {
+    stop('"input_directory" must be a vector of VCF files splitted by chromosome!')
+  }
+
+  if (length(input_directory) > 1) {
+    vcf_files <- input_directory
+  } else {
+    vcf_files <- list.files(input_directory, pattern = "(.vcf.gz|.vcf)$", full.names = TRUE)
+  }
+
+  temp_directory <- file.path(tempdir(), "convert_assembly")
+  dir.create(path = temp_directory, showWarnings = FALSE)
+
+  ## convert to target assembly
+  invisible(parallel::mclapply(
+    X = vcf_files,
+    mc.cores = min(length(vcf_files), nb_cores),
+    FUN = function(file_i) {
+      output_file <- file.path(temp_directory, basename(file_i))
+      system(
+        command = paste(bin_path[["crossmap"]], "vcf", chain_file, file_i, ref_fasta, output_file),
+        wait = TRUE
+      )
+      system(
+        command = paste(bin_path[["bcftools"]], "sort", "-Oz -o", output_file, output_file),
+        wait = TRUE
+      )
+      system(
+        command = paste(bin_path[["tabix"]], "-f -p vcf", output_file),
+        wait = TRUE
+      )
+    }
+  ))
+
+  ## resolve changed chromosomes
+  chrs <- paste0(rep(c("", "chr"), each = 29), c(1:25, "X", "Y", "M", "MT"))
+  list_vcfs <- list.files(temp_directory, pattern = "vcf.gz$", full.names = TRUE)
+  invisible(lapply(list_vcfs, function(vcf_i) {
+    chr_i_orig <- gsub(paste0("^(", paste(chrs, collapse = "|"), ").*"), "\\1", basename(vcf_i))
+    chr_list <- system(
+      command = paste(bin_path[["tabix"]], "--list-chroms", vcf_i),
+      intern = TRUE,
+      wait = TRUE
+    )
+    chr_list <- intersect(chr_list, chrs)
+
+    parallel::mclapply(
+      X = chr_list,
+      mc.cores = min(length(chr_list), nb_cores),
+      mc.preschedule = FALSE,
+      FUN = function(chr_i) {
+        output_file <- file.path(
+          temp_directory,
+          gsub(chr_i_orig, paste0(chr_i, "_from_", chr_i_orig), basename(vcf_i))
+        )
+        system(
+          command = paste(bin_path[["bcftools"]], "view", "--regions", chr_i, vcf_i, "-Oz -o", output_file),
+          wait = TRUE
+        )
+        system(
+          command = paste(bin_path[["tabix"]], "-f -p vcf", output_file),
+          wait = TRUE
+        )
+      }
+    )
+  }))
+
+  invisible(parallel::mclapply(
+    X = list_vcfs,
+    mc.cores = min(length(list_vcfs), nb_cores),
+    mc.preschedule = FALSE,
+    FUN = function(vcf_i) {
+      chr_i <- gsub(paste0("^(", paste(chrs, collapse = "|"), ").*"), "\\1", basename(vcf_i))
+      vcf_to_bind <- paste(
+        list.files(temp_directory, pattern = paste0(chr_i, "_from_.*vcf.gz$"), full.names = TRUE),
+        collapse = " "
+      )
+
+      vcf_tmp <- file.path(temp_directory, paste0("temp_", basename(vcf_i)))
+      vcf_out <- file.path(output_directory, basename(vcf_i))
+
+      system(
+        command = paste(bin_path[["bcftools"]], "concat -a", vcf_to_bind, "-Oz -o", vcf_tmp),
+        wait = TRUE
+      )
+      system(
+        command = paste(bin_path[["bcftools"]], "sort", "-Oz -o", vcf_out, vcf_tmp),
+        wait = TRUE
+      )
+      system(
+        command = paste(bin_path[["tabix"]], "-f -p  vcf", vcf_out),
+        wait = TRUE
+      )
+    }
+  ))
+
+  invisible(unlink(temp_directory, recursive = TRUE, force = TRUE))
+}
diff --git a/README.Rmd b/README.Rmd
@@ -38,6 +38,8 @@ remotes::install_github("mcanouil/dgapaq")
     using a [rmarkdown template](inst/rmarkdown/templates/qc_plink/skeleton/skeleton.Rmd).
 * `qc_vcf()` allows to compute post-imputation quality-control report 
     using a default [rmarkdown template](inst/rmarkdown/templates/qc_vcf/skeleton/skeleton.Rmd).
+* `convert_assembly()` allows to convert VCFs to target genome assembly using the software 
+    [CrossMap](https://crossmap.readthedocs.io/en/latest/).
 
 ## Getting help
 

diff --git a/README.md b/README.md
@@ -30,6 +30,8 @@ remotes::install_github("mcanouil/dgapaq")
   - `qc_vcf()` allows to compute post-imputation quality-control report
     using a default [rmarkdown
     template](inst/rmarkdown/templates/qc_vcf/skeleton/skeleton.Rmd).
+  - `convert_assembly()` allows to convert VCFs to target genome assembly 
+    using the software [CrossMap](https://crossmap.readthedocs.io/en/latest/).
 
 ## Getting help
 

diff --git a/man/convert_assembly.Rd b/man/convert_assembly.Rd