Skip to content
This repository has been archived by the owner on Jun 12, 2022. It is now read-only.

Commit

Permalink
Merge pull request #1 from Ning-L/vcf_assembly_convert
Browse files Browse the repository at this point in the history
new feature to convert VCF between genome assemblies
  • Loading branch information
Mickaël Canouil authored Oct 20, 2020
2 parents ca9b271 + 085a64f commit 75eb107
Show file tree
Hide file tree
Showing 7 changed files with 191 additions and 2 deletions.
6 changes: 4 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -38,12 +38,14 @@ Suggests:
Encoding: UTF-8
LazyData: true
Roxygen: list(markdown = TRUE)
RoxygenNote: 7.1.0
RoxygenNote: 7.1.1
SystemRequirements:
pandoc (>= 1.12.3) - http://pandoc.org,
bcftools (>= 1.10.2),
tabix (>= 1.10.2),
bgzip (>= 1.10.2),
gcta64 (>= 1.93.2),
perl (>= 5.28.1),
plink1.9 (>= 1.90),
vcftools (>= 0.1.16)
vcftools (>= 0.1.16),
CrossMap (>= 0.4.3) - https://crossmap.readthedocs.io/en/latest/
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# Generated by roxygen2: do not edit by hand

export(convert_assembly)
export(qc_plink)
export(qc_vcf)
6 changes: 6 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
# dgapaq (development version)

## New feature

* In `R/convert_assembly.R`,
+ `convert_assembly()` allows to convert VCFs to target genome assembly
using the software [CrossMap](https://crossmap.readthedocs.io/en/latest/).

# dgapaq 0.5.2

* In [rmarkdown templates](inst/rmarkdown/templates),
Expand Down
144 changes: 144 additions & 0 deletions R/convert_assembly.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
#' Convert VCF to different assembly with CrossMap
#'
#' @param input_directory A `character`. The path to the VCFs files or directory. Default is `NULL`.
#' @param output_directory A `character`. The path to the output directory. Default is `NULL`.
#' @param ref_fasta A `character`. A path to the reference fasta file. Default is `NULL`.
#' @param chain_file A `character`. A path to the chain file. Default is `NULL`.
#' @param bin_path A `list(character)`. A list giving the binary path of `CrossMap`, `bcftools`, `tabix` and `bgzip`.
#' @param nb_cores An `integer`. The number of CPUs to use.
#'
#' @return NULL
#'
#' @export
convert_assembly <- function(
input_directory = NULL,
output_directory = NULL,
ref_fasta = NULL,
chain_file = NULL,
bin_path = list(
crossmap = "/usr/local/bin/CrossMap.py",
bcftools = "/usr/bin/bcftools",
tabix = "/usr/bin/tabix",
bgzip = "/usr/bin/bgzip"
),
nb_cores = 1
) {

names(bin_path) <- tolower(names(bin_path))

if (
length(intersect(names(bin_path), c("crossmap", "bcftools", "tabix", "bgzip"))) != 4 |
!all(sapply(bin_path, file.exists))
) {
stop('"bin_path" must contains valid named path to "crossmap", "bcftools", "tabix" and "bgzip"!')
}

if (
length(input_directory) == 1 &
length(list.files(input_directory, pattern = "(.vcf.gz|.vcf)$")) <= 1
) {
stop('"input_directory" must be a directory of VCF files splitted by chromosome!')
}

if (
length(input_directory) > 1 &
!all(file.exists(input_directory)) &
!all(grepl("(.vcf.gz|.vcf)$", input_directory))
) {
stop('"input_directory" must be a vector of VCF files splitted by chromosome!')
}

if (length(input_directory) > 1) {
vcf_files <- input_directory
} else {
vcf_files <- list.files(input_directory, pattern = "(.vcf.gz|.vcf)$", full.names = TRUE)
}

temp_directory <- file.path(tempdir(), "convert_assembly")
dir.create(path = temp_directory, showWarnings = FALSE)

## convert to target assembly
invisible(parallel::mclapply(
X = vcf_files,
mc.cores = min(length(vcf_files), nb_cores),
FUN = function(file_i) {
output_file <- file.path(temp_directory, basename(file_i))
system(
command = paste(bin_path[["crossmap"]], "vcf", chain_file, file_i, ref_fasta, output_file),
wait = TRUE
)
system(
command = paste(bin_path[["bcftools"]], "sort", "-Oz -o", output_file, output_file),
wait = TRUE
)
system(
command = paste(bin_path[["tabix"]], "-f -p vcf", output_file),
wait = TRUE
)
}
))

## resolve changed chromosomes
chrs <- paste0(rep(c("", "chr"), each = 29), c(1:25, "X", "Y", "M", "MT"))
list_vcfs <- list.files(temp_directory, pattern = "vcf.gz$", full.names = TRUE)
invisible(lapply(list_vcfs, function(vcf_i) {
chr_i_orig <- gsub(paste0("^(", paste(chrs, collapse = "|"), ").*"), "\\1", basename(vcf_i))
chr_list <- system(
command = paste(bin_path[["tabix"]], "--list-chroms", vcf_i),
intern = TRUE,
wait = TRUE
)
chr_list <- intersect(chr_list, chrs)

parallel::mclapply(
X = chr_list,
mc.cores = min(length(chr_list), nb_cores),
mc.preschedule = FALSE,
FUN = function(chr_i) {
output_file <- file.path(
temp_directory,
gsub(chr_i_orig, paste0(chr_i, "_from_", chr_i_orig), basename(vcf_i))
)
system(
command = paste(bin_path[["bcftools"]], "view", "--regions", chr_i, vcf_i, "-Oz -o", output_file),
wait = TRUE
)
system(
command = paste(bin_path[["tabix"]], "-f -p vcf", output_file),
wait = TRUE
)
}
)
}))

invisible(parallel::mclapply(
X = list_vcfs,
mc.cores = min(length(list_vcfs), nb_cores),
mc.preschedule = FALSE,
FUN = function(vcf_i) {
chr_i <- gsub(paste0("^(", paste(chrs, collapse = "|"), ").*"), "\\1", basename(vcf_i))
vcf_to_bind <- paste(
list.files(temp_directory, pattern = paste0(chr_i, "_from_.*vcf.gz$"), full.names = TRUE),
collapse = " "
)

vcf_tmp <- file.path(temp_directory, paste0("temp_", basename(vcf_i)))
vcf_out <- file.path(output_directory, basename(vcf_i))

system(
command = paste(bin_path[["bcftools"]], "concat -a", vcf_to_bind, "-Oz -o", vcf_tmp),
wait = TRUE
)
system(
command = paste(bin_path[["bcftools"]], "sort", "-Oz -o", vcf_out, vcf_tmp),
wait = TRUE
)
system(
command = paste(bin_path[["tabix"]], "-f -p vcf", vcf_out),
wait = TRUE
)
}
))

invisible(unlink(temp_directory, recursive = TRUE, force = TRUE))
}
2 changes: 2 additions & 0 deletions README.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ remotes::install_github("mcanouil/dgapaq")
using a [rmarkdown template](inst/rmarkdown/templates/qc_plink/skeleton/skeleton.Rmd).
* `qc_vcf()` allows to compute post-imputation quality-control report
using a default [rmarkdown template](inst/rmarkdown/templates/qc_vcf/skeleton/skeleton.Rmd).
* `convert_assembly()` allows to convert VCFs to target genome assembly using the software
[CrossMap](https://crossmap.readthedocs.io/en/latest/).

## Getting help

Expand Down
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ remotes::install_github("mcanouil/dgapaq")
- `qc_vcf()` allows to compute post-imputation quality-control report
using a default [rmarkdown
template](inst/rmarkdown/templates/qc_vcf/skeleton/skeleton.Rmd).
- `convert_assembly()` allows to convert VCFs to target genome assembly
using the software [CrossMap](https://crossmap.readthedocs.io/en/latest/).

## Getting help

Expand Down
32 changes: 32 additions & 0 deletions man/convert_assembly.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 75eb107

Please sign in to comment.