From cae73bf61d0dd7af05839add09f1ce2867dfb1cc Mon Sep 17 00:00:00 2001
From: NING-L <lijiao.ning91@gmail.com>
Date: Wed, 9 Sep 2020 16:19:57 +0200
Subject: [PATCH 01/14] convert vcf to target assembly

---
 R/convert_assembly.R | 112 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 112 insertions(+)
 create mode 100644 R/convert_assembly.R

diff --git a/R/convert_assembly.R b/R/convert_assembly.R
new file mode 100644
index 0000000..4f3a6a5
--- /dev/null
+++ b/R/convert_assembly.R
@@ -0,0 +1,112 @@
+#' Convert VCF to different assembly with CrossMap
+#'
+#' @param ref_fasta A `character`. A path to the reference fasta file.
+#' @param chain_file A `character`. A path to the chain file.
+#' @param input_directory  A `character`. The path to the VCFs.
+#' @param output_directory A `character`. The path to the output directory.
+#' @param bin_path A `list(character)`. A list giving the binary path of `CrossMap`, `bcftools`, `tabix` and `bgzip`
+#' @param nb_cores An `integer`. The number of CPUs to use.
+#'
+#' @return NULL
+#' @export
+#'
+#' @examples
+convert_assembly <- function(
+  ref_fasta,
+  chain_file,
+  input_directory ,
+  output_directory,
+  bin_path = list(
+    CrossMap = "/usr/local/bin/CrossMap.py",
+    bcftools = "/usr/bin/bcftools",
+    tabix = "/usr/bin/tabix",
+    bgzip = "/usr/bin/bgzip"
+  ),
+  nb_cores = 1
+) {
+
+  if (all(file_test("-f", input_directory ))) {
+    vcf_files <- input_directory
+  } else if(file_test("-d", input_directory )) {
+    vcf_files <- list.files(input_directory , pattern = "(.vcf.gz|.vcf)$", full.names = TRUE)
+  } else {
+    stop('"input_directory " contains file or directory which does not exist, please check!')
+  }
+
+  if (!dir.exists(output_directory)) {
+    system(paste("sudo mkdir", output_directory))
+    system(paste("sudo chmod g+w -R", output_directory))
+  }
+
+  ## convert to target assembly
+  invisible(parallel::mclapply(vcf_files, mc.cores = min(length(vcf_files), nb_cores), function(file_i) {
+    out_name <- file.path(output_directory, basename(file_i))
+    system(paste(
+      bin_path[["CrossMap"]],
+      "vcf",
+      chain_file,
+      file_i,
+      ref_fasta,
+      out_name
+    ))
+    system(paste(
+      bin_path[["bcftools"]],
+      "sort",
+      "-Oz -o", out_name,
+      out_name
+    ))
+    system(paste(bin_path[["tabix"]], "-f -p vcf", out_name))
+  }))
+
+  ## resolve changed chromosomes
+  chrs <- c(1:25, "X", "Y", "M", "MT")
+  chrs <- c(chrs, paste0("chr", chrs))
+  list_vcfs <- list.files(output_directory, pattern = "vcf.gz$", full.names = TRUE)
+  invisible(lapply(list_vcfs, function(vcf_i) {
+    chr_i_orig <- gsub("([0-9]+|X|Y|M|MT).*", "\\1", basename(vcf_i))
+    chr_list <- system(paste(
+      bin_path[["tabix"]],
+      "--list-chroms",
+      vcf_i
+    ), intern = TRUE)
+    chr_list <- intersect(chr_list, chrs)
+
+    parallel::mclapply(chr_list, mc.cores = min(length(chr_list), nb_cores), function(chr_i) {
+      out_name <- file.path(
+        dirname(vcf_i),
+        gsub(chr_i_orig, paste0(chr_i, "_from_", chr_i_orig), basename(vcf_i))
+      )
+      system(paste(
+        bin_path[["bcftools"]],
+        "view",
+        "--regions", chr_i,
+        vcf_i,
+        "-Oz -o", out_name
+      ))
+      system(paste(bin_path[["tabix"]], "-f -p vcf", out_name))
+    })
+  }))
+
+  list_vcfs_split <- list.files(output_directory, pattern = "from.*vcf.gz$", full.names = TRUE)
+  invisible(parallel::mclapply(list_vcfs, mc.cores = min(length(list_vcfs), nb_cores), function(vcf_i) {
+    chr_i <- gsub("([0-9]+|X|Y|M|MT).*", "\\1", basename(vcf_i))
+    vcf_to_bind <- list_vcfs_split[grep(paste0("^", chr_i, "_from"), basename(list_vcfs_split))]
+    vcf_tmp <- file.path(dirname(vcf_i), paste0("tempo_", basename(vcf_i)))
+    system(paste(
+      bin_path[["bcftools"]],
+      "concat -a", paste(vcf_to_bind, collapse = " "),
+      "-Oz -o", vcf_tmp
+    ))
+    unlink(paste0(vcf_i, ".tbi"))
+    system(paste(
+      bin_path[["bcftools"]],
+      "sort",
+      "-Oz -o", vcf_i,
+      vcf_tmp
+    ))
+    unlink(vcf_tmp)
+    system(paste(bin_path[["tabix"]], "-p vcf", vcf_i))
+  }))
+
+  unlink(list.files(output_directory, pattern = "_from_", full.names = TRUE))
+}

From 3e18ceb765562ab574161c6b8bf55387031581ce Mon Sep 17 00:00:00 2001
From: NING-L <lijiao.ning91@gmail.com>
Date: Wed, 9 Sep 2020 16:26:47 +0200
Subject: [PATCH 02/14] update namespace and docs

---
 DESCRIPTION             |  3 ++-
 NAMESPACE               |  1 +
 man/convert_assembly.Rd | 32 ++++++++++++++++++++++++++++++++
 3 files changed, 35 insertions(+), 1 deletion(-)
 create mode 100644 man/convert_assembly.Rd

diff --git a/DESCRIPTION b/DESCRIPTION
index c02ab66..852e58d 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -46,4 +46,5 @@ SystemRequirements:
     gcta64 (>= 1.93.2), 
     perl (>= 5.28.1), 
     plink1.9 (>= 1.90),
-    vcftools (>= 0.1.16)
+    vcftools (>= 0.1.16),
+    CrossMap (>= 0.4.3) - https://crossmap.readthedocs.io/en/latest/
diff --git a/NAMESPACE b/NAMESPACE
index 9a9b6d8..096d4be 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -1,4 +1,5 @@
 # Generated by roxygen2: do not edit by hand
 
+export(convert_assembly)
 export(qc_plink)
 export(qc_vcf)
diff --git a/man/convert_assembly.Rd b/man/convert_assembly.Rd
new file mode 100644
index 0000000..b13a7b3
--- /dev/null
+++ b/man/convert_assembly.Rd
@@ -0,0 +1,32 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/convert_assembly.R
+\name{convert_assembly}
+\alias{convert_assembly}
+\title{Convert VCF to different assembly with CrossMap}
+\usage{
+convert_assembly(
+  ref_fasta,
+  chain_file,
+  input_directory,
+  output_directory,
+  bin_path = list(CrossMap = "/usr/local/bin/CrossMap.py", bcftools =
+    "/usr/bin/bcftools", tabix = "/usr/bin/tabix", bgzip = "/usr/bin/bgzip"),
+  nb_cores = 1
+)
+}
+\arguments{
+\item{ref_fasta}{A \code{character}. A path to the reference fasta file.}
+
+\item{chain_file}{A \code{character}. A path to the chain file.}
+
+\item{input_directory}{A \code{character}. The path to the VCFs.}
+
+\item{output_directory}{A \code{character}. The path to the output directory.}
+
+\item{bin_path}{A \code{list(character)}. A list giving the binary path of \code{CrossMap}, \code{bcftools}, \code{tabix} and \code{bgzip}}
+
+\item{nb_cores}{An \code{integer}. The number of CPUs to use.}
+}
+\description{
+Convert VCF to different assembly with CrossMap
+}

From 4dc7948b0af447f20a164d03afeae4ccceae565b Mon Sep 17 00:00:00 2001
From: lning <lijiao.ning@cnrs.fr>
Date: Wed, 9 Sep 2020 17:39:52 +0200
Subject: [PATCH 03/14] add prefix, modify doc, reindent

---
 R/convert_assembly.R | 58 +++++++++++++++++++++++---------------------
 1 file changed, 30 insertions(+), 28 deletions(-)

diff --git a/R/convert_assembly.R b/R/convert_assembly.R
index 4f3a6a5..1064437 100644
--- a/R/convert_assembly.R
+++ b/R/convert_assembly.R
@@ -2,15 +2,16 @@
 #'
 #' @param ref_fasta A `character`. A path to the reference fasta file.
 #' @param chain_file A `character`. A path to the chain file.
-#' @param input_directory  A `character`. The path to the VCFs.
+#' @param input_directory  A `character`. The path to the VCFs, files should be prefixed by
+#'     chromosome name like: 1, 2, ..., 23 (X), 24 (Y), 25 (M or MT).
 #' @param output_directory A `character`. The path to the output directory.
-#' @param bin_path A `list(character)`. A list giving the binary path of `CrossMap`, `bcftools`, `tabix` and `bgzip`
+#' @param bin_path A `list(character)`. A list giving the binary path of `CrossMap`, `bcftools`,
+#'     `tabix` and `bgzip`
 #' @param nb_cores An `integer`. The number of CPUs to use.
 #'
 #' @return NULL
-#' @export
 #'
-#' @examples
+#' @export
 convert_assembly <- function(
   ref_fasta,
   chain_file,
@@ -25,17 +26,16 @@ convert_assembly <- function(
   nb_cores = 1
 ) {
 
-  if (all(file_test("-f", input_directory ))) {
+  if (all(utils::file_test("-f", input_directory ))) {
     vcf_files <- input_directory
-  } else if(file_test("-d", input_directory )) {
+  } else if(utils::file_test("-d", input_directory )) {
     vcf_files <- list.files(input_directory , pattern = "(.vcf.gz|.vcf)$", full.names = TRUE)
   } else {
-    stop('"input_directory " contains file or directory which does not exist, please check!')
+    stop('"input_directory" contains file or directory which does not exist, please check!')
   }
 
   if (!dir.exists(output_directory)) {
-    system(paste("sudo mkdir", output_directory))
-    system(paste("sudo chmod g+w -R", output_directory))
+    dir.create(output_directory, showWarnings = FALSE, recursive = TRUE, mode = "0777")
   }
 
   ## convert to target assembly
@@ -88,25 +88,27 @@ convert_assembly <- function(
   }))
 
   list_vcfs_split <- list.files(output_directory, pattern = "from.*vcf.gz$", full.names = TRUE)
-  invisible(parallel::mclapply(list_vcfs, mc.cores = min(length(list_vcfs), nb_cores), function(vcf_i) {
-    chr_i <- gsub("([0-9]+|X|Y|M|MT).*", "\\1", basename(vcf_i))
-    vcf_to_bind <- list_vcfs_split[grep(paste0("^", chr_i, "_from"), basename(list_vcfs_split))]
-    vcf_tmp <- file.path(dirname(vcf_i), paste0("tempo_", basename(vcf_i)))
-    system(paste(
-      bin_path[["bcftools"]],
-      "concat -a", paste(vcf_to_bind, collapse = " "),
-      "-Oz -o", vcf_tmp
-    ))
-    unlink(paste0(vcf_i, ".tbi"))
-    system(paste(
-      bin_path[["bcftools"]],
-      "sort",
-      "-Oz -o", vcf_i,
-      vcf_tmp
-    ))
-    unlink(vcf_tmp)
-    system(paste(bin_path[["tabix"]], "-p vcf", vcf_i))
-  }))
+  invisible(parallel::mclapply(
+    list_vcfs, mc.cores = min(length(list_vcfs), nb_cores), mc.preschedule = FALSE, function(vcf_i) {
+      chr_i <- gsub("([0-9]+|X|Y|M|MT).*", "\\1", basename(vcf_i))
+      vcf_to_bind <- list_vcfs_split[grep(paste0("^", chr_i, "_from"), basename(list_vcfs_split))]
+      vcf_tmp <- file.path(dirname(vcf_i), paste0("tempo_", basename(vcf_i)))
+      system(paste(
+        bin_path[["bcftools"]],
+        "concat -a", paste(vcf_to_bind, collapse = " "),
+        "-Oz -o", vcf_tmp
+      ))
+      unlink(paste0(vcf_i, ".tbi"))
+      system(paste(
+        bin_path[["bcftools"]],
+        "sort",
+        "-Oz -o", vcf_i,
+        vcf_tmp
+      ))
+      unlink(vcf_tmp)
+      system(paste(bin_path[["tabix"]], "-p vcf", vcf_i))
+    }
+  ))
 
   unlink(list.files(output_directory, pattern = "_from_", full.names = TRUE))
 }

From 5fe9fcd34e285a8e709e0945ee69760332edb487 Mon Sep 17 00:00:00 2001
From: lning <lijiao.ning@cnrs.fr>
Date: Wed, 9 Sep 2020 17:43:13 +0200
Subject: [PATCH 04/14] update doc

---
 man/convert_assembly.Rd | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/man/convert_assembly.Rd b/man/convert_assembly.Rd
index b13a7b3..7d953c6 100644
--- a/man/convert_assembly.Rd
+++ b/man/convert_assembly.Rd
@@ -19,11 +19,13 @@ convert_assembly(
 
 \item{chain_file}{A \code{character}. A path to the chain file.}
 
-\item{input_directory}{A \code{character}. The path to the VCFs.}
+\item{input_directory}{A \code{character}. The path to the VCFs, files should be prefixed by
+chromosome name like: 1, 2, ..., 23 (X), 24 (Y), 25 (M or MT).}
 
 \item{output_directory}{A \code{character}. The path to the output directory.}
 
-\item{bin_path}{A \code{list(character)}. A list giving the binary path of \code{CrossMap}, \code{bcftools}, \code{tabix} and \code{bgzip}}
+\item{bin_path}{A \code{list(character)}. A list giving the binary path of \code{CrossMap}, \code{bcftools},
+\code{tabix} and \code{bgzip}}
 
 \item{nb_cores}{An \code{integer}. The number of CPUs to use.}
 }

From 02590cb6a29e614cead56dbbee60c13bbb7657c6 Mon Sep 17 00:00:00 2001
From: lning <lijiao.ning@cnrs.fr>
Date: Wed, 9 Sep 2020 18:01:01 +0200
Subject: [PATCH 05/14] update readme

---
 README.Rmd | 2 ++
 README.md  | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/README.Rmd b/README.Rmd
index 3e9cc29..d9b814e 100644
--- a/README.Rmd
+++ b/README.Rmd
@@ -38,6 +38,8 @@ remotes::install_github("mcanouil/dgapaq")
     using a [rmarkdown template](inst/rmarkdown/templates/qc_plink/skeleton/skeleton.Rmd).
 * `qc_vcf()` allows to compute post-imputation quality-control report 
     using a default [rmarkdown template](inst/rmarkdown/templates/qc_vcf/skeleton/skeleton.Rmd).
+* `convert_assembly()` allows to convert VCFs to target genome assembly using the software 
+    [CrossMap](https://crossmap.readthedocs.io/en/latest/).
 
 ## Getting help
 
diff --git a/README.md b/README.md
index 16f26df..6972cb1 100644
--- a/README.md
+++ b/README.md
@@ -30,6 +30,8 @@ remotes::install_github("mcanouil/dgapaq")
   - `qc_vcf()` allows to compute post-imputation quality-control report
     using a default [rmarkdown
     template](inst/rmarkdown/templates/qc_vcf/skeleton/skeleton.Rmd).
+  - `convert_assembly()` allows to convert VCFs to target genome assembly 
+    using the software [CrossMap](https://crossmap.readthedocs.io/en/latest/).
 
 ## Getting help
 

From f6c88f3f3cacf38079c8e1278c1921a8463fb1a4 Mon Sep 17 00:00:00 2001
From: lning <lijiao.ning@cnrs.fr>
Date: Wed, 9 Sep 2020 18:14:51 +0200
Subject: [PATCH 06/14] add tabix

---
 DESCRIPTION | 1 +
 1 file changed, 1 insertion(+)

diff --git a/DESCRIPTION b/DESCRIPTION
index 852e58d..459a246 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -42,6 +42,7 @@ RoxygenNote: 7.1.0
 SystemRequirements: 
     pandoc (>= 1.12.3) - http://pandoc.org, 
     bcftools (>= 1.10.2), 
+    tabix (>= 1.10.2), 
     bgzip (>= 1.10.2), 
     gcta64 (>= 1.93.2), 
     perl (>= 5.28.1), 

From 24a4c97b2fbf016cf32d359ce986fe33ebe59d61 Mon Sep 17 00:00:00 2001
From: lning <lijiao.ning@cnrs.fr>
Date: Wed, 9 Sep 2020 18:25:05 +0200
Subject: [PATCH 07/14] remove extra space

---
 R/convert_assembly.R | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/R/convert_assembly.R b/R/convert_assembly.R
index 1064437..e3905cf 100644
--- a/R/convert_assembly.R
+++ b/R/convert_assembly.R
@@ -26,9 +26,9 @@ convert_assembly <- function(
   nb_cores = 1
 ) {
 
-  if (all(utils::file_test("-f", input_directory ))) {
+  if (all(utils::file_test("-f", input_directory))) {
     vcf_files <- input_directory
-  } else if(utils::file_test("-d", input_directory )) {
+  } else if (utils::file_test("-d", input_directory)) {
     vcf_files <- list.files(input_directory , pattern = "(.vcf.gz|.vcf)$", full.names = TRUE)
   } else {
     stop('"input_directory" contains file or directory which does not exist, please check!')

From 58018b1528e3045dbef4c218d924478959bf7f19 Mon Sep 17 00:00:00 2001
From: NING-L <lijiao.ning91@gmail.com>
Date: Thu, 10 Sep 2020 13:45:34 +0200
Subject: [PATCH 08/14] remove temp files right after merged

---
 R/convert_assembly.R | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/R/convert_assembly.R b/R/convert_assembly.R
index e3905cf..2c03240 100644
--- a/R/convert_assembly.R
+++ b/R/convert_assembly.R
@@ -98,7 +98,7 @@ convert_assembly <- function(
         "concat -a", paste(vcf_to_bind, collapse = " "),
         "-Oz -o", vcf_tmp
       ))
-      unlink(paste0(vcf_i, ".tbi"))
+      unlink(c(paste0(vcf_i, ".tbi"), vcf_to_bind))
       system(paste(
         bin_path[["bcftools"]],
         "sort",
@@ -110,5 +110,4 @@ convert_assembly <- function(
     }
   ))
 
-  unlink(list.files(output_directory, pattern = "_from_", full.names = TRUE))
 }

From cfe2d6b38bbc12479330985c1dba0b3b6161b5bd Mon Sep 17 00:00:00 2001
From: NING-L <lijiao.ning91@gmail.com>
Date: Thu, 10 Sep 2020 13:50:09 +0200
Subject: [PATCH 09/14] update new feature

---
 NEWS.md | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/NEWS.md b/NEWS.md
index 997b12d..e9708a4 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,3 +1,10 @@
+# dgapaq 0.6.0
+
+## New feature
+
+* `convert_assembly()` allows to convert VCFs to target genome assembly 
+    using the software [CrossMap](https://crossmap.readthedocs.io/en/latest/).
+
 # dgapaq 0.5.1
 
 * In [rmarkdown templates](inst/rmarkdown/templates),

From 5a295c4c6e1dabddfb4586d30349e99be2120270 Mon Sep 17 00:00:00 2001
From: NING-L <lijiao.ning91@gmail.com>
Date: Thu, 10 Sep 2020 14:01:05 +0200
Subject: [PATCH 10/14] update new feature

---
 DESCRIPTION | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 459a246..83c8e46 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: dgapaq
 Title: DNA Genotyping Arrays Processing And Quality-Control
-Version: 0.5.1
+Version: 0.6.0
 Authors@R: 
     c(person(given = "Mickaël",
              family = "Canouil",
@@ -38,7 +38,7 @@ Suggests:
 Encoding: UTF-8
 LazyData: true
 Roxygen: list(markdown = TRUE)
-RoxygenNote: 7.1.0
+RoxygenNote: 7.1.1
 SystemRequirements: 
     pandoc (>= 1.12.3) - http://pandoc.org, 
     bcftools (>= 1.10.2), 

From 534125c1f01f40666cb5e33d51d2639423b7ef23 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micka=C3=ABl=20Canouil?= <mickael.canouil@cnrs.fr>
Date: Tue, 20 Oct 2020 13:03:16 +0200
Subject: [PATCH 11/14] dev version

---
 DESCRIPTION |  2 +-
 NEWS.md     | 11 +++++++++--
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 83c8e46..5fbb2d1 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: dgapaq
 Title: DNA Genotyping Arrays Processing And Quality-Control
-Version: 0.6.0
+Version: 0.5.2.9000
 Authors@R: 
     c(person(given = "Mickaël",
              family = "Canouil",
diff --git a/NEWS.md b/NEWS.md
index e9708a4..af7f131 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,10 +1,17 @@
-# dgapaq 0.6.0
+# dgapaq (development version)
 
 ## New feature
 
-* `convert_assembly()` allows to convert VCFs to target genome assembly 
+* In `R/convert_assembly.R`, 
+    + `convert_assembly()` allows to convert VCFs to target genome assembly 
     using the software [CrossMap](https://crossmap.readthedocs.io/en/latest/).
 
+# dgapaq 0.5.2
+
+* In [rmarkdown templates](inst/rmarkdown/templates),
+    + Small refactoring.
+    + Ensure FID and IID are of type character.
+    
 # dgapaq 0.5.1
 
 * In [rmarkdown templates](inst/rmarkdown/templates),

From 8842985ed592be2541444c3404785b52b354f4fc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micka=C3=ABl=20Canouil?= <mickael.canouil@cnrs.fr>
Date: Tue, 20 Oct 2020 13:24:41 +0200
Subject: [PATCH 12/14] rewrite doc and stop condition

---
 R/convert_assembly.R | 78 ++++++++++++++++++++++++--------------------
 1 file changed, 42 insertions(+), 36 deletions(-)

diff --git a/R/convert_assembly.R b/R/convert_assembly.R
index 2c03240..bf87638 100644
--- a/R/convert_assembly.R
+++ b/R/convert_assembly.R
@@ -1,22 +1,21 @@
 #' Convert VCF to different assembly with CrossMap
 #'
-#' @param ref_fasta A `character`. A path to the reference fasta file.
-#' @param chain_file A `character`. A path to the chain file.
-#' @param input_directory  A `character`. The path to the VCFs, files should be prefixed by
-#'     chromosome name like: 1, 2, ..., 23 (X), 24 (Y), 25 (M or MT).
-#' @param output_directory A `character`. The path to the output directory.
-#' @param bin_path A `list(character)`. A list giving the binary path of `CrossMap`, `bcftools`,
-#'     `tabix` and `bgzip`
+#' @param input_directory  A `character`. The path to the VCFs files or directory. Default is `NULL`.
+#' @param output_directory A `character`. The path to the output directory. Default is `NULL`.
+#' @param ref_fasta A `character`. A path to the reference fasta file. Default is `NULL`.
+#' @param chain_file A `character`. A path to the chain file. Default is `NULL`.
+#' @param bin_path A `list(character)`. A list giving the binary path of
+#'     `CrossMap`, `bcftools`, `tabix` and `bgzip`.
 #' @param nb_cores An `integer`. The number of CPUs to use.
 #'
 #' @return NULL
 #'
 #' @export
 convert_assembly <- function(
-  ref_fasta,
-  chain_file,
-  input_directory ,
-  output_directory,
+  input_directory = NULL,
+  output_directory = NULL,
+  ref_fasta = NULL,
+  chain_file = NULL,
   bin_path = list(
     CrossMap = "/usr/local/bin/CrossMap.py",
     bcftools = "/usr/bin/bcftools",
@@ -26,37 +25,44 @@ convert_assembly <- function(
   nb_cores = 1
 ) {
 
-  if (all(utils::file_test("-f", input_directory))) {
+  if (length(input_directory) == 1 & length(list.files(input_directory, pattern = "(.vcf.gz|.vcf)$")) <= 1) {
+    stop('"input_directory" must be a directory of VCF files splitted by chromosome!')
+  }
+
+  if (
+    length(input_directory) > 1 &
+      !all(file.exists(input_directory)) &
+      !all(grepl("(.vcf.gz|.vcf)$", input_directory))
+  ) {
+    stop('"input_directory" must be a vector of VCF files splitted by chromosome!')
+  }
+
+  if (length(input_directory) > 1) {
     vcf_files <- input_directory
-  } else if (utils::file_test("-d", input_directory)) {
-    vcf_files <- list.files(input_directory , pattern = "(.vcf.gz|.vcf)$", full.names = TRUE)
   } else {
-    stop('"input_directory" contains file or directory which does not exist, please check!')
+    vcf_files <- list.files(input_directory, pattern = "(.vcf.gz|.vcf)$", full.names = TRUE)
   }
 
-  if (!dir.exists(output_directory)) {
-    dir.create(output_directory, showWarnings = FALSE, recursive = TRUE, mode = "0777")
-  }
+  temp_directory <- file.path(tempdir(), "convert_assembly")
+  dir.create(path = temp_directory, showWarnings = FALSE)
 
   ## convert to target assembly
-  invisible(parallel::mclapply(vcf_files, mc.cores = min(length(vcf_files), nb_cores), function(file_i) {
-    out_name <- file.path(output_directory, basename(file_i))
-    system(paste(
-      bin_path[["CrossMap"]],
-      "vcf",
-      chain_file,
-      file_i,
-      ref_fasta,
-      out_name
-    ))
-    system(paste(
-      bin_path[["bcftools"]],
-      "sort",
-      "-Oz -o", out_name,
-      out_name
-    ))
-    system(paste(bin_path[["tabix"]], "-f -p vcf", out_name))
-  }))
+  invisible(parallel::mclapply(
+    X = vcf_files,
+    mc.cores = min(length(vcf_files), nb_cores),
+    FUN = function(file_i) {
+      output_file <- file.path(temp_directory, basename(file_i))
+      system(
+        intern = TRUE, wait = TRUE,
+        command = paste(bin_path[["CrossMap"]], "vcf", chain_file, file_i, ref_fasta, output_file)
+      )
+      system(
+        intern = TRUE, wait = TRUE,
+        command = paste(bin_path[["bcftools"]], "sort", "-Oz -o", output_file, output_file)
+      )
+      system(intern = TRUE, wait = TRUE, command = paste(bin_path[["tabix"]], "-f -p vcf", output_file))
+    }
+  ))
 
   ## resolve changed chromosomes
   chrs <- c(1:25, "X", "Y", "M", "MT")

From aa8e816681783ccd0be52bc788759881db6af527 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micka=C3=ABl=20Canouil?= <mickael.canouil@cnrs.fr>
Date: Tue, 20 Oct 2020 14:19:32 +0200
Subject: [PATCH 13/14] small refactoring and error condition

---
 R/convert_assembly.R | 127 ++++++++++++++++++++++++++-----------------
 1 file changed, 76 insertions(+), 51 deletions(-)

diff --git a/R/convert_assembly.R b/R/convert_assembly.R
index bf87638..976197f 100644
--- a/R/convert_assembly.R
+++ b/R/convert_assembly.R
@@ -4,8 +4,7 @@
 #' @param output_directory A `character`. The path to the output directory. Default is `NULL`.
 #' @param ref_fasta A `character`. A path to the reference fasta file. Default is `NULL`.
 #' @param chain_file A `character`. A path to the chain file. Default is `NULL`.
-#' @param bin_path A `list(character)`. A list giving the binary path of
-#'     `CrossMap`, `bcftools`, `tabix` and `bgzip`.
+#' @param bin_path A `list(character)`. A list giving the binary path of `CrossMap`, `bcftools`, `tabix` and `bgzip`.
 #' @param nb_cores An `integer`. The number of CPUs to use.
 #'
 #' @return NULL
@@ -17,7 +16,7 @@ convert_assembly <- function(
   ref_fasta = NULL,
   chain_file = NULL,
   bin_path = list(
-    CrossMap = "/usr/local/bin/CrossMap.py",
+    crossmap = "/usr/local/bin/CrossMap.py",
     bcftools = "/usr/bin/bcftools",
     tabix = "/usr/bin/tabix",
     bgzip = "/usr/bin/bgzip"
@@ -25,7 +24,19 @@ convert_assembly <- function(
   nb_cores = 1
 ) {
 
-  if (length(input_directory) == 1 & length(list.files(input_directory, pattern = "(.vcf.gz|.vcf)$")) <= 1) {
+  names(bin_path) <- tolower(names(bin_path))
+
+  if (
+    length(intersect(names(bin_path), c("crossmap", "bcftools", "tabix", "bgzip"))) != 4 |
+      !all(sapply(bin_path, file.exists))
+  ) {
+    stop('"bin_path" must contains valid named path to "crossmap", "bcftools", "tabix" and "bgzip"!')
+  }
+
+  if (
+    length(input_directory) == 1 &
+      length(list.files(input_directory, pattern = "(.vcf.gz|.vcf)$")) <= 1
+  ) {
     stop('"input_directory" must be a directory of VCF files splitted by chromosome!')
   }
 
@@ -53,67 +64,81 @@ convert_assembly <- function(
     FUN = function(file_i) {
       output_file <- file.path(temp_directory, basename(file_i))
       system(
-        intern = TRUE, wait = TRUE,
-        command = paste(bin_path[["CrossMap"]], "vcf", chain_file, file_i, ref_fasta, output_file)
+        command = paste(bin_path[["crossmap"]], "vcf", chain_file, file_i, ref_fasta, output_file),
+        wait = TRUE
       )
       system(
-        intern = TRUE, wait = TRUE,
-        command = paste(bin_path[["bcftools"]], "sort", "-Oz -o", output_file, output_file)
+        command = paste(bin_path[["bcftools"]], "sort", "-Oz -o", output_file, output_file),
+        wait = TRUE
+      )
+      system(
+        command = paste(bin_path[["tabix"]], "-f -p vcf", output_file),
+        wait = TRUE
       )
-      system(intern = TRUE, wait = TRUE, command = paste(bin_path[["tabix"]], "-f -p vcf", output_file))
     }
   ))
 
   ## resolve changed chromosomes
-  chrs <- c(1:25, "X", "Y", "M", "MT")
-  chrs <- c(chrs, paste0("chr", chrs))
-  list_vcfs <- list.files(output_directory, pattern = "vcf.gz$", full.names = TRUE)
+  chrs <- paste0(rep(c("", "chr"), each = 29), c(1:25, "X", "Y", "M", "MT"))
+  list_vcfs <- list.files(temp_directory, pattern = "vcf.gz$", full.names = TRUE)
   invisible(lapply(list_vcfs, function(vcf_i) {
-    chr_i_orig <- gsub("([0-9]+|X|Y|M|MT).*", "\\1", basename(vcf_i))
-    chr_list <- system(paste(
-      bin_path[["tabix"]],
-      "--list-chroms",
-      vcf_i
-    ), intern = TRUE)
+    chr_i_orig <- gsub(paste0("^(", paste(chrs, collapse = "|"), ").*"), "\\1", basename(vcf_i))
+    chr_list <- system(
+      command = paste(bin_path[["tabix"]], "--list-chroms", vcf_i),
+      intern = TRUE,
+      wait = TRUE
+    )
     chr_list <- intersect(chr_list, chrs)
 
-    parallel::mclapply(chr_list, mc.cores = min(length(chr_list), nb_cores), function(chr_i) {
-      out_name <- file.path(
-        dirname(vcf_i),
-        gsub(chr_i_orig, paste0(chr_i, "_from_", chr_i_orig), basename(vcf_i))
-      )
-      system(paste(
-        bin_path[["bcftools"]],
-        "view",
-        "--regions", chr_i,
-        vcf_i,
-        "-Oz -o", out_name
-      ))
-      system(paste(bin_path[["tabix"]], "-f -p vcf", out_name))
-    })
+    parallel::mclapply(
+      X = chr_list,
+      mc.cores = min(length(chr_list), nb_cores),
+      mc.preschedule = FALSE,
+      FUN = function(chr_i) {
+        output_file <- file.path(
+          temp_directory,
+          gsub(chr_i_orig, paste0(chr_i, "_from_", chr_i_orig), basename(vcf_i))
+        )
+        system(
+          command = paste(bin_path[["bcftools"]], "view", "--regions", chr_i, vcf_i, "-Oz -o", output_file),
+          wait = TRUE
+        )
+        system(
+          command = paste(bin_path[["tabix"]], "-f -p vcf", output_file),
+          wait = TRUE
+        )
+      }
+    )
   }))
 
-  list_vcfs_split <- list.files(output_directory, pattern = "from.*vcf.gz$", full.names = TRUE)
   invisible(parallel::mclapply(
-    list_vcfs, mc.cores = min(length(list_vcfs), nb_cores), mc.preschedule = FALSE, function(vcf_i) {
-      chr_i <- gsub("([0-9]+|X|Y|M|MT).*", "\\1", basename(vcf_i))
-      vcf_to_bind <- list_vcfs_split[grep(paste0("^", chr_i, "_from"), basename(list_vcfs_split))]
-      vcf_tmp <- file.path(dirname(vcf_i), paste0("tempo_", basename(vcf_i)))
-      system(paste(
-        bin_path[["bcftools"]],
-        "concat -a", paste(vcf_to_bind, collapse = " "),
-        "-Oz -o", vcf_tmp
-      ))
-      unlink(c(paste0(vcf_i, ".tbi"), vcf_to_bind))
-      system(paste(
-        bin_path[["bcftools"]],
-        "sort",
-        "-Oz -o", vcf_i,
-        vcf_tmp
-      ))
-      unlink(vcf_tmp)
-      system(paste(bin_path[["tabix"]], "-p vcf", vcf_i))
+    X = list_vcfs,
+    mc.cores = min(length(list_vcfs), nb_cores),
+    mc.preschedule = FALSE,
+    FUN = function(vcf_i) {
+      chr_i <- gsub(paste0("^(", paste(chrs, collapse = "|"), ").*"), "\\1", basename(vcf_i))
+      vcf_to_bind <- paste(
+        list.files(temp_directory, pattern = paste0(chr_i, "_from_.*vcf.gz$"), full.names = TRUE),
+        collapse = " "
+      )
+
+      vcf_tmp <- file.path(temp_directory, paste0("temp_", basename(vcf_i)))
+      vcf_out <- file.path(output_directory, basename(vcf_i))
+
+      system(
+        command = paste(bin_path[["bcftools"]], "concat -a", vcf_to_bind, "-Oz -o", vcf_tmp),
+        wait = TRUE
+      )
+      system(
+        command = paste(bin_path[["bcftools"]], "sort", "-Oz -o", vcf_out, vcf_tmp),
+        wait = TRUE
+      )
+      system(
+        command = paste(bin_path[["tabix"]], "-f -p  vcf", vcf_out),
+        wait = TRUE
+      )
     }
   ))
 
+  invisible(unlink(temp_directory, recursive = TRUE, force = TRUE))
 }

From 3ac1330719ce62a53497ad7282ce5723c42faed8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micka=C3=ABl=20Canouil?= <mickael.canouil@cnrs.fr>
Date: Tue, 20 Oct 2020 14:20:01 +0200
Subject: [PATCH 14/14] update doc

---
 man/convert_assembly.Rd | 22 ++++++++++------------
 1 file changed, 10 insertions(+), 12 deletions(-)

diff --git a/man/convert_assembly.Rd b/man/convert_assembly.Rd
index 7d953c6..8f10a31 100644
--- a/man/convert_assembly.Rd
+++ b/man/convert_assembly.Rd
@@ -5,27 +5,25 @@
 \title{Convert VCF to different assembly with CrossMap}
 \usage{
 convert_assembly(
-  ref_fasta,
-  chain_file,
-  input_directory,
-  output_directory,
-  bin_path = list(CrossMap = "/usr/local/bin/CrossMap.py", bcftools =
+  input_directory = NULL,
+  output_directory = NULL,
+  ref_fasta = NULL,
+  chain_file = NULL,
+  bin_path = list(crossmap = "/usr/local/bin/CrossMap.py", bcftools =
     "/usr/bin/bcftools", tabix = "/usr/bin/tabix", bgzip = "/usr/bin/bgzip"),
   nb_cores = 1
 )
 }
 \arguments{
-\item{ref_fasta}{A \code{character}. A path to the reference fasta file.}
+\item{input_directory}{A \code{character}. The path to the VCFs files or directory. Default is \code{NULL}.}
 
-\item{chain_file}{A \code{character}. A path to the chain file.}
+\item{output_directory}{A \code{character}. The path to the output directory. Default is \code{NULL}.}
 
-\item{input_directory}{A \code{character}. The path to the VCFs, files should be prefixed by
-chromosome name like: 1, 2, ..., 23 (X), 24 (Y), 25 (M or MT).}
+\item{ref_fasta}{A \code{character}. A path to the reference fasta file. Default is \code{NULL}.}
 
-\item{output_directory}{A \code{character}. The path to the output directory.}
+\item{chain_file}{A \code{character}. A path to the chain file. Default is \code{NULL}.}
 
-\item{bin_path}{A \code{list(character)}. A list giving the binary path of \code{CrossMap}, \code{bcftools},
-\code{tabix} and \code{bgzip}}
+\item{bin_path}{A \code{list(character)}. A list giving the binary path of \code{CrossMap}, \code{bcftools}, \code{tabix} and \code{bgzip}.}
 
 \item{nb_cores}{An \code{integer}. The number of CPUs to use.}
 }