better handling of gff files with ##FASTA section, fixes #35

thackl · Feb 2, 2021 · 15ed1e0 · 15ed1e0
1 parent 1bfbdfe
commit 15ed1e0
Show file tree

Hide file tree

Showing 3 changed files with 30 additions and 13 deletions.
diff --git a/R/aaa.R b/R/aaa.R
@@ -50,6 +50,18 @@ shared_names <- function(x, ...){
   names[names %in% names2(x)]
 }
 
+# split a vector of tuples c(a1,b2,a2,b2) into a list of two vectors
+# list(a=c(a1,a2), b=c(b1,b2))
+vec_unzip <- function(x, names=NULL, ignore_odd=FALSE){
+  if(!ignore_odd && length(x)%%2){
+    abort(str_glue("Won't unzip a vector of uneven length {(length(x))}\n",
+                   "Disable error with `ignore_odd=TRUE`"))
+  }
+
+  i <- c(TRUE, FALSE)
+  set_names(list(x[i], x[!i]), names)
+}
+
 #' @export
 #' @importFrom magrittr %>%
 magrittr::`%>%`

diff --git a/R/geom_gene.R b/R/geom_gene.R
@@ -256,11 +256,6 @@ makeContent.genetree <- function(x){
   grid::setChildren(x, grobs)
 }
 
-unzip <- function(x, names=NULL){
-  i <- c(TRUE, FALSE)
-  set_names(list(x[i], x[!i]), names)
-}
-
 exon_spans <- function(x, xend, introns, ...){
   n <- length(introns)
   if(n<2){
@@ -270,7 +265,7 @@ exon_spans <- function(x, xend, introns, ...){
   introns <- if(x<xend) x + introns else xend + rev(introns)
   exons <- c(x, introns, xend)
 
-  as_tibble(unzip(exons, c("x", "xend")))
+  as_tibble(vec_unzip(exons, c("x", "xend")))
 }
 
 exon_polys <- function(x, xend, y, height, arrow_width, arrow_height){

diff --git a/R/read_gff3.R b/R/read_gff3.R
@@ -22,16 +22,21 @@
 #' @return tibble
 read_gff3 <- function(file, sources=NULL, types=NULL, infer_cds_parents=FALSE,
     col_names = def_names("gff3"), col_types = def_types("gff3")){
-  x <- read_tsv(file, col_names = col_names, col_types = col_types, na=".", comment="#")
+
+  x <- read_tsv(file, col_names = col_names, col_types = col_types, na=".",
+                comment = "#")
 
   # ignore FASTA block - dirty fix because all seqs are read into x first and
-  # also create parsing warnings
-  i <- which(x[[1]] == "##FASTA")
-  if(length(i) > 0)
-    x <- slice_head(i-1)
+  # create parsing warnings
+  i <- str_which(x[[1]], "^>")[1]
+  if(!is.na(i)){
+    x <- slice_head(x, n=i-1)
+    warn(str_glue("Note: File contains ##FASTA section starting at line {i}.\n",
+        "You can ignore any parsing failures starting from that row."))
+  }
 
   reserved_names <- c(col_names[1:8], c("name", "feat_id", "parent_ids", "introns"))
-  x_attrs <- tidy_attributes(x[["attributes"]], reserved_names)
+  x_attrs <- tidy_attributes(x[[9]], reserved_names)
 
   x <- bind_cols(x[,1:8], x_attrs)
 
@@ -108,7 +113,12 @@ infer_cds_parent <- function(x){
 
 tidy_attributes <- function(x, reserved_names){
   d <- map_df(str_split(x, ";"), function(r){
-    r <- r[r!=""] # ignore empty elements
+    # handle missing comments
+    if(!length(r) || is.na(r))
+      return(tibble())
+
+    # ignore empty elements caused by trailing or duplicated ";"
+    r <- r[r!=""]
     z <- str_split(r, "=")
     z <- as_tibble(set_names(map(z,2), map(z,1)))
     return(z)