Skip to content

Commit

Permalink
better handling of gff files with ##FASTA section, fixes #35
Browse files Browse the repository at this point in the history
  • Loading branch information
thackl committed Feb 2, 2021
1 parent 1bfbdfe commit 15ed1e0
Show file tree
Hide file tree
Showing 3 changed files with 30 additions and 13 deletions.
12 changes: 12 additions & 0 deletions R/aaa.R
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,18 @@ shared_names <- function(x, ...){
names[names %in% names2(x)]
}

# split a vector of tuples c(a1,b2,a2,b2) into a list of two vectors
# list(a=c(a1,a2), b=c(b1,b2))
vec_unzip <- function(x, names=NULL, ignore_odd=FALSE){
if(!ignore_odd && length(x)%%2){
abort(str_glue("Won't unzip a vector of uneven length {(length(x))}\n",
"Disable error with `ignore_odd=TRUE`"))
}

i <- c(TRUE, FALSE)
set_names(list(x[i], x[!i]), names)
}

#' @export
#' @importFrom magrittr %>%
magrittr::`%>%`
Expand Down
7 changes: 1 addition & 6 deletions R/geom_gene.R
Original file line number Diff line number Diff line change
Expand Up @@ -256,11 +256,6 @@ makeContent.genetree <- function(x){
grid::setChildren(x, grobs)
}

unzip <- function(x, names=NULL){
i <- c(TRUE, FALSE)
set_names(list(x[i], x[!i]), names)
}

exon_spans <- function(x, xend, introns, ...){
n <- length(introns)
if(n<2){
Expand All @@ -270,7 +265,7 @@ exon_spans <- function(x, xend, introns, ...){
introns <- if(x<xend) x + introns else xend + rev(introns)
exons <- c(x, introns, xend)

as_tibble(unzip(exons, c("x", "xend")))
as_tibble(vec_unzip(exons, c("x", "xend")))
}

exon_polys <- function(x, xend, y, height, arrow_width, arrow_height){
Expand Down
24 changes: 17 additions & 7 deletions R/read_gff3.R
Original file line number Diff line number Diff line change
Expand Up @@ -22,16 +22,21 @@
#' @return tibble
read_gff3 <- function(file, sources=NULL, types=NULL, infer_cds_parents=FALSE,
col_names = def_names("gff3"), col_types = def_types("gff3")){
x <- read_tsv(file, col_names = col_names, col_types = col_types, na=".", comment="#")

x <- read_tsv(file, col_names = col_names, col_types = col_types, na=".",
comment = "#")

# ignore FASTA block - dirty fix because all seqs are read into x first and
# also create parsing warnings
i <- which(x[[1]] == "##FASTA")
if(length(i) > 0)
x <- slice_head(i-1)
# create parsing warnings
i <- str_which(x[[1]], "^>")[1]
if(!is.na(i)){
x <- slice_head(x, n=i-1)
warn(str_glue("Note: File contains ##FASTA section starting at line {i}.\n",
"You can ignore any parsing failures starting from that row."))
}

reserved_names <- c(col_names[1:8], c("name", "feat_id", "parent_ids", "introns"))
x_attrs <- tidy_attributes(x[["attributes"]], reserved_names)
x_attrs <- tidy_attributes(x[[9]], reserved_names)

x <- bind_cols(x[,1:8], x_attrs)

Expand Down Expand Up @@ -108,7 +113,12 @@ infer_cds_parent <- function(x){

tidy_attributes <- function(x, reserved_names){
d <- map_df(str_split(x, ";"), function(r){
r <- r[r!=""] # ignore empty elements
# handle missing comments
if(!length(r) || is.na(r))
return(tibble())

# ignore empty elements caused by trailing or duplicated ";"
r <- r[r!=""]
z <- str_split(r, "=")
z <- as_tibble(set_names(map(z,2), map(z,1)))
return(z)
Expand Down

0 comments on commit 15ed1e0

Please sign in to comment.