From bdbc183ffe69ca19b0f37b568eaceeba194fabf5 Mon Sep 17 00:00:00 2001
From: Thomas Hackl <thackl@lim4.de>
Date: Sun, 7 Feb 2021 00:52:57 +0100
Subject: [PATCH 1/4] rewrite of smart-read-backend; cleaner mapping of (ext >
 format) + context > parser

---
 NAMESPACE            |   2 +-
 R/global.R           |  41 +++--
 R/read.R             | 353 +++++++++++++++++++++----------------------
 R/read_alitv.R       |  55 +++++++
 R/read_feats.R       | 104 ++++++-------
 R/read_seqs.R        |  49 ++----
 _pkgdown.yml         |   6 +-
 man/def_formats.Rd   |  66 ++++++++
 man/ext_to_format.Rd |  22 ---
 man/file_exts.Rd     |  21 ---
 man/file_formats.Rd  |  47 ------
 man/read_alitv.Rd    |   2 +-
 man/read_context.Rd  |  46 ++++++
 man/read_feats.Rd    |  68 ---------
 man/read_seq_len.Rd  |  26 ++++
 man/read_seqs.Rd     |  47 ------
 man/read_tracks.Rd   |  96 ++++++++++++
 man/swap_query.Rd    |   2 +-
 18 files changed, 548 insertions(+), 505 deletions(-)
 create mode 100644 R/read_alitv.R
 create mode 100644 man/def_formats.Rd
 delete mode 100644 man/ext_to_format.Rd
 delete mode 100644 man/file_exts.Rd
 delete mode 100644 man/file_formats.Rd
 create mode 100644 man/read_context.Rd
 delete mode 100644 man/read_feats.Rd
 create mode 100644 man/read_seq_len.Rd
 delete mode 100644 man/read_seqs.Rd
 create mode 100644 man/read_tracks.Rd

diff --git a/NAMESPACE b/NAMESPACE
index 98bb3671..0b0c4b20 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -94,6 +94,7 @@ export(as_sublinks)
 export(bins)
 export(check_strand)
 export(combine_strands)
+export(def_formats)
 export(def_names)
 export(def_types)
 export(drop_feat_layout)
@@ -103,7 +104,6 @@ export(drop_seq_layout)
 export(ex)
 export(feats)
 export(feats0)
-export(file_formats)
 export(flip)
 export(flip_nicely)
 export(flip_seqs)
diff --git a/R/global.R b/R/global.R
index 64a38094..9ff7ac24 100644
--- a/R/global.R
+++ b/R/global.R
@@ -3,28 +3,25 @@
 # manipulated by other packages.
 gggenomes_global  <- new.env(parent = emptyenv())
 
-# list of contexts of dictionaryish vectors mapping suffixes to file formats
-gggenomes_global$file_formats <- map(list(
-  feats = list(
-    gff3 = qc(gff, gff3),
-    gbk = qc(gbk, gb, gbff),
-    bed = qc(bed),
-    fasta = qc(fa, fas, fasta, ffn, fna, faa),
-    blast = qc(m8, o6, o7),
-    paf = qc(paf),
-    ambigious = qc(txt, tsv, csv)
-  ),
-  seqs = list(
-    fai = qc(fai),
-    seq_len = qc(fa, fas, fasta, ffn, fna, faa, gff, gbk),
-    ambigious = qc(txt, tsv, csv)
-  ),
-  zips = list(
-    bz2 = qc(bz, bz2),
-    gz = qc(gz),
-    xz = qc(xz),
-    zip = qc(zip))
-), ~deframe(stack(.x) %>% mutate(ind=as.character(ind))))
+# Mapping of file formats, extensions, contexts, and parsers
+#
+# parser is a function name, like "read_tsv", ...
+#
+# context=NA defines fallback parser that is used if no parser is defined for
+# the specific context
+gggenomes_global$def_formats <- tribble(
+  ~format, ~ext, ~context, ~parser,
+  "ambigious", qc(txt,tsv,csv), NA, "read_ambigious",
+  "fasta", qc(fa,fas,fasta,ffn,fna,faa), qc(seqs), qc(read_seq_len),
+  "fai", qc(fai), qc(seqs), qc(read_fai),
+  "gff3", qc(gff,gff3), qc(feats, seqs), qc(read_gff3, read_seq_len),
+  "gbk", qc(gbk,gb,gbff,gpff), qc(feats, seqs), qc(read_gbk, read_seq_len),
+  "bed", qc(bed), "feats", "read_bed",
+  "blast", qc(m8,o6,o7), "feats", "read_blast",
+  "paf", qc(paf), "feats", "read_paf",
+  "alitv", qc(json), qc(feats, seqs, links),
+     qc(read_alitv_genes, read_alitv_seqs, read_alitv_links)
+)
 
 # Default column names for different formats
 gggenomes_global$def_names <- list(
diff --git a/R/read.R b/R/read.R
index 5046d0f2..32171456 100644
--- a/R/read.R
+++ b/R/read.R
@@ -1,33 +1,135 @@
-#' Swap query and subject in blast-like feature tables
+#' Read files in various formats into track tables
 #'
-#' Swap query and subject columns in a table read with [read_feats()] or
-#' [read_links()], for example, from blast searches. Swaps columns with
-#' name/name2, such as 'seq_id/seq_id2', 'start/start2', ...
+#' Convenience functions to read sequences, features or links from various
+#' bioinformatics file formats, such as FASTA, GFF3, Genbank, BLAST tabular
+#' output, etc. See [def_formats()] for full list. File formats and the
+#' corresponding read-functions are automatically determined based on file
+#' extensions. All these functions can read multiple files in the same format at
+#' once, and combine them into a single table - useful, for example, to read a
+#' folder of gff-files with each file containing genes of a different genome.
 #'
-#' @param x tibble with query and subject columns
+#' @name read_tracks
+#' @inheritParams read_context
+#' @return A gggenomes-compatible sequence, feature or link tibble
+NULL
+
+
+#' Read files in different contexts
+#'
+#' Powers [read_seqs()], [read_feats()], [read_links()]
+#' @param files files to reads. Should all be of same format. In many cases,
+#'   compressed files (`.gz`, `.bz2`, `.xz`, or `.zip`) are supported.
+#'   Similarly, automatic download of remote files starting with `http(s)://` or
+#'   `ftp(s)://` works in most cases.
+#' @param .id the column with the name of the file a record was read from.
+#'   Defaults to "file_id". Set to "bin_id" if every file represents a different
+#'   bin.
+#' @param format specify a format known to gggenomes, such as `gff3`, `gbk`, ...
+#'   to overwrite automatic determination based on the file extension (see
+#'   [def_formats()] for full list).
+#' @param parser specify the name of an R function to overwrite automatic
+#'   determination based on format, e.g. `parser="read_tsv"`.
+#' @param ... additional arguments passed on to the format-specific read
+#'   function called down the line.
+#' @param context the context ("seqs", "feats", "links") in which a given format
+#'   should be read.
+#' @describeIn read_context bla keywords internal
+read_context <- function(files, context, .id="file_id", format=NULL, parser=NULL, ...){
+  if(is_connection(files))
+    files <- list(files) # weird things happen to pipes in vectors
+
+  # for unnamed files, infer name from filename (used as file_id/bin_id)
+  files <- file_label(files)
+
+  parser <- parser %||% file_parser(files, context=context, format=format, require_unique=T)
+  # map_df .id = bin_id
+  inform(str_glue("Reading '{names(parser)}' with `{parser}()`:"))
+  x <- map2_df(files, names(files), .id=.id, parser=parser, ...,
+               .f=function(file, name, parser, ...){
+                 inform(str_glue("* {.id}: {name} [{file}]"))
+                 exec(parser, file, ...)
+               })
+
+  x
+}
+
+read_ambigious <- function(file, ...){
+  abort(c("Ambigious file extension, please specify format or parser explicitly"), file)
+}
+
+# file: vec of files
+# context: vec of context
+# single file/context is recycled to match multiple context/files if given
+# format:  force this format regardless of file extension
+file_parser <- function(file, context=NULL, format=NULL, require_unique=FALSE){
+  format <- format %||% def_formats(file, context=context)
+  parser <- def_parser(format, context=context) %>% set_names(format)
+
+  if(require_unique){
+    p <- unique(parser)
+    if(length(p) > 1)
+      abort(c("All files need the same format/parser.", i="Got mix of:", unname(p)))
+    parser <- parser[1] # unique(parser) strips names
+  }
+  parser
+}
+
+#' Defined file formats and extensions
+#'
+#' For seamless reading of different file formats, gggenomes uses a mapping of
+#' known formats to associated file extensions and contexts in which the
+#' different formats can be read. The notion of context allows one to read
+#' different information from the same format/extension. For example, a gbk file
+#' holds both feature and sequence information. If read in "feats" context
+#' `read_feats("*.gbk")` it will return a feature table, if read in "seqs"
+#' context `read_seqs("*.gbk")`, a sequence index.
+#'
+#' @param file a vector of file names
+#' @param ext a vector of file extensions
+#' @param context a vector of file contexts defined in
+#'   `gggenomes_global$def_formats`
+#' @param parser a vector of file parsers defined in
+#'   `gggenomes_global$def_formats`
+#' @return dictionarish vector of file formats with recognized extensions as
+#'   names
 #' @export
-#' @return tibble with swapped query/subject columns
 #' @examples
-#' feats <- tribble(
-#'  ~seq_id, ~seq_id2, ~start, ~end, ~strand, ~start2, ~end2, ~evalue,
-#'  "A", "B", 100, 200, "+", 10000, 10200, 1e-5
-#' )
-#' # make B the query
-#' swap_query(feats)
-swap_query <- function(x){
-  # for every pair seq_id/seq_id2, name/name2 > name2/name
-  n <- names(x)
-  m <- str_subset(n, "\\D2") %>% str_remove("2$") %>% intersect(n)
-  if(!length(m))
-    return(x)
+#' # vector of defined zip formats and recognized extensions as names
+#' # format of file
+#' def_formats("foo.fa")
+#'
+#' # formats associated with each extension
+#' def_formats(ext=qc(fa, gff))
+#'
+#' # all formats/extensions that can be read in seqs context; includes formats
+#' # that are defined for context=NA, i.e. that can be read in any context.
+#' def_formats(context="seqs")
+#' @eval def_formats_rd()
+def_formats <- function(file=NULL, ext=NULL, context=NULL, parser=NULL, allow_na=FALSE){
+  if(!is.null(file)){
+    ext <- c(file_ext(file), ext)
+  }
 
-  m2 <- paste0(m, "2")
-  i <- which(n %in% m)
-  i2 <- which(n %in% m2)
-  inform(c("Swapping query/subject-associated columns",
-           comma(m, collapse='  '), comma(m2, collapse=' ')))
-  x[c(i, i2)] <- x[c(i2, i)]
-  x
+  ff <- filter_def_formats(context=context, parser=parser) %>% unchop(ext)
+
+  format <- deframe(select(ff, ext, format))
+  if(!is.null(ext))
+    format <- format[ext]
+
+  if(!allow_na && any(is.na(format))){
+    bad <- ext[is.na(format)]
+    names(bad) <- rep("x", length(bad))
+    good <- def_formats(context=context, parser=parser) %>%
+      enframe(name = "ext", value = "format") %>%
+      chop(ext) %>% mutate(ext = map_chr(ext, comma)) %>% format()
+    abort(c(str_glue('Unknown extention(s):'),
+            i=str_glue("in context: {context}"),
+            i=str_glue("with parser: {parser}"),
+            bad,
+            i="Recognized formats/extensions for given context/parser:",
+            good[-(1:3)]))
+  }
+  format
 }
 
 #' Default column names and types for defined formats
@@ -53,7 +155,6 @@ def_names <- function(format){
   ff[[format]]
 }
 
-
 #' @describeIn def_names default column types for defined formats
 #' @export
 #' @return a vector with default column types for the given format
@@ -68,62 +169,23 @@ def_types <- function(format){
   ff[[format]]
 }
 
-#' Defined file formats and extensions
-#'
-#' For seamless reading of different file formats, gggenomes uses a mapping of
-#' known formats to associated file extensions and contexts in which the
-#' different formats can be read. The notion of context allows one to read
-#' different information from the same format/extension. For example, a gbk file
-#' holds both feature and sequence information. If read in "feats" context
-#' `read_feats("*.gbk")` it will return a feature table, if read in "seqs"
-#' context `read_seqs("*.gbk")`, a sequence index.
-#'
-#'
-#' @param context a file format context defined in `gggenomes_global$file_formats`
-#' @return dictionarish vector of file formats with recognized extensions as names
-#' @export
-#' @examples
-#' # vector of defined zip formats and recognized extensions as names
-#' file_formats("zips")
-#' @eval file_formats_rd()
-file_formats <- function(context){
-  ff <- gggenomes_global$file_formats
-  if(!context %in% names(ff)){
-    abort(c(
-      str_glue("Unknown file format context '{context}'.\nDefined families are:"),
-      names(ff)
-    ))
-  }
-  ff[[context]]
-}
+def_parser <- function(format, context=NULL){
+  context <- context %||% NA
 
-#' Defined file extensions and associated formats
-#'
-#' @inheritParams file_formats
-#' @return vector of file extensions with formats as names
-#' @examples
-#' # vector of zip-context file extensions and format names
-#' gggenomes:::file_exts("zips")
-file_exts <- function(context){
-  f <- file_formats(context)
-  set_names(names(f), f)
-}
+  # recycle format & context to same length
+  x <- tibble(format=format, context=context)
 
-#' File format from suffix
-#' @param x a vector of file extensions
-#' @param context a file format context defined in [file_formats()]
-#' @return a vector of formats with extensions as names
-#' @examples
-#' gggenomes:::ext_to_format(c("gff", "txt", "FASTA"), "feats")
-ext_to_format <- function(x, context){
-  x <- str_to_lower(x)
-  if(is_dictionaryish(context))
-    context[x]
-  else
-    file_formats(context)[x]
+  # for each format/context combo, get parser
+  pp <- pmap_chr(x, function(format, context){
+    r <- filter_def_formats(format=format, context=context) %>% pull(parser)
+    if(!length(r) || is.na(r))
+      abort(str_glue("No predefined parser for: `format={format}, context={context}`"))
+    r
+  })
+  pp
 }
 
-file_strip_zip <- function(file, ext = names(file_formats("zips"))){
+file_strip_zip <- function(file, ext = qc(bz2,gz,xz,zip)){
   ext <- paste0("\\.", ext, "$", collapse="|")
   str_remove(file, ext)
 }
@@ -140,32 +202,10 @@ file_name <- function(file, pattern = "\\.[^.]+$", ignore_zip = TRUE){
   str_remove(basename(file), pattern)
 }
 
-file_format <- function(file, context, allow_na = FALSE){
-  ext <- file_ext(file)
-  format <- ext_to_format(ext, context)
-  if(!allow_na && any(is.na(format))){
-    bad <- file[is.na(format)]
-    names(bad) <- rep("x", length(bad))
-    good <- file_formats("feats") %>%
-      enframe(name = "ext", value = "format") %>%
-      chop(ext) %>% mutate(ext = map_chr(ext, comma)) %>% format()
-    abort(c(str_glue('Bad extention for file format context "{context}"'), bad,
-      i="Recognized formats/extensions:", good[-(1:3)]))
-  }
-  set_names(format, file)
-}
-
 file_id <- function(file){
   vctrs::vec_as_names(file_name(file), repair="unique")
 }
 
-file_format_unique <- function(files, context, allow_duplicates = FALSE){
-  fmt <- unique(file_format(files, context))
-  if(!allow_duplicates && length(fmt) > 1)
-    abort(c("All files need the same format.", i="Got mix of:", unname(fmt)))
-  fmt
-}
-
 #' Add a unique name to files
 #'
 #' Given a vector of file paths, add a unique labels based on the filename as
@@ -176,100 +216,55 @@ file_label <- function(file){
   file
 }
 
-
-file_is_zip <- function(file, ext = names(file_formats("zips"))){
+file_is_zip <- function(file, ext = qc(bz2,gz,xz,zip)){
   pattern <- paste0("\\.", ext, "$", collapse="|")
   str_detect(file, pattern)
 }
 
-
 file_is_url <- function(file){
   str_detect(file, "^((http|ftp)s?|sftp)://")
 }
 
-file_formats_rd <- function(){
-  ff <- gggenomes_global$file_formats %>%
-    map_df(.id="context", function(x){
-      enframe(x, "extension", "format") %>% group_by(format) %>%
-        summarize(extension = comma(extension), .groups="drop")
-    })
-  ff <- mutate(ff, context = ifelse(duplicated(context), "", context))
+is_connection <- function(x) inherits(x, "connection")
+
 
-  ff <- str_c(sep = "\n",
-      "@section Defined contexts, formats and extensions:",
-      "\\preformatted{",
-      #sprintf("%-9s %-12s  %s", "Context", "Format", "Extensions"),
-      str_c(collapse = "\n",
-            str_glue_data(ff, '{sprintf("%-8s", context)} ',
-                    '{sprintf("%-7s", format)}  [{extension}]')),
-      "}"
-      )
+# filter but keep fallback parser for context=NA
+filter_def_formats <- function(ff, format=NULL, context=NULL, parser=NULL){
+  ff <- gggenomes_global$def_formats
+  if(!is.null(format)){
+    ff <- filter(ff, format %in% !!format)
+  }
+
+  if(!is.null(context) || !is.null(parser)){
+    ff <- unchop(ff, c(context, parser))
+    if(!is.null(context)){
+      # context=NA defines fallback parser which is always last in arrange
+      ff <- ff %>% group_by(format) %>%
+        filter(context %in% !!context | is.na(context)) %>%
+        arrange(context, .by_group = TRUE) %>% slice_head(n=1)
+    }
+    if(!is.null(parser))
+      ff <- filter(ff, parser %in% !!parser)
+  }
   ff
 }
 
+def_formats_rd <- function(){
+  str_c(collapse = "\n", c(
+    "@section Defined formats, extensions, contexts, and parsers:",
+    "\\preformatted{",
+    capture_output(as.data.frame(gggenomes_global$def_formats), print=TRUE, width=120),
+    "}"))
+}
+
 def_names_rd <- function(){
   ns <- gggenomes_global$def_names
   ts <- gggenomes_global$def_types
   str_c(sep = "\n",
-    "@section Defined formats, column types and names:",
-    "\\preformatted{",
-      paste0(map(names(ns),
-          ~sprintf("  %-10s %-15s %s", .x, ts[[.x]], comma(ns[[.x]]))), collapse="\n"),
-    "}"
+        "@section Defined formats, column types and names:",
+        "\\preformatted{",
+        paste0(map(names(ns),
+                   ~sprintf("  %-10s %-15s %s", .x, ts[[.x]], comma(ns[[.x]]))), collapse="\n"),
+        "}"
   )
 }
-
-is_connection <- function(x) inherits(x, "connection")
-
-#' Read AliTV .json file
-#'
-#' this file contains sequences, links and (optionally) genes
-#'
-#' @importFrom tidyr unnest_wider
-#' @importFrom tidyr unnest
-#' @importFrom jsonlite fromJSON
-#' @param file path to json
-#' @export
-#' @return list with seqs, genes, and links
-#' @examples
-#' ali <- read_alitv("https://alitvteam.github.io/AliTV/d3/data/chloroplasts.json")
-#' gggenomes(ali$seqs, ali$genes, links=ali$links) +
-#'   geom_seq() +
-#'   geom_bin_label() +
-#'   geom_gene(aes(fill=class)) +
-#'   geom_link()
-#' p <- gggenomes(ali$seqs, ali$genes, links=ali$links) +
-#'   geom_seq() +
-#'   geom_bin_label() +
-#'   geom_gene(aes(color=class)) +
-#'   geom_link(aes(fill=identity)) +
-#'   scale_fill_distiller(palette="RdYlGn", direction = 1)
-#' p %>% flip_seq("Same_gi") %>% pick(1,3,2,4,5,6,7,8)
-read_alitv <- function(file){
-  ali <- jsonlite::fromJSON(file, simplifyDataFrame=TRUE)
-  seqs <- tibble(seq = ali$data$karyo$chromosome) %>%
-    mutate(seq_id = names(seq)) %>%
-    unnest_wider(seq) %>%
-    rename(bin_id = genome_id)
-  genes <- tibble(feature = ali$data$feature) %>%
-    mutate(class = names(feature)) %>%
-    filter(class != "link") %>%
-    unnest(feature) %>%
-    rename(seq_id=karyo)
-  links <- tibble(links=ali$data$links) %>% unnest(links) %>% unnest(links) %>% unnest_wider(links)
-  link_pos <- tibble(link=ali$data$features$link) %>% mutate(id=names(link)) %>% unnest_wider(link)
-  links <- links %>%
-    left_join(link_pos, by=c("source"="id")) %>%
-    left_join(link_pos, by=c("target"="id")) %>%
-    transmute(
-        seq_id1=karyo.x,
-        start1=start.x,
-        end1=end.x,
-        seq_id2=karyo.y,
-        start2=start.y,
-        end2=end.y,
-        identity=identity
-    )
-  return(list(seqs=seqs,genes=genes,links=links))
-}
-
diff --git a/R/read_alitv.R b/R/read_alitv.R
new file mode 100644
index 00000000..d456134e
--- /dev/null
+++ b/R/read_alitv.R
@@ -0,0 +1,55 @@
+#' Read AliTV .json file
+#'
+#' this file contains sequences, links and (optionally) genes
+#'
+#' @importFrom tidyr unnest_wider
+#' @importFrom tidyr unnest
+#' @importFrom jsonlite fromJSON
+#' @param file path to json
+#' @export
+#' @return list with seqs, genes, and links
+#' @examples
+#' ali <- read_alitv("https://alitvteam.github.io/AliTV/d3/data/chloroplasts.json")
+#' gggenomes(ali$seqs, ali$genes, links=ali$links) +
+#'   geom_seq() +
+#'   geom_bin_label() +
+#'   geom_gene(aes(fill=class)) +
+#'   geom_link()
+#' p <- gggenomes(ali$seqs, ali$genes, links=ali$links) +
+#'   geom_seq() +
+#'   geom_bin_label() +
+#'   geom_gene(aes(color=class)) +
+#'   geom_link(aes(fill=identity)) +
+#'   scale_fill_distiller(palette="RdYlGn", direction = 1)
+#' p %>% flip_seq("Same_gi") %>% pick(1,3,2,4,5,6,7,8)
+read_alitv <- function(file){
+  ali <- jsonlite::fromJSON(file, simplifyDataFrame=TRUE)
+  seqs <- tibble(seq = ali$data$karyo$chromosome) %>%
+    mutate(seq_id = names(seq)) %>%
+    unnest_wider(seq) %>%
+    rename(bin_id = genome_id)
+  genes <- tibble(feature = ali$data$feature) %>%
+    mutate(class = names(feature)) %>%
+    filter(class != "link") %>%
+    unnest(feature) %>%
+    rename(seq_id=karyo)
+  links <- tibble(links=ali$data$links) %>% unnest(links) %>% unnest(links) %>% unnest_wider(links)
+  link_pos <- tibble(link=ali$data$features$link) %>% mutate(id=names(link)) %>% unnest_wider(link)
+  links <- links %>%
+    left_join(link_pos, by=c("source"="id")) %>%
+    left_join(link_pos, by=c("target"="id")) %>%
+    transmute(
+      seq_id1=karyo.x,
+      start1=start.x,
+      end1=end.x,
+      seq_id2=karyo.y,
+      start2=start.y,
+      end2=end.y,
+      identity=identity
+    )
+  return(list(seqs=seqs,genes=genes,links=links))
+}
+
+read_alitv_seqs <- function(...) read_alitv(...)$seqs
+read_alitv_genes <- function(...) read_alitv(...)$genes
+read_alitv_links <- function(...) read_alitv(...)$links
diff --git a/R/read_feats.R b/R/read_feats.R
index acacc49f..2a637e25 100644
--- a/R/read_feats.R
+++ b/R/read_feats.R
@@ -1,83 +1,77 @@
-#' Read features and links from common file formats
-#'
-#' Read features or links from common formats, such as GFF3, Genbank, BED, BLAST
-#' tabular output or PAF files. File formats and the format-specific `read_*()`
-#' function are automatically determined based in file extensions, if possible.
-#' Can read multiple files in the same format into a single table: useful, for
-#' example, to read a folder of gff-files with each containing genes of a
-#' different genome.
-#'
-#' @param files files to reads. Should all be of same format.
-#' @param format If NULL, guess from file extension. Else, any format known to
-#'   gggenomes (gff3, gbk, ... see [file_formats()] for full list) or any suffix
-#'   of a known `read_<suffix>` function, e.g. tsv for `readr::read_tsv()`.
-#' @param .id the name of the column storing the file name each record came
-#'   from. Defaults to "file_id". Set to "bin_id" if every file represents a
-#'   different bin.
-#' @param ... additional arguments passed on to the format-specific read
-#'   function called down the line.
-#'
-#' @return A gggenomes-compatible feature or link tibble
 #' @export
+#' @describeIn read_tracks read files as features mapping onto
+#'   sequences.
 #' @examples
-#' # read a file
+#' # read genes/features from a gff file
 #' read_feats(ex("eden-utr.gff"))
 #'
+#'
 #' # read all gffs from a directory
 #' read_feats(list.files(ex("emales/"), "*.gff$", full.names=TRUE))
 #'
-#' \dontrun{
+#'
 #' # read remote files
+#' \dontrun{
 #' gbk_phages <- c(
 #'   PSSP7 = "ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/858/745/GCF_000858745.1_ViralProj15134/GCF_000858745.1_ViralProj15134_genomic.gff.gz",
 #'   PSSP3 = "ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/904/555/GCF_000904555.1_ViralProj195517/GCF_000904555.1_ViralProj195517_genomic.gff.gz")
 #' read_feats(gbk_phages)
 #' }
-#' @describeIn read_feats read files as features mapping onto sequences
-read_feats <- function(files, format=NULL, .id="file_id", ...){
-  if(is_connection(files))
-    files <- list(files) # weird things happen to pipes in vectors
-
-  # infer file format from suffix
-  format <- (format %||% file_format_unique(files, "feats"))
-
-  if(format == 'ambigious'){
-    abort(str_glue('Ambigious file extension(s): "', comma(unique(file_ext(files))),
-                   '".\nPlease specify `format` explicitly'))
-  }
-
-  # for unnamed files, infer name from filename (used as file_id/bin_id)
-  files <- file_label(files)
-
-  # map_df .id = bin_id
-  inform(str_glue("Reading as {format}:"))
-  feats <- map2_df(files, names(files), read_format, .id=.id, format, ...)
-
-  feats
+#'
+#'
+read_feats <- function(files, .id="file_id", format=NULL, parser=NULL, ...){
+  read_context(files, "feats", .id=.id, format=format, parser=parser, ...)
 }
 
 #' @export
-#' @describeIn read_feats read files as subfeatures mapping onto other features
-read_subfeats <- function(files, format=NULL, .id="file_id", ...){
-  feats <- read_feats(files=files, format=format, ...)
+#' @describeIn read_tracks read files as subfeatures mapping onto other features
+read_subfeats <- function(files, .id="file_id", format=NULL, parser=NULL, ...){
+  feats <- read_context(files, "feats", .id=.id, format=format, parser=parser, ...)
   rename(feats, feat_id=seq_id, feat_id2=seq_id2)
 }
 
 #' @export
-#' @describeIn read_feats read files as links connecting sequences
-read_links <- function(files, format=NULL, .id="file_id", ...){
-  feats <- read_feats(files=files, format=format, ...)
+#' @describeIn read_tracks read files as links connecting sequences
+read_links <- function(files, .id="file_id", format=NULL, parser=NULL, ...){
+  feats <- read_context(files, "feats", .id=.id, format=format, parser=parser, ...)
   rename(feats, seq_id=seq_id, start=start, end=end)
 }
 
 #' @export
-#' @describeIn read_feats read files as sublinks connecting features
-read_sublinks <- function(files, format=NULL, .id="file_id", ...){
-  feats <- read_feats(files=files, format=format, ...)
+#' @describeIn read_tracks read files as sublinks connecting features
+read_sublinks <- function(files, .id="file_id", format=NULL, parser=NULL, ...){
+  feats <- read_context(files, "feats", .id=.id, format=format, parser=parser, ...)
   rename(feats, feat_id=seq_id, start=start, end=end, feat_id2=seq_id2)
 }
 
-read_format <- function(file, name, format, ...){
-  inform(str_glue("* {name} [{file}]"))
-  exec(paste0("read_", format), file, ...)
+#' Swap query and subject in blast-like feature tables
+#'
+#' Swap query and subject columns in a table read with [read_feats()] or
+#' [read_links()], for example, from blast searches. Swaps columns with
+#' name/name2, such as 'seq_id/seq_id2', 'start/start2', ...
+#'
+#' @param x tibble with query and subject columns
+#' @export
+#' @return tibble with swapped query/subject columns
+#' @examples
+#' feats <- tribble(
+#'  ~seq_id, ~seq_id2, ~start, ~end, ~strand, ~start2, ~end2, ~evalue,
+#'  "A", "B", 100, 200, "+", 10000, 10200, 1e-5
+#' )
+#' # make B the query
+#' swap_query(feats)
+swap_query <- function(x){
+  # for every pair seq_id/seq_id2, name/name2 > name2/name
+  n <- names(x)
+  m <- str_subset(n, "\\D2") %>% str_remove("2$") %>% intersect(n)
+  if(!length(m))
+    return(x)
+
+  m2 <- paste0(m, "2")
+  i <- which(n %in% m)
+  i2 <- which(n %in% m2)
+  inform(c("Swapping query/subject-associated columns",
+           comma(m, collapse='  '), comma(m2, collapse=' ')))
+  x[c(i, i2)] <- x[c(i2, i)]
+  x
 }
diff --git a/R/read_seqs.R b/R/read_seqs.R
index 47d53591..ad3ae941 100644
--- a/R/read_seqs.R
+++ b/R/read_seqs.R
@@ -1,47 +1,24 @@
-#' Read a sequence index
-#'
-#' Read ID, description and length for each sequence from common formats
-#' including FASTA, samtools/seqkit FASTA index files, and GFF3. Default columns
-#' are seq_id, seq_desc and length.
-#'
-#' @importFrom readr read_tsv
-#' @param file fasta or .fai/.seqkit.fai fasta index
 #' @export
-#' @return A gggenomes-compatible sequence tibble
-#' @describeIn read_seqs read seqs from files with automatic format detection
+#' @describeIn read_tracks read sequence ID, description and length.
 #' @examples
-#' # from a fasta file
+#' # reads sequence index from a fasta file
 #' read_seqs(ex("emales/emales.fna"))
+#'
+#'
 #' # from samtools/seqkit style index
 #' read_seqs(ex("emales/emales.fna.seqkit.fai"))
+#'
+#'
 #' # from multiple gff file
 #' read_seqs(c(ex("emales/emales.gff"), ex("emales/emales-tirs.gff")))
-read_seqs <- function(files, format=NULL, .id="file_id", ...){
-  if(any(map_lgl(files, is_connection))){
-    warn("Using connections instead of paths to files can lead to unexpected behaviour")
-    is_connection(files)
-      files <- list(files) # weird things happen to pipes in vectors
-  }
-
-  # infer file format from suffix
-  format <- (format %||% file_format_unique(files, "seqs"))
-
-  if(format == 'ambigious'){
-    abort(str_glue('Ambigious file extension(s): "', comma(unique(file_ext(files))),
-                   '".\nPlease specify `format` explicitly'))
-  }
-
-  # for unnamed files, infer name from filename (used as file_id/bin_id)
-  files <- file_label(files)
-
-  # map_df .id = bin_id
-  inform(str_glue("Reading as {format}:"))
-  seqs <- map2_df(files, names(files), read_format, .id=.id, format, ...)
-
-  seqs
+read_seqs <- function(files, .id="file_id", format=NULL, parser=NULL, ...){
+  read_context(files, "seqs", .id=.id, format=format, parser=parser, ...)
 }
 
-#' @describeIn read_seqs read seqs from a single file in fasta, gbk or gff3 format.
+
+#' Read sequence index
+#'
+#' @describeIn read_seq_len read seqs from a single file in fasta, gbk or gff3 format.
 #' @export
 read_seq_len <- function(file, col_names = def_names("seq_len"),
     col_types = def_types("seq_len"), ...){
@@ -53,7 +30,7 @@ read_seq_len <- function(file, col_names = def_names("seq_len"),
 
 }
 
-#' @describeIn read_seqs read seqs from a single file in seqkit/samtools fai format.
+#' @describeIn read_seq_len read seqs from a single file in seqkit/samtools fai format.
 #' @export
 read_fai <- function(file, col_names=def_names("fai"),
     col_types=def_types("fai"), ...){
diff --git a/_pkgdown.yml b/_pkgdown.yml
index dfa22a09..6d11f0be 100644
--- a/_pkgdown.yml
+++ b/_pkgdown.yml
@@ -34,7 +34,7 @@ reference:
     - starts_with("read_")
     - swap_query
     - ex
-    - file_formats
+    - def_formats
     - def_names
     - def_types
     - starts_with("write")
@@ -51,10 +51,6 @@ reference:
   - in_range
   - width
   - introduce
-- title: "Handle files"
-- contents:
-  - ext_to_format
-  - file_exts
   - file_label
 - title: "Data sets"
 - contents:
diff --git a/man/def_formats.Rd b/man/def_formats.Rd
new file mode 100644
index 00000000..0b3e7ebe
--- /dev/null
+++ b/man/def_formats.Rd
@@ -0,0 +1,66 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/read.R
+\name{def_formats}
+\alias{def_formats}
+\title{Defined file formats and extensions}
+\usage{
+def_formats(
+  file = NULL,
+  ext = NULL,
+  context = NULL,
+  parser = NULL,
+  allow_na = FALSE
+)
+}
+\arguments{
+\item{file}{a vector of file names}
+
+\item{ext}{a vector of file extensions}
+
+\item{context}{a vector of file contexts defined in
+\code{gggenomes_global$def_formats}}
+
+\item{parser}{a vector of file parsers defined in
+\code{gggenomes_global$def_formats}}
+}
+\value{
+dictionarish vector of file formats with recognized extensions as
+names
+}
+\description{
+For seamless reading of different file formats, gggenomes uses a mapping of
+known formats to associated file extensions and contexts in which the
+different formats can be read. The notion of context allows one to read
+different information from the same format/extension. For example, a gbk file
+holds both feature and sequence information. If read in "feats" context
+\code{read_feats("*.gbk")} it will return a feature table, if read in "seqs"
+context \code{read_seqs("*.gbk")}, a sequence index.
+}
+\section{Defined formats, extensions, contexts, and parsers}{
+
+\preformatted{
+     format                           ext            context                                              parser
+1 ambigious                 txt, tsv, csv                 NA                                      read_ambigious
+2     fasta fa, fas, fasta, ffn, fna, faa               seqs                                        read_seq_len
+3       fai                           fai               seqs                                            read_fai
+4      gff3                     gff, gff3        feats, seqs                             read_gff3, read_seq_len
+5       gbk           gbk, gb, gbff, gpff        feats, seqs                              read_gbk, read_seq_len
+6       bed                           bed              feats                                            read_bed
+7     blast                    m8, o6, o7              feats                                          read_blast
+8       paf                           paf              feats                                            read_paf
+9     alitv                          json feats, seqs, links read_alitv_genes, read_alitv_seqs, read_alitv_links
+}
+}
+
+\examples{
+# vector of defined zip formats and recognized extensions as names
+# format of file
+def_formats("foo.fa")
+
+# formats associated with each extension
+def_formats(ext=qc(fa, gff))
+
+# all formats/extensions that can be read in seqs context; includes formats
+# that are defined for context=NA, i.e. that can be read in any context.
+def_formats(context="seqs")
+}
diff --git a/man/ext_to_format.Rd b/man/ext_to_format.Rd
deleted file mode 100644
index 9f2dd8e8..00000000
--- a/man/ext_to_format.Rd
+++ /dev/null
@@ -1,22 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/read.R
-\name{ext_to_format}
-\alias{ext_to_format}
-\title{File format from suffix}
-\usage{
-ext_to_format(x, context)
-}
-\arguments{
-\item{x}{a vector of file extensions}
-
-\item{context}{a file format context defined in \code{\link[=file_formats]{file_formats()}}}
-}
-\value{
-a vector of formats with extensions as names
-}
-\description{
-File format from suffix
-}
-\examples{
-gggenomes:::ext_to_format(c("gff", "txt", "FASTA"), "feats")
-}
diff --git a/man/file_exts.Rd b/man/file_exts.Rd
deleted file mode 100644
index bf4ff1ec..00000000
--- a/man/file_exts.Rd
+++ /dev/null
@@ -1,21 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/read.R
-\name{file_exts}
-\alias{file_exts}
-\title{Defined file extensions and associated formats}
-\usage{
-file_exts(context)
-}
-\arguments{
-\item{context}{a file format context defined in \code{gggenomes_global$file_formats}}
-}
-\value{
-vector of file extensions with formats as names
-}
-\description{
-Defined file extensions and associated formats
-}
-\examples{
-# vector of zip-context file extensions and format names
-gggenomes:::file_exts("zips")
-}
diff --git a/man/file_formats.Rd b/man/file_formats.Rd
deleted file mode 100644
index 3d6b8d39..00000000
--- a/man/file_formats.Rd
+++ /dev/null
@@ -1,47 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/read.R
-\name{file_formats}
-\alias{file_formats}
-\title{Defined file formats and extensions}
-\usage{
-file_formats(context)
-}
-\arguments{
-\item{context}{a file format context defined in \code{gggenomes_global$file_formats}}
-}
-\value{
-dictionarish vector of file formats with recognized extensions as names
-}
-\description{
-For seamless reading of different file formats, gggenomes uses a mapping of
-known formats to associated file extensions and contexts in which the
-different formats can be read. The notion of context allows one to read
-different information from the same format/extension. For example, a gbk file
-holds both feature and sequence information. If read in "feats" context
-\code{read_feats("*.gbk")} it will return a feature table, if read in "seqs"
-context \code{read_seqs("*.gbk")}, a sequence index.
-}
-\section{Defined contexts, formats and extensions}{
-
-\preformatted{
-feats    ambigious  [txt,tsv,csv]
-         bed      [bed]
-         blast    [m8,o6,o7]
-         fasta    [fa,fas,fasta,ffn,fna,faa]
-         gbk      [gbk,gb,gbff]
-         gff3     [gff,gff3]
-         paf      [paf]
-seqs     ambigious  [txt,tsv,csv]
-         fai      [fai]
-         seq_len  [fa,fas,fasta,ffn,fna,faa,gff,gbk]
-zips     bz2      [bz,bz2]
-         gz       [gz]
-         xz       [xz]
-         zip      [zip]
-}
-}
-
-\examples{
-# vector of defined zip formats and recognized extensions as names
-file_formats("zips")
-}
diff --git a/man/read_alitv.Rd b/man/read_alitv.Rd
index e4609196..14bc79e9 100644
--- a/man/read_alitv.Rd
+++ b/man/read_alitv.Rd
@@ -1,5 +1,5 @@
 % Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/read.R
+% Please edit documentation in R/read_alitv.R
 \name{read_alitv}
 \alias{read_alitv}
 \title{Read AliTV .json file}
diff --git a/man/read_context.Rd b/man/read_context.Rd
new file mode 100644
index 00000000..34e2d823
--- /dev/null
+++ b/man/read_context.Rd
@@ -0,0 +1,46 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/read.R
+\name{read_context}
+\alias{read_context}
+\title{Read files in different contexts}
+\usage{
+read_context(
+  files,
+  context,
+  .id = "file_id",
+  format = NULL,
+  parser = NULL,
+  ...
+)
+}
+\arguments{
+\item{files}{files to reads. Should all be of same format. In many cases,
+compressed files (\code{.gz}, \code{.bz2}, \code{.xz}, or \code{.zip}) are supported.
+Similarly, automatic download of remote files starting with \verb{http(s)://} or
+\verb{ftp(s)://} works in most cases.}
+
+\item{context}{the context ("seqs", "feats", "links") in which a given format
+should be read.}
+
+\item{.id}{the column with the name of the file a record was read from.
+Defaults to "file_id". Set to "bin_id" if every file represents a different
+bin.}
+
+\item{format}{specify a format known to gggenomes, such as \code{gff3}, \code{gbk}, ...
+to overwrite automatic determination based on the file extension (see
+\code{\link[=def_formats]{def_formats()}} for full list).}
+
+\item{parser}{specify the name of an R function to overwrite automatic
+determination based on format, e.g. \code{parser="read_tsv"}.}
+
+\item{...}{additional arguments passed on to the format-specific read
+function called down the line.}
+}
+\description{
+Powers \code{\link[=read_seqs]{read_seqs()}}, \code{\link[=read_feats]{read_feats()}}, \code{\link[=read_links]{read_links()}}
+}
+\section{Functions}{
+\itemize{
+\item \code{read_context}: bla keywords internal
+}}
+
diff --git a/man/read_feats.Rd b/man/read_feats.Rd
deleted file mode 100644
index 5faeab4b..00000000
--- a/man/read_feats.Rd
+++ /dev/null
@@ -1,68 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/read_feats.R
-\name{read_feats}
-\alias{read_feats}
-\alias{read_subfeats}
-\alias{read_links}
-\alias{read_sublinks}
-\title{Read features and links from common file formats}
-\usage{
-read_feats(files, format = NULL, .id = "file_id", ...)
-
-read_subfeats(files, format = NULL, .id = "file_id", ...)
-
-read_links(files, format = NULL, .id = "file_id", ...)
-
-read_sublinks(files, format = NULL, .id = "file_id", ...)
-}
-\arguments{
-\item{files}{files to reads. Should all be of same format.}
-
-\item{format}{If NULL, guess from file extension. Else, any format known to
-gggenomes (gff3, gbk, ... see \code{\link[=file_formats]{file_formats()}} for full list) or any suffix
-of a known \verb{read_<suffix>} function, e.g. tsv for \code{readr::read_tsv()}.}
-
-\item{.id}{the name of the column storing the file name each record came
-from. Defaults to "file_id". Set to "bin_id" if every file represents a
-different bin.}
-
-\item{...}{additional arguments passed on to the format-specific read
-function called down the line.}
-}
-\value{
-A gggenomes-compatible feature or link tibble
-}
-\description{
-Read features or links from common formats, such as GFF3, Genbank, BED, BLAST
-tabular output or PAF files. File formats and the format-specific \verb{read_*()}
-function are automatically determined based in file extensions, if possible.
-Can read multiple files in the same format into a single table: useful, for
-example, to read a folder of gff-files with each containing genes of a
-different genome.
-}
-\section{Functions}{
-\itemize{
-\item \code{read_feats}: read files as features mapping onto sequences
-
-\item \code{read_subfeats}: read files as subfeatures mapping onto other features
-
-\item \code{read_links}: read files as links connecting sequences
-
-\item \code{read_sublinks}: read files as sublinks connecting features
-}}
-
-\examples{
-# read a file
-read_feats(ex("eden-utr.gff"))
-
-# read all gffs from a directory
-read_feats(list.files(ex("emales/"), "*.gff$", full.names=TRUE))
-
-\dontrun{
-# read remote files
-gbk_phages <- c(
-  PSSP7 = "ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/858/745/GCF_000858745.1_ViralProj15134/GCF_000858745.1_ViralProj15134_genomic.gff.gz",
-  PSSP3 = "ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/904/555/GCF_000904555.1_ViralProj195517/GCF_000904555.1_ViralProj195517_genomic.gff.gz")
-read_feats(gbk_phages)
-}
-}
diff --git a/man/read_seq_len.Rd b/man/read_seq_len.Rd
new file mode 100644
index 00000000..b5273081
--- /dev/null
+++ b/man/read_seq_len.Rd
@@ -0,0 +1,26 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/read_seqs.R
+\name{read_seq_len}
+\alias{read_seq_len}
+\alias{read_fai}
+\title{Read sequence index}
+\usage{
+read_seq_len(
+  file,
+  col_names = def_names("seq_len"),
+  col_types = def_types("seq_len"),
+  ...
+)
+
+read_fai(file, col_names = def_names("fai"), col_types = def_types("fai"), ...)
+}
+\description{
+Read sequence index
+}
+\section{Functions}{
+\itemize{
+\item \code{read_seq_len}: read seqs from a single file in fasta, gbk or gff3 format.
+
+\item \code{read_fai}: read seqs from a single file in seqkit/samtools fai format.
+}}
+
diff --git a/man/read_seqs.Rd b/man/read_seqs.Rd
deleted file mode 100644
index 666203db..00000000
--- a/man/read_seqs.Rd
+++ /dev/null
@@ -1,47 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/read_seqs.R
-\name{read_seqs}
-\alias{read_seqs}
-\alias{read_seq_len}
-\alias{read_fai}
-\title{Read a sequence index}
-\usage{
-read_seqs(files, format = NULL, .id = "file_id", ...)
-
-read_seq_len(
-  file,
-  col_names = def_names("seq_len"),
-  col_types = def_types("seq_len"),
-  ...
-)
-
-read_fai(file, col_names = def_names("fai"), col_types = def_types("fai"), ...)
-}
-\arguments{
-\item{file}{fasta or .fai/.seqkit.fai fasta index}
-}
-\value{
-A gggenomes-compatible sequence tibble
-}
-\description{
-Read ID, description and length for each sequence from common formats
-including FASTA, samtools/seqkit FASTA index files, and GFF3. Default columns
-are seq_id, seq_desc and length.
-}
-\section{Functions}{
-\itemize{
-\item \code{read_seqs}: read seqs from files with automatic format detection
-
-\item \code{read_seq_len}: read seqs from a single file in fasta, gbk or gff3 format.
-
-\item \code{read_fai}: read seqs from a single file in seqkit/samtools fai format.
-}}
-
-\examples{
-# from a fasta file
-read_seqs(ex("emales/emales.fna"))
-# from samtools/seqkit style index
-read_seqs(ex("emales/emales.fna.seqkit.fai"))
-# from multiple gff file
-read_seqs(c(ex("emales/emales.gff"), ex("emales/emales-tirs.gff")))
-}
diff --git a/man/read_tracks.Rd b/man/read_tracks.Rd
new file mode 100644
index 00000000..afb9e0a7
--- /dev/null
+++ b/man/read_tracks.Rd
@@ -0,0 +1,96 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/read.R, R/read_feats.R, R/read_seqs.R
+\name{read_tracks}
+\alias{read_tracks}
+\alias{read_feats}
+\alias{read_subfeats}
+\alias{read_links}
+\alias{read_sublinks}
+\alias{read_seqs}
+\title{Read files in various formats into track tables}
+\usage{
+read_feats(files, .id = "file_id", format = NULL, parser = NULL, ...)
+
+read_subfeats(files, .id = "file_id", format = NULL, parser = NULL, ...)
+
+read_links(files, .id = "file_id", format = NULL, parser = NULL, ...)
+
+read_sublinks(files, .id = "file_id", format = NULL, parser = NULL, ...)
+
+read_seqs(files, .id = "file_id", format = NULL, parser = NULL, ...)
+}
+\arguments{
+\item{files}{files to reads. Should all be of same format. In many cases,
+compressed files (\code{.gz}, \code{.bz2}, \code{.xz}, or \code{.zip}) are supported.
+Similarly, automatic download of remote files starting with \verb{http(s)://} or
+\verb{ftp(s)://} works in most cases.}
+
+\item{.id}{the column with the name of the file a record was read from.
+Defaults to "file_id". Set to "bin_id" if every file represents a different
+bin.}
+
+\item{format}{specify a format known to gggenomes, such as \code{gff3}, \code{gbk}, ...
+to overwrite automatic determination based on the file extension (see
+\code{\link[=def_formats]{def_formats()}} for full list).}
+
+\item{parser}{specify the name of an R function to overwrite automatic
+determination based on format, e.g. \code{parser="read_tsv"}.}
+
+\item{...}{additional arguments passed on to the format-specific read
+function called down the line.}
+}
+\value{
+A gggenomes-compatible sequence, feature or link tibble
+}
+\description{
+Convenience functions to read sequences, features or links from various
+bioinformatics file formats, such as FASTA, GFF3, Genbank, BLAST tabular
+output, etc. See \code{\link[=def_formats]{def_formats()}} for full list. File formats and the
+corresponding read-functions are automatically determined based on file
+extensions. All these functions can read multiple files in the same format at
+once, and combine them into a single table - useful, for example, to read a
+folder of gff-files with each file containing genes of a different genome.
+}
+\section{Functions}{
+\itemize{
+\item \code{read_feats}: read files as features mapping onto
+sequences.
+
+\item \code{read_subfeats}: read files as subfeatures mapping onto other features
+
+\item \code{read_links}: read files as links connecting sequences
+
+\item \code{read_sublinks}: read files as sublinks connecting features
+
+\item \code{read_seqs}: read sequence ID, description and length.
+}}
+
+\examples{
+# read genes/features from a gff file
+read_feats(ex("eden-utr.gff"))
+
+
+# read all gffs from a directory
+read_feats(list.files(ex("emales/"), "*.gff$", full.names=TRUE))
+
+
+# read remote files
+\dontrun{
+gbk_phages <- c(
+  PSSP7 = "ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/858/745/GCF_000858745.1_ViralProj15134/GCF_000858745.1_ViralProj15134_genomic.gff.gz",
+  PSSP3 = "ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/904/555/GCF_000904555.1_ViralProj195517/GCF_000904555.1_ViralProj195517_genomic.gff.gz")
+read_feats(gbk_phages)
+}
+
+
+# reads sequence index from a fasta file
+read_seqs(ex("emales/emales.fna"))
+
+
+# from samtools/seqkit style index
+read_seqs(ex("emales/emales.fna.seqkit.fai"))
+
+
+# from multiple gff file
+read_seqs(c(ex("emales/emales.gff"), ex("emales/emales-tirs.gff")))
+}
diff --git a/man/swap_query.Rd b/man/swap_query.Rd
index ed3d0873..f61acbce 100644
--- a/man/swap_query.Rd
+++ b/man/swap_query.Rd
@@ -1,5 +1,5 @@
 % Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/read.R
+% Please edit documentation in R/read_feats.R
 \name{swap_query}
 \alias{swap_query}
 \title{Swap query and subject in blast-like feature tables}

From 9f4ca1ac0160f96db6b3d02383ea7c8b358bf719 Mon Sep 17 00:00:00 2001
From: Markus Ankenbrand <markus@ankenbrand.me>
Date: Mon, 8 Feb 2021 16:03:42 +0100
Subject: [PATCH 2/4] Fix context for links

---
 R/read_feats.R | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/R/read_feats.R b/R/read_feats.R
index 2a637e25..0fb43de9 100644
--- a/R/read_feats.R
+++ b/R/read_feats.R
@@ -33,14 +33,14 @@ read_subfeats <- function(files, .id="file_id", format=NULL, parser=NULL, ...){
 #' @export
 #' @describeIn read_tracks read files as links connecting sequences
 read_links <- function(files, .id="file_id", format=NULL, parser=NULL, ...){
-  feats <- read_context(files, "feats", .id=.id, format=format, parser=parser, ...)
+  feats <- read_context(files, "links", .id=.id, format=format, parser=parser, ...)
   rename(feats, seq_id=seq_id, start=start, end=end)
 }
 
 #' @export
 #' @describeIn read_tracks read files as sublinks connecting features
 read_sublinks <- function(files, .id="file_id", format=NULL, parser=NULL, ...){
-  feats <- read_context(files, "feats", .id=.id, format=format, parser=parser, ...)
+  feats <- read_context(files, "links", .id=.id, format=format, parser=parser, ...)
   rename(feats, feat_id=seq_id, start=start, end=end, feat_id2=seq_id2)
 }
 

From f6b965d0ae700e4743dfef2ac37d7adea3959f39 Mon Sep 17 00:00:00 2001
From: Markus Ankenbrand <markus@ankenbrand.me>
Date: Mon, 8 Feb 2021 16:04:04 +0100
Subject: [PATCH 3/4] Fix column names for links from alitv

---
 R/read_alitv.R | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/R/read_alitv.R b/R/read_alitv.R
index d456134e..a8418e1b 100644
--- a/R/read_alitv.R
+++ b/R/read_alitv.R
@@ -39,9 +39,9 @@ read_alitv <- function(file){
     left_join(link_pos, by=c("source"="id")) %>%
     left_join(link_pos, by=c("target"="id")) %>%
     transmute(
-      seq_id1=karyo.x,
-      start1=start.x,
-      end1=end.x,
+      seq_id=karyo.x,
+      start=start.x,
+      end=end.x,
       seq_id2=karyo.y,
       start2=start.y,
       end2=end.y,

From 415c501e0abf4832c8aa05379c8689d41d8d1fc0 Mon Sep 17 00:00:00 2001
From: Markus Ankenbrand <markus@ankenbrand.me>
Date: Mon, 8 Feb 2021 17:03:47 +0100
Subject: [PATCH 4/4] Fix alitv example

---
 R/read_alitv.R    | 2 +-
 man/read_alitv.Rd | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/R/read_alitv.R b/R/read_alitv.R
index a8418e1b..4e10464f 100644
--- a/R/read_alitv.R
+++ b/R/read_alitv.R
@@ -21,7 +21,7 @@
 #'   geom_gene(aes(color=class)) +
 #'   geom_link(aes(fill=identity)) +
 #'   scale_fill_distiller(palette="RdYlGn", direction = 1)
-#' p %>% flip_seq("Same_gi") %>% pick(1,3,2,4,5,6,7,8)
+#' p %>% flip_seqs(5) %>% pick_seqs(1,3,2,4,5,6,7,8)
 read_alitv <- function(file){
   ali <- jsonlite::fromJSON(file, simplifyDataFrame=TRUE)
   seqs <- tibble(seq = ali$data$karyo$chromosome) %>%
diff --git a/man/read_alitv.Rd b/man/read_alitv.Rd
index 14bc79e9..5a0865a5 100644
--- a/man/read_alitv.Rd
+++ b/man/read_alitv.Rd
@@ -28,5 +28,5 @@ p <- gggenomes(ali$seqs, ali$genes, links=ali$links) +
   geom_gene(aes(color=class)) +
   geom_link(aes(fill=identity)) +
   scale_fill_distiller(palette="RdYlGn", direction = 1)
-p \%>\% flip_seq("Same_gi") \%>\% pick(1,3,2,4,5,6,7,8)
+p \%>\% flip_seqs(5) \%>\% pick_seqs(1,3,2,4,5,6,7,8)
 }