From bdbc183ffe69ca19b0f37b568eaceeba194fabf5 Mon Sep 17 00:00:00 2001 From: Thomas Hackl Date: Sun, 7 Feb 2021 00:52:57 +0100 Subject: [PATCH 1/4] rewrite of smart-read-backend; cleaner mapping of (ext > format) + context > parser --- NAMESPACE | 2 +- R/global.R | 41 +++-- R/read.R | 353 +++++++++++++++++++++---------------------- R/read_alitv.R | 55 +++++++ R/read_feats.R | 104 ++++++------- R/read_seqs.R | 49 ++---- _pkgdown.yml | 6 +- man/def_formats.Rd | 66 ++++++++ man/ext_to_format.Rd | 22 --- man/file_exts.Rd | 21 --- man/file_formats.Rd | 47 ------ man/read_alitv.Rd | 2 +- man/read_context.Rd | 46 ++++++ man/read_feats.Rd | 68 --------- man/read_seq_len.Rd | 26 ++++ man/read_seqs.Rd | 47 ------ man/read_tracks.Rd | 96 ++++++++++++ man/swap_query.Rd | 2 +- 18 files changed, 548 insertions(+), 505 deletions(-) create mode 100644 R/read_alitv.R create mode 100644 man/def_formats.Rd delete mode 100644 man/ext_to_format.Rd delete mode 100644 man/file_exts.Rd delete mode 100644 man/file_formats.Rd create mode 100644 man/read_context.Rd delete mode 100644 man/read_feats.Rd create mode 100644 man/read_seq_len.Rd delete mode 100644 man/read_seqs.Rd create mode 100644 man/read_tracks.Rd diff --git a/NAMESPACE b/NAMESPACE index 98bb3671..0b0c4b20 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -94,6 +94,7 @@ export(as_sublinks) export(bins) export(check_strand) export(combine_strands) +export(def_formats) export(def_names) export(def_types) export(drop_feat_layout) @@ -103,7 +104,6 @@ export(drop_seq_layout) export(ex) export(feats) export(feats0) -export(file_formats) export(flip) export(flip_nicely) export(flip_seqs) diff --git a/R/global.R b/R/global.R index 64a38094..9ff7ac24 100644 --- a/R/global.R +++ b/R/global.R @@ -3,28 +3,25 @@ # manipulated by other packages. gggenomes_global <- new.env(parent = emptyenv()) -# list of contexts of dictionaryish vectors mapping suffixes to file formats -gggenomes_global$file_formats <- map(list( - feats = list( - gff3 = qc(gff, gff3), - gbk = qc(gbk, gb, gbff), - bed = qc(bed), - fasta = qc(fa, fas, fasta, ffn, fna, faa), - blast = qc(m8, o6, o7), - paf = qc(paf), - ambigious = qc(txt, tsv, csv) - ), - seqs = list( - fai = qc(fai), - seq_len = qc(fa, fas, fasta, ffn, fna, faa, gff, gbk), - ambigious = qc(txt, tsv, csv) - ), - zips = list( - bz2 = qc(bz, bz2), - gz = qc(gz), - xz = qc(xz), - zip = qc(zip)) -), ~deframe(stack(.x) %>% mutate(ind=as.character(ind)))) +# Mapping of file formats, extensions, contexts, and parsers +# +# parser is a function name, like "read_tsv", ... +# +# context=NA defines fallback parser that is used if no parser is defined for +# the specific context +gggenomes_global$def_formats <- tribble( + ~format, ~ext, ~context, ~parser, + "ambigious", qc(txt,tsv,csv), NA, "read_ambigious", + "fasta", qc(fa,fas,fasta,ffn,fna,faa), qc(seqs), qc(read_seq_len), + "fai", qc(fai), qc(seqs), qc(read_fai), + "gff3", qc(gff,gff3), qc(feats, seqs), qc(read_gff3, read_seq_len), + "gbk", qc(gbk,gb,gbff,gpff), qc(feats, seqs), qc(read_gbk, read_seq_len), + "bed", qc(bed), "feats", "read_bed", + "blast", qc(m8,o6,o7), "feats", "read_blast", + "paf", qc(paf), "feats", "read_paf", + "alitv", qc(json), qc(feats, seqs, links), + qc(read_alitv_genes, read_alitv_seqs, read_alitv_links) +) # Default column names for different formats gggenomes_global$def_names <- list( diff --git a/R/read.R b/R/read.R index 5046d0f2..32171456 100644 --- a/R/read.R +++ b/R/read.R @@ -1,33 +1,135 @@ -#' Swap query and subject in blast-like feature tables +#' Read files in various formats into track tables #' -#' Swap query and subject columns in a table read with [read_feats()] or -#' [read_links()], for example, from blast searches. Swaps columns with -#' name/name2, such as 'seq_id/seq_id2', 'start/start2', ... +#' Convenience functions to read sequences, features or links from various +#' bioinformatics file formats, such as FASTA, GFF3, Genbank, BLAST tabular +#' output, etc. See [def_formats()] for full list. File formats and the +#' corresponding read-functions are automatically determined based on file +#' extensions. All these functions can read multiple files in the same format at +#' once, and combine them into a single table - useful, for example, to read a +#' folder of gff-files with each file containing genes of a different genome. #' -#' @param x tibble with query and subject columns +#' @name read_tracks +#' @inheritParams read_context +#' @return A gggenomes-compatible sequence, feature or link tibble +NULL + + +#' Read files in different contexts +#' +#' Powers [read_seqs()], [read_feats()], [read_links()] +#' @param files files to reads. Should all be of same format. In many cases, +#' compressed files (`.gz`, `.bz2`, `.xz`, or `.zip`) are supported. +#' Similarly, automatic download of remote files starting with `http(s)://` or +#' `ftp(s)://` works in most cases. +#' @param .id the column with the name of the file a record was read from. +#' Defaults to "file_id". Set to "bin_id" if every file represents a different +#' bin. +#' @param format specify a format known to gggenomes, such as `gff3`, `gbk`, ... +#' to overwrite automatic determination based on the file extension (see +#' [def_formats()] for full list). +#' @param parser specify the name of an R function to overwrite automatic +#' determination based on format, e.g. `parser="read_tsv"`. +#' @param ... additional arguments passed on to the format-specific read +#' function called down the line. +#' @param context the context ("seqs", "feats", "links") in which a given format +#' should be read. +#' @describeIn read_context bla keywords internal +read_context <- function(files, context, .id="file_id", format=NULL, parser=NULL, ...){ + if(is_connection(files)) + files <- list(files) # weird things happen to pipes in vectors + + # for unnamed files, infer name from filename (used as file_id/bin_id) + files <- file_label(files) + + parser <- parser %||% file_parser(files, context=context, format=format, require_unique=T) + # map_df .id = bin_id + inform(str_glue("Reading '{names(parser)}' with `{parser}()`:")) + x <- map2_df(files, names(files), .id=.id, parser=parser, ..., + .f=function(file, name, parser, ...){ + inform(str_glue("* {.id}: {name} [{file}]")) + exec(parser, file, ...) + }) + + x +} + +read_ambigious <- function(file, ...){ + abort(c("Ambigious file extension, please specify format or parser explicitly"), file) +} + +# file: vec of files +# context: vec of context +# single file/context is recycled to match multiple context/files if given +# format: force this format regardless of file extension +file_parser <- function(file, context=NULL, format=NULL, require_unique=FALSE){ + format <- format %||% def_formats(file, context=context) + parser <- def_parser(format, context=context) %>% set_names(format) + + if(require_unique){ + p <- unique(parser) + if(length(p) > 1) + abort(c("All files need the same format/parser.", i="Got mix of:", unname(p))) + parser <- parser[1] # unique(parser) strips names + } + parser +} + +#' Defined file formats and extensions +#' +#' For seamless reading of different file formats, gggenomes uses a mapping of +#' known formats to associated file extensions and contexts in which the +#' different formats can be read. The notion of context allows one to read +#' different information from the same format/extension. For example, a gbk file +#' holds both feature and sequence information. If read in "feats" context +#' `read_feats("*.gbk")` it will return a feature table, if read in "seqs" +#' context `read_seqs("*.gbk")`, a sequence index. +#' +#' @param file a vector of file names +#' @param ext a vector of file extensions +#' @param context a vector of file contexts defined in +#' `gggenomes_global$def_formats` +#' @param parser a vector of file parsers defined in +#' `gggenomes_global$def_formats` +#' @return dictionarish vector of file formats with recognized extensions as +#' names #' @export -#' @return tibble with swapped query/subject columns #' @examples -#' feats <- tribble( -#' ~seq_id, ~seq_id2, ~start, ~end, ~strand, ~start2, ~end2, ~evalue, -#' "A", "B", 100, 200, "+", 10000, 10200, 1e-5 -#' ) -#' # make B the query -#' swap_query(feats) -swap_query <- function(x){ - # for every pair seq_id/seq_id2, name/name2 > name2/name - n <- names(x) - m <- str_subset(n, "\\D2") %>% str_remove("2$") %>% intersect(n) - if(!length(m)) - return(x) +#' # vector of defined zip formats and recognized extensions as names +#' # format of file +#' def_formats("foo.fa") +#' +#' # formats associated with each extension +#' def_formats(ext=qc(fa, gff)) +#' +#' # all formats/extensions that can be read in seqs context; includes formats +#' # that are defined for context=NA, i.e. that can be read in any context. +#' def_formats(context="seqs") +#' @eval def_formats_rd() +def_formats <- function(file=NULL, ext=NULL, context=NULL, parser=NULL, allow_na=FALSE){ + if(!is.null(file)){ + ext <- c(file_ext(file), ext) + } - m2 <- paste0(m, "2") - i <- which(n %in% m) - i2 <- which(n %in% m2) - inform(c("Swapping query/subject-associated columns", - comma(m, collapse=' '), comma(m2, collapse=' '))) - x[c(i, i2)] <- x[c(i2, i)] - x + ff <- filter_def_formats(context=context, parser=parser) %>% unchop(ext) + + format <- deframe(select(ff, ext, format)) + if(!is.null(ext)) + format <- format[ext] + + if(!allow_na && any(is.na(format))){ + bad <- ext[is.na(format)] + names(bad) <- rep("x", length(bad)) + good <- def_formats(context=context, parser=parser) %>% + enframe(name = "ext", value = "format") %>% + chop(ext) %>% mutate(ext = map_chr(ext, comma)) %>% format() + abort(c(str_glue('Unknown extention(s):'), + i=str_glue("in context: {context}"), + i=str_glue("with parser: {parser}"), + bad, + i="Recognized formats/extensions for given context/parser:", + good[-(1:3)])) + } + format } #' Default column names and types for defined formats @@ -53,7 +155,6 @@ def_names <- function(format){ ff[[format]] } - #' @describeIn def_names default column types for defined formats #' @export #' @return a vector with default column types for the given format @@ -68,62 +169,23 @@ def_types <- function(format){ ff[[format]] } -#' Defined file formats and extensions -#' -#' For seamless reading of different file formats, gggenomes uses a mapping of -#' known formats to associated file extensions and contexts in which the -#' different formats can be read. The notion of context allows one to read -#' different information from the same format/extension. For example, a gbk file -#' holds both feature and sequence information. If read in "feats" context -#' `read_feats("*.gbk")` it will return a feature table, if read in "seqs" -#' context `read_seqs("*.gbk")`, a sequence index. -#' -#' -#' @param context a file format context defined in `gggenomes_global$file_formats` -#' @return dictionarish vector of file formats with recognized extensions as names -#' @export -#' @examples -#' # vector of defined zip formats and recognized extensions as names -#' file_formats("zips") -#' @eval file_formats_rd() -file_formats <- function(context){ - ff <- gggenomes_global$file_formats - if(!context %in% names(ff)){ - abort(c( - str_glue("Unknown file format context '{context}'.\nDefined families are:"), - names(ff) - )) - } - ff[[context]] -} +def_parser <- function(format, context=NULL){ + context <- context %||% NA -#' Defined file extensions and associated formats -#' -#' @inheritParams file_formats -#' @return vector of file extensions with formats as names -#' @examples -#' # vector of zip-context file extensions and format names -#' gggenomes:::file_exts("zips") -file_exts <- function(context){ - f <- file_formats(context) - set_names(names(f), f) -} + # recycle format & context to same length + x <- tibble(format=format, context=context) -#' File format from suffix -#' @param x a vector of file extensions -#' @param context a file format context defined in [file_formats()] -#' @return a vector of formats with extensions as names -#' @examples -#' gggenomes:::ext_to_format(c("gff", "txt", "FASTA"), "feats") -ext_to_format <- function(x, context){ - x <- str_to_lower(x) - if(is_dictionaryish(context)) - context[x] - else - file_formats(context)[x] + # for each format/context combo, get parser + pp <- pmap_chr(x, function(format, context){ + r <- filter_def_formats(format=format, context=context) %>% pull(parser) + if(!length(r) || is.na(r)) + abort(str_glue("No predefined parser for: `format={format}, context={context}`")) + r + }) + pp } -file_strip_zip <- function(file, ext = names(file_formats("zips"))){ +file_strip_zip <- function(file, ext = qc(bz2,gz,xz,zip)){ ext <- paste0("\\.", ext, "$", collapse="|") str_remove(file, ext) } @@ -140,32 +202,10 @@ file_name <- function(file, pattern = "\\.[^.]+$", ignore_zip = TRUE){ str_remove(basename(file), pattern) } -file_format <- function(file, context, allow_na = FALSE){ - ext <- file_ext(file) - format <- ext_to_format(ext, context) - if(!allow_na && any(is.na(format))){ - bad <- file[is.na(format)] - names(bad) <- rep("x", length(bad)) - good <- file_formats("feats") %>% - enframe(name = "ext", value = "format") %>% - chop(ext) %>% mutate(ext = map_chr(ext, comma)) %>% format() - abort(c(str_glue('Bad extention for file format context "{context}"'), bad, - i="Recognized formats/extensions:", good[-(1:3)])) - } - set_names(format, file) -} - file_id <- function(file){ vctrs::vec_as_names(file_name(file), repair="unique") } -file_format_unique <- function(files, context, allow_duplicates = FALSE){ - fmt <- unique(file_format(files, context)) - if(!allow_duplicates && length(fmt) > 1) - abort(c("All files need the same format.", i="Got mix of:", unname(fmt))) - fmt -} - #' Add a unique name to files #' #' Given a vector of file paths, add a unique labels based on the filename as @@ -176,100 +216,55 @@ file_label <- function(file){ file } - -file_is_zip <- function(file, ext = names(file_formats("zips"))){ +file_is_zip <- function(file, ext = qc(bz2,gz,xz,zip)){ pattern <- paste0("\\.", ext, "$", collapse="|") str_detect(file, pattern) } - file_is_url <- function(file){ str_detect(file, "^((http|ftp)s?|sftp)://") } -file_formats_rd <- function(){ - ff <- gggenomes_global$file_formats %>% - map_df(.id="context", function(x){ - enframe(x, "extension", "format") %>% group_by(format) %>% - summarize(extension = comma(extension), .groups="drop") - }) - ff <- mutate(ff, context = ifelse(duplicated(context), "", context)) +is_connection <- function(x) inherits(x, "connection") + - ff <- str_c(sep = "\n", - "@section Defined contexts, formats and extensions:", - "\\preformatted{", - #sprintf("%-9s %-12s %s", "Context", "Format", "Extensions"), - str_c(collapse = "\n", - str_glue_data(ff, '{sprintf("%-8s", context)} ', - '{sprintf("%-7s", format)} [{extension}]')), - "}" - ) +# filter but keep fallback parser for context=NA +filter_def_formats <- function(ff, format=NULL, context=NULL, parser=NULL){ + ff <- gggenomes_global$def_formats + if(!is.null(format)){ + ff <- filter(ff, format %in% !!format) + } + + if(!is.null(context) || !is.null(parser)){ + ff <- unchop(ff, c(context, parser)) + if(!is.null(context)){ + # context=NA defines fallback parser which is always last in arrange + ff <- ff %>% group_by(format) %>% + filter(context %in% !!context | is.na(context)) %>% + arrange(context, .by_group = TRUE) %>% slice_head(n=1) + } + if(!is.null(parser)) + ff <- filter(ff, parser %in% !!parser) + } ff } +def_formats_rd <- function(){ + str_c(collapse = "\n", c( + "@section Defined formats, extensions, contexts, and parsers:", + "\\preformatted{", + capture_output(as.data.frame(gggenomes_global$def_formats), print=TRUE, width=120), + "}")) +} + def_names_rd <- function(){ ns <- gggenomes_global$def_names ts <- gggenomes_global$def_types str_c(sep = "\n", - "@section Defined formats, column types and names:", - "\\preformatted{", - paste0(map(names(ns), - ~sprintf(" %-10s %-15s %s", .x, ts[[.x]], comma(ns[[.x]]))), collapse="\n"), - "}" + "@section Defined formats, column types and names:", + "\\preformatted{", + paste0(map(names(ns), + ~sprintf(" %-10s %-15s %s", .x, ts[[.x]], comma(ns[[.x]]))), collapse="\n"), + "}" ) } - -is_connection <- function(x) inherits(x, "connection") - -#' Read AliTV .json file -#' -#' this file contains sequences, links and (optionally) genes -#' -#' @importFrom tidyr unnest_wider -#' @importFrom tidyr unnest -#' @importFrom jsonlite fromJSON -#' @param file path to json -#' @export -#' @return list with seqs, genes, and links -#' @examples -#' ali <- read_alitv("https://alitvteam.github.io/AliTV/d3/data/chloroplasts.json") -#' gggenomes(ali$seqs, ali$genes, links=ali$links) + -#' geom_seq() + -#' geom_bin_label() + -#' geom_gene(aes(fill=class)) + -#' geom_link() -#' p <- gggenomes(ali$seqs, ali$genes, links=ali$links) + -#' geom_seq() + -#' geom_bin_label() + -#' geom_gene(aes(color=class)) + -#' geom_link(aes(fill=identity)) + -#' scale_fill_distiller(palette="RdYlGn", direction = 1) -#' p %>% flip_seq("Same_gi") %>% pick(1,3,2,4,5,6,7,8) -read_alitv <- function(file){ - ali <- jsonlite::fromJSON(file, simplifyDataFrame=TRUE) - seqs <- tibble(seq = ali$data$karyo$chromosome) %>% - mutate(seq_id = names(seq)) %>% - unnest_wider(seq) %>% - rename(bin_id = genome_id) - genes <- tibble(feature = ali$data$feature) %>% - mutate(class = names(feature)) %>% - filter(class != "link") %>% - unnest(feature) %>% - rename(seq_id=karyo) - links <- tibble(links=ali$data$links) %>% unnest(links) %>% unnest(links) %>% unnest_wider(links) - link_pos <- tibble(link=ali$data$features$link) %>% mutate(id=names(link)) %>% unnest_wider(link) - links <- links %>% - left_join(link_pos, by=c("source"="id")) %>% - left_join(link_pos, by=c("target"="id")) %>% - transmute( - seq_id1=karyo.x, - start1=start.x, - end1=end.x, - seq_id2=karyo.y, - start2=start.y, - end2=end.y, - identity=identity - ) - return(list(seqs=seqs,genes=genes,links=links)) -} - diff --git a/R/read_alitv.R b/R/read_alitv.R new file mode 100644 index 00000000..d456134e --- /dev/null +++ b/R/read_alitv.R @@ -0,0 +1,55 @@ +#' Read AliTV .json file +#' +#' this file contains sequences, links and (optionally) genes +#' +#' @importFrom tidyr unnest_wider +#' @importFrom tidyr unnest +#' @importFrom jsonlite fromJSON +#' @param file path to json +#' @export +#' @return list with seqs, genes, and links +#' @examples +#' ali <- read_alitv("https://alitvteam.github.io/AliTV/d3/data/chloroplasts.json") +#' gggenomes(ali$seqs, ali$genes, links=ali$links) + +#' geom_seq() + +#' geom_bin_label() + +#' geom_gene(aes(fill=class)) + +#' geom_link() +#' p <- gggenomes(ali$seqs, ali$genes, links=ali$links) + +#' geom_seq() + +#' geom_bin_label() + +#' geom_gene(aes(color=class)) + +#' geom_link(aes(fill=identity)) + +#' scale_fill_distiller(palette="RdYlGn", direction = 1) +#' p %>% flip_seq("Same_gi") %>% pick(1,3,2,4,5,6,7,8) +read_alitv <- function(file){ + ali <- jsonlite::fromJSON(file, simplifyDataFrame=TRUE) + seqs <- tibble(seq = ali$data$karyo$chromosome) %>% + mutate(seq_id = names(seq)) %>% + unnest_wider(seq) %>% + rename(bin_id = genome_id) + genes <- tibble(feature = ali$data$feature) %>% + mutate(class = names(feature)) %>% + filter(class != "link") %>% + unnest(feature) %>% + rename(seq_id=karyo) + links <- tibble(links=ali$data$links) %>% unnest(links) %>% unnest(links) %>% unnest_wider(links) + link_pos <- tibble(link=ali$data$features$link) %>% mutate(id=names(link)) %>% unnest_wider(link) + links <- links %>% + left_join(link_pos, by=c("source"="id")) %>% + left_join(link_pos, by=c("target"="id")) %>% + transmute( + seq_id1=karyo.x, + start1=start.x, + end1=end.x, + seq_id2=karyo.y, + start2=start.y, + end2=end.y, + identity=identity + ) + return(list(seqs=seqs,genes=genes,links=links)) +} + +read_alitv_seqs <- function(...) read_alitv(...)$seqs +read_alitv_genes <- function(...) read_alitv(...)$genes +read_alitv_links <- function(...) read_alitv(...)$links diff --git a/R/read_feats.R b/R/read_feats.R index acacc49f..2a637e25 100644 --- a/R/read_feats.R +++ b/R/read_feats.R @@ -1,83 +1,77 @@ -#' Read features and links from common file formats -#' -#' Read features or links from common formats, such as GFF3, Genbank, BED, BLAST -#' tabular output or PAF files. File formats and the format-specific `read_*()` -#' function are automatically determined based in file extensions, if possible. -#' Can read multiple files in the same format into a single table: useful, for -#' example, to read a folder of gff-files with each containing genes of a -#' different genome. -#' -#' @param files files to reads. Should all be of same format. -#' @param format If NULL, guess from file extension. Else, any format known to -#' gggenomes (gff3, gbk, ... see [file_formats()] for full list) or any suffix -#' of a known `read_` function, e.g. tsv for `readr::read_tsv()`. -#' @param .id the name of the column storing the file name each record came -#' from. Defaults to "file_id". Set to "bin_id" if every file represents a -#' different bin. -#' @param ... additional arguments passed on to the format-specific read -#' function called down the line. -#' -#' @return A gggenomes-compatible feature or link tibble #' @export +#' @describeIn read_tracks read files as features mapping onto +#' sequences. #' @examples -#' # read a file +#' # read genes/features from a gff file #' read_feats(ex("eden-utr.gff")) #' +#' #' # read all gffs from a directory #' read_feats(list.files(ex("emales/"), "*.gff$", full.names=TRUE)) #' -#' \dontrun{ +#' #' # read remote files +#' \dontrun{ #' gbk_phages <- c( #' PSSP7 = "ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/858/745/GCF_000858745.1_ViralProj15134/GCF_000858745.1_ViralProj15134_genomic.gff.gz", #' PSSP3 = "ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/904/555/GCF_000904555.1_ViralProj195517/GCF_000904555.1_ViralProj195517_genomic.gff.gz") #' read_feats(gbk_phages) #' } -#' @describeIn read_feats read files as features mapping onto sequences -read_feats <- function(files, format=NULL, .id="file_id", ...){ - if(is_connection(files)) - files <- list(files) # weird things happen to pipes in vectors - - # infer file format from suffix - format <- (format %||% file_format_unique(files, "feats")) - - if(format == 'ambigious'){ - abort(str_glue('Ambigious file extension(s): "', comma(unique(file_ext(files))), - '".\nPlease specify `format` explicitly')) - } - - # for unnamed files, infer name from filename (used as file_id/bin_id) - files <- file_label(files) - - # map_df .id = bin_id - inform(str_glue("Reading as {format}:")) - feats <- map2_df(files, names(files), read_format, .id=.id, format, ...) - - feats +#' +#' +read_feats <- function(files, .id="file_id", format=NULL, parser=NULL, ...){ + read_context(files, "feats", .id=.id, format=format, parser=parser, ...) } #' @export -#' @describeIn read_feats read files as subfeatures mapping onto other features -read_subfeats <- function(files, format=NULL, .id="file_id", ...){ - feats <- read_feats(files=files, format=format, ...) +#' @describeIn read_tracks read files as subfeatures mapping onto other features +read_subfeats <- function(files, .id="file_id", format=NULL, parser=NULL, ...){ + feats <- read_context(files, "feats", .id=.id, format=format, parser=parser, ...) rename(feats, feat_id=seq_id, feat_id2=seq_id2) } #' @export -#' @describeIn read_feats read files as links connecting sequences -read_links <- function(files, format=NULL, .id="file_id", ...){ - feats <- read_feats(files=files, format=format, ...) +#' @describeIn read_tracks read files as links connecting sequences +read_links <- function(files, .id="file_id", format=NULL, parser=NULL, ...){ + feats <- read_context(files, "feats", .id=.id, format=format, parser=parser, ...) rename(feats, seq_id=seq_id, start=start, end=end) } #' @export -#' @describeIn read_feats read files as sublinks connecting features -read_sublinks <- function(files, format=NULL, .id="file_id", ...){ - feats <- read_feats(files=files, format=format, ...) +#' @describeIn read_tracks read files as sublinks connecting features +read_sublinks <- function(files, .id="file_id", format=NULL, parser=NULL, ...){ + feats <- read_context(files, "feats", .id=.id, format=format, parser=parser, ...) rename(feats, feat_id=seq_id, start=start, end=end, feat_id2=seq_id2) } -read_format <- function(file, name, format, ...){ - inform(str_glue("* {name} [{file}]")) - exec(paste0("read_", format), file, ...) +#' Swap query and subject in blast-like feature tables +#' +#' Swap query and subject columns in a table read with [read_feats()] or +#' [read_links()], for example, from blast searches. Swaps columns with +#' name/name2, such as 'seq_id/seq_id2', 'start/start2', ... +#' +#' @param x tibble with query and subject columns +#' @export +#' @return tibble with swapped query/subject columns +#' @examples +#' feats <- tribble( +#' ~seq_id, ~seq_id2, ~start, ~end, ~strand, ~start2, ~end2, ~evalue, +#' "A", "B", 100, 200, "+", 10000, 10200, 1e-5 +#' ) +#' # make B the query +#' swap_query(feats) +swap_query <- function(x){ + # for every pair seq_id/seq_id2, name/name2 > name2/name + n <- names(x) + m <- str_subset(n, "\\D2") %>% str_remove("2$") %>% intersect(n) + if(!length(m)) + return(x) + + m2 <- paste0(m, "2") + i <- which(n %in% m) + i2 <- which(n %in% m2) + inform(c("Swapping query/subject-associated columns", + comma(m, collapse=' '), comma(m2, collapse=' '))) + x[c(i, i2)] <- x[c(i2, i)] + x } diff --git a/R/read_seqs.R b/R/read_seqs.R index 47d53591..ad3ae941 100644 --- a/R/read_seqs.R +++ b/R/read_seqs.R @@ -1,47 +1,24 @@ -#' Read a sequence index -#' -#' Read ID, description and length for each sequence from common formats -#' including FASTA, samtools/seqkit FASTA index files, and GFF3. Default columns -#' are seq_id, seq_desc and length. -#' -#' @importFrom readr read_tsv -#' @param file fasta or .fai/.seqkit.fai fasta index #' @export -#' @return A gggenomes-compatible sequence tibble -#' @describeIn read_seqs read seqs from files with automatic format detection +#' @describeIn read_tracks read sequence ID, description and length. #' @examples -#' # from a fasta file +#' # reads sequence index from a fasta file #' read_seqs(ex("emales/emales.fna")) +#' +#' #' # from samtools/seqkit style index #' read_seqs(ex("emales/emales.fna.seqkit.fai")) +#' +#' #' # from multiple gff file #' read_seqs(c(ex("emales/emales.gff"), ex("emales/emales-tirs.gff"))) -read_seqs <- function(files, format=NULL, .id="file_id", ...){ - if(any(map_lgl(files, is_connection))){ - warn("Using connections instead of paths to files can lead to unexpected behaviour") - is_connection(files) - files <- list(files) # weird things happen to pipes in vectors - } - - # infer file format from suffix - format <- (format %||% file_format_unique(files, "seqs")) - - if(format == 'ambigious'){ - abort(str_glue('Ambigious file extension(s): "', comma(unique(file_ext(files))), - '".\nPlease specify `format` explicitly')) - } - - # for unnamed files, infer name from filename (used as file_id/bin_id) - files <- file_label(files) - - # map_df .id = bin_id - inform(str_glue("Reading as {format}:")) - seqs <- map2_df(files, names(files), read_format, .id=.id, format, ...) - - seqs +read_seqs <- function(files, .id="file_id", format=NULL, parser=NULL, ...){ + read_context(files, "seqs", .id=.id, format=format, parser=parser, ...) } -#' @describeIn read_seqs read seqs from a single file in fasta, gbk or gff3 format. + +#' Read sequence index +#' +#' @describeIn read_seq_len read seqs from a single file in fasta, gbk or gff3 format. #' @export read_seq_len <- function(file, col_names = def_names("seq_len"), col_types = def_types("seq_len"), ...){ @@ -53,7 +30,7 @@ read_seq_len <- function(file, col_names = def_names("seq_len"), } -#' @describeIn read_seqs read seqs from a single file in seqkit/samtools fai format. +#' @describeIn read_seq_len read seqs from a single file in seqkit/samtools fai format. #' @export read_fai <- function(file, col_names=def_names("fai"), col_types=def_types("fai"), ...){ diff --git a/_pkgdown.yml b/_pkgdown.yml index dfa22a09..6d11f0be 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -34,7 +34,7 @@ reference: - starts_with("read_") - swap_query - ex - - file_formats + - def_formats - def_names - def_types - starts_with("write") @@ -51,10 +51,6 @@ reference: - in_range - width - introduce -- title: "Handle files" -- contents: - - ext_to_format - - file_exts - file_label - title: "Data sets" - contents: diff --git a/man/def_formats.Rd b/man/def_formats.Rd new file mode 100644 index 00000000..0b3e7ebe --- /dev/null +++ b/man/def_formats.Rd @@ -0,0 +1,66 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/read.R +\name{def_formats} +\alias{def_formats} +\title{Defined file formats and extensions} +\usage{ +def_formats( + file = NULL, + ext = NULL, + context = NULL, + parser = NULL, + allow_na = FALSE +) +} +\arguments{ +\item{file}{a vector of file names} + +\item{ext}{a vector of file extensions} + +\item{context}{a vector of file contexts defined in +\code{gggenomes_global$def_formats}} + +\item{parser}{a vector of file parsers defined in +\code{gggenomes_global$def_formats}} +} +\value{ +dictionarish vector of file formats with recognized extensions as +names +} +\description{ +For seamless reading of different file formats, gggenomes uses a mapping of +known formats to associated file extensions and contexts in which the +different formats can be read. The notion of context allows one to read +different information from the same format/extension. For example, a gbk file +holds both feature and sequence information. If read in "feats" context +\code{read_feats("*.gbk")} it will return a feature table, if read in "seqs" +context \code{read_seqs("*.gbk")}, a sequence index. +} +\section{Defined formats, extensions, contexts, and parsers}{ + +\preformatted{ + format ext context parser +1 ambigious txt, tsv, csv NA read_ambigious +2 fasta fa, fas, fasta, ffn, fna, faa seqs read_seq_len +3 fai fai seqs read_fai +4 gff3 gff, gff3 feats, seqs read_gff3, read_seq_len +5 gbk gbk, gb, gbff, gpff feats, seqs read_gbk, read_seq_len +6 bed bed feats read_bed +7 blast m8, o6, o7 feats read_blast +8 paf paf feats read_paf +9 alitv json feats, seqs, links read_alitv_genes, read_alitv_seqs, read_alitv_links +} +} + +\examples{ +# vector of defined zip formats and recognized extensions as names +# format of file +def_formats("foo.fa") + +# formats associated with each extension +def_formats(ext=qc(fa, gff)) + +# all formats/extensions that can be read in seqs context; includes formats +# that are defined for context=NA, i.e. that can be read in any context. +def_formats(context="seqs") +} diff --git a/man/ext_to_format.Rd b/man/ext_to_format.Rd deleted file mode 100644 index 9f2dd8e8..00000000 --- a/man/ext_to_format.Rd +++ /dev/null @@ -1,22 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/read.R -\name{ext_to_format} -\alias{ext_to_format} -\title{File format from suffix} -\usage{ -ext_to_format(x, context) -} -\arguments{ -\item{x}{a vector of file extensions} - -\item{context}{a file format context defined in \code{\link[=file_formats]{file_formats()}}} -} -\value{ -a vector of formats with extensions as names -} -\description{ -File format from suffix -} -\examples{ -gggenomes:::ext_to_format(c("gff", "txt", "FASTA"), "feats") -} diff --git a/man/file_exts.Rd b/man/file_exts.Rd deleted file mode 100644 index bf4ff1ec..00000000 --- a/man/file_exts.Rd +++ /dev/null @@ -1,21 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/read.R -\name{file_exts} -\alias{file_exts} -\title{Defined file extensions and associated formats} -\usage{ -file_exts(context) -} -\arguments{ -\item{context}{a file format context defined in \code{gggenomes_global$file_formats}} -} -\value{ -vector of file extensions with formats as names -} -\description{ -Defined file extensions and associated formats -} -\examples{ -# vector of zip-context file extensions and format names -gggenomes:::file_exts("zips") -} diff --git a/man/file_formats.Rd b/man/file_formats.Rd deleted file mode 100644 index 3d6b8d39..00000000 --- a/man/file_formats.Rd +++ /dev/null @@ -1,47 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/read.R -\name{file_formats} -\alias{file_formats} -\title{Defined file formats and extensions} -\usage{ -file_formats(context) -} -\arguments{ -\item{context}{a file format context defined in \code{gggenomes_global$file_formats}} -} -\value{ -dictionarish vector of file formats with recognized extensions as names -} -\description{ -For seamless reading of different file formats, gggenomes uses a mapping of -known formats to associated file extensions and contexts in which the -different formats can be read. The notion of context allows one to read -different information from the same format/extension. For example, a gbk file -holds both feature and sequence information. If read in "feats" context -\code{read_feats("*.gbk")} it will return a feature table, if read in "seqs" -context \code{read_seqs("*.gbk")}, a sequence index. -} -\section{Defined contexts, formats and extensions}{ - -\preformatted{ -feats ambigious [txt,tsv,csv] - bed [bed] - blast [m8,o6,o7] - fasta [fa,fas,fasta,ffn,fna,faa] - gbk [gbk,gb,gbff] - gff3 [gff,gff3] - paf [paf] -seqs ambigious [txt,tsv,csv] - fai [fai] - seq_len [fa,fas,fasta,ffn,fna,faa,gff,gbk] -zips bz2 [bz,bz2] - gz [gz] - xz [xz] - zip [zip] -} -} - -\examples{ -# vector of defined zip formats and recognized extensions as names -file_formats("zips") -} diff --git a/man/read_alitv.Rd b/man/read_alitv.Rd index e4609196..14bc79e9 100644 --- a/man/read_alitv.Rd +++ b/man/read_alitv.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/read.R +% Please edit documentation in R/read_alitv.R \name{read_alitv} \alias{read_alitv} \title{Read AliTV .json file} diff --git a/man/read_context.Rd b/man/read_context.Rd new file mode 100644 index 00000000..34e2d823 --- /dev/null +++ b/man/read_context.Rd @@ -0,0 +1,46 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/read.R +\name{read_context} +\alias{read_context} +\title{Read files in different contexts} +\usage{ +read_context( + files, + context, + .id = "file_id", + format = NULL, + parser = NULL, + ... +) +} +\arguments{ +\item{files}{files to reads. Should all be of same format. In many cases, +compressed files (\code{.gz}, \code{.bz2}, \code{.xz}, or \code{.zip}) are supported. +Similarly, automatic download of remote files starting with \verb{http(s)://} or +\verb{ftp(s)://} works in most cases.} + +\item{context}{the context ("seqs", "feats", "links") in which a given format +should be read.} + +\item{.id}{the column with the name of the file a record was read from. +Defaults to "file_id". Set to "bin_id" if every file represents a different +bin.} + +\item{format}{specify a format known to gggenomes, such as \code{gff3}, \code{gbk}, ... +to overwrite automatic determination based on the file extension (see +\code{\link[=def_formats]{def_formats()}} for full list).} + +\item{parser}{specify the name of an R function to overwrite automatic +determination based on format, e.g. \code{parser="read_tsv"}.} + +\item{...}{additional arguments passed on to the format-specific read +function called down the line.} +} +\description{ +Powers \code{\link[=read_seqs]{read_seqs()}}, \code{\link[=read_feats]{read_feats()}}, \code{\link[=read_links]{read_links()}} +} +\section{Functions}{ +\itemize{ +\item \code{read_context}: bla keywords internal +}} + diff --git a/man/read_feats.Rd b/man/read_feats.Rd deleted file mode 100644 index 5faeab4b..00000000 --- a/man/read_feats.Rd +++ /dev/null @@ -1,68 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/read_feats.R -\name{read_feats} -\alias{read_feats} -\alias{read_subfeats} -\alias{read_links} -\alias{read_sublinks} -\title{Read features and links from common file formats} -\usage{ -read_feats(files, format = NULL, .id = "file_id", ...) - -read_subfeats(files, format = NULL, .id = "file_id", ...) - -read_links(files, format = NULL, .id = "file_id", ...) - -read_sublinks(files, format = NULL, .id = "file_id", ...) -} -\arguments{ -\item{files}{files to reads. Should all be of same format.} - -\item{format}{If NULL, guess from file extension. Else, any format known to -gggenomes (gff3, gbk, ... see \code{\link[=file_formats]{file_formats()}} for full list) or any suffix -of a known \verb{read_} function, e.g. tsv for \code{readr::read_tsv()}.} - -\item{.id}{the name of the column storing the file name each record came -from. Defaults to "file_id". Set to "bin_id" if every file represents a -different bin.} - -\item{...}{additional arguments passed on to the format-specific read -function called down the line.} -} -\value{ -A gggenomes-compatible feature or link tibble -} -\description{ -Read features or links from common formats, such as GFF3, Genbank, BED, BLAST -tabular output or PAF files. File formats and the format-specific \verb{read_*()} -function are automatically determined based in file extensions, if possible. -Can read multiple files in the same format into a single table: useful, for -example, to read a folder of gff-files with each containing genes of a -different genome. -} -\section{Functions}{ -\itemize{ -\item \code{read_feats}: read files as features mapping onto sequences - -\item \code{read_subfeats}: read files as subfeatures mapping onto other features - -\item \code{read_links}: read files as links connecting sequences - -\item \code{read_sublinks}: read files as sublinks connecting features -}} - -\examples{ -# read a file -read_feats(ex("eden-utr.gff")) - -# read all gffs from a directory -read_feats(list.files(ex("emales/"), "*.gff$", full.names=TRUE)) - -\dontrun{ -# read remote files -gbk_phages <- c( - PSSP7 = "ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/858/745/GCF_000858745.1_ViralProj15134/GCF_000858745.1_ViralProj15134_genomic.gff.gz", - PSSP3 = "ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/904/555/GCF_000904555.1_ViralProj195517/GCF_000904555.1_ViralProj195517_genomic.gff.gz") -read_feats(gbk_phages) -} -} diff --git a/man/read_seq_len.Rd b/man/read_seq_len.Rd new file mode 100644 index 00000000..b5273081 --- /dev/null +++ b/man/read_seq_len.Rd @@ -0,0 +1,26 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/read_seqs.R +\name{read_seq_len} +\alias{read_seq_len} +\alias{read_fai} +\title{Read sequence index} +\usage{ +read_seq_len( + file, + col_names = def_names("seq_len"), + col_types = def_types("seq_len"), + ... +) + +read_fai(file, col_names = def_names("fai"), col_types = def_types("fai"), ...) +} +\description{ +Read sequence index +} +\section{Functions}{ +\itemize{ +\item \code{read_seq_len}: read seqs from a single file in fasta, gbk or gff3 format. + +\item \code{read_fai}: read seqs from a single file in seqkit/samtools fai format. +}} + diff --git a/man/read_seqs.Rd b/man/read_seqs.Rd deleted file mode 100644 index 666203db..00000000 --- a/man/read_seqs.Rd +++ /dev/null @@ -1,47 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/read_seqs.R -\name{read_seqs} -\alias{read_seqs} -\alias{read_seq_len} -\alias{read_fai} -\title{Read a sequence index} -\usage{ -read_seqs(files, format = NULL, .id = "file_id", ...) - -read_seq_len( - file, - col_names = def_names("seq_len"), - col_types = def_types("seq_len"), - ... -) - -read_fai(file, col_names = def_names("fai"), col_types = def_types("fai"), ...) -} -\arguments{ -\item{file}{fasta or .fai/.seqkit.fai fasta index} -} -\value{ -A gggenomes-compatible sequence tibble -} -\description{ -Read ID, description and length for each sequence from common formats -including FASTA, samtools/seqkit FASTA index files, and GFF3. Default columns -are seq_id, seq_desc and length. -} -\section{Functions}{ -\itemize{ -\item \code{read_seqs}: read seqs from files with automatic format detection - -\item \code{read_seq_len}: read seqs from a single file in fasta, gbk or gff3 format. - -\item \code{read_fai}: read seqs from a single file in seqkit/samtools fai format. -}} - -\examples{ -# from a fasta file -read_seqs(ex("emales/emales.fna")) -# from samtools/seqkit style index -read_seqs(ex("emales/emales.fna.seqkit.fai")) -# from multiple gff file -read_seqs(c(ex("emales/emales.gff"), ex("emales/emales-tirs.gff"))) -} diff --git a/man/read_tracks.Rd b/man/read_tracks.Rd new file mode 100644 index 00000000..afb9e0a7 --- /dev/null +++ b/man/read_tracks.Rd @@ -0,0 +1,96 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/read.R, R/read_feats.R, R/read_seqs.R +\name{read_tracks} +\alias{read_tracks} +\alias{read_feats} +\alias{read_subfeats} +\alias{read_links} +\alias{read_sublinks} +\alias{read_seqs} +\title{Read files in various formats into track tables} +\usage{ +read_feats(files, .id = "file_id", format = NULL, parser = NULL, ...) + +read_subfeats(files, .id = "file_id", format = NULL, parser = NULL, ...) + +read_links(files, .id = "file_id", format = NULL, parser = NULL, ...) + +read_sublinks(files, .id = "file_id", format = NULL, parser = NULL, ...) + +read_seqs(files, .id = "file_id", format = NULL, parser = NULL, ...) +} +\arguments{ +\item{files}{files to reads. Should all be of same format. In many cases, +compressed files (\code{.gz}, \code{.bz2}, \code{.xz}, or \code{.zip}) are supported. +Similarly, automatic download of remote files starting with \verb{http(s)://} or +\verb{ftp(s)://} works in most cases.} + +\item{.id}{the column with the name of the file a record was read from. +Defaults to "file_id". Set to "bin_id" if every file represents a different +bin.} + +\item{format}{specify a format known to gggenomes, such as \code{gff3}, \code{gbk}, ... +to overwrite automatic determination based on the file extension (see +\code{\link[=def_formats]{def_formats()}} for full list).} + +\item{parser}{specify the name of an R function to overwrite automatic +determination based on format, e.g. \code{parser="read_tsv"}.} + +\item{...}{additional arguments passed on to the format-specific read +function called down the line.} +} +\value{ +A gggenomes-compatible sequence, feature or link tibble +} +\description{ +Convenience functions to read sequences, features or links from various +bioinformatics file formats, such as FASTA, GFF3, Genbank, BLAST tabular +output, etc. See \code{\link[=def_formats]{def_formats()}} for full list. File formats and the +corresponding read-functions are automatically determined based on file +extensions. All these functions can read multiple files in the same format at +once, and combine them into a single table - useful, for example, to read a +folder of gff-files with each file containing genes of a different genome. +} +\section{Functions}{ +\itemize{ +\item \code{read_feats}: read files as features mapping onto +sequences. + +\item \code{read_subfeats}: read files as subfeatures mapping onto other features + +\item \code{read_links}: read files as links connecting sequences + +\item \code{read_sublinks}: read files as sublinks connecting features + +\item \code{read_seqs}: read sequence ID, description and length. +}} + +\examples{ +# read genes/features from a gff file +read_feats(ex("eden-utr.gff")) + + +# read all gffs from a directory +read_feats(list.files(ex("emales/"), "*.gff$", full.names=TRUE)) + + +# read remote files +\dontrun{ +gbk_phages <- c( + PSSP7 = "ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/858/745/GCF_000858745.1_ViralProj15134/GCF_000858745.1_ViralProj15134_genomic.gff.gz", + PSSP3 = "ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/904/555/GCF_000904555.1_ViralProj195517/GCF_000904555.1_ViralProj195517_genomic.gff.gz") +read_feats(gbk_phages) +} + + +# reads sequence index from a fasta file +read_seqs(ex("emales/emales.fna")) + + +# from samtools/seqkit style index +read_seqs(ex("emales/emales.fna.seqkit.fai")) + + +# from multiple gff file +read_seqs(c(ex("emales/emales.gff"), ex("emales/emales-tirs.gff"))) +} diff --git a/man/swap_query.Rd b/man/swap_query.Rd index ed3d0873..f61acbce 100644 --- a/man/swap_query.Rd +++ b/man/swap_query.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/read.R +% Please edit documentation in R/read_feats.R \name{swap_query} \alias{swap_query} \title{Swap query and subject in blast-like feature tables} From 9f4ca1ac0160f96db6b3d02383ea7c8b358bf719 Mon Sep 17 00:00:00 2001 From: Markus Ankenbrand Date: Mon, 8 Feb 2021 16:03:42 +0100 Subject: [PATCH 2/4] Fix context for links --- R/read_feats.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/read_feats.R b/R/read_feats.R index 2a637e25..0fb43de9 100644 --- a/R/read_feats.R +++ b/R/read_feats.R @@ -33,14 +33,14 @@ read_subfeats <- function(files, .id="file_id", format=NULL, parser=NULL, ...){ #' @export #' @describeIn read_tracks read files as links connecting sequences read_links <- function(files, .id="file_id", format=NULL, parser=NULL, ...){ - feats <- read_context(files, "feats", .id=.id, format=format, parser=parser, ...) + feats <- read_context(files, "links", .id=.id, format=format, parser=parser, ...) rename(feats, seq_id=seq_id, start=start, end=end) } #' @export #' @describeIn read_tracks read files as sublinks connecting features read_sublinks <- function(files, .id="file_id", format=NULL, parser=NULL, ...){ - feats <- read_context(files, "feats", .id=.id, format=format, parser=parser, ...) + feats <- read_context(files, "links", .id=.id, format=format, parser=parser, ...) rename(feats, feat_id=seq_id, start=start, end=end, feat_id2=seq_id2) } From f6b965d0ae700e4743dfef2ac37d7adea3959f39 Mon Sep 17 00:00:00 2001 From: Markus Ankenbrand Date: Mon, 8 Feb 2021 16:04:04 +0100 Subject: [PATCH 3/4] Fix column names for links from alitv --- R/read_alitv.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/R/read_alitv.R b/R/read_alitv.R index d456134e..a8418e1b 100644 --- a/R/read_alitv.R +++ b/R/read_alitv.R @@ -39,9 +39,9 @@ read_alitv <- function(file){ left_join(link_pos, by=c("source"="id")) %>% left_join(link_pos, by=c("target"="id")) %>% transmute( - seq_id1=karyo.x, - start1=start.x, - end1=end.x, + seq_id=karyo.x, + start=start.x, + end=end.x, seq_id2=karyo.y, start2=start.y, end2=end.y, From 415c501e0abf4832c8aa05379c8689d41d8d1fc0 Mon Sep 17 00:00:00 2001 From: Markus Ankenbrand Date: Mon, 8 Feb 2021 17:03:47 +0100 Subject: [PATCH 4/4] Fix alitv example --- R/read_alitv.R | 2 +- man/read_alitv.Rd | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/R/read_alitv.R b/R/read_alitv.R index a8418e1b..4e10464f 100644 --- a/R/read_alitv.R +++ b/R/read_alitv.R @@ -21,7 +21,7 @@ #' geom_gene(aes(color=class)) + #' geom_link(aes(fill=identity)) + #' scale_fill_distiller(palette="RdYlGn", direction = 1) -#' p %>% flip_seq("Same_gi") %>% pick(1,3,2,4,5,6,7,8) +#' p %>% flip_seqs(5) %>% pick_seqs(1,3,2,4,5,6,7,8) read_alitv <- function(file){ ali <- jsonlite::fromJSON(file, simplifyDataFrame=TRUE) seqs <- tibble(seq = ali$data$karyo$chromosome) %>% diff --git a/man/read_alitv.Rd b/man/read_alitv.Rd index 14bc79e9..5a0865a5 100644 --- a/man/read_alitv.Rd +++ b/man/read_alitv.Rd @@ -28,5 +28,5 @@ p <- gggenomes(ali$seqs, ali$genes, links=ali$links) + geom_gene(aes(color=class)) + geom_link(aes(fill=identity)) + scale_fill_distiller(palette="RdYlGn", direction = 1) -p \%>\% flip_seq("Same_gi") \%>\% pick(1,3,2,4,5,6,7,8) +p \%>\% flip_seqs(5) \%>\% pick_seqs(1,3,2,4,5,6,7,8) }