R/ncbi_searcher.R

#' Search for gene sequences available for taxa from NCBI.
#'
#' @export
#' @template ncbi
#' @param id (\code{character}) Taxonomic id to search for. Not compatible with
#'    argument \code{taxa}.
#' @param limit (\code{numeric}) Number of sequences to search for and return.
#' Max of 10,000. If you search for 6000 records, and only 5000 are found,
#' you will of course only get 5000 back.
#' @param entrez_query (\code{character}; length 1) An Entrez-format query to
#' filter results with. This is useful to search for sequences with specific
#' characteristics. The format is the same as the one used to seach genbank.
#' (\url{https://www.ncbi.nlm.nih.gov/books/NBK3837/#EntrezHelp.Entrez_Searching_Options})
#' @param fuzzy (logical) Whether to do fuzzy taxonomic ID search or exact
#' search. If \code{TRUE}, we use \code{xXarbitraryXx[porgn:__txid<ID>]},
#' but if \code{FALSE}, we use \code{txid<ID>}. Default: \code{FALSE}
#' @param hypothetical (\code{logical}; length 1) If \code{FALSE}, an attempt
#' will be made to not return hypothetical or predicted sequences judging from
#' accession number prefixs (XM and XR). This can result in less than the
#' \code{limit} being returned even if there are more sequences available,
#' since this filtering is done after searching NCBI.
#' @param sleep (integer) number of seconds to sleep before each HTTP request.
#' use if running to 429 Too Many Requests errors from NCBI. default: 0 
#' (no sleep)
#' @return \code{data.frame} of results if a single input is given. A list of
#'  \code{data.frame}s if multiple inputs are given.
#' @seealso \code{\link{ncbi_byid}}, \code{\link{ncbi_byname}}
#' @author Scott Chamberlain, Zachary Foster \email{zacharyfoster1989@@gmail.com}
#' @section Authentication:
#' NCBI rate limits requests. If you set an API key you have a higher rate limit. 
#' Set your API key like `Sys.setenv(ENTREZ_KEY="yourkey")` or you can use 
#' `?rentrez::set_entrez_key`. set verbose curl output (`crul::set_verbose()`) to
#' make sure your api key is being sent in the requests
#' @examples \dontrun{
#' # A single species
#' out <- ncbi_searcher(taxa="Umbra limi", seqrange = "1:2000")
#' # Get the same species information using a taxonomy id
#' out <- ncbi_searcher(id = "75935", seqrange = "1:2000")
#' # If the taxon name is unique, using the taxon name and id are equivalent
#' all(ncbi_searcher(id = "75935") ==  ncbi_searcher(taxa="Umbra limi"))
#' # If the taxon name is not unique, use taxon id
#' #  "266948" is the uid for the butterfly genus, but there is also a genus
#' #  of orchids with the
#' #  same name
#' nrow(ncbi_searcher(id = "266948")) ==  nrow(ncbi_searcher(taxa="Satyrium"))
#' # get list of genes available, removing non-unique
#' unique(out$gene_desc)
#' # does the string 'RAG1' exist in any of the gene names
#' out[grep("RAG1", out$gene_desc, ignore.case=TRUE),]
#'
#' # A single species without records in NCBI
#' out <- ncbi_searcher(taxa="Sequoia wellingtonia", seqrange="1:2000",
#'   getrelated=TRUE)
#'
#' # Many species, can run in parallel or not using plyr
#' species <- c("Salvelinus alpinus","Ictalurus nebulosus","Carassius auratus")
#' out2 <- ncbi_searcher(taxa=species, seqrange = "1:2000")
#' lapply(out2, head)
#' library("plyr")
#' out2df <- ldply(out2) # make data.frame of all
#' unique(out2df$gene_desc) # get list of genes available, removing non-unique
#' out2df[grep("12S", out2df$gene_desc, ignore.case=TRUE), ]
#'
#' # Using the getrelated and entrez_query options
#' ncbi_searcher(taxa = "Olpidiopsidales", limit = 5, getrelated = TRUE,
#'             entrez_query = "18S[title] AND 28S[title]")
#'
#' # get refseqs
#' one <- ncbi_searcher(taxa = "Salmonella enterica",
#'   entrez_query="srcdb_refseq[PROP]")
#' two <- ncbi_searcher(taxa = "Salmonella enterica")
#' }
ncbi_searcher <- function(taxa = NULL, id = NULL, seqrange="1:3000",
  getrelated=FALSE, fuzzy=FALSE, limit = 500, entrez_query = NULL,
  hypothetical = FALSE, verbose=TRUE, sleep=0L) {

  cat(paste0("using sleep: ", sleep), sep="\n")

  # Argument validation ----------------------------------------------------------------------------
  if (sum(c(is.null(taxa), is.null(id))) != 1) {
    stop("Either taxa or id must be specified, but not both")
  }

  # Convert 'taxa' to 'id' if 'taxa' is supplied ---------------------------------------------------
  if (is.null(id)) {
    id <- get_uid(taxa, messages = verbose)
    names(id) <- taxa
  } else {
    id <- as.character(id)
    class(id) <- "uid"
    names(id) <- id
  }

  if (getrelated) fuzzy <- TRUE
  # look up sequences for taxa ids -----------------------------------------------------------------
  if (length(id) == 1) {
    ncbi_searcher_foo(id, getrelated = getrelated, verbose = verbose,
      seqrange = seqrange, entrez_query = entrez_query, fuzzy = fuzzy,
      limit = limit, hypothetical = hypothetical, sleep=sleep)
  } else {
    lapply(id, ncbi_searcher_foo, getrelated = getrelated, verbose = verbose,
      seqrange = seqrange, entrez_query = entrez_query, fuzzy = fuzzy, limit = limit,
      hypothetical = hypothetical, sleep=sleep)
  }
}

# Constants --------------------------------------------------------------------------------------
url_esearch <- "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
url_esummary <- "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi"

# Function to process queries one at a time ------------------------------------------------------
ncbi_searcher_foo <- function(xx, getrelated, verbose, seqrange, entrez_query,
  fuzzy, limit, hypothetical, sleep, ...) {

  # Search for sequence IDs for the given taxon  - - - - - - - - - - - - - - - - - - - - - - - - -
  mssg(verbose, paste("Working on ", names(xx), "...", sep = ""))
  mssg(verbose, "...retrieving sequence IDs...")
  seq_ids <- search_for_sequences(xx, seqrange, entrez_query, fuzzy, limit, sleep, ...)
  # Search for sequences of the taxons parent if necessary and possible  - - - - - - - - - - - - -
  if (is.null(seq_ids) && getrelated) {
    mssg(verbose, paste("no sequences for ", names(xx), " - getting other related taxa", sep = ""))
    parent_id <- get_parent(xx, verbose, sleep)
    if (is.na(parent_id)) {
      mssg(verbose, paste0("no related taxa found"))
    } else {
      mssg(verbose, paste0("...retrieving sequence IDs for ", names(xx), "..."))
      seq_ids <- search_for_sequences(parent_id, seqrange, entrez_query, fuzzy,
        limit, sleep, ...)
    }
  }
  # Retrieve sequence information  - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  if (is.null(seq_ids)) {
    mssg(verbose, "no sequences found")
    df <- data.frame(character(0), numeric(0), character(0), character(0), numeric(0), stringsAsFactors = FALSE)
  } else {
    mssg(verbose, "...retrieving available genes and their lengths...")
    df <- download_summary(seq_ids, hypothetical=hypothetical, sleep=sleep, ...)
    mssg(verbose, "...done.")
  }
  # Format output  - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  stats::setNames(df, c("taxon", "length", "gene_desc", "acc_no", "gi_no"))
}

# Function to search for sequences with esearch --------------------------------------------------
search_for_sequences <- function(id, seqrange, entrez_query, fuzzy, limit, sleep=0L, ...) {
  if (is.na(id)) return(NULL)
  # Construct search query  - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  ## fuzzy or not fuzzy
  query_term <- if (fuzzy) {
    sprintf("xXarbitraryXx[porgn:__txid%s] AND %s[SLEN]", id, seqrange)
  } else {
    sprintf("txid%s AND %s[SLEN]", id, seqrange)
  }
  if (!is.null(entrez_query)) query_term <- paste(query_term, entrez_query, sep = " AND ")
  query <- list(db = "nuccore", retmax = limit, term = query_term, api_key = ncbi_key())
  # Submit query to NCBI - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  con <- crul::HttpClient$new(url_esearch, opts = list(...))
  Sys.sleep(sleep)
  res <- con$get(query = traitsc(query))
  res$raise_for_status()
  # Parse result - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  query_content <- xml2::read_xml(res$parse("UTF-8"))
  esearch_result <- xml2::xml_find_all(query_content, "//eSearchResult")[[1]]
  if (as.numeric(xml2::xml_text(xml2::xml_find_all(esearch_result, "//Count")[[1]])) == 0) {
    return(NULL)
  } else {
    return(xml2::xml_text(xml2::xml_find_all(esearch_result, "//IdList//Id"))) # a list of sequence ids
  }
}

# Function to download and parse sequence summary information using esummary ---------------------
download_summary <- function(seq_id, hypothetical, sleep=0L, ...) {
  actualnum <- length(seq_id)
  if (actualnum > 10000) {
    q <- list(db = "nucleotide")
    getstart <- seq(from = 1, to = actualnum, by = 10000)
    getnum <- c(rep(10000, length(getstart) - 1), actualnum - sum(rep(10000, length(getstart) - 1)))
    iterlist = list()
    for (i in seq_along(getstart)) {
      q$id = paste(seq_id[getstart[i]:(getstart[i] + (getnum[i] - 1))], collapse = " ")
      q$retstart <- getstart[i]
      q$retmax <- getnum[i]
      con <- crul::HttpClient$new(url_esummary, opts = list(...))
      Sys.sleep(sleep)
      res <- con$post(body = traitsc(q))
      res$raise_for_status()
      iterlist[[i]] <- parseres(res, hypothetical, sleep)
    }
    data.frame(rbindlist(iterlist), stringsAsFactors = FALSE)
  } else {
    body <- list(db = "nucleotide", api_key = ncbi_key(), id = paste(seq_id, collapse = " "))
    con <- crul::HttpClient$new(url_esummary, opts = list(...))
    Sys.sleep(sleep)
    res <- con$post(body = traitsc(body))
    res$raise_for_status()
    parseres(res, hypothetical, sleep)
  }
}

# Function to get a taxon's parent ---------------------------------------------------------------
get_parent <- function(id, verbose, sleep=0L) {
  if (!is.na(id)) {
    Sys.sleep(sleep)
    ancestry <- classification(id = id, db = "ncbi")[[1]]
    if (nrow(ancestry) > 1) {
      parent_name <- ancestry$name[nrow(ancestry) - 1]
      Sys.sleep(sleep)
      return(get_uid(parent_name, messages = verbose))
    }
  }
  if (!is.null(names(id)) && grepl(" ", names(id))) { #if a name is given and looks like a species
    parent_name <- strsplit(names(id), " ")[[1]][[1]]
    Sys.sleep(sleep)
    return(get_uid(parent_name, messages = verbose))
  }
  return(NA)
}

# Function to parse results from http query ------------------------------------------------------
parseres <- function(x, hypothetical, sleep){
  outsum <- xml2::xml_find_all(xml2::read_xml(x$parse("UTF-8")), "//eSummaryResult")[[1]]
  names <- xml2::xml_attr(xml2::xml_find_all(outsum, "//Item"), attr = "Name") # gets names of values in summary
  predicted <- as.character(xml2::xml_text(xml2::xml_find_all(outsum, "//Item"))[grepl("Caption", names)]) #  get access numbers
  has_access_prefix <- grepl("_", predicted)
  access_prefix <- unlist(Map(function(x, y) ifelse(x, strsplit(y, "_")[[1]][[1]], NA),
                              has_access_prefix, predicted))
  length_ <- as.numeric(xml2::xml_text(xml2::xml_find_all(outsum, "//Item"))[grepl("Length", names)]) # gets seq lengths
  gis <- as.numeric(xml2::xml_text(xml2::xml_find_all(outsum, "//Item"))[grepl("Gi", names)]) # gets GI numbers
  spused <- taxonomy(outsum, sleep)
  desc <- xml2::xml_text(xml2::xml_find_all(outsum, "//Item"))[grepl("Title", names)] # gets seq lengths # get spp names
  df <- data.frame(spused = spused, length = length_, genesavail = desc, access_num = predicted, ids = gis, stringsAsFactors = FALSE)
  if (!hypothetical) df <- df[!(access_prefix %in% c("XM","XR")), ]
  return(df)
}

taxonomy <- function(zz, sleep=0L) {
  taxids <- xml2::xml_text(xml2::xml_find_all(zz, '//Item[@Name="TaxId"]'))
  uids <- unique(taxids)
  out <- list()
  for (i in seq_along(uids)) {
    con <- crul::HttpClient$new(url_esummary)
    Sys.sleep(sleep)
    res <- con$get(query = traitsc(list(db="taxonomy", id=uids[i], api_key=ncbi_key())))
    res$raise_for_status()
    xml <- xml2::read_xml(res$parse("UTF-8"))
    out[[ uids[i] ]] <- xml2::xml_text(xml2::xml_find_all(xml, '//Item[@Name="ScientificName"]'))
  }
  for (i in seq_along(out)) {
    if (length(out[[i]]) == 0) {
      taxids[grepl(names(out)[i], taxids)] <- NA_character_
    } else {
      taxids[grepl(names(out)[i], taxids)] <- out[[i]]
    }
  }
  return(taxids)
}

ncbi_key <- function() Sys.getenv("ENTREZ_KEY", "") %||% NULL