refactor: rename columns (issue #62)

- Rename `compound_name` to `compound`, `inchi_key` to `inchikey` and `mass` to `exactmass`.
rformassspectrometry · Sep 25, 2020 · 326e3a2 · 326e3a2
1 parent c1b8ff9
commit 326e3a2
Show file tree

Hide file tree

Showing 21 changed files with 164 additions and 156 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: CompoundDb
 Type: Package
 Title: Creating and using (Chemical) Compound Annotation Databases
-Version: 0.5.0
+Version: 0.6.0
 Authors@R: c(person(given = "Jan", family = "Stanstrup",
  email = "stanstrup@gmail.com",
  role = c("aut"),

diff --git a/NAMESPACE b/NAMESPACE
@@ -2,10 +2,10 @@
 
 export(CompDb)
 export(CompoundIdFilter)
-export(CompoundNameFilter)
 export(MsBackendCompDb)
 export(MsmsMzRangeMaxFilter)
 export(MsmsMzRangeMinFilter)
+export(NameFilter)
 export(adducts)
 export(compound_tbl_lipidblast)
 export(compound_tbl_sdf)
@@ -25,10 +25,10 @@ export(src_compdb)
 export(tables)
 exportClasses(CompDb)
 exportClasses(CompoundIdFilter)
-exportClasses(CompoundNameFilter)
 exportClasses(MsBackendCompDb)
 exportClasses(MsmsMzRangeMaxFilter)
 exportClasses(MsmsMzRangeMinFilter)
+exportClasses(NameFilter)
 exportMethods("$<-")
 exportMethods("intensity<-")
 exportMethods("mz<-")

diff --git a/R/AnnotationFilters.R b/R/AnnotationFilters.R
@@ -9,7 +9,7 @@
 #'
 #' The supported filters are:
 #' - `CompoundIdFilter`: filter based on the compound ID.
-#' - `CompoundNameFilter`: filter based on the compound name.
+#' - `NameFilter`: filter based on the compound name.
 #' - `MsmsMzRangeMinFilter`: retrieve entries based on the smallest m/z of all
 #' peaks of their MS/MS spectra. Requires that MS/MS spectra data are present
 #' (i.e. `hasMsMsSpectra(cmp_db)` returns `TRUE`).
@@ -43,10 +43,10 @@
 #' AnnotationFilter(~ compound_id == "comp_b")
 #'
 #' ## Combine filters
-#' AnnotationFilterList(CompoundIdFilter("a"), CompoundNameFilter("b"))
+#' AnnotationFilterList(CompoundIdFilter("a"), NameFilter("b"))
 #'
 #' ## Using a formula expression
-#' AnnotationFilter(~ compound_id == "a" | compound_name != "b")
+#' AnnotationFilter(~ compound_id == "a" | name != "b")
 NULL
 
 #' @importClassesFrom AnnotationFilter CharacterFilter AnnotationFilter
@@ -67,20 +67,20 @@ CompoundIdFilter <- function(value, condition = "==") {
  new("CompoundIdFilter", value = as.character(value), condition = condition)
 }
 
-#' @exportClass CompoundNameFilter
+#' @exportClass NameFilter
 #'
 #' @rdname Filter-classes
-setClass("CompoundNameFilter", contains = "CharacterFilter",
+setClass("NameFilter", contains = "CharacterFilter",
  prototype = list(
  condition = "==",
  value = "",
- field = "compound_name"
+ field = "name"
  ))
-#' @export CompoundNameFilter
+#' @export NameFilter
 #'
 #' @rdname Filter-classes
-CompoundNameFilter <- function(value, condition = "==") {
- new("CompoundNameFilter", value = as.character(value),
+NameFilter <- function(value, condition = "==") {
+ new("NameFilter", value = as.character(value),
  condition = condition)
 }
 
@@ -300,9 +300,9 @@ MsmsMzRangeMaxFilter <- function(value, condition = "<=") {
 #' @noRd
 .supported_filters <- function(x) {
  df <- data.frame(filter = c("CompoundIdFilter",
- "CompoundNameFilter"),
+ "NameFilter"),
  field = c("compound_id",
- "compound_name"),
+ "name"),
  stringsAsFactors = FALSE)
  if (!missing(x) && .has_msms_spectra(x)) {
  df <- rbind(df,

diff --git a/R/CompDb.R b/R/CompDb.R
@@ -59,11 +59,11 @@
 #' to the database with parameter `x`.
 #'
 #' For all other methods: a `CompDb` object.
-#' 
+#'
 #' @param flags flags passed to the SQLite database connection.
 #' See [SQLite()]. Defaults to read-only, i.e. RSQLite::SQLITE_RO.
-#' 
-#' 
+#'
+#'
 #' @author Johannes Rainer
 #'
 #' @md
@@ -100,12 +100,12 @@
 #' tables(cmp_db)
 #'
 #' ## Extract a data.frame with the id, name and inchi of all compounds
-#' compounds(cmp_db, columns = c("compound_id", "compound_name", "inchi"))
+#' compounds(cmp_db, columns = c("compound_id", "name", "inchi"))
 #'
 #' ## Add also the synonyms (aliases) for the compounds. This will cause the
 #' ## tables compound and synonym to be joined. The elements of the compound_id
-#' ## and compound_name are now no longer unique
-#' res <- compounds(cmp_db, columns = c("compound_id", "compound_name", "synonym"))
+#' ## and name are now no longer unique
+#' res <- compounds(cmp_db, columns = c("compound_id", "name", "synonym"))
 #' head(res)
 #'
 #' ## Extract spectra for a specific HMDB compound.
@@ -124,7 +124,7 @@
 #' cmp_tbl <- tbl(src_cmp, "compound")
 #'
 #' ## Extract the id, name and inchi
-#' cmp_tbl %>% select(compound_id, compound_name, inchi) %>% collect()
+#' cmp_tbl %>% select(compound_id, name, inchi) %>% collect()
 NULL
 
 #' @importFrom methods new

diff --git a/R/createCompDbPackage.R b/R/createCompDbPackage.R
@@ -12,7 +12,7 @@
 #'
 #' @details
 #'
-#' Column `"compound_name"` reports for HMDB files the `"GENERIC_NAME"`, for
+#' Column `"name"` reports for HMDB files the `"GENERIC_NAME"`, for
 #' ChEBI the `"ChEBI Name"`, for PubChem the `"PUBCHEM_IUPAC_TRADITIONAL_NAME"`,
 #' and for Lipid Maps the `"COMMON_NAME"`, if that is
 #' not available, the first of the compounds synonyms and, if that is also not
@@ -27,7 +27,7 @@
 #' defined (i.e. not all entries have an InChI ID or other means to uniquely
 #' identify compounds). Thus, the function returns a highly redundant compound
 #' table. Feedback on how to reduce this redundancy would be highly welcome!
-#' 
+#'
 #' LIPID MAPS was tested August 2020. Older SDF files might not work as the field names were changed.
 #'
 #' @param file `character(1)` with the name of the SDF file.
@@ -38,11 +38,11 @@
 #' @return A [tibble::tibble] with general compound information (one row per
 #' compound):
 #' + `compound_id`: the ID of the compound.
-#' + `compound_name`: the compound's name.
+#' + `name`: the compound's name.
 #' + `inchi`: the InChI of the compound.
 #' + `inchikey`: the InChI key.
 #' + `formula`: the chemical formula.
-#' + `mass`: the compound's mass.
+#' + `exactmass`: the compound's (monoisotopic exact) mass.
 #' + `synonyms`: the compound's synonyms (aliases). This type of this column is
 #' by default a `list` to support multiple aliases per compound, unless
 #' argument `collapse` is provided, in which case multiple synonyms are pasted
@@ -106,11 +106,11 @@ compound_tbl_sdf <- function(file, collapse) {
 #' @return A [tibble::tibble] with general compound information (one row per
 #' compound):
 #' + `compound_id`: the ID of the compound.
-#' + `compound_name`: the compound's name.
+#' + `name`: the compound's name.
 #' + `inchi`: the InChI of the compound.
 #' + `inchikey`: the InChI key.
 #' + `formula`: the chemical formula.
-#' + `mass`: the compound's mass.
+#' + `exactmass`: the compound's mass.
 #' + `synonyms`: the compound's synonyms (aliases). This type of this column is
 #' by default a `list` to support multiple aliases per compound, unless
 #' argument `collapse` is provided, in which case multiple synonyms are pasted
@@ -145,11 +145,11 @@ compound_tbl_lipidblast <- function(file, collapse) {
 #' @description
 #'
 #' Internal function to extract compound information from a file in SDF format.
-#' 
+#'
 #' @param x what is returned by datablock2ma(datablock(read.SDFset)).
 #'
-#' @return A [tibble::tibble] with columns `"compound_id"`, `"compound_name"`,
-#' `"inchi"`, `"formula"`, `"mass"`.
+#' @return A [tibble::tibble] with columns `"compound_id"`, `"name"`,
+#' `"inchi"`, `"formula"`, `"exactmass"`.
 #'
 #' @note
 #' LIPID MAPS was tested August 2020. Older SDF files might not work as the field names were changed.
@@ -180,11 +180,11 @@ compound_tbl_lipidblast <- function(file, collapse) {
  nms[nas] <- x[nas, "SYSTEMATIC_NAME"]
  }
  res <- data_frame(compound_id = x[, colmap["id"]],
- compound_name = nms,
+ name = nms,
  inchi = x[, colmap["inchi"]],
- inchi_key = x[, colmap["inchi_key"]],
+ inchikey = x[, colmap["inchikey"]],
  formula = x[, colmap["formula"]],
- mass = as.numeric(x[, colmap["mass"]]),
+ exactmass = as.numeric(x[, colmap["exactmass"]]),
  synonyms = syns
  )
  if (is.na(colmap["smiles"])) {
@@ -209,7 +209,7 @@ compound_tbl_lipidblast <- function(file, collapse) {
 #'
 #' Based on the provided `colnames` guess whether the file is from HMDB,
 #' ChEBI, LIPID MAPS, PubChem or LipidBlast.
-#' 
+#'
 #'
 #' @param x `character` with the column names of the data table.
 #'
@@ -243,39 +243,39 @@ compound_tbl_lipidblast <- function(file, collapse) {
 .hmdb_colmap <- c(id = "HMDB_ID",
  name = "GENERIC_NAME",
  inchi = "INCHI_IDENTIFIER",
- inchi_key = "INCHI_KEY",
+ inchikey = "INCHI_KEY",
  formula = "FORMULA",
- mass = "EXACT_MASS",
+ exactmass = "EXACT_MASS",
  synonyms = "SYNONYMS",
  smiles = "SMILES"
  )
 .hmdb_separator <- "; "
 .chebi_colmap <- c(id = "ChEBI ID",
  name = "ChEBI Name",
  inchi = "InChI",
- inchi_key = "InChIKey",
+ inchikey = "InChIKey",
  formula = "Formulae",
- mass = "Monoisotopic Mass",
+ exactmass = "Monoisotopic Mass",
  synonyms = "Synonyms",
  smiles = "SMILES"
  )
 .chebi_separator <- " __ "
 .lipidmaps_colmap <- c(id = "LM_ID",
  name = "NAME",
  inchi = "INCHI",
- inchi_key = "INCHI_KEY",
+ inchikey = "INCHI_KEY",
  formula = "FORMULA",
- mass = "EXACT_MASS",
+ exactmass = "EXACT_MASS",
  synonyms = "SYNONYMS",
  smiles = NA
  )
 .lipidmaps_separator <- "; "
 .pubchem_colmap <- c(id = "PUBCHEM_COMPOUND_CID",
  name = "PUBCHEM_IUPAC_TRADITIONAL_NAME",
  inchi = "PUBCHEM_IUPAC_INCHI",
- inchi_key = "PUBCHEM_IUPAC_INCHIKEY",
+ inchikey = "PUBCHEM_IUPAC_INCHIKEY",
  formula = "PUBCHEM_MOLECULAR_FORMULA",
- mass = "PUBCHEM_EXACT_MASS",
+ exactmass = "PUBCHEM_EXACT_MASS",
  synonyms = "PUBCHEM_IUPAC_TRADITIONAL_NAME",
  smiles = "PUBCHEM_OPENEYE_CAN_SMILES"
  # Others:
@@ -288,9 +288,9 @@ compound_tbl_lipidblast <- function(file, collapse) {
 .mona_colmap <- c(id = "ID",
  name = "NAME",
  inchi = "INCHIKEY",
- inchi_key = "INCHIKEY",
+ inchikey = "INCHIKEY",
  formula = "FORMULA",
- mass = "EXACT MASS",
+ exactmass = "EXACT MASS",
  synonyms = "SYNONYMS",
  smiles = NA
  )
@@ -332,11 +332,11 @@ compound_tbl_lipidblast <- function(file, collapse) {
  mass <- NA_character_
  list(
  compound_id = x$id,
- compound_name = nms[1],
+ name = nms[1],
  inchi = cmp$inchi,
- inchi_key = NA_character_,
+ inchikey = NA_character_,
  formula = frml,
- mass = mass,
+ exactmass = mass,
  synonyms = nms[-1]
  )
  }
@@ -369,11 +369,11 @@ compound_tbl_lipidblast <- function(file, collapse) {
 #' Required columns for the `data.frame` providing the compound information (
 #' parameter `x`) are:
 #' + `"compound_id"`: the ID of the compound.
-#' + `"compound_name"`: the compound's name.
+#' + `"name"`: the compound's name.
 #' + `"inchi"`: the InChI of the compound.
 #' + `"inchikey"`: the InChI key.
 #' + `"formula"`: the chemical formula.
-#' + `"mass"`: the compound's mass.
+#' + `"exactmass"`: the compound's (exact) mass.
 #' + `"synonyms"`: additional synonyms/aliases for the compound. Should be
 #' either a single character or a list of values for each compound.
 #'
@@ -587,7 +587,7 @@ createCompDb <- function(x, metadata, msms_spectra, path = ".") {
  }
  ## Creating indices
  dbExecute(con, "create index compound_id_idx on compound (compound_id)")
- dbExecute(con, "create index compound_name_idx on compound (compound_name)")
+ dbExecute(con, "create index compound_name_idx on compound (name)")
  ## Process spectra.
  if (!missing(msms_spectra) && is.data.frame(msms_spectra)) {
  comp_ids <- unique(x$compound_id)
@@ -667,8 +667,8 @@ createCompDb <- function(x, metadata, msms_spectra, path = ".") {
 
 .required_metadata_keys <- c("source", "url", "source_version", "source_date",
  "organism")
-.required_compound_db_columns <- c("compound_id", "compound_name", "inchi",
- "inchi_key", "formula", "mass")
+.required_compound_db_columns <- c("compound_id", "name", "inchi",
+ "inchikey", "formula", "exactmass")
 .required_compound_columns <- c(.required_compound_db_columns, "synonyms")
 
 .required_msms_spectrum_columns <- c(spectrum_id = "integer",
@@ -783,8 +783,8 @@ createCompDb <- function(x, metadata, msms_spectra, path = ".") {
  paste0(.required_compound_columns[!got_it],
  collapse = ", ")))
  } else {
- if (!is.numeric(x$mass))
- txt <- c(txt, "Column 'mass' should be numeric")
+ if (!is.numeric(x$exactmass))
+ txt <- c(txt, "Column 'exactmass' should be numeric")
  }
  ## if (db) {
  ## ## Do not allow more columns than expected!
@@ -930,7 +930,7 @@ make_metadata <- function(source, url, source_version, source_date, organism) {
 #' MoNa SDF files organize the data by individual spectra (i.e. each element
 #' is one spectrum) and individual compounds can not easily and consistently
 #' defined (i.e. not all entries have an InChI ID or other means to uniquely
-#' identify compounds). Thus, the function returns a highly redundant compount
+#' identify compounds). Thus, the function returns a highly redundant compound
 #' table. Feedback on how to reduce this redundancy would be highly welcome!
 #'
 #' @param x `character(1)` being the SDF file name.

diff --git a/R/mass-utility-functions.R b/R/mass-utility-functions.R
@@ -153,7 +153,7 @@ adducts <- function(pattern, polarity, name, set, ...) {
 #' @param x `numeric` with m/z values.
 #'
 #' @param cmps `data.frame` with a column containing monoisotopic masses in a
-#' column named `"mass"`.
+#' column named `"exactmass"`.
 #'
 #' @param adduct Adduct definition. Either a `data.frame` as returned by
 #' [adducts()] or a `character` with the names of the adducts in that
@@ -202,15 +202,15 @@ adducts <- function(pattern, polarity, name, set, ...) {
  not_found$ppm <- NA_real_
  lapply(x, function(z) {
  mss <- (adduct$charge * z - adduct$massdiff) / adduct$nmol
- idx <- matchWithPpm(mss, cmps$mass, ppm = ppm)
+ idx <- matchWithPpm(mss, cmps$exactmass, ppm = ppm)
  hits <- lengths(idx)
  idx <- unlist(idx, use.names = FALSE)
  if (length(idx)) {
  res <- cmps[idx, , drop = FALSE]
  rownames(res) <- NULL
  res$adduct <- rep(adduct$name, hits)
  rep_mss <- rep(mss, hits)
- res$ppm <- abs(res$mass - rep_mss) * 1e6 / res$mass
+ res$ppm <- abs(res$exactmass - rep_mss) * 1e6 / res$exactmass
  res[order(res$ppm), ]
  } else not_found
  })
@@ -241,7 +241,7 @@ adducts <- function(pattern, polarity, name, set, ...) {
 #' equivalent) with a column containing the m/z values.
 #'
 #' @param compounds either a `data.frame` (or equivalent) or a [CompDb()] with
-#' the reference annotations. A column named `"mass"` is mandatory if
+#' the reference annotations. A column named `"exactmass"` is mandatory if
 #' `compounds` is a `data.frame`.
 #'
 #' @param adduct adduct definition. Either a `data.frame` as returned by
@@ -297,8 +297,8 @@ setMethod(
  "annotateMz", signature(object = "numeric",
  compounds = "DataFrameOrEquivalent"),
  function(object, compounds, adduct = adducts(), ppm = 10, ...) {
- if (!any(colnames(compounds) == "mass"))
- stop("Required column \"mass\" not found in 'compounds'",
+ if (!any(colnames(compounds) == "exactmass"))
+ stop("Required column \"exactmass\" not found in 'compounds'",
  call. = FALSE)
  .annotate_adduct_mz(object, compounds, adduct = adduct, ppm = ppm)
  })