Skip to content

Commit

Permalink
refactor: rename columns (issue #62)
Browse files Browse the repository at this point in the history
- Rename `compound_name` to `compound`, `inchi_key` to `inchikey` and `mass` to
  `exactmass`.
  • Loading branch information
jorainer committed Sep 25, 2020
1 parent c1b8ff9 commit 326e3a2
Show file tree
Hide file tree
Showing 21 changed files with 164 additions and 156 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: CompoundDb
Type: Package
Title: Creating and using (Chemical) Compound Annotation Databases
Version: 0.5.0
Version: 0.6.0
Authors@R: c(person(given = "Jan", family = "Stanstrup",
email = "stanstrup@gmail.com",
role = c("aut"),
Expand Down
4 changes: 2 additions & 2 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,10 @@

export(CompDb)
export(CompoundIdFilter)
export(CompoundNameFilter)
export(MsBackendCompDb)
export(MsmsMzRangeMaxFilter)
export(MsmsMzRangeMinFilter)
export(NameFilter)
export(adducts)
export(compound_tbl_lipidblast)
export(compound_tbl_sdf)
Expand All @@ -25,10 +25,10 @@ export(src_compdb)
export(tables)
exportClasses(CompDb)
exportClasses(CompoundIdFilter)
exportClasses(CompoundNameFilter)
exportClasses(MsBackendCompDb)
exportClasses(MsmsMzRangeMaxFilter)
exportClasses(MsmsMzRangeMinFilter)
exportClasses(NameFilter)
exportMethods("$<-")
exportMethods("intensity<-")
exportMethods("mz<-")
Expand Down
22 changes: 11 additions & 11 deletions R/AnnotationFilters.R
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
#'
#' The supported filters are:
#' - `CompoundIdFilter`: filter based on the compound ID.
#' - `CompoundNameFilter`: filter based on the compound name.
#' - `NameFilter`: filter based on the compound name.
#' - `MsmsMzRangeMinFilter`: retrieve entries based on the smallest m/z of all
#' peaks of their MS/MS spectra. Requires that MS/MS spectra data are present
#' (i.e. `hasMsMsSpectra(cmp_db)` returns `TRUE`).
Expand Down Expand Up @@ -43,10 +43,10 @@
#' AnnotationFilter(~ compound_id == "comp_b")
#'
#' ## Combine filters
#' AnnotationFilterList(CompoundIdFilter("a"), CompoundNameFilter("b"))
#' AnnotationFilterList(CompoundIdFilter("a"), NameFilter("b"))
#'
#' ## Using a formula expression
#' AnnotationFilter(~ compound_id == "a" | compound_name != "b")
#' AnnotationFilter(~ compound_id == "a" | name != "b")
NULL

#' @importClassesFrom AnnotationFilter CharacterFilter AnnotationFilter
Expand All @@ -67,20 +67,20 @@ CompoundIdFilter <- function(value, condition = "==") {
new("CompoundIdFilter", value = as.character(value), condition = condition)
}

#' @exportClass CompoundNameFilter
#' @exportClass NameFilter
#'
#' @rdname Filter-classes
setClass("CompoundNameFilter", contains = "CharacterFilter",
setClass("NameFilter", contains = "CharacterFilter",
prototype = list(
condition = "==",
value = "",
field = "compound_name"
field = "name"
))
#' @export CompoundNameFilter
#' @export NameFilter
#'
#' @rdname Filter-classes
CompoundNameFilter <- function(value, condition = "==") {
new("CompoundNameFilter", value = as.character(value),
NameFilter <- function(value, condition = "==") {
new("NameFilter", value = as.character(value),
condition = condition)
}

Expand Down Expand Up @@ -300,9 +300,9 @@ MsmsMzRangeMaxFilter <- function(value, condition = "<=") {
#' @noRd
.supported_filters <- function(x) {
df <- data.frame(filter = c("CompoundIdFilter",
"CompoundNameFilter"),
"NameFilter"),
field = c("compound_id",
"compound_name"),
"name"),
stringsAsFactors = FALSE)
if (!missing(x) && .has_msms_spectra(x)) {
df <- rbind(df,
Expand Down
14 changes: 7 additions & 7 deletions R/CompDb.R
Original file line number Diff line number Diff line change
Expand Up @@ -59,11 +59,11 @@
#' to the database with parameter `x`.
#'
#' For all other methods: a `CompDb` object.
#'
#'
#' @param flags flags passed to the SQLite database connection.
#' See [SQLite()]. Defaults to read-only, i.e. RSQLite::SQLITE_RO.
#'
#'
#'
#'
#' @author Johannes Rainer
#'
#' @md
Expand Down Expand Up @@ -100,12 +100,12 @@
#' tables(cmp_db)
#'
#' ## Extract a data.frame with the id, name and inchi of all compounds
#' compounds(cmp_db, columns = c("compound_id", "compound_name", "inchi"))
#' compounds(cmp_db, columns = c("compound_id", "name", "inchi"))
#'
#' ## Add also the synonyms (aliases) for the compounds. This will cause the
#' ## tables compound and synonym to be joined. The elements of the compound_id
#' ## and compound_name are now no longer unique
#' res <- compounds(cmp_db, columns = c("compound_id", "compound_name", "synonym"))
#' ## and name are now no longer unique
#' res <- compounds(cmp_db, columns = c("compound_id", "name", "synonym"))
#' head(res)
#'
#' ## Extract spectra for a specific HMDB compound.
Expand All @@ -124,7 +124,7 @@
#' cmp_tbl <- tbl(src_cmp, "compound")
#'
#' ## Extract the id, name and inchi
#' cmp_tbl %>% select(compound_id, compound_name, inchi) %>% collect()
#' cmp_tbl %>% select(compound_id, name, inchi) %>% collect()
NULL

#' @importFrom methods new
Expand Down
68 changes: 34 additions & 34 deletions R/createCompDbPackage.R
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
#'
#' @details
#'
#' Column `"compound_name"` reports for HMDB files the `"GENERIC_NAME"`, for
#' Column `"name"` reports for HMDB files the `"GENERIC_NAME"`, for
#' ChEBI the `"ChEBI Name"`, for PubChem the `"PUBCHEM_IUPAC_TRADITIONAL_NAME"`,
#' and for Lipid Maps the `"COMMON_NAME"`, if that is
#' not available, the first of the compounds synonyms and, if that is also not
Expand All @@ -27,7 +27,7 @@
#' defined (i.e. not all entries have an InChI ID or other means to uniquely
#' identify compounds). Thus, the function returns a highly redundant compound
#' table. Feedback on how to reduce this redundancy would be highly welcome!
#'
#'
#' LIPID MAPS was tested August 2020. Older SDF files might not work as the field names were changed.
#'
#' @param file `character(1)` with the name of the SDF file.
Expand All @@ -38,11 +38,11 @@
#' @return A [tibble::tibble] with general compound information (one row per
#' compound):
#' + `compound_id`: the ID of the compound.
#' + `compound_name`: the compound's name.
#' + `name`: the compound's name.
#' + `inchi`: the InChI of the compound.
#' + `inchikey`: the InChI key.
#' + `formula`: the chemical formula.
#' + `mass`: the compound's mass.
#' + `exactmass`: the compound's (monoisotopic exact) mass.
#' + `synonyms`: the compound's synonyms (aliases). This type of this column is
#' by default a `list` to support multiple aliases per compound, unless
#' argument `collapse` is provided, in which case multiple synonyms are pasted
Expand Down Expand Up @@ -106,11 +106,11 @@ compound_tbl_sdf <- function(file, collapse) {
#' @return A [tibble::tibble] with general compound information (one row per
#' compound):
#' + `compound_id`: the ID of the compound.
#' + `compound_name`: the compound's name.
#' + `name`: the compound's name.
#' + `inchi`: the InChI of the compound.
#' + `inchikey`: the InChI key.
#' + `formula`: the chemical formula.
#' + `mass`: the compound's mass.
#' + `exactmass`: the compound's mass.
#' + `synonyms`: the compound's synonyms (aliases). This type of this column is
#' by default a `list` to support multiple aliases per compound, unless
#' argument `collapse` is provided, in which case multiple synonyms are pasted
Expand Down Expand Up @@ -145,11 +145,11 @@ compound_tbl_lipidblast <- function(file, collapse) {
#' @description
#'
#' Internal function to extract compound information from a file in SDF format.
#'
#'
#' @param x what is returned by datablock2ma(datablock(read.SDFset)).
#'
#' @return A [tibble::tibble] with columns `"compound_id"`, `"compound_name"`,
#' `"inchi"`, `"formula"`, `"mass"`.
#' @return A [tibble::tibble] with columns `"compound_id"`, `"name"`,
#' `"inchi"`, `"formula"`, `"exactmass"`.
#'
#' @note
#' LIPID MAPS was tested August 2020. Older SDF files might not work as the field names were changed.
Expand Down Expand Up @@ -180,11 +180,11 @@ compound_tbl_lipidblast <- function(file, collapse) {
nms[nas] <- x[nas, "SYSTEMATIC_NAME"]
}
res <- data_frame(compound_id = x[, colmap["id"]],
compound_name = nms,
name = nms,
inchi = x[, colmap["inchi"]],
inchi_key = x[, colmap["inchi_key"]],
inchikey = x[, colmap["inchikey"]],
formula = x[, colmap["formula"]],
mass = as.numeric(x[, colmap["mass"]]),
exactmass = as.numeric(x[, colmap["exactmass"]]),
synonyms = syns
)
if (is.na(colmap["smiles"])) {
Expand All @@ -209,7 +209,7 @@ compound_tbl_lipidblast <- function(file, collapse) {
#'
#' Based on the provided `colnames` guess whether the file is from HMDB,
#' ChEBI, LIPID MAPS, PubChem or LipidBlast.
#'
#'
#'
#' @param x `character` with the column names of the data table.
#'
Expand Down Expand Up @@ -243,39 +243,39 @@ compound_tbl_lipidblast <- function(file, collapse) {
.hmdb_colmap <- c(id = "HMDB_ID",
name = "GENERIC_NAME",
inchi = "INCHI_IDENTIFIER",
inchi_key = "INCHI_KEY",
inchikey = "INCHI_KEY",
formula = "FORMULA",
mass = "EXACT_MASS",
exactmass = "EXACT_MASS",
synonyms = "SYNONYMS",
smiles = "SMILES"
)
.hmdb_separator <- "; "
.chebi_colmap <- c(id = "ChEBI ID",
name = "ChEBI Name",
inchi = "InChI",
inchi_key = "InChIKey",
inchikey = "InChIKey",
formula = "Formulae",
mass = "Monoisotopic Mass",
exactmass = "Monoisotopic Mass",
synonyms = "Synonyms",
smiles = "SMILES"
)
.chebi_separator <- " __ "
.lipidmaps_colmap <- c(id = "LM_ID",
name = "NAME",
inchi = "INCHI",
inchi_key = "INCHI_KEY",
inchikey = "INCHI_KEY",
formula = "FORMULA",
mass = "EXACT_MASS",
exactmass = "EXACT_MASS",
synonyms = "SYNONYMS",
smiles = NA
)
.lipidmaps_separator <- "; "
.pubchem_colmap <- c(id = "PUBCHEM_COMPOUND_CID",
name = "PUBCHEM_IUPAC_TRADITIONAL_NAME",
inchi = "PUBCHEM_IUPAC_INCHI",
inchi_key = "PUBCHEM_IUPAC_INCHIKEY",
inchikey = "PUBCHEM_IUPAC_INCHIKEY",
formula = "PUBCHEM_MOLECULAR_FORMULA",
mass = "PUBCHEM_EXACT_MASS",
exactmass = "PUBCHEM_EXACT_MASS",
synonyms = "PUBCHEM_IUPAC_TRADITIONAL_NAME",
smiles = "PUBCHEM_OPENEYE_CAN_SMILES"
# Others:
Expand All @@ -288,9 +288,9 @@ compound_tbl_lipidblast <- function(file, collapse) {
.mona_colmap <- c(id = "ID",
name = "NAME",
inchi = "INCHIKEY",
inchi_key = "INCHIKEY",
inchikey = "INCHIKEY",
formula = "FORMULA",
mass = "EXACT MASS",
exactmass = "EXACT MASS",
synonyms = "SYNONYMS",
smiles = NA
)
Expand Down Expand Up @@ -332,11 +332,11 @@ compound_tbl_lipidblast <- function(file, collapse) {
mass <- NA_character_
list(
compound_id = x$id,
compound_name = nms[1],
name = nms[1],
inchi = cmp$inchi,
inchi_key = NA_character_,
inchikey = NA_character_,
formula = frml,
mass = mass,
exactmass = mass,
synonyms = nms[-1]
)
}
Expand Down Expand Up @@ -369,11 +369,11 @@ compound_tbl_lipidblast <- function(file, collapse) {
#' Required columns for the `data.frame` providing the compound information (
#' parameter `x`) are:
#' + `"compound_id"`: the ID of the compound.
#' + `"compound_name"`: the compound's name.
#' + `"name"`: the compound's name.
#' + `"inchi"`: the InChI of the compound.
#' + `"inchikey"`: the InChI key.
#' + `"formula"`: the chemical formula.
#' + `"mass"`: the compound's mass.
#' + `"exactmass"`: the compound's (exact) mass.
#' + `"synonyms"`: additional synonyms/aliases for the compound. Should be
#' either a single character or a list of values for each compound.
#'
Expand Down Expand Up @@ -587,7 +587,7 @@ createCompDb <- function(x, metadata, msms_spectra, path = ".") {
}
## Creating indices
dbExecute(con, "create index compound_id_idx on compound (compound_id)")
dbExecute(con, "create index compound_name_idx on compound (compound_name)")
dbExecute(con, "create index compound_name_idx on compound (name)")
## Process spectra.
if (!missing(msms_spectra) && is.data.frame(msms_spectra)) {
comp_ids <- unique(x$compound_id)
Expand Down Expand Up @@ -667,8 +667,8 @@ createCompDb <- function(x, metadata, msms_spectra, path = ".") {

.required_metadata_keys <- c("source", "url", "source_version", "source_date",
"organism")
.required_compound_db_columns <- c("compound_id", "compound_name", "inchi",
"inchi_key", "formula", "mass")
.required_compound_db_columns <- c("compound_id", "name", "inchi",
"inchikey", "formula", "exactmass")
.required_compound_columns <- c(.required_compound_db_columns, "synonyms")

.required_msms_spectrum_columns <- c(spectrum_id = "integer",
Expand Down Expand Up @@ -783,8 +783,8 @@ createCompDb <- function(x, metadata, msms_spectra, path = ".") {
paste0(.required_compound_columns[!got_it],
collapse = ", ")))
} else {
if (!is.numeric(x$mass))
txt <- c(txt, "Column 'mass' should be numeric")
if (!is.numeric(x$exactmass))
txt <- c(txt, "Column 'exactmass' should be numeric")
}
## if (db) {
## ## Do not allow more columns than expected!
Expand Down Expand Up @@ -930,7 +930,7 @@ make_metadata <- function(source, url, source_version, source_date, organism) {
#' MoNa SDF files organize the data by individual spectra (i.e. each element
#' is one spectrum) and individual compounds can not easily and consistently
#' defined (i.e. not all entries have an InChI ID or other means to uniquely
#' identify compounds). Thus, the function returns a highly redundant compount
#' identify compounds). Thus, the function returns a highly redundant compound
#' table. Feedback on how to reduce this redundancy would be highly welcome!
#'
#' @param x `character(1)` being the SDF file name.
Expand Down
12 changes: 6 additions & 6 deletions R/mass-utility-functions.R
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,7 @@ adducts <- function(pattern, polarity, name, set, ...) {
#' @param x `numeric` with m/z values.
#'
#' @param cmps `data.frame` with a column containing monoisotopic masses in a
#' column named `"mass"`.
#' column named `"exactmass"`.
#'
#' @param adduct Adduct definition. Either a `data.frame` as returned by
#' [adducts()] or a `character` with the names of the adducts in that
Expand Down Expand Up @@ -202,15 +202,15 @@ adducts <- function(pattern, polarity, name, set, ...) {
not_found$ppm <- NA_real_
lapply(x, function(z) {
mss <- (adduct$charge * z - adduct$massdiff) / adduct$nmol
idx <- matchWithPpm(mss, cmps$mass, ppm = ppm)
idx <- matchWithPpm(mss, cmps$exactmass, ppm = ppm)
hits <- lengths(idx)
idx <- unlist(idx, use.names = FALSE)
if (length(idx)) {
res <- cmps[idx, , drop = FALSE]
rownames(res) <- NULL
res$adduct <- rep(adduct$name, hits)
rep_mss <- rep(mss, hits)
res$ppm <- abs(res$mass - rep_mss) * 1e6 / res$mass
res$ppm <- abs(res$exactmass - rep_mss) * 1e6 / res$exactmass
res[order(res$ppm), ]
} else not_found
})
Expand Down Expand Up @@ -241,7 +241,7 @@ adducts <- function(pattern, polarity, name, set, ...) {
#' equivalent) with a column containing the m/z values.
#'
#' @param compounds either a `data.frame` (or equivalent) or a [CompDb()] with
#' the reference annotations. A column named `"mass"` is mandatory if
#' the reference annotations. A column named `"exactmass"` is mandatory if
#' `compounds` is a `data.frame`.
#'
#' @param adduct adduct definition. Either a `data.frame` as returned by
Expand Down Expand Up @@ -297,8 +297,8 @@ setMethod(
"annotateMz", signature(object = "numeric",
compounds = "DataFrameOrEquivalent"),
function(object, compounds, adduct = adducts(), ppm = 10, ...) {
if (!any(colnames(compounds) == "mass"))
stop("Required column \"mass\" not found in 'compounds'",
if (!any(colnames(compounds) == "exactmass"))
stop("Required column \"exactmass\" not found in 'compounds'",
call. = FALSE)
.annotate_adduct_mz(object, compounds, adduct = adduct, ppm = ppm)
})
Expand Down
Loading

0 comments on commit 326e3a2

Please sign in to comment.