Merge branch 'developer' into Update-metal-extraction

jpquast · Oct 25, 2024 · 17a2ed6 · 17a2ed6
2 parents 5ce5140 + 216a44e
commit 17a2ed6
Show file tree

Hide file tree

Showing 27 changed files with 260 additions and 141 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: protti
 Title: Bottom-Up Proteomics and LiP-MS Quality Control and Data Analysis Tools
-Version: 0.9.0.9000
+Version: 0.9.1.9000
 Authors@R: 
     c(person(given = "Jan-Philipp",
            family = "Quast",
@@ -67,7 +67,9 @@ Suggests:
     iq,
     scales,
     farver,
-    ggforce
+    ggforce,
+    xml2,
+    jsonlite
 Depends: 
     R (>= 4.0)
 URL: https://github.com/jpquast/protti, https://jpquast.github.io/protti/

diff --git a/NAMESPACE b/NAMESPACE
@@ -134,6 +134,7 @@ importFrom(purrr,pluck)
 importFrom(purrr,pmap)
 importFrom(purrr,reduce)
 importFrom(purrr,set_names)
+importFrom(readr,read_csv)
 importFrom(readr,read_tsv)
 importFrom(readr,write_csv)
 importFrom(readr,write_tsv)

diff --git a/NEWS.md b/NEWS.md
@@ -1,8 +1,15 @@
-# protti 0.9.0.9000
+# protti 0.9.1.9000
 
 ## Additional Changes
 
+* `assign_peptide_type` now takes the `start` argument, containing the start position of a peptide. If a protein does not have any peptide starting at position `1` and there is a peptide starting at position `2`, this peptide will be considered "tryptic" at the N-terminus. This is because the initial Methionine is likely missing due to processing for every copy of the protein and therefore position `2` is the true N-terminus.
 * `extract_metal_binders()` now uses keywords from UniProt as well. In addition, only "enables" GO terms are considered now.
+* 
+# protti 0.9.1
+
+## Bug fixes
+
+* `try_query()` now correctly handles errors that don't return a response object. We also handle gzip decompression problems better since some databases compressed responses were not handled correctly. 
 
 # protti 0.9.0
 

diff --git a/R/assign_peptide_type.R b/R/assign_peptide_type.R
@@ -24,7 +24,9 @@ peptide_type <- function(...) {
 #' peptide is located at the N- or C-terminus of a protein and fulfills the criterium to be
 #' fully-tryptic otherwise, it is also considered as fully-tryptic. Peptides that only fulfill the
 #' criterium on one terminus are semi-tryptic peptides. Lastly, peptides that are not fulfilling
-#' the criteria for both termini are non-tryptic peptides.
+#' the criteria for both termini are non-tryptic peptides. In addition, peptides that miss the initial
+#' Methionine of a protein are considered "tryptic" at that site if there is no other peptide
+#' starting at position 1 for that protein.
 #'
 #' @param data a data frame containing at least information about the preceding and C-terminal
 #' amino acids of peptides.
@@ -34,49 +36,90 @@ peptide_type <- function(...) {
 #' acid as one letter code.
 #' @param aa_after a character column in the \code{data} data frame that contains the following amino
 #' acid as one letter code.
+#' @param protein_id a character column in the \code{data} data frame that contains the protein
+#' accession numbers.
+#' @param start a numeric column in the \code{data} data frame that contains the start position of
+#' each peptide within the corresponding protein. This is used to check if the protein is consistently
+#' missing the initial Methionine, making peptides starting at position 2 "tryptic" on that site.
 #'
 #' @return A data frame that contains the input data and an additional column with the peptide
 #' type information.
 #' @import dplyr
 #' @importFrom magrittr %>%
 #' @importFrom rlang .data
+#' @importFrom stringr str_detect
 #' @export
 #'
 #' @examples
 #' data <- data.frame(
-#'   aa_before = c("K", "S", "T"),
-#'   last_aa = c("R", "K", "Y"),
-#'   aa_after = c("T", "R", "T")
+#'   aa_before = c("K", "M", "", "M", "S", "M", "-"),
+#'   last_aa = c("R", "K", "R", "R", "Y", "K", "K"),
+#'   aa_after = c("T", "R", "T", "R", "T", "R", "T"),
+#'   protein_id = c("P1", "P1", "P3", "P3", "P2", "P2", "P2"),
+#'   start = c(38, 2, 1, 2, 10, 2, 1)
 #' )
 #'
-#' assign_peptide_type(data, aa_before, last_aa, aa_after)
+#' assign_peptide_type(data, aa_before, last_aa, aa_after, protein_id, start)
 assign_peptide_type <- function(data,
                                 aa_before = aa_before,
                                 last_aa = last_aa,
-                                aa_after = aa_after) {
-  data %>%
-    dplyr::distinct({{ aa_before }}, {{ last_aa }}, {{ aa_after }}) %>%
-    dplyr::mutate(N_term_tryp = dplyr::if_else({{ aa_before }} == "" |
-      {{ aa_before }} == "K" |
-      {{ aa_before }} == "R",
-    TRUE,
-    FALSE
+                                aa_after = aa_after,
+                                protein_id = protein_id,
+                                start = start) {
+  # Check if there's any peptide starting at position 1 for each protein
+  start_summary <- data %>%
+    dplyr::group_by({{ protein_id }}) %>%
+    dplyr::summarize(has_start_1 = any({{ start }} == 1), .groups = "drop")
+
+  peptide_data <- data %>%
+    dplyr::distinct({{ aa_before }}, {{ last_aa }}, {{ aa_after }}, {{ protein_id }}, {{ start }}, .keep_all = TRUE) %>%
+    dplyr::left_join(start_summary, by = rlang::as_name(rlang::enquo(protein_id))) %>%
+    # Determine N-terminal trypticity
+    dplyr::mutate(N_term_tryp = dplyr::if_else(
+      !stringr::str_detect({{ aa_before }}, "[A-Y]") | {{ aa_before }} == "K" | {{ aa_before }} == "R",
+      TRUE,
+      FALSE
     )) %>%
-    dplyr::mutate(C_term_tryp = dplyr::if_else({{ last_aa }} == "K" |
-      {{ last_aa }} == "R" |
-      {{ aa_after }} == "",
-    TRUE,
-    FALSE
+    # Determine C-terminal trypticity
+    dplyr::mutate(C_term_tryp = dplyr::if_else(
+      {{ last_aa }} == "K" | {{ last_aa }} == "R" | !stringr::str_detect({{ aa_after }}, "[A-Y]"),
+      TRUE,
+      FALSE
     )) %>%
+    # Assign peptide type based on N-term and C-term trypticity
     dplyr::mutate(pep_type = dplyr::case_when(
-      .data$N_term_tryp + .data$C_term_tryp == 2 ~ "fully-tryptic",
-      .data$N_term_tryp + .data$C_term_tryp == 1 ~ "semi-tryptic",
-      .data$N_term_tryp + .data$C_term_tryp == 0 ~ "non-tryptic"
+      .data$N_term_tryp & .data$C_term_tryp ~ "fully-tryptic",
+      .data$N_term_tryp | .data$C_term_tryp ~ "semi-tryptic",
+      TRUE ~ "non-tryptic"
+    )) %>%
+    # Reassign semi-tryptic peptides at position 2 to fully-tryptic if no start == 1
+    dplyr::mutate(pep_type = dplyr::if_else(
+      .data$pep_type == "semi-tryptic" & {{ start }} == 2 & !.data$has_start_1 & .data$C_term_tryp,
+      "fully-tryptic",
+      .data$pep_type
+    )) %>%
+    # Reassign non-tryptic peptides at position 2 to semi-tryptic if no start == 1
+    dplyr::mutate(pep_type = dplyr::if_else(
+      .data$pep_type == "non-tryptic" & {{ start }} == 2 & !.data$has_start_1 & !.data$C_term_tryp,
+      "fully-tryptic",
+      .data$pep_type
     )) %>%
-    dplyr::select(-.data$N_term_tryp, -.data$C_term_tryp) %>%
-    dplyr::right_join(data, by = c(
-      rlang::as_name(rlang::enquo(aa_before)),
-      rlang::as_name(rlang::enquo(last_aa)),
-      rlang::as_name(rlang::enquo(aa_after))
-    ))
+    # Drop unnecessary columns
+    dplyr::select(-c("N_term_tryp", "C_term_tryp", "has_start_1"))
+
+  # Join back to original data to return the full result
+  result <- data %>%
+    dplyr::left_join(
+      peptide_data %>%
+        dplyr::select({{ aa_before }}, {{ last_aa }}, {{ aa_after }}, {{ protein_id }}, {{ start }}, "pep_type"),
+      by = c(
+        rlang::as_name(rlang::enquo(aa_before)),
+        rlang::as_name(rlang::enquo(last_aa)),
+        rlang::as_name(rlang::enquo(aa_after)),
+        rlang::as_name(rlang::enquo(protein_id)),
+        rlang::as_name(rlang::enquo(start))
+      )
+    )
+
+  return(result)
 }
diff --git a/R/calculate_protein_abundance.R b/R/calculate_protein_abundance.R
@@ -18,12 +18,11 @@
 #' for a protein to be included in the analysis. The default value is 3, which means
 #' proteins with fewer than three unique peptides will be excluded from the analysis.
 #' @param method a character value specifying with which method protein quantities should be
-#' calculated. Possible options include \code{"sum"}, which takes the sum of all precursor
-#' intensities as the protein abundance. Another option is \code{"iq"}, which performs protein
+#' calculated. Possible options include `"sum"`, which takes the sum of all precursor
+#' intensities as the protein abundance. Another option is `"iq"`, which performs protein
 #' quantification based on a maximal peptide ratio extraction algorithm that is adapted from the
 #' MaxLFQ algorithm of the MaxQuant software. Functions from the
-#' \href{https://academic.oup.com/bioinformatics/article/36/8/2611/5697917}{\code{iq}} package are
-#' used. Default is \code{"iq"}.
+#' `iq` package (\doi{10.1093/bioinformatics/btz961}) are used. Default is `"iq"`.
 #' @param for_plot a logical value indicating whether the result should be only protein intensities
 #' or protein intensities together with precursor intensities that can be used for plotting using
 #' \code{peptide_profile_plot()}. Default is \code{FALSE}.

diff --git a/R/data.R b/R/data.R
@@ -33,7 +33,7 @@
 #' @format A data frame containing peptide level data from a Spectronaut report.
 #' @source Piazza, I., Beaton, N., Bruderer, R. et al. A machine learning-based chemoproteomic
 #' approach to identify drug targets and binding sites in complex proteomes. Nat Commun 11, 4200
-#' (2020). https://doi.org/10.1038/s41467-020-18071-x
+#' (2020). \doi{10.1038/s41467-020-18071-x}
 "rapamycin_10uM"
 
 #' Rapamycin dose response example data
@@ -47,13 +47,13 @@
 #' @format A data frame containing peptide level data from a Spectronaut report.
 #' @source Piazza, I., Beaton, N., Bruderer, R. et al. A machine learning-based chemoproteomic
 #' approach to identify drug targets and binding sites in complex proteomes. Nat Commun 11, 4200
-#' (2020). https://doi.org/10.1038/s41467-020-18071-x
+#' (2020). \doi{10.1038/s41467-020-18071-x}
 "rapamycin_dose_response"
 
 #' Structural analysis example data
 #'
 #' Example data used for the vignette about structural analysis. The data was obtained from
-#' \href{https://www.sciencedirect.com/science/article/pii/S0092867420316913}{Cappelletti 2021}
+#' Cappelletti et al. 2021 (\doi{10.1016/j.cell.2020.12.021})
 #' and corresponds to two separate experiments. Both experiments were limited proteolyis coupled to
 #' mass spectrometry (LiP-MS) experiments conducted on purified proteins. The first protein is
 #' phosphoglycerate kinase 1 (pgk) and it was treated with 25mM 3-phosphoglyceric acid (3PG).
@@ -69,7 +69,7 @@
 #' @source Cappelletti V, Hauser T, Piazza I, Pepelnjak M, Malinovska L, Fuhrer T, Li Y, Dörig C,
 #' Boersema P, Gillet L, Grossbach J, Dugourd A, Saez-Rodriguez J, Beyer A, Zamboni N, Caflisch A,
 #' de Souza N, Picotti P. Dynamic 3D proteomes reveal protein functional alterations at high
-#' resolution in situ. Cell. 2021 Jan 21;184(2):545-559.e22. doi: 10.1016/j.cell.2020.12.021.
+#' resolution in situ. Cell. 2021 Jan 21;184(2):545-559.e22. \doi{10.1016/j.cell.2020.12.021}.
 #' Epub 2020 Dec 23. PMID: 33357446; PMCID: PMC7836100.
 "ptsi_pgk"
 

diff --git a/R/fetch_eco.R b/R/fetch_eco.R
@@ -18,8 +18,7 @@
 #' essential to navigating the ever-growing (in size and complexity) corpus of scientific
 #' information."
 #'
-#' More information can be found in their
-#' \href{https://academic.oup.com/nar/article/47/D1/D1186/5165344?login=true}{publication}.
+#' More information can be found in their publication (\doi{10.1093/nar/gky1036}).
 #'
 #' @param return_relation a logical value that indicates if relational information should be returned instead
 #' the main descriptive information. This data can be used to check the relations of ECO terms to each other.

diff --git a/R/fetch_mobidb.R b/R/fetch_mobidb.R
@@ -17,7 +17,7 @@
 #' @return A data frame that contains start and end positions for disordered and flexible protein
 #' regions. The \code{feature} column contains information on the source of this
 #' annotation. More information on the source can be found
-#' \href{https://mobidb.bio.unipd.it/about/mobidb}{here}.
+#' \href{https://mobidb.org/about/mobidb}{here}.
 #' @import progress
 #' @importFrom rlang .data
 #' @importFrom purrr map_dfr keep

diff --git a/R/qc_cvs.R b/R/qc_cvs.R
@@ -122,7 +122,7 @@ The function does not handle log2 transformed data.",
         dplyr::mutate({{ condition }} := forcats::fct_expand({{ condition }}, "combined")) %>%
         dplyr::mutate({{ condition }} := replace({{ condition }}, .data$type == "cv_combined", "combined")) %>%
         dplyr::mutate({{ condition }} := forcats::fct_relevel({{ condition }}, "combined")) %>%
-        dplyr::select(-.data$type) %>%
+        dplyr::select(-"type") %>%
         dplyr::group_by({{ condition }}) %>%
         dplyr::mutate(median = stats::median(.data$values)) %>%
         dplyr::distinct()

diff --git a/R/try_query.R b/R/try_query.R
@@ -13,7 +13,6 @@
 #' @param type a character value that specifies the type of data at the target URL. Options are
 #' all options that can be supplied to httr::content, these include e.g.
 #' "text/tab-separated-values", "application/json" and "txt/csv". Default is "text/tab-separated-values".
-#' Default is "tab-separated-values".
 #' @param timeout a numeric value that specifies the maximum request time. Default is 60 seconds.
 #' @param accept a character value that specifies the type of data that should be sent by the API if
 #' it uses content negotiation. The default is NULL and it should only be set for APIs that use
@@ -22,6 +21,7 @@
 #'
 #' @importFrom curl has_internet
 #' @importFrom httr GET timeout http_error message_for_status http_status content accept
+#' @importFrom readr read_tsv read_csv
 #'
 #' @return A data frame that contains the table from the url.
 try_query <-
@@ -77,18 +77,56 @@ try_query <-
       return(invisible("No internet connection"))
     }
 
-    if (httr::http_error(query_result)) {
+    # If response was an error return that error message
+    if (inherits(query_result, "response") && httr::http_error(query_result)) {
       if (!silent) httr::message_for_status(query_result)
       return(invisible(httr::http_status(query_result)$message))
     }
 
+    # Handle other types of errors separately from query errors
+    if (inherits(query_result, "character")) {
+      if (!silent) message(query_result)
+      return(invisible(query_result))
+    }
+
     # Record readr progress variable to set back later
     readr_show_progress <- getOption("readr.show_progress")
     on.exit(options(readr.show_progress = readr_show_progress))
     # Change variable to not show progress if readr is used
     options(readr.show_progress = FALSE)
 
-    result <- suppressMessages(httr::content(query_result, type = type, encoding = "UTF-8", ...))
+    # Retrieve the content as raw bytes using httr::content
+    raw_content <- httr::content(query_result, type = "raw")
+    # Check for gzip magic number (1f 8b) before decompression
+    compressed <- length(raw_content) >= 2 && raw_content[1] == as.raw(0x1f) && raw_content[2] == as.raw(0x8b)
+
+    # Check if the content is gzip compressed
+    if (!is.null(query_result$headers[["content-encoding"]]) && query_result$headers[["content-encoding"]] == "gzip" && compressed) {
+      # Decompress the raw content using base R's `memDecompress`
+      decompressed_content <- memDecompress(raw_content, type = "gzip")
+
+      # Convert the raw bytes to a character string
+      text_content <- rawToChar(decompressed_content)
+
+      # Read the decompressed content based on the specified type
+      if (type == "text/tab-separated-values") {
+        result <- readr::read_tsv(text_content, ...)
+      } else if (type == "text/html") {
+        result <- xml2::read_html(text_content, ...)
+      } else if (type == "text/xml") {
+        result <- xml2::read_xml(text_content, ...)
+      } else if (type == "text/csv" || type == "txt/csv") {
+        result <- readr::read_csv(text_content, ...)
+      } else if (type == "application/json") {
+        result <- jsonlite::fromJSON(text_content, ...) # Using jsonlite for JSON parsing
+      } else if (type == "text") {
+        result <- text_content # Return raw text as-is
+      } else {
+        stop("Unsupported content type: ", type)
+      }
+    } else {
+      result <- suppressMessages(httr::content(query_result, type = type, encoding = "UTF-8", ...))
+    }
 
     return(result)
   }
diff --git a/README.Rmd b/README.Rmd
@@ -26,7 +26,7 @@ knitr::opts_chunk$set(
 
 The goal of **protti** is to provide flexible functions and workflows for proteomics quality control and data analysis, within a single, user-friendly package. It can be used for label-free DDA, DIA and SRM data generated with search tools and software such as Spectronaut, MaxQuant, Proteome Discoverer and Skyline. Both limited proteolysis mass spectrometry (LiP-MS) and regular bottom-up proteomics experiments can be analysed.
 
-**protti** is developed  and maintained by members of the lab of Paola Picotti at ETH Zurich. Our lab is focused on protein structural changes that occur in response to perturbations such as metabolite, drug and protein binding-events, as well as protein aggregation and enzyme activation ([Piazza 2018](https://www.sciencedirect.com/science/article/pii/S0092867417314484), [Piazza 2020](https://www.nature.com/articles/s41467-020-18071-x#additional-information), [Cappelletti, Hauser & Piazza 2021](https://www.sciencedirect.com/science/article/pii/S0092867420316913)). We have devoloped mass spectrometry-based structural and chemical proteomic methods aimed at monitoring protein conformational changes in the complex cellular milieu ([Feng 2014](https://www.nature.com/articles/nbt.2999)). 
+**protti** is developed  and maintained by members of the lab of Paola Picotti at ETH Zurich. Our lab is focused on protein structural changes that occur in response to perturbations such as metabolite, drug and protein binding-events, as well as protein aggregation and enzyme activation ([Piazza 2018](https://doi.org/10.1016/j.cell.2017.12.006), [Piazza 2020](https://doi.org/10.1038/s41467-020-18071-x), [Cappelletti, Hauser & Piazza 2021](https://doi.org/10.1016/j.cell.2020.12.021)). We have devoloped mass spectrometry-based structural and chemical proteomic methods aimed at monitoring protein conformational changes in the complex cellular milieu ([Feng 2014](https://doi.org/10.1038/nbt.2999)). 
 
 There is a wide range of functions **protti** provides to the user. The main areas of application are: