Merge pull request #261 from jpquast/developer

Developer
jpquast · Jul 16, 2024 · d5f7503 · d5f7503
2 parents 3acc2f5 + 645ce60
commit d5f7503
Show file tree

Hide file tree

Showing 32 changed files with 498 additions and 142 deletions.
diff --git a/.github/workflows/format-code.yml b/.github/workflows/format-code.yml
@@ -0,0 +1,75 @@
+on:
+  push:
+    paths: ["**.[rR]", "**.[qrR]md", "**.[rR]markdown", "**.[rR]nw", "**.[rR]profile"]
+
+name: Style
+env:
+  GITHUB_ACTOR: "actions-user"
+
+jobs:
+  style:
+    runs-on: ubuntu-latest
+    permissions:
+      contents: write
+    env:
+      GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
+    steps:
+      - name: Checkout repo
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Setup R
+        uses: r-lib/actions/setup-r@v2
+        with:
+          use-public-rspm: true
+
+      - name: Install dependencies
+        uses: r-lib/actions/setup-r-dependencies@v2
+        with:
+          extra-packages: any::styler, any::roxygen2
+          needs: styler
+
+      - name: Enable styler cache
+        run: styler::cache_activate()
+        shell: Rscript {0}
+
+      - name: Determine cache location
+        id: styler-location
+        run: |
+          cat(
+            "location=",
+            styler::cache_info(format = "tabular")$location,
+            "\n",
+            file = Sys.getenv("GITHUB_OUTPUT"),
+            append = TRUE,
+            sep = ""
+          )
+        shell: Rscript {0}
+
+      - name: Cache styler
+        uses: actions/cache@v4
+        with:
+          path: ${{ steps.styler-location.outputs.location }}
+          key: ${{ runner.os }}-styler-${{ github.sha }}
+          restore-keys: |
+            ${{ runner.os }}-styler-
+            ${{ runner.os }}-
+
+      - name: Style
+        run: styler::style_pkg()
+        shell: Rscript {0}
+
+      - name: Commit and push changes
+        run: |
+          if FILES_TO_COMMIT=($(git diff-index --name-only ${{ github.sha }} \
+              | egrep --ignore-case '\.(R|[qR]md|Rmarkdown|Rnw|Rprofile)$'))
+          then
+            git config --local user.name "$GITHUB_ACTOR"
+            git config --local user.email "$GITHUB_ACTOR@users.noreply.github.com"
+            git commit ${FILES_TO_COMMIT[*]} -m "Style code (GHA)"
+            git pull --ff-only
+            git push origin
+          else
+            echo "No changes to commit."
+          fi
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: protti
 Title: Bottom-Up Proteomics and LiP-MS Quality Control and Data Analysis Tools
-Version: 0.8.0
+Version: 0.9.0
 Authors@R: 
     c(person(given = "Jan-Philipp",
            family = "Quast",

diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,27 @@
+# protti 0.9.0
+
+## New features 
+
+* `calculate_go_enrichment()` got additional arguments.
+  * `replace_long_name`: a logical argument that specifies if GO term names above 50 characters should be replaced by the GO ID instead for the plot. This ensures that the plotting area doesn't become too small due to the long name. The default is `TRUE`.
+  * `label_move_frac`: a numeric argument between 0 and 1 that specifies which labels should be moved outside of the bar. The default is 0.2, which means that the labels of all bars that have a size of 20% or less of the largest bar are moved to the right of the bar. This prevents labels from  overlapping with the bar boundaries.
+* `fetch_alphafold_aligned_error()`, `fetch_alphafold_prediction()`, `fetch_mobidb()`, `fetch_quickgo()`, `fetch_uniprot()` and `fetch_uniprot_proteome()` got additional arguments:
+ * `timeout`: a numeric value specifying the time in seconds until the download times out.
+ * `max_tries`: a numeric value that specifies the number of times the function tries to download the data in case an error occurs.
+* Enhanced Flexibility in Protein Quantification: Introduced the `min_n_peptides` parameter to the `calculate_protein_abundance()` function. This allows users to specify the minimum number of peptides per protein needed for analysis. Default is set at three peptides.
+
+## Bug fixes
+
+* `fetch_uniprot()` previously had an issue where it incorrectly identified certain IDs as UniProt IDs, such as ENSEMBL IDs. For example, it would incorrectly interpret `"CON_ENSEMBL:ENSBTAP00000037665"` as `"P00000"`. To address this, the function now requires that UniProt IDs are not preceded or followed by letters or digits. This means that UniProt IDs should be recognized only if they stand alone or are separated by non-alphanumeric characters. For instance, in the string `"P02545;P20700"`, both `"P02545"` and `"P20700"` are correctly identified as UniProt IDs because they are separated by a semicolon and not attached to any other letters or digits. Fixes issue #245.
+* `calculate_go_enrichment()` now correctly uses the total number of provided proteins for the contingency table. Previously it falsely only considered proteins with a GO annotation for the enrichment analysis.
+
+## Additional Changes
+
+* `fetch_uniprot()` and `fetch_uniprot_proteome()` are more resistant to database connection issues. They also give more informative messages as to why the data could not be retrieved. Fixes issue #252.
+* `qc_csv()` now properly works if the column supplied to the `condition` argument is a factor. Fixes issue #254.
+* The `analyse_functional_network()` function now includes enhanced error handling to ensure it fails gracefully in case of any issues.  Fixes issue #259.
+* The default `version` parameter for `analyse_functional_network()` has been updated to 12.0, aligning with the latest STRINGdb version. Fixes issue #244.
+
 # protti 0.8.0
 
 ## New features

diff --git a/R/analyse_functional_network.R b/R/analyse_functional_network.R
@@ -40,7 +40,7 @@ network_analysis <-
 #' \href{https://string-db.org/cgi/input?sessionId=bpvps5GS2As6&input_page_show_search=on}{here}.
 #' H. sapiens: 9606, S. cerevisiae: 4932, E. coli: 511145.
 #' @param version a character value that specifies the version of STRINGdb to be used.
-#' Default is 11.5.
+#' Default is 12.0.
 #' @param score_threshold a numeric value specifying the interaction score that based on
 #' \href{https://string-db.org/cgi/info?sessionId=bBP5N4cIf0PA&footer_active_subpage=scores}{STRING}
 #' has to be between 0 and 1000. A score closer to 1000 is related to a higher confidence for the
@@ -109,7 +109,7 @@ analyse_functional_network <- function(data,
                                        protein_id,
                                        string_id,
                                        organism_id,
-                                       version = "11.5",
+                                       version = "12.0",
                                        score_threshold = 900,
                                        binds_treatment = NULL,
                                        halo_color = NULL,
@@ -121,22 +121,66 @@ analyse_functional_network <- function(data,
     return(invisible(NULL))
   }
 
-  STRINGdb <- get("STRINGdb", envir = loadNamespace("STRINGdb"))
+  # Ensure data frame is not empty and columns exist
+  if (nrow(data) == 0) {
+    stop("The input data frame is empty.")
+  }
+
+  required_columns <- c(ensym(protein_id), ensym(string_id))
+  missing_columns <- required_columns[!required_columns %in% colnames(data)]
+
+  if (length(missing_columns) > 0) {
+    stop(
+      "The following required columns are missing from the input data frame: ",
+      paste(missing_columns, collapse = ", ")
+    )
+  }
+
+  if (plot && nrow(data) > 400) {
+    stop("Please only provide the top 400 significant proteins for plots! STRING cannot plot more at once.")
+  }
+
 
   data <- data %>%
     dplyr::distinct({{ protein_id }}, {{ string_id }}, {{ binds_treatment }})
 
   if (length(unique(dplyr::pull(data, !!ensym(protein_id)))) != nrow(data)) {
     stop(strwrap("Please provide unique annotations for each protein! The number of proteins
-does not match the number of rows in your data.", prefix = "\n", initial = ""))
+    does not match the number of rows in your data.", prefix = "\n", initial = ""))
   }
 
-  string_db <- STRINGdb$new(
-    version = version,
-    species = organism_id, # Check on String database to get the right code (E.coli K12: 511145)
-    score_threshold = score_threshold, # Cutoff score to consider something an interaction
-    input_directory = ""
+  if (!curl::has_internet()) {
+    message("No internet connection.")
+    return(invisible(NULL))
+  }
+
+  STRINGdb <- get("STRINGdb", envir = loadNamespace("STRINGdb"))
+
+  string_db <- tryCatch(
+    {
+      withCallingHandlers(
+        expr = {
+          STRINGdb$new(
+            version = version,
+            species = organism_id, # Check on String database to get the right code (E.coli K12: 511145)
+            score_threshold = score_threshold, # Cutoff score to consider something an interaction
+            input_directory = ""
+          )
+        },
+        warning = function(w) {
+          message("A warning occurred during STRINGdb object creation: ", conditionMessage(w))
+          invokeRestart("muffleWarning")
+        }
+      )
+    },
+    error = function(e) {
+      e$message
+    }
   )
+  if (is.character(string_db)) {
+    message("An error occurred during the interaction network analysis: ", string_db)
+    return(invisible(NULL))
+  }
 
   input <- data %>%
     dplyr::mutate({{ string_id }} := stringr::str_extract({{ string_id }}, pattern = ".+[^;]")) %>%
@@ -148,35 +192,52 @@ does not match the number of rows in your data.", prefix = "\n", initial = ""))
 
   if (!missing(binds_treatment)) {
     if (missing(halo_color)) {
-      coloring <- input %>%
-        dplyr::filter({{ binds_treatment }}) %>%
-        dplyr::mutate(color = "#5680C1")
-    } else {
-      coloring <- input %>%
-        dplyr::filter({{ binds_treatment }}) %>%
-        dplyr::mutate(color = halo_color)
+      halo_color <- "#5680C1"
     }
+
+    coloring <- input %>%
+      dplyr::filter({{ binds_treatment }}) %>%
+      dplyr::mutate(color = halo_color)
+
     payload_id <- string_db$post_payload(dplyr::pull(coloring, {{ string_id }}),
       colors = coloring$color
     )
   }
-  if (plot == TRUE) {
-    if (length(unique(dplyr::pull(data, !!ensym(protein_id)))) > 400) {
-      stop(strwrap("Please only provide the top 400 significant proteins for plots! String
-cannot plot more at once.", prefix = "\n", initial = ""))
-    }
-    string_db$plot_network(string_ids, payload_id = payload_id)
-  } else {
-    mapping <- input %>%
-      dplyr::distinct({{ protein_id }}, {{ string_id }})
 
-    interactions <- string_db$get_interactions(string_ids) %>%
-      dplyr::left_join(mapping, by = c("from" = rlang::as_name(rlang::enquo(string_id)))) %>%
-      dplyr::rename(from_protein = {{ protein_id }}) %>%
-      dplyr::left_join(mapping, by = c("to" = rlang::as_name(rlang::enquo(string_id)))) %>%
-      dplyr::rename(to_protein = {{ protein_id }}) %>%
-      dplyr::distinct()
 
+  interactions <- tryCatch(
+    {
+      withCallingHandlers(
+        expr = {
+          if (plot) {
+            string_db$plot_network(string_ids, payload_id = payload_id)
+          } else {
+            mapping <- input %>%
+              dplyr::distinct({{ protein_id }}, {{ string_id }})
+
+            interactions <- string_db$get_interactions(string_ids) %>%
+              dplyr::left_join(mapping, by = c("from" = rlang::as_name(rlang::enquo(string_id)))) %>%
+              dplyr::rename(from_protein = {{ protein_id }}) %>%
+              dplyr::left_join(mapping, by = c("to" = rlang::as_name(rlang::enquo(string_id)))) %>%
+              dplyr::rename(to_protein = {{ protein_id }}) %>%
+              dplyr::distinct()
+          }
+        },
+        warning = function(w) {
+          message("A warning occurred during the interaction network analysis: ", conditionMessage(w))
+          invokeRestart("muffleWarning")
+        }
+      )
+    },
+    error = function(e) {
+      e$message
+    }
+  )
+
+  if (is.character(interactions)) {
+    message("An error occurred during the interaction network analysis: ", interactions)
+    return(invisible(NULL))
+  } else {
     return(interactions)
   }
 }