Merge pull request #255 from jpquast/fix-qc_cvs_issue_254

Fix qc cvs issue 254
jpquast · May 29, 2024 · f22ed13 · f22ed13
2 parents 5dab7ef + 8f0ad2d
commit f22ed13
Show file tree

Hide file tree

Showing 9 changed files with 112 additions and 30 deletions.
diff --git a/.github/workflows/format-code.yml b/.github/workflows/format-code.yml
@@ -0,0 +1,75 @@
+on:
+  push:
+    paths: ["**.[rR]", "**.[qrR]md", "**.[rR]markdown", "**.[rR]nw", "**.[rR]profile"]
+
+name: Style
+env:
+  GITHUB_ACTOR: "actions-user"
+
+jobs:
+  style:
+    runs-on: ubuntu-latest
+    permissions:
+      contents: write
+    env:
+      GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
+    steps:
+      - name: Checkout repo
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Setup R
+        uses: r-lib/actions/setup-r@v2
+        with:
+          use-public-rspm: true
+
+      - name: Install dependencies
+        uses: r-lib/actions/setup-r-dependencies@v2
+        with:
+          extra-packages: any::styler, any::roxygen2
+          needs: styler
+
+      - name: Enable styler cache
+        run: styler::cache_activate()
+        shell: Rscript {0}
+
+      - name: Determine cache location
+        id: styler-location
+        run: |
+          cat(
+            "location=",
+            styler::cache_info(format = "tabular")$location,
+            "\n",
+            file = Sys.getenv("GITHUB_OUTPUT"),
+            append = TRUE,
+            sep = ""
+          )
+        shell: Rscript {0}
+
+      - name: Cache styler
+        uses: actions/cache@v4
+        with:
+          path: ${{ steps.styler-location.outputs.location }}
+          key: ${{ runner.os }}-styler-${{ github.sha }}
+          restore-keys: |
+            ${{ runner.os }}-styler-
+            ${{ runner.os }}-
+
+      - name: Style
+        run: styler::style_pkg()
+        shell: Rscript {0}
+
+      - name: Commit and push changes
+        run: |
+          if FILES_TO_COMMIT=($(git diff-index --name-only ${{ github.sha }} \
+              | egrep --ignore-case '\.(R|[qR]md|Rmarkdown|Rnw|Rprofile)$'))
+          then
+            git config --local user.name "$GITHUB_ACTOR"
+            git config --local user.email "$GITHUB_ACTOR@users.noreply.github.com"
+            git commit ${FILES_TO_COMMIT[*]} -m "Style code (GHA)"
+            git pull --ff-only
+            git push origin
+          else
+            echo "No changes to commit."
+          fi
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: protti
 Title: Bottom-Up Proteomics and LiP-MS Quality Control and Data Analysis Tools
-Version: 0.8.0
+Version: 0.8.0.9000
 Authors@R: 
     c(person(given = "Jan-Philipp",
            family = "Quast",

diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,10 @@
+# protti 0.8.9000
+
+## Bug fixes
+
+* `fetch_uniprot()` previously had an issue where it incorrectly identified certain IDs as UniProt IDs, such as ENSEMBL IDs. For example, it would incorrectly interpret `"CON_ENSEMBL:ENSBTAP00000037665"` as `"P00000"`. To address this, the function now requires that UniProt IDs are not preceded or followed by letters or digits. This means that UniProt IDs should be recognized only if they stand alone or are separated by non-alphanumeric characters. For instance, in the string `"P02545;P20700"`, both `"P02545"` and `"P20700"` are correctly identified as UniProt IDs because they are separated by a semicolon and not attached to any other letters or digits.
+* `qc_csv()` now properly works if the column supplied to the `condition` argument is a factor. Fixes issue #254.
+
 # protti 0.8.0
 
 ## New features

diff --git a/R/calculate_sequence_coverage.R b/R/calculate_sequence_coverage.R
@@ -77,5 +77,4 @@ calculate_sequence_coverage <-
 
     data %>%
       dplyr::left_join(result, by = c(rlang::as_name(rlang::enquo(protein_sequence)), groups))
-
   }
diff --git a/R/fetch_uniprot.R b/R/fetch_uniprot.R
@@ -73,24 +73,24 @@ fetch_uniprot <-
     non_conform_ids <- uniprot_ids[!id_test]
     # if non_conform_ids contain IDs they are extracted and fetched.
     contains_valid_id <- non_conform_ids[stringr::str_detect(non_conform_ids,
-      pattern = "[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}"
+      pattern = "(?<![:alnum:])[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}(?![:alnum:])"
     )]
 
     uniprot_ids_contain_valid <- uniprot_ids[stringr::str_detect(uniprot_ids,
-      pattern = "[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}"
+      pattern = "(?<![:alnum:])[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}(?![:alnum:])"
     )]
 
     valid_id_annotations <- tibble::tibble(input_id = contains_valid_id) %>%
       dplyr::mutate(accession = stringr::str_extract_all(.data$input_id,
-        pattern = "[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}"
+        pattern = "(?<![:alnum:])[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}(?![:alnum:])"
       )) %>%
       tidyr::unnest("accession") %>%
       dplyr::distinct()
 
     uniprot_ids_filtered <- unique(c(uniprot_ids_filtered, valid_id_annotations$accession))
 
     non_identifiable_id <- non_conform_ids[!stringr::str_detect(non_conform_ids,
-      pattern = "[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}"
+      pattern = "(?<![:alnum:])[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}(?![:alnum:])"
     )]
 
     if (length(non_identifiable_id) != 0) {
@@ -184,13 +184,13 @@ They were fetched and the original input ID can be found in the "input_id" colum
     new_ids <- new$new
 
     if (length(new_ids) == 0) {
-        original_ids <- data.frame(input_id = uniprot_ids_contain_valid) %>%
-          dplyr::left_join(valid_id_annotations, by = "input_id") %>%
-          dplyr::mutate(accession = ifelse(is.na(.data$accession), .data$input_id, .data$accession))
+      original_ids <- data.frame(input_id = uniprot_ids_contain_valid) %>%
+        dplyr::left_join(valid_id_annotations, by = "input_id") %>%
+        dplyr::mutate(accession = ifelse(is.na(.data$accession), .data$input_id, .data$accession))
 
-        result <- result %>%
-          dplyr::right_join(original_ids, by = "accession") %>%
-          dplyr::relocate(.data$accession, .data$input_id)
+      result <- result %>%
+        dplyr::right_join(original_ids, by = "accession") %>%
+        dplyr::relocate("accession", "input_id")
 
       return(result)
     }
@@ -222,7 +222,7 @@ They were fetched and the original input ID can be found in the "input_id" colum
 
     result <- result %>%
       dplyr::right_join(original_ids, by = "accession") %>%
-      dplyr::relocate(.data$accession, .data$input_id)
+      dplyr::relocate("accession", "input_id")
 
     result
   }
diff --git a/R/find_peptide.R b/R/find_peptide.R
@@ -52,8 +52,8 @@ find_peptide <-
         end = .data$end + 1
       ))
 
-      data %>% dplyr::left_join(result, c(
-        rlang::as_name(rlang::enquo(protein_sequence)),
-        rlang::as_name(rlang::enquo(peptide_sequence))
-      ))
+    data %>% dplyr::left_join(result, c(
+      rlang::as_name(rlang::enquo(protein_sequence)),
+      rlang::as_name(rlang::enquo(peptide_sequence))
+    ))
   }
diff --git a/R/find_peptide_in_structure.R b/R/find_peptide_in_structure.R
@@ -193,7 +193,7 @@ find_peptide_in_structure <- function(peptide_data,
           {{ end }} > .data$ref_end_seq_id)) %>%
       dplyr::group_by(.data$pdb_ids, .data$auth_asym_id) %>%
       dplyr::mutate(n_peptides = dplyr::n_distinct({{ peptide }})) %>%
-      tidyr::drop_na(.data$pdb_ids) %>%
+      tidyr::drop_na("pdb_ids") %>%
       dplyr::mutate(n_peptides_in_structure = sum(.data$peptide_in_pdb)) %>%
       dplyr::ungroup() %>%
       dplyr::mutate(

diff --git a/R/qc_cvs.R b/R/qc_cvs.R
@@ -6,7 +6,7 @@
 #' information on conditions and intensity values for each peptide, precursor or protein.
 #' @param grouping a character column in the \code{data} data frame that contains the grouping
 #' variables (e.g. peptides, precursors or proteins).
-#' @param condition a column in the \code{data} data frame that contains condition information
+#' @param condition a character or factor column in the \code{data} data frame that contains condition information
 #' (e.g. "treated" and "control").
 #' @param intensity a numeric column in the \code{data} data frame that contains the corresponding
 #' raw or untransformed normalised intensity values for each peptide or precursor.
@@ -119,10 +119,11 @@ The function does not handle log2 transformed data.",
         dplyr::distinct({{ condition }}, {{ grouping }}, .data$cv_combined, .data$cv) %>%
         tidyr::drop_na() %>%
         tidyr::pivot_longer(cols = starts_with("cv"), names_to = "type", values_to = "values") %>%
-        dplyr::mutate(type = ifelse(.data$type == "cv", {{ condition }}, "combined")) %>%
-        dplyr::mutate(type = forcats::fct_relevel(as.factor(.data$type), "combined")) %>%
-        dplyr::select(-{{ condition }}) %>%
-        dplyr::group_by(.data$type) %>%
+        dplyr::mutate({{ condition }} := forcats::fct_expand({{ condition }}, "combined")) %>%
+        dplyr::mutate({{ condition }} := replace({{ condition }}, .data$type == "cv_combined", "combined")) %>%
+        dplyr::mutate({{ condition }} := forcats::fct_relevel({{ condition }}, "combined")) %>%
+        dplyr::select(-.data$type) %>%
+        dplyr::group_by({{ condition }}) %>%
         dplyr::mutate(median = stats::median(.data$values)) %>%
         dplyr::distinct()
 
@@ -137,9 +138,9 @@ The function does not handle log2 transformed data.",
         plot <- ggplot2::ggplot(result) +
           ggplot2::geom_boxplot(
             aes(
-              x = .data$type,
+              x = {{ condition }},
               y = .data$values,
-              fill = .data$type
+              fill = {{ condition }}
             ),
             na.rm = TRUE
           ) +
@@ -165,7 +166,7 @@ The function does not handle log2 transformed data.",
       }
       if (plot_style == "density") {
         plot <- ggplot2::ggplot(result) +
-          ggplot2::geom_density(ggplot2::aes(x = .data$values, col = .data$type), size = 1, na.rm = TRUE) +
+          ggplot2::geom_density(ggplot2::aes(x = .data$values, col = {{ condition }}), size = 1, na.rm = TRUE) +
           ggplot2::labs(
             title = "Coefficients of variation",
             x = "Coefficient of variation [%]",
@@ -174,10 +175,10 @@ The function does not handle log2 transformed data.",
           ) +
           ggplot2::scale_x_continuous(limits = c(0, max_cv)) +
           geom_vline(
-            data = dplyr::distinct(result, .data$median, .data$type),
+            data = dplyr::distinct(result, .data$median, {{ condition }}),
             ggplot2::aes(
               xintercept = median,
-              col = .data$type
+              col = {{ condition }}
             ),
             size = 1,
             linetype = "dashed",
@@ -198,7 +199,7 @@ The function does not handle log2 transformed data.",
         return(plot)
       }
       if (plot_style == "violin") {
-        plot <- ggplot2::ggplot(result, aes(x = .data$type, y = .data$values, fill = .data$type)) +
+        plot <- ggplot2::ggplot(result, aes(x = {{ condition }}, y = .data$values, fill = {{ condition }})) +
           ggplot2::geom_violin(na.rm = TRUE) +
           ggplot2::geom_boxplot(width = 0.15, fill = "white", na.rm = TRUE, alpha = 0.6) +
           ggplot2::labs(

diff --git a/man/qc_cvs.Rd b/man/qc_cvs.Rd