Merge remote-tracking branch 'origin/developer' into improve-calculat…

…e_go_enrichment # Conflicts: # NEWS.md
jpquast · May 17, 2024 · 6b43b21 · 6b43b21
2 parents 25f9abb + 417867a
commit 6b43b21
Show file tree

Hide file tree

Showing 6 changed files with 18 additions and 18 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: protti
 Title: Bottom-Up Proteomics and LiP-MS Quality Control and Data Analysis Tools
-Version: 0.8.0
+Version: 0.8.0.9000
 Authors@R: 
     c(person(given = "Jan-Philipp",
            family = "Quast",

diff --git a/NEWS.md b/NEWS.md
@@ -8,6 +8,7 @@
 
 ## Bug fixes
 
+* `fetch_uniprot()` previously had an issue where it incorrectly identified certain IDs as UniProt IDs, such as ENSEMBL IDs. For example, it would incorrectly interpret `"CON_ENSEMBL:ENSBTAP00000037665"` as `"P00000"`. To address this, the function now requires that UniProt IDs are not preceded or followed by letters or digits. This means that UniProt IDs should be recognized only if they stand alone or are separated by non-alphanumeric characters. For instance, in the string `"P02545;P20700"`, both `"P02545"` and `"P20700"` are correctly identified as UniProt IDs because they are separated by a semicolon and not attached to any other letters or digits.
 * `calculate_go_enrichment()` now correctly uses to total number of provided proteins for the contingency table. Previously it falsely only considered proteins with a GO annotation for the enrichment analysis.
 
 # protti 0.8.0

diff --git a/R/calculate_sequence_coverage.R b/R/calculate_sequence_coverage.R
@@ -77,5 +77,4 @@ calculate_sequence_coverage <-
 
     data %>%
       dplyr::left_join(result, by = c(rlang::as_name(rlang::enquo(protein_sequence)), groups))
-
   }
diff --git a/R/fetch_uniprot.R b/R/fetch_uniprot.R
@@ -73,24 +73,24 @@ fetch_uniprot <-
     non_conform_ids <- uniprot_ids[!id_test]
     # if non_conform_ids contain IDs they are extracted and fetched.
     contains_valid_id <- non_conform_ids[stringr::str_detect(non_conform_ids,
-      pattern = "[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}"
+      pattern = "(?<![:alnum:])[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}(?![:alnum:])"
     )]
 
     uniprot_ids_contain_valid <- uniprot_ids[stringr::str_detect(uniprot_ids,
-      pattern = "[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}"
+      pattern = "(?<![:alnum:])[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}(?![:alnum:])"
     )]
 
     valid_id_annotations <- tibble::tibble(input_id = contains_valid_id) %>%
       dplyr::mutate(accession = stringr::str_extract_all(.data$input_id,
-        pattern = "[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}"
+        pattern = "(?<![:alnum:])[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}(?![:alnum:])"
       )) %>%
       tidyr::unnest("accession") %>%
       dplyr::distinct()
 
     uniprot_ids_filtered <- unique(c(uniprot_ids_filtered, valid_id_annotations$accession))
 
     non_identifiable_id <- non_conform_ids[!stringr::str_detect(non_conform_ids,
-      pattern = "[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}"
+      pattern = "(?<![:alnum:])[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}(?![:alnum:])"
     )]
 
     if (length(non_identifiable_id) != 0) {
@@ -184,13 +184,13 @@ They were fetched and the original input ID can be found in the "input_id" colum
     new_ids <- new$new
 
     if (length(new_ids) == 0) {
-        original_ids <- data.frame(input_id = uniprot_ids_contain_valid) %>%
-          dplyr::left_join(valid_id_annotations, by = "input_id") %>%
-          dplyr::mutate(accession = ifelse(is.na(.data$accession), .data$input_id, .data$accession))
+      original_ids <- data.frame(input_id = uniprot_ids_contain_valid) %>%
+        dplyr::left_join(valid_id_annotations, by = "input_id") %>%
+        dplyr::mutate(accession = ifelse(is.na(.data$accession), .data$input_id, .data$accession))
 
-        result <- result %>%
-          dplyr::right_join(original_ids, by = "accession") %>%
-          dplyr::relocate(.data$accession, .data$input_id)
+      result <- result %>%
+        dplyr::right_join(original_ids, by = "accession") %>%
+        dplyr::relocate("accession", "input_id")
 
       return(result)
     }
@@ -222,7 +222,7 @@ They were fetched and the original input ID can be found in the "input_id" colum
 
     result <- result %>%
       dplyr::right_join(original_ids, by = "accession") %>%
-      dplyr::relocate(.data$accession, .data$input_id)
+      dplyr::relocate("accession", "input_id")
 
     result
   }
diff --git a/R/find_peptide.R b/R/find_peptide.R
@@ -52,8 +52,8 @@ find_peptide <-
         end = .data$end + 1
       ))
 
-      data %>% dplyr::left_join(result, c(
-        rlang::as_name(rlang::enquo(protein_sequence)),
-        rlang::as_name(rlang::enquo(peptide_sequence))
-      ))
+    data %>% dplyr::left_join(result, c(
+      rlang::as_name(rlang::enquo(protein_sequence)),
+      rlang::as_name(rlang::enquo(peptide_sequence))
+    ))
   }
diff --git a/R/find_peptide_in_structure.R b/R/find_peptide_in_structure.R
@@ -193,7 +193,7 @@ find_peptide_in_structure <- function(peptide_data,
           {{ end }} > .data$ref_end_seq_id)) %>%
       dplyr::group_by(.data$pdb_ids, .data$auth_asym_id) %>%
       dplyr::mutate(n_peptides = dplyr::n_distinct({{ peptide }})) %>%
-      tidyr::drop_na(.data$pdb_ids) %>%
+      tidyr::drop_na("pdb_ids") %>%
       dplyr::mutate(n_peptides_in_structure = sum(.data$peptide_in_pdb)) %>%
       dplyr::ungroup() %>%
       dplyr::mutate(