Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/developer' into improve-calculat…
Browse files Browse the repository at this point in the history
…e_go_enrichment

# Conflicts:
#	NEWS.md
  • Loading branch information
jpquast committed May 17, 2024
2 parents 25f9abb + 417867a commit 6b43b21
Show file tree
Hide file tree
Showing 6 changed files with 18 additions and 18 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: protti
Title: Bottom-Up Proteomics and LiP-MS Quality Control and Data Analysis Tools
Version: 0.8.0
Version: 0.8.0.9000
Authors@R:
c(person(given = "Jan-Philipp",
family = "Quast",
Expand Down
1 change: 1 addition & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

## Bug fixes

* `fetch_uniprot()` previously had an issue where it incorrectly identified certain IDs as UniProt IDs, such as ENSEMBL IDs. For example, it would incorrectly interpret `"CON_ENSEMBL:ENSBTAP00000037665"` as `"P00000"`. To address this, the function now requires that UniProt IDs are not preceded or followed by letters or digits. This means that UniProt IDs should be recognized only if they stand alone or are separated by non-alphanumeric characters. For instance, in the string `"P02545;P20700"`, both `"P02545"` and `"P20700"` are correctly identified as UniProt IDs because they are separated by a semicolon and not attached to any other letters or digits.
* `calculate_go_enrichment()` now correctly uses to total number of provided proteins for the contingency table. Previously it falsely only considered proteins with a GO annotation for the enrichment analysis.

# protti 0.8.0
Expand Down
1 change: 0 additions & 1 deletion R/calculate_sequence_coverage.R
Original file line number Diff line number Diff line change
Expand Up @@ -77,5 +77,4 @@ calculate_sequence_coverage <-

data %>%
dplyr::left_join(result, by = c(rlang::as_name(rlang::enquo(protein_sequence)), groups))

}
22 changes: 11 additions & 11 deletions R/fetch_uniprot.R
Original file line number Diff line number Diff line change
Expand Up @@ -73,24 +73,24 @@ fetch_uniprot <-
non_conform_ids <- uniprot_ids[!id_test]
# if non_conform_ids contain IDs they are extracted and fetched.
contains_valid_id <- non_conform_ids[stringr::str_detect(non_conform_ids,
pattern = "[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}"
pattern = "(?<![:alnum:])[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}(?![:alnum:])"
)]

uniprot_ids_contain_valid <- uniprot_ids[stringr::str_detect(uniprot_ids,
pattern = "[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}"
pattern = "(?<![:alnum:])[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}(?![:alnum:])"
)]

valid_id_annotations <- tibble::tibble(input_id = contains_valid_id) %>%
dplyr::mutate(accession = stringr::str_extract_all(.data$input_id,
pattern = "[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}"
pattern = "(?<![:alnum:])[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}(?![:alnum:])"
)) %>%
tidyr::unnest("accession") %>%
dplyr::distinct()

uniprot_ids_filtered <- unique(c(uniprot_ids_filtered, valid_id_annotations$accession))

non_identifiable_id <- non_conform_ids[!stringr::str_detect(non_conform_ids,
pattern = "[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}"
pattern = "(?<![:alnum:])[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}(?![:alnum:])"
)]

if (length(non_identifiable_id) != 0) {
Expand Down Expand Up @@ -184,13 +184,13 @@ They were fetched and the original input ID can be found in the "input_id" colum
new_ids <- new$new

if (length(new_ids) == 0) {
original_ids <- data.frame(input_id = uniprot_ids_contain_valid) %>%
dplyr::left_join(valid_id_annotations, by = "input_id") %>%
dplyr::mutate(accession = ifelse(is.na(.data$accession), .data$input_id, .data$accession))
original_ids <- data.frame(input_id = uniprot_ids_contain_valid) %>%
dplyr::left_join(valid_id_annotations, by = "input_id") %>%
dplyr::mutate(accession = ifelse(is.na(.data$accession), .data$input_id, .data$accession))

result <- result %>%
dplyr::right_join(original_ids, by = "accession") %>%
dplyr::relocate(.data$accession, .data$input_id)
result <- result %>%
dplyr::right_join(original_ids, by = "accession") %>%
dplyr::relocate("accession", "input_id")

return(result)
}
Expand Down Expand Up @@ -222,7 +222,7 @@ They were fetched and the original input ID can be found in the "input_id" colum

result <- result %>%
dplyr::right_join(original_ids, by = "accession") %>%
dplyr::relocate(.data$accession, .data$input_id)
dplyr::relocate("accession", "input_id")

result
}
8 changes: 4 additions & 4 deletions R/find_peptide.R
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,8 @@ find_peptide <-
end = .data$end + 1
))

data %>% dplyr::left_join(result, c(
rlang::as_name(rlang::enquo(protein_sequence)),
rlang::as_name(rlang::enquo(peptide_sequence))
))
data %>% dplyr::left_join(result, c(
rlang::as_name(rlang::enquo(protein_sequence)),
rlang::as_name(rlang::enquo(peptide_sequence))
))
}
2 changes: 1 addition & 1 deletion R/find_peptide_in_structure.R
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,7 @@ find_peptide_in_structure <- function(peptide_data,
{{ end }} > .data$ref_end_seq_id)) %>%
dplyr::group_by(.data$pdb_ids, .data$auth_asym_id) %>%
dplyr::mutate(n_peptides = dplyr::n_distinct({{ peptide }})) %>%
tidyr::drop_na(.data$pdb_ids) %>%
tidyr::drop_na("pdb_ids") %>%
dplyr::mutate(n_peptides_in_structure = sum(.data$peptide_in_pdb)) %>%
dplyr::ungroup() %>%
dplyr::mutate(
Expand Down

0 comments on commit 6b43b21

Please sign in to comment.