Skip to content

Commit

Permalink
Merge pull request #255 from jpquast/fix-qc_cvs_issue_254
Browse files Browse the repository at this point in the history
Fix qc cvs issue 254
  • Loading branch information
jpquast authored May 29, 2024
2 parents 5dab7ef + 8f0ad2d commit f22ed13
Show file tree
Hide file tree
Showing 9 changed files with 112 additions and 30 deletions.
75 changes: 75 additions & 0 deletions .github/workflows/format-code.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
on:
push:
paths: ["**.[rR]", "**.[qrR]md", "**.[rR]markdown", "**.[rR]nw", "**.[rR]profile"]

name: Style
env:
GITHUB_ACTOR: "actions-user"

jobs:
style:
runs-on: ubuntu-latest
permissions:
contents: write
env:
GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
steps:
- name: Checkout repo
uses: actions/checkout@v4
with:
fetch-depth: 0

- name: Setup R
uses: r-lib/actions/setup-r@v2
with:
use-public-rspm: true

- name: Install dependencies
uses: r-lib/actions/setup-r-dependencies@v2
with:
extra-packages: any::styler, any::roxygen2
needs: styler

- name: Enable styler cache
run: styler::cache_activate()
shell: Rscript {0}

- name: Determine cache location
id: styler-location
run: |
cat(
"location=",
styler::cache_info(format = "tabular")$location,
"\n",
file = Sys.getenv("GITHUB_OUTPUT"),
append = TRUE,
sep = ""
)
shell: Rscript {0}

- name: Cache styler
uses: actions/cache@v4
with:
path: ${{ steps.styler-location.outputs.location }}
key: ${{ runner.os }}-styler-${{ github.sha }}
restore-keys: |
${{ runner.os }}-styler-
${{ runner.os }}-
- name: Style
run: styler::style_pkg()
shell: Rscript {0}

- name: Commit and push changes
run: |
if FILES_TO_COMMIT=($(git diff-index --name-only ${{ github.sha }} \
| egrep --ignore-case '\.(R|[qR]md|Rmarkdown|Rnw|Rprofile)$'))
then
git config --local user.name "$GITHUB_ACTOR"
git config --local user.email "$GITHUB_ACTOR@users.noreply.github.com"
git commit ${FILES_TO_COMMIT[*]} -m "Style code (GHA)"
git pull --ff-only
git push origin
else
echo "No changes to commit."
fi
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: protti
Title: Bottom-Up Proteomics and LiP-MS Quality Control and Data Analysis Tools
Version: 0.8.0
Version: 0.8.0.9000
Authors@R:
c(person(given = "Jan-Philipp",
family = "Quast",
Expand Down
7 changes: 7 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,10 @@
# protti 0.8.9000

## Bug fixes

* `fetch_uniprot()` previously had an issue where it incorrectly identified certain IDs as UniProt IDs, such as ENSEMBL IDs. For example, it would incorrectly interpret `"CON_ENSEMBL:ENSBTAP00000037665"` as `"P00000"`. To address this, the function now requires that UniProt IDs are not preceded or followed by letters or digits. This means that UniProt IDs should be recognized only if they stand alone or are separated by non-alphanumeric characters. For instance, in the string `"P02545;P20700"`, both `"P02545"` and `"P20700"` are correctly identified as UniProt IDs because they are separated by a semicolon and not attached to any other letters or digits.
* `qc_csv()` now properly works if the column supplied to the `condition` argument is a factor. Fixes issue #254.

# protti 0.8.0

## New features
Expand Down
1 change: 0 additions & 1 deletion R/calculate_sequence_coverage.R
Original file line number Diff line number Diff line change
Expand Up @@ -77,5 +77,4 @@ calculate_sequence_coverage <-

data %>%
dplyr::left_join(result, by = c(rlang::as_name(rlang::enquo(protein_sequence)), groups))

}
22 changes: 11 additions & 11 deletions R/fetch_uniprot.R
Original file line number Diff line number Diff line change
Expand Up @@ -73,24 +73,24 @@ fetch_uniprot <-
non_conform_ids <- uniprot_ids[!id_test]
# if non_conform_ids contain IDs they are extracted and fetched.
contains_valid_id <- non_conform_ids[stringr::str_detect(non_conform_ids,
pattern = "[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}"
pattern = "(?<![:alnum:])[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}(?![:alnum:])"
)]

uniprot_ids_contain_valid <- uniprot_ids[stringr::str_detect(uniprot_ids,
pattern = "[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}"
pattern = "(?<![:alnum:])[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}(?![:alnum:])"
)]

valid_id_annotations <- tibble::tibble(input_id = contains_valid_id) %>%
dplyr::mutate(accession = stringr::str_extract_all(.data$input_id,
pattern = "[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}"
pattern = "(?<![:alnum:])[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}(?![:alnum:])"
)) %>%
tidyr::unnest("accession") %>%
dplyr::distinct()

uniprot_ids_filtered <- unique(c(uniprot_ids_filtered, valid_id_annotations$accession))

non_identifiable_id <- non_conform_ids[!stringr::str_detect(non_conform_ids,
pattern = "[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}"
pattern = "(?<![:alnum:])[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}(?![:alnum:])"
)]

if (length(non_identifiable_id) != 0) {
Expand Down Expand Up @@ -184,13 +184,13 @@ They were fetched and the original input ID can be found in the "input_id" colum
new_ids <- new$new

if (length(new_ids) == 0) {
original_ids <- data.frame(input_id = uniprot_ids_contain_valid) %>%
dplyr::left_join(valid_id_annotations, by = "input_id") %>%
dplyr::mutate(accession = ifelse(is.na(.data$accession), .data$input_id, .data$accession))
original_ids <- data.frame(input_id = uniprot_ids_contain_valid) %>%
dplyr::left_join(valid_id_annotations, by = "input_id") %>%
dplyr::mutate(accession = ifelse(is.na(.data$accession), .data$input_id, .data$accession))

result <- result %>%
dplyr::right_join(original_ids, by = "accession") %>%
dplyr::relocate(.data$accession, .data$input_id)
result <- result %>%
dplyr::right_join(original_ids, by = "accession") %>%
dplyr::relocate("accession", "input_id")

return(result)
}
Expand Down Expand Up @@ -222,7 +222,7 @@ They were fetched and the original input ID can be found in the "input_id" colum

result <- result %>%
dplyr::right_join(original_ids, by = "accession") %>%
dplyr::relocate(.data$accession, .data$input_id)
dplyr::relocate("accession", "input_id")

result
}
8 changes: 4 additions & 4 deletions R/find_peptide.R
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,8 @@ find_peptide <-
end = .data$end + 1
))

data %>% dplyr::left_join(result, c(
rlang::as_name(rlang::enquo(protein_sequence)),
rlang::as_name(rlang::enquo(peptide_sequence))
))
data %>% dplyr::left_join(result, c(
rlang::as_name(rlang::enquo(protein_sequence)),
rlang::as_name(rlang::enquo(peptide_sequence))
))
}
2 changes: 1 addition & 1 deletion R/find_peptide_in_structure.R
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,7 @@ find_peptide_in_structure <- function(peptide_data,
{{ end }} > .data$ref_end_seq_id)) %>%
dplyr::group_by(.data$pdb_ids, .data$auth_asym_id) %>%
dplyr::mutate(n_peptides = dplyr::n_distinct({{ peptide }})) %>%
tidyr::drop_na(.data$pdb_ids) %>%
tidyr::drop_na("pdb_ids") %>%
dplyr::mutate(n_peptides_in_structure = sum(.data$peptide_in_pdb)) %>%
dplyr::ungroup() %>%
dplyr::mutate(
Expand Down
23 changes: 12 additions & 11 deletions R/qc_cvs.R
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
#' information on conditions and intensity values for each peptide, precursor or protein.
#' @param grouping a character column in the \code{data} data frame that contains the grouping
#' variables (e.g. peptides, precursors or proteins).
#' @param condition a column in the \code{data} data frame that contains condition information
#' @param condition a character or factor column in the \code{data} data frame that contains condition information
#' (e.g. "treated" and "control").
#' @param intensity a numeric column in the \code{data} data frame that contains the corresponding
#' raw or untransformed normalised intensity values for each peptide or precursor.
Expand Down Expand Up @@ -119,10 +119,11 @@ The function does not handle log2 transformed data.",
dplyr::distinct({{ condition }}, {{ grouping }}, .data$cv_combined, .data$cv) %>%
tidyr::drop_na() %>%
tidyr::pivot_longer(cols = starts_with("cv"), names_to = "type", values_to = "values") %>%
dplyr::mutate(type = ifelse(.data$type == "cv", {{ condition }}, "combined")) %>%
dplyr::mutate(type = forcats::fct_relevel(as.factor(.data$type), "combined")) %>%
dplyr::select(-{{ condition }}) %>%
dplyr::group_by(.data$type) %>%
dplyr::mutate({{ condition }} := forcats::fct_expand({{ condition }}, "combined")) %>%
dplyr::mutate({{ condition }} := replace({{ condition }}, .data$type == "cv_combined", "combined")) %>%
dplyr::mutate({{ condition }} := forcats::fct_relevel({{ condition }}, "combined")) %>%
dplyr::select(-.data$type) %>%
dplyr::group_by({{ condition }}) %>%
dplyr::mutate(median = stats::median(.data$values)) %>%
dplyr::distinct()

Expand All @@ -137,9 +138,9 @@ The function does not handle log2 transformed data.",
plot <- ggplot2::ggplot(result) +
ggplot2::geom_boxplot(
aes(
x = .data$type,
x = {{ condition }},
y = .data$values,
fill = .data$type
fill = {{ condition }}
),
na.rm = TRUE
) +
Expand All @@ -165,7 +166,7 @@ The function does not handle log2 transformed data.",
}
if (plot_style == "density") {
plot <- ggplot2::ggplot(result) +
ggplot2::geom_density(ggplot2::aes(x = .data$values, col = .data$type), size = 1, na.rm = TRUE) +
ggplot2::geom_density(ggplot2::aes(x = .data$values, col = {{ condition }}), size = 1, na.rm = TRUE) +
ggplot2::labs(
title = "Coefficients of variation",
x = "Coefficient of variation [%]",
Expand All @@ -174,10 +175,10 @@ The function does not handle log2 transformed data.",
) +
ggplot2::scale_x_continuous(limits = c(0, max_cv)) +
geom_vline(
data = dplyr::distinct(result, .data$median, .data$type),
data = dplyr::distinct(result, .data$median, {{ condition }}),
ggplot2::aes(
xintercept = median,
col = .data$type
col = {{ condition }}
),
size = 1,
linetype = "dashed",
Expand All @@ -198,7 +199,7 @@ The function does not handle log2 transformed data.",
return(plot)
}
if (plot_style == "violin") {
plot <- ggplot2::ggplot(result, aes(x = .data$type, y = .data$values, fill = .data$type)) +
plot <- ggplot2::ggplot(result, aes(x = {{ condition }}, y = .data$values, fill = {{ condition }})) +
ggplot2::geom_violin(na.rm = TRUE) +
ggplot2::geom_boxplot(width = 0.15, fill = "white", na.rm = TRUE, alpha = 0.6) +
ggplot2::labs(
Expand Down
2 changes: 1 addition & 1 deletion man/qc_cvs.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit f22ed13

Please sign in to comment.