Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add genomecov #411

Merged
merged 4 commits into from
Oct 11, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ export(bed_complement)
export(bed_coverage)
export(bed_fisher)
export(bed_flank)
export(bed_genomecov)
export(bed_glyph)
export(bed_intersect)
export(bed_jaccard)
Expand Down
2 changes: 2 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
# valr (development version)

* Added `bed_genomecov()` to compute interval coverage across a genome.

# valr 0.7.0

* `read_bed` and related functions now automatically calculate the fields. Use of `n_fields` was deprecated.
Expand Down
4 changes: 4 additions & 0 deletions R/RcppExports.R
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,10 @@ flank_impl <- function(df, genome, both = 0, left = 0, right = 0, fraction = FAL
.Call(`_valr_flank_impl`, df, genome, both, left, right, fraction, stranded, trim)
}

gcoverage_impl <- function(gdf, max_coords) {
.Call(`_valr_gcoverage_impl`, gdf, max_coords)
}

intersect_impl <- function(x, y, x_grp_indexes, y_grp_indexes, invert = FALSE, suffix_x = ".x", suffix_y = ".y") {
.Call(`_valr_intersect_impl`, x, y, x_grp_indexes, y_grp_indexes, invert, suffix_x, suffix_y)
}
Expand Down
121 changes: 121 additions & 0 deletions R/bed_genomecov.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
#' Calculate coverage across a genome
#'
#' This function is useful for calculating interval coverage across an entire genome.
#'
#' @param x [ivl_df]
#' @param genome [genome_df]
#' @param zero_depth If TRUE, report intervals with zero depth. Zero depth intervals will
#' be reported with respect to groups.
#'
#' @template groups
#'
#' @return
#' [ivl_df] with the an additional column:
#'
#' - `.depth` depth of interval coverage
#'
#' @family single set operations
#'
#' @seealso
#' \url{https://bedtools.readthedocs.io/en/latest/content/tools/genomecov.html}
#'
#' @examples
#' x <- tibble::tribble(
#' ~chrom, ~start, ~end, ~strand,
#' "chr1", 20, 70, "+",
#' "chr1", 50, 100, "-",
#' "chr1", 200, 250, "+",
#' "chr1", 220, 250, "+"
#' )
#'
#' genome <- tibble::tribble(
#' ~chrom, ~size,
#' "chr1", 500,
#' "chr2", 1000
#' )
#'
#' bed_genomecov(x, genome)
#'
#' bed_genomecov(dplyr::group_by(x, strand), genome)
#'
#' bed_genomecov(dplyr::group_by(x, strand), genome, zero_depth = TRUE)
#'
#' @export
bed_genomecov <- function(x, genome, zero_depth = FALSE) {
check_required(x)
check_required(genome)

x <- check_interval(x)
genome <- check_genome(genome)

non_genome_chroms <- setdiff(unique(x$chrom), genome$chrom)
if (length(non_genome_chroms) > 0) {
cli::cli_warn(c(
paste0(
"The following chromosomes in bed intervals are not",
" in the genome and will be ignored:"
),
paste0(
non_genome_chroms,
sep = "\n"
)
))
x <- x[!x[["chrom"]] %in% non_genome_chroms, ]
}

grp_cols <- group_vars(x)
x <- bed_sort(x)

groups <- rlang::syms(unique(c("chrom", grp_cols)))
x <- group_by(x, !!!groups)

max_coords <- group_chrom_sizes(x, genome)

res <- gcoverage_impl(x, max_coords)
res <- tibble::as_tibble(res)

# drop non-grouped cols as values no longer match ivls
res <- select(res, chrom, start, end, one_of(grp_cols), .depth)

if (!zero_depth) {
res <- res[res[[".depth"]] > 0, ]
} else {
# handle any missing chromosome, zero depth intervals handled on cpp side
missing_chroms <- setdiff(genome$chrom, group_data(x)$chrom)
if (length(missing_chroms) > 0) {
missing_chrom_ivls <- genome[genome[["chrom"]] %in% missing_chroms, ]
missing_chrom_ivls[["start"]] <- 0L
missing_chrom_ivls[[".depth"]] <- 0L
missing_chrom_ivls <- select(missing_chrom_ivls, chrom, start, end = size, .depth)

if (length(groups) > 1) {
missing_chrom_ivls <- fill_missing_grouping(missing_chrom_ivls, x)
}
res <- bind_rows(res, missing_chrom_ivls)
res <- bed_sort(res)
}
}

res
}

# return chrom sizes for each group
group_chrom_sizes <- function(x, genome) {
xx <- left_join(group_data(x), genome, by = "chrom")
xx[["size"]]
}

# expand df to contain non-chrom groups from grp_df
fill_missing_grouping <- function(df, grp_df) {
grps <- get_group_data(grp_df)
grps <- grps[, setdiff(colnames(grps), "chrom")]

# expand df rows for new groups
df_grown <- df[rep(seq_len(nrow(df)), nrow(grps)), ]
# expand groups df for new groups
grp_grown <- grps[rep(seq_len(nrow(grps)), nrow(df)), ]

stopifnot(nrow(df_grown) == nrow(grp_grown))
res <- bind_cols(df_grown, grp_grown)
res
}
2 changes: 1 addition & 1 deletion R/globals.r
Original file line number Diff line number Diff line change
Expand Up @@ -19,5 +19,5 @@ globalVariables(c(
"p.value", ".win_size", ".row_id",
"cds_start", "cds_end", "score",
"n_int", "sum_overlap", "sum_x", "sum_xy", "sum_y",
"temp_start", "temp_end", "seqnames"
"temp_start", "temp_end", "seqnames", ".depth"
))
3 changes: 3 additions & 0 deletions bench/benchmarks.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ res <- mark(
bed_partition(x),
bed_cluster(x),
bed_complement(x, genome),
bed_genomecov(x, genome),
# multi tbl functions
bed_closest(x, y),
bed_intersect(x, y),
Expand Down Expand Up @@ -141,6 +142,7 @@ res2 <- mark(
bed_subtract(x, y),
bed_makewindows(x, win_size = 100),
bed_sort(z),
bed_genomecov(x, genome),
# GRanges
flank(x_gr, 1000),
shift(x_gr,0),
Expand All @@ -151,6 +153,7 @@ res2 <- mark(
setdiff(x_gr, y_gr),
tile(x_gr, width = 100),
sort(z_gr),
coverage(x_gr),
min_time = Inf,
iterations = nrep,
check = FALSE,
Expand Down
1 change: 1 addition & 0 deletions man/bed_cluster.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions man/bed_complement.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions man/bed_flank.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

68 changes: 68 additions & 0 deletions man/bed_genomecov.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions man/bed_merge.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions man/bed_partition.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions man/bed_shift.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions man/bed_slop.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pkgdown/_pkgdown.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ reference:
- bed_cluster
- bed_complement
- bed_flank
- bed_genomecov
- bed_merge
- bed_partition
- bed_slop
Expand Down
12 changes: 12 additions & 0 deletions src/RcppExports.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,18 @@ BEGIN_RCPP
return rcpp_result_gen;
END_RCPP
}
// gcoverage_impl
DataFrame gcoverage_impl(const ValrGroupedDataFrame& gdf, const IntegerVector& max_coords);
RcppExport SEXP _valr_gcoverage_impl(SEXP gdfSEXP, SEXP max_coordsSEXP) {
BEGIN_RCPP
Rcpp::RObject rcpp_result_gen;
Rcpp::RNGScope rcpp_rngScope_gen;
Rcpp::traits::input_parameter< const ValrGroupedDataFrame& >::type gdf(gdfSEXP);
Rcpp::traits::input_parameter< const IntegerVector& >::type max_coords(max_coordsSEXP);
rcpp_result_gen = Rcpp::wrap(gcoverage_impl(gdf, max_coords));
return rcpp_result_gen;
END_RCPP
}
// intersect_impl
DataFrame intersect_impl(ValrGroupedDataFrame x, ValrGroupedDataFrame y, IntegerVector x_grp_indexes, IntegerVector y_grp_indexes, bool invert, const std::string& suffix_x, const std::string& suffix_y);
RcppExport SEXP _valr_intersect_impl(SEXP xSEXP, SEXP ySEXP, SEXP x_grp_indexesSEXP, SEXP y_grp_indexesSEXP, SEXP invertSEXP, SEXP suffix_xSEXP, SEXP suffix_ySEXP) {
Expand Down
Loading
Loading