Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for breakend variants from VCF #1399

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
364a999
Return features impacted by BND variants
nuno-agostinho Dec 7, 2022
dbb935a
Merge branch 'postreleasefix/109' into add/bnd-support
nuno-agostinho Apr 13, 2023
3766dde
Allow min_max regions per chromosome
nuno-agostinho Apr 14, 2023
1e71189
Minor changes
nuno-agostinho Apr 14, 2023
f621fea
Support breakend described in INFO field
nuno-agostinho Apr 18, 2023
9ebf7c3
Detect newer BND alternative allele format in VCF
nuno-agostinho Apr 18, 2023
47a8edb
Print BND alternative allele in output (if available)
nuno-agostinho Apr 18, 2023
2f45a48
Fix unit tests and add breakend examples
nuno-agostinho Apr 19, 2023
b1ecdd8
Fix unit tests
nuno-agostinho Apr 19, 2023
6b4f93a
Avoid experimental warnings
nuno-agostinho Apr 19, 2023
f1163eb
Fix unknown chromosome in features from unit tests
nuno-agostinho Apr 20, 2023
85a6706
Further fix unit tests
nuno-agostinho Apr 20, 2023
71049d8
Merge branch 'postreleasefix/110' into add/bnd-support
nuno-agostinho Jun 19, 2023
9cd0b75
Fix git merge changes
nuno-agostinho Jun 19, 2023
00efc83
Fix class SO term for breakends
nuno-agostinho Jun 20, 2023
a7281ea
Improve regex for validation of alternative alleles
nuno-agostinho Jun 21, 2023
ec973ac
Avoid nested routines
nuno-agostinho Jun 21, 2023
884bf9c
Update class SO term to chromosome_breakpoint
nuno-agostinho Jun 21, 2023
df0ed89
Avoid calculating overlap for breakpoints
nuno-agostinho Jun 26, 2023
179f3d1
Fix missing parameter on unit tests
nuno-agostinho Jun 27, 2023
a296499
Revert overlap flag check
nuno-agostinho Jun 27, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
148 changes: 108 additions & 40 deletions modules/Bio/EnsEMBL/VEP/AnnotationSource.pm
Original file line number Diff line number Diff line change
Expand Up @@ -140,18 +140,75 @@ sub get_all_features_by_InputBuffer {

# use filter_by_min_max to filter out features we know are not overlapped
# do it here so we're not affecting the cache
$self->filter_features_by_min_max(\@features, @{$buffer->min_max})
$self->filter_features_by_min_max(\@features, $buffer->min_max)
);
}

=head2 get_regions_from_coords

Arg 1 : string $chr
Arg 2 : int $start
Arg 3 : int $end
Arg 4 : hashref $min_max of array [$min, $max] by $chr
Arg 5 : int $cache_region_size
Arg 6 : int $up_down_size
Arg 7 : hashref $seen
Example : $regions = $as->get_regions_from_coords($chr, $start, $end, $min,
$max, $cache_region_size, $up_down_size, $seen)
Description: Fetches all regions that overlap the given coordinates. Regions
are a simple arrayref [$chr, $start], corresponding to bins of
genomic coordinates whose size is cache_region_size.
Returntype : arrayref of region arrayrefs and min_max coordinates
Exceptions : none
Caller : get_all_regions_by_InputBuffer()
Status : Stable

=cut

sub get_regions_from_coords {
my $self = shift;
my $chr = shift;
my $start = shift;
my $end = shift;
my $min_max = shift;
my $cache_region_size = shift;
my $up_down_size = shift;
my $seen = shift;

# find actual chromosome name used by AnnotationSource
my $source_chr = $self->get_source_chr_name($chr);

# allow for indels
($start, $end) = ($end, $start) if $start > $end;

# log min/max
my ($min, $max) = defined $min_max->{$source_chr} ? @{$min_max->{$source_chr}} : (1e10, 0);
$min = $start if $start < $min;
$max = $end if $end > $max;

# convert to region-size
my ($r_s, $r_e) = map {int(($_ - 1) / $cache_region_size)} ($start - $up_down_size, $end + $up_down_size);

# add all regions between r_s and r_e inclusive (if not previously returned)
my @regions;
for(my $s = $r_s; $s <= $r_e; $s++) {
my $key = join(':', ($source_chr, $s));
next if $seen->{$key};

push @regions, [$source_chr, $s];
$seen->{$key} = 1;
}
return ($source_chr, $min, $max, $seen, @regions);
}

=head2 get_all_regions_by_InputBuffer

Arg 1 : Bio::EnsEMBL::VEP::InputBuffer $ib
Example : $regions = $as->get_all_regions_by_InputBuffer($ib)
Description: Fetches all non-overlapping regions that overlap the variants currently
in the input buffer. Regions are a simple arrayref [$chr, $start],
corresponding to bins of genomic coordinates, size cache_region_size
Description: Fetches all regions that overlap the variants currently in the
input buffer. Regions are a simple arrayref [$chr, $start],
corresponding to bins of genomic coordinates with a size of
cache_region_size.
Returntype : arrayref of region arrayrefs
Exceptions : none
Caller : get_all_features_by_InputBuffer()
Expand All @@ -164,46 +221,42 @@ sub get_all_regions_by_InputBuffer {
my $buffer = shift;
my $cache_region_size = shift || $self->{cache_region_size};

my $seen;
my @regions = ();
my %seen = ();
my @new_regions;

my $up_down_size = defined($self->{up_down_size}) ? $self->{up_down_size} : $self->up_down_size();

my ($min, $max) = (1e10, 0);
my $min_max;
my $chr;
my $min;
my $max;

foreach my $vf(@{$buffer->buffer}) {

# skip long and unsupported types of SV; doing this here to avoid stopping looping
next if $vf->{vep_skip};

my $chr = $vf->{chr} || $vf->slice->seq_region_name;
throw("ERROR: Cannot get chromosome from VariationFeature") unless $chr;

# find actual chromosome name used by AnnotationSource
my $source_chr = $self->get_source_chr_name($chr);

my ($vf_s, $vf_e) = ($vf->{start}, $vf->{end});

# allow for indels
($vf_s, $vf_e) = ($vf_e, $vf_s) if $vf_s > $vf_e;

# log min/max
$min = $vf_s if $vf_s < $min;
$max = $vf_e if $vf_e > $max;

# convert to region-size
my ($r_s, $r_e) = map {int(($_ - 1) / $cache_region_size)} ($vf_s - $up_down_size, $vf_e + $up_down_size);

# add all regions between r_s and r_e inclusive
for(my $s = $r_s; $s <= $r_e; $s++) {
my $key = join(':', ($source_chr, $s));
next if $seen{$key};

push @regions, [$source_chr, $s];
$seen{$key} = 1;
$chr = $vf->{chr} || $vf->{slice}->{seq_region_name};
throw("ERROR: Cannot get chromosome $chr from VariationFeature") unless $chr;

($chr, $min, $max, $seen, @new_regions) = $self->get_regions_from_coords(
$chr, $vf->{start}, $vf->{end},
$min_max, $cache_region_size, $up_down_size, $seen);
push @regions, @new_regions;
$min_max->{$chr} = [$min, $max];

# process alternative alleles from breakend structural variants
foreach my $alt (@{$vf->{_parsed_allele}}) {
($chr, $min, $max, $seen, @new_regions) = $self->get_regions_from_coords(
$alt->{chr}, $alt->{pos}, $alt->{pos},
$min_max, $cache_region_size, $up_down_size, $seen);
push @regions, @new_regions;
$min_max->{$chr} = [$min, $max];
}
}

$buffer->min_max([$min, $max]);
$buffer->min_max($min_max);

return \@regions;
}
Expand Down Expand Up @@ -231,6 +284,25 @@ sub get_features_by_regions_cached {
}


sub _check_overlap {
# Check overlap of $elem around $min_max region
my $self = shift;
my $elem = shift;
my $min_max = shift;
my $up_down_size = shift;

# if there is no chromosome info, return first key of $min_max
my $chr = exists $elem->{slice} ? $elem->{slice}->{seq_region_name} : $elem->{chr};
$chr = defined $chr ? $self->get_source_chr_name($chr) : (keys %$min_max)[0];

my $check = exists $min_max->{$chr} &&
overlap($elem->{start}, $elem->{end},
@{$min_max->{$chr}}[0] - $up_down_size,
@{$min_max->{$chr}}[1] + $up_down_size);
return $check;
}


=head2 filter_features_by_min_max

Arg 1 : arrayref $features
Expand All @@ -251,16 +323,12 @@ sub get_features_by_regions_cached {
sub filter_features_by_min_max {
my $self = shift;
my $features = shift;
my $min = shift;
my $max = shift;
my $min_max = shift;

my $up_down_size = defined($self->{up_down_size}) ? $self->{up_down_size} : $self->up_down_size();
$min -= $up_down_size;
$max += $up_down_size;


return [
grep {overlap($_->{start}, $_->{end}, $min, $max)}
@$features
grep { $self->_check_overlap($_, $min_max, $up_down_size) } @$features
];
}

Expand Down Expand Up @@ -351,4 +419,4 @@ sub info {
return $_[0]->{info} || {};
}

1;
1;
5 changes: 4 additions & 1 deletion modules/Bio/EnsEMBL/VEP/BaseVEP.pm
Original file line number Diff line number Diff line change
Expand Up @@ -638,7 +638,10 @@ sub get_source_chr_name {
if(!exists($chr_name_map->{$chr})) {
my $mapped_name = $chr;

@$valids = @{$self->can('valid_chromosomes') ? $self->valid_chromosomes : []} unless @$valids;
if (!@$valids && $self->can('valid_chromosomes')) {
my $valid_chr = $self->valid_chromosomes;
@$valids = ref($valid_chr) eq 'HASH' ? [ keys %$valid_chr ] : @$valid_chr;
}
my %valid = map {$_ => 1} @$valids;

unless($valid{$chr}) {
Expand Down
41 changes: 32 additions & 9 deletions modules/Bio/EnsEMBL/VEP/InputBuffer.pm
Original file line number Diff line number Diff line change
Expand Up @@ -292,7 +292,7 @@ sub get_overlapping_vfs {
@all_vfs = @{$tree->fetch($start - 1, $end)};
}
else{
$tree = $self->hash_tree unless($tree);
$tree = $self->hash_tree unless($tree);
@all_vfs = values %{{
map {$_->{_hash_tree_id} => $_} # use _hash_tree_id to uniquify
map {@{$tree->{$_} || []}} # tree might be empty
Expand All @@ -302,18 +302,24 @@ sub get_overlapping_vfs {
(int($end / $HASH_TREE_SIZE) + 1) # end of range
)
}};
}
}

my @vfs;
foreach my $vf (@all_vfs) {
if (overlap($vf->{start}, $vf->{end}, $start, $end)){
push(@vfs, $vf);
}
if((defined($vf->{unshifted_end}) && (defined($vf->{unshifted_start})))) {

if ((defined($vf->{unshifted_end}) && (defined($vf->{unshifted_start})))) {
if (overlap($vf->{unshifted_start}, $vf->{unshifted_end}, $start, $end)) {
push(@vfs, $vf);
}
}

# check overlap with complex alternative alleles (such as breakend structural variants)
for my $alt (@{ $vf->{_parsed_allele} }) {
push(@vfs, $vf) if overlap($alt->{pos}, $alt->{pos}, $start, $end);
nuno-agostinho marked this conversation as resolved.
Show resolved Hide resolved
}
}
return [@vfs];
}
Expand Down Expand Up @@ -344,6 +350,13 @@ sub interval_tree {
my ($s, $e) = ($vf->{start}, $vf->{end});
($s, $e) = ($e, $s) if $s > $e;
$tree->insert($vf, $s - 1, $e);

# add same variation feature to alternative allele coordinates
# (such as breakend structural variants)
for my $alt (@{ $vf->{_parsed_allele} }) {
$s = $e = $alt->{pos};
$tree->insert($vf, $s - 1, $e);
}
}

$self->{temp}->{interval_tree} = $tree;
Expand Down Expand Up @@ -406,10 +419,10 @@ sub hash_tree {

=head2 min_max

Example : my ($min, $max) = @{$ib->min_max()};
Description: Gets the minimum and maximum coordinates spanned
Example : my %min_max = %{$ib->min_max()};
Description: Gets the chromosome and minimum and maximum coordinates spanned
by variants in the buffer.
Returntype : arrayref [$min, $max]
Returntype : hash of array [$min, $max] by $chr
Exceptions : none
Caller : AnnotationSource
Status : Stable
Expand All @@ -422,9 +435,17 @@ sub min_max {
if(!exists($self->{temp}->{min_max})) {
return $self->{temp}->{min_max} = shift if @_;

my ($min, $max) = (1e10, 0);
my %min_max;
my $chr;
my $min;
my $max;

foreach my $vf(@{$self->buffer}) {
$chr = $vf->{chr} || $vf->slice->seq_region_name;
throw("ERROR: Cannot get chromosome $chr from VariationFeature") unless $chr;

($min, $max) = defined $min_max{$chr} ? @{$min_max{$chr}} : (1e10, 0);

my ($vf_s, $vf_e) = ($vf->{start}, $vf->{end});

if($vf_s > $vf_e) {
Expand All @@ -435,9 +456,11 @@ sub min_max {
$min = $vf_s if $vf_s < $min;
$max = $vf_e if $vf_e > $max;
}

$min_max{$chr} = [$min, $max]
}

$self->{temp}->{min_max} = [$min, $max];
$self->{temp}->{min_max} = \%min_max;
}

return $self->{temp}->{min_max};
Expand Down
18 changes: 10 additions & 8 deletions modules/Bio/EnsEMBL/VEP/OutputFactory.pm
Original file line number Diff line number Diff line change
Expand Up @@ -1883,7 +1883,7 @@ sub BaseStructuralVariationOverlapAllele_to_output_hash {

my $svf = $vfoa->base_variation_feature;

$hash->{Allele} = $svf->class_SO_term;
$hash->{Allele} = $svf->{allele_string} || $svf->class_SO_term;

# allele number
$hash->{ALLELE_NUM} = $vfoa->allele_number if $self->{allele_number};
Expand Down Expand Up @@ -1940,14 +1940,16 @@ sub StructuralVariationOverlapAllele_to_output_hash {
$hash->{Feature_type} = $feature_type;
$hash->{Feature} = $feature->stable_id;

# work out overlap amounts
my $overlap_start = (sort {$a <=> $b} ($svf->start, $feature->start))[-1];
my $overlap_end = (sort {$a <=> $b} ($svf->end, $feature->end))[0];
my $overlap_length = ($overlap_end - $overlap_start) + 1;
my $overlap_pc = 100 * ($overlap_length / (($feature->end - $feature->start) + 1));
# work out overlap amounts (except for breakpoints)
if ($svf->class_SO_term !~ /breakpoint/) {
my $overlap_start = (sort {$a <=> $b} ($svf->start, $feature->start))[-1];
my $overlap_end = (sort {$a <=> $b} ($svf->end, $feature->end))[0];
my $overlap_length = ($overlap_end - $overlap_start) + 1;
my $overlap_pc = 100 * ($overlap_length / (($feature->end - $feature->start) + 1));

$hash->{OverlapBP} = $overlap_length if $overlap_length > 0;
$hash->{OverlapPC} = sprintf("%.2f", $overlap_pc) if $overlap_pc > 0;
$hash->{OverlapBP} = $overlap_length if $overlap_length > 0;
$hash->{OverlapPC} = sprintf("%.2f", $overlap_pc) if $overlap_pc > 0;
}

# cell types
$hash->{CELL_TYPE} = $self->get_cell_types($feature)
Expand Down
2 changes: 1 addition & 1 deletion modules/Bio/EnsEMBL/VEP/Parser.pm
Original file line number Diff line number Diff line change
Expand Up @@ -701,7 +701,7 @@ sub get_SO_term {
DUP => 'duplication',
CNV => 'copy_number_variation',
INV => 'inversion',
BND => 'breakpoint'
BND => 'chromosome_breakpoint'
);

return $terms{$abbrev};
Expand Down
Loading