Skip to content

Commit

Permalink
Better support of duplicate removal of symbolic alleles
Browse files Browse the repository at this point in the history
For example, these variants should not be removed with `-d exact`

    POS=1 ALT=<DEL> SVLEN=-1000
    POS=1 ALT=<DEL> SVLEN=-2000
    POS=1 ALT=<DEL> SVLEN=-3000

Resolves #2182
  • Loading branch information
pd3 committed May 15, 2024
1 parent caafc8a commit 6d53540
Show file tree
Hide file tree
Showing 7 changed files with 70 additions and 7 deletions.
2 changes: 2 additions & 0 deletions NEWS
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ Changes affecting specific commands:

- Fix in reporting reference allele genotypes with `--multi-overlaps .` (#2160)

- Support of duplicate removal of symbolic alleles of the same type but different SVLEN (#2182)



## Release 1.20 (15th April 2024)
Expand Down
9 changes: 9 additions & 0 deletions test/norm.rmdup.3.1.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
##fileformat=VCFv4.1
##FILTER=<ID=PASS,Description="All filters passed">
##INFO=<ID=SVLEN,Number=.,Type=Integer,Description="Difference in length between REF and ALT alleles">
##contig=<ID=1>
#CHROM POS ID REF ALT QUAL FILTER INFO
1 1 . A <DEL> . . SVLEN=-1000
1 1 . A <DEL> . . SVLEN=-2000
1 1 . A <DEL> . . SVLEN=-3000
1 18 . AC A . . .
7 changes: 7 additions & 0 deletions test/norm.rmdup.3.2.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
##fileformat=VCFv4.1
##FILTER=<ID=PASS,Description="All filters passed">
##INFO=<ID=SVLEN,Number=.,Type=Integer,Description="Difference in length between REF and ALT alleles">
##contig=<ID=1>
#CHROM POS ID REF ALT QUAL FILTER INFO
1 1 . A <DEL> . . SVLEN=-1000
1 18 . AC A . . .
2 changes: 2 additions & 0 deletions test/norm.rmdup.3.fa
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
>1
ACGTACGTCCCGTACGTACCCCCGT
15 changes: 15 additions & 0 deletions test/norm.rmdup.3.vcf
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
##fileformat=VCFv4.1
##INFO=<ID=SVLEN,Number=.,Type=Integer,Description="Difference in length between REF and ALT alleles">
##contig=<ID=1>
#CHROM POS ID REF ALT QUAL FILTER INFO
1 10 . C <DEL> . . SVLEN=-1000
1 10 . C <DEL> . . SVLEN=-1000
1 10 . C <DEL> . . SVLEN=-2000
1 10 . C <DEL> . . SVLEN=-1000
1 10 . C <DEL> . . SVLEN=-3000
1 10 . C <DEL> . . SVLEN=-2000
1 10 . C <DEL> . . SVLEN=-1000
1 10 . C <DEL> . . SVLEN=-3000
1 10 . C <DEL> . . SVLEN=-2000
1 20 . CCC CC . . .
1 20 . CC C . . .
2 changes: 2 additions & 0 deletions test/test.pl
Original file line number Diff line number Diff line change
Expand Up @@ -285,6 +285,8 @@
run_test(\&test_vcf_norm,$opts,in=>'norm.rmdup.2',out=>'norm.rmdup.2.2.out',args=>'-d any');
run_test(\&test_vcf_norm,$opts,in=>'norm.rmdup.2',out=>'norm.rmdup.2.2.out',args=>'-d both');
run_test(\&test_vcf_norm,$opts,in=>'norm.rmdup.2',out=>'norm.rmdup.2.2.out',args=>'-d snps');
run_test(\&test_vcf_norm,$opts,in=>'norm.rmdup.3',fai=>'norm.rmdup.3',out=>'norm.rmdup.3.1.out',args=>'-d exact');
run_test(\&test_vcf_norm,$opts,in=>'norm.rmdup.3',fai=>'norm.rmdup.3',out=>'norm.rmdup.3.2.out',args=>'-d all');
run_test(\&test_vcf_norm,$opts,in=>'norm.2',fai=>'norm.2',out=>'norm.2.out',args=>'-c s -a');
run_test(\&test_vcf_norm,$opts,in=>'norm.iupac',fai=>'norm.iupac',out=>'norm.iupac.out',args=>'-c s');
run_test(\&test_vcf_norm,$opts,in=>'norm.3',fai=>'norm.3',out=>'norm.3.out',args=>'-c s');
Expand Down
40 changes: 33 additions & 7 deletions vcfnorm.c
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ typedef struct
{
int n; // number of alleles
char *ref, *alt;
void *hash;
void *hash; // str2int hash
}
cmpals1_t;

Expand Down Expand Up @@ -2014,18 +2014,41 @@ static bcf1_t *mrows_flush(args_t *args)
args->mrows_first++;
return args->mrows[ibeg];
}
static void cmpals_add(cmpals_t *ca, bcf1_t *rec)
static char *strdup_alt_svlen(args_t *args, bcf1_t *rec, int ial)
{
if ( rec->d.allele[ial][0]!='<' ) return strdup(rec->d.allele[ial]);

int n = bcf_get_info_int32(args->hdr, rec, "SVLEN", &args->tmp_arr1, &args->ntmp_arr1);
if ( n<=0 ) return strdup(rec->d.allele[ial]);

if ( n+1 != rec->n_allele )
{
// there should be as many SVLEN numbers as there are ALT alleles
static int warned = 0;
if ( !warned )
{
fprintf(stderr,"TODO: different number of ALT alleles and SVLEN fields %s:%"PRIhts_pos"\n",bcf_seqname(args->hdr,rec),rec->pos+1);
warned = 1;
}
}

kstring_t str = {0,0,0};
ksprintf(&str,"%s.%d",rec->d.allele[ial],args->tmp_arr1[ial-1]);
return str.s;
}
static void cmpals_add(args_t *args, cmpals_t *ca, bcf1_t *rec)
{
ca->ncmpals++;
hts_expand0(cmpals1_t, ca->ncmpals, ca->mcmpals, ca->cmpals);
cmpals1_t *cmpals = ca->cmpals + ca->ncmpals - 1;
free(cmpals->ref);
cmpals->ref = strdup(rec->d.allele[0]);
cmpals->n = rec->n_allele;

if ( rec->n_allele==2 )
{
free(cmpals->alt);
cmpals->alt = strdup(rec->d.allele[1]);
cmpals->alt = strdup_alt_svlen(args,rec,1);
}
else
{
Expand All @@ -2036,9 +2059,10 @@ static void cmpals_add(cmpals_t *ca, bcf1_t *rec)
khash_str2int_inc(cmpals->hash, strdup(rec->d.allele[i]));
}
}
static int cmpals_match(cmpals_t *ca, bcf1_t *rec)
static int cmpals_match(args_t *args, cmpals_t *ca, bcf1_t *rec)
{
int i, j;
char *alt_svlen = rec->n_allele==2 ? strdup_alt_svlen(args,rec,1) : NULL;
for (i=0; i<ca->ncmpals; i++)
{
cmpals1_t *cmpals = ca->cmpals + i;
Expand All @@ -2050,7 +2074,8 @@ static int cmpals_match(cmpals_t *ca, bcf1_t *rec)
// the most frequent case
if ( rec->n_allele==2 )
{
if ( strcasecmp(rec->d.allele[1], cmpals->alt) ) continue;
if ( strcasecmp(alt_svlen, cmpals->alt) ) continue;
free(alt_svlen);
return 1;
}

Expand All @@ -2060,6 +2085,7 @@ static int cmpals_match(cmpals_t *ca, bcf1_t *rec)
if ( j<rec->n_allele ) continue;
return 1;
}
free(alt_svlen);
return 0;
}
static void cmpals_reset(cmpals_t *ca) { ca->ncmpals = 0; }
Expand Down Expand Up @@ -2102,7 +2128,7 @@ static void flush_buffer(args_t *args, htsFile *file, int n)
if ( args->rmdup & BCF_SR_PAIR_ANY ) continue; // rmdup by position only
if ( args->rmdup & BCF_SR_PAIR_SNPS && line_type&(VCF_SNP|VCF_MNP) && prev_type&(VCF_SNP|VCF_MNP) ) continue;
if ( args->rmdup & BCF_SR_PAIR_INDELS && line_type&(VCF_INDEL) && prev_type&(VCF_INDEL) ) continue;
if ( args->rmdup & BCF_SR_PAIR_EXACT && cmpals_match(&args->cmpals_out, args->lines[k]) ) continue;
if ( args->rmdup & BCF_SR_PAIR_EXACT && cmpals_match(args, &args->cmpals_out, args->lines[k]) ) continue;
}
else
{
Expand All @@ -2112,7 +2138,7 @@ static void flush_buffer(args_t *args, htsFile *file, int n)
if ( args->rmdup & BCF_SR_PAIR_EXACT ) cmpals_reset(&args->cmpals_out);
}
prev_type |= line_type;
if ( args->rmdup & BCF_SR_PAIR_EXACT ) cmpals_add(&args->cmpals_out, args->lines[k]);
if ( args->rmdup & BCF_SR_PAIR_EXACT ) cmpals_add(args,&args->cmpals_out, args->lines[k]);
}
if ( bcf_write1(file, args->out_hdr, args->lines[k])!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname);
}
Expand Down

0 comments on commit 6d53540

Please sign in to comment.