Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Extend the -O, --overlap option #1811

Merged
merged 1 commit into from
Jul 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 5 additions & 3 deletions annot-tsv.1
Original file line number Diff line number Diff line change
Expand Up @@ -170,13 +170,15 @@ Ignore the headers completely and use numeric indexes even when a header exists
Suppress index numbers in the printed header. If given twice, drop the entire header.
.RE
.PP
.BR \-O ", " \-\-overlap " FLOAT"
.BR \-O ", " \-\-overlap " FLOAT,[FLOAT]"
.RS 4
Minimum overlap as a fraction of region length in at least one of the overlapping regions. If also
Minimum overlap as a fraction of region length in SRC and TGT, respectively (with two numbers), or in
at least one of the overlapping regions (with a single number). If also
.BR \-r ", " \-\-reciprocal
is given, require at least
.I FLOAT
overlap with respect to both regions
overlap with respect to both regions. Two identical numbers are equivalent to running with
.BR \-r ", " \-\-reciprocal
.RE
.PP
.BR \-r ", " \-\-reciprocal
Expand Down
50 changes: 36 additions & 14 deletions annot-tsv.c
Original file line number Diff line number Diff line change
Expand Up @@ -105,8 +105,8 @@ typedef struct
char *core_str, *match_str, *transfer_str, *annots_str, *headers_str, *delim_str;
char *temp_dir, *out_fname;
BGZF *out_fp;
int allow_dups, reciprocal, max_annots, mode, no_write_hdr;
double overlap;
int allow_dups, max_annots, mode, no_write_hdr, overlap_either;
double overlap_src, overlap_dst;
regidx_t *idx;
regitr_t *itr;
kstring_t tmp_kstr;
Expand Down Expand Up @@ -736,18 +736,20 @@ void process_line(args_t *args, char *line, size_t size)
int has_match = 0, annot_len = 0;
while ( regitr_overlap(args->itr) )
{
if ( args->overlap )
if ( args->overlap_src || args->overlap_dst )
{
double len1 = end - beg + 1;
double len2 = args->itr->end - args->itr->beg + 1;
double len_dst = end - beg + 1;
double len_src = args->itr->end - args->itr->beg + 1;
double isec = (args->itr->end < end ? args->itr->end : end) - (args->itr->beg > beg ? args->itr->beg : beg) + 1;
if ( args->reciprocal )
int pass_dst = isec/len_dst < args->overlap_dst ? 0 : 1;
int pass_src = isec/len_src < args->overlap_src ? 0 : 1;
if ( args->overlap_either )
{
if ( isec/len1 < args->overlap || isec/len2 < args->overlap ) continue;
if ( !pass_dst && !pass_src ) continue;
}
else
{
if ( isec/len1 < args->overlap && isec/len2 < args->overlap ) continue;
if ( !pass_dst || !pass_src ) continue;
}
}
cols_t *src_cols = regitr_payload(args->itr,cols_t*);
Expand Down Expand Up @@ -885,8 +887,9 @@ static const char *usage_text(void)
" -H, --ignore-headers Use numeric indices, ignore the headers completely\n"
" -I, --no-header-idx Suppress index numbers in the printed header. If given\n"
" twice, drop the entire header\n"
" -O, --overlap FLOAT Minimum required overlap (non-reciprocal, unless -r\n"
" is given)\n"
" -O, --overlap FLOAT[,FLOAT] Minimum required overlap with respect to SRC,TGT.\n"
" If single value, the bigger overlap is considered.\n"
" Identical values are equivalent to running with -r.\n"
" -r, --reciprocal Apply the -O requirement to both overlapping\n"
" intervals\n"
" -x, --drop-overlaps Drop overlapping regions (precludes -f)\n"
Expand Down Expand Up @@ -941,6 +944,7 @@ int main(int argc, char **argv)
};
char *tmp = NULL;
int c;
int reciprocal = 0;
while ((c = getopt_long(argc, argv, "c:f:m:o:s:t:a:HO:rxh:Id:",loptions,NULL)) >= 0)
{
switch (c)
Expand All @@ -960,16 +964,24 @@ int main(int argc, char **argv)
case 'd': args->delim_str = optarg; break;
case 'h': args->headers_str = optarg; break;
case 'H': args->headers_str = "0:0"; break;
case 'r': args->reciprocal = 1; break;
case 'r': reciprocal = 1; break;
case 'c': args->core_str = optarg; break;
case 't': args->dst.fname = optarg; break;
case 'm': args->match_str = optarg; break;
case 'a': args->annots_str = optarg; break;
case 'o': args->out_fname = optarg; break;
case 'O':
args->overlap = strtod(optarg, &tmp);
if ( tmp==optarg || *tmp ) error("Could not parse --overlap %s\n", optarg);
if ( args->overlap<0 || args->overlap>1 ) error("Expected value from the interval [0,1]: --overlap %s\n", optarg);
args->overlap_src = strtod(optarg, &tmp);
if ( tmp==optarg || (*tmp && *tmp!=',') ) error("Could not parse --overlap %s\n", optarg);
if ( args->overlap_src<0 || args->overlap_src>1 ) error("Expected value(s) from the interval [0,1]: --overlap %s\n", optarg);
if ( *tmp )
{
args->overlap_dst = strtod(tmp+1, &tmp);
if ( *tmp ) error("Could not parse --overlap %s\n", optarg);
if ( args->overlap_dst<0 || args->overlap_dst>1 ) error("Expected value(s) from the interval [0,1]: --overlap %s\n", optarg);
}
else
args->overlap_either = 1;
break;
case 's': args->src.fname = optarg; break;
case 'f': args->transfer_str = optarg; break;
Expand All @@ -994,6 +1006,16 @@ int main(int argc, char **argv)
else args->mode = PRINT_MATCHING|PRINT_NONMATCHING;
}
if ( (args->transfer_str || args->annots_str) && !(args->mode & PRINT_MATCHING) ) error("The option -x cannot be combined with -f and -a\n");
if ( reciprocal )
{
if ( args->overlap_dst && args->overlap_src && args->overlap_dst!=args->overlap_src )
error("The combination of --reciprocal with --overlap %f,%f makes no sense: expected single value or identical values\n",args->overlap_src,args->overlap_dst);
if ( !args->overlap_src )
args->overlap_src = args->overlap_dst;
else
args->overlap_dst = args->overlap_src;
args->overlap_either = 0;
}

init_data(args);
write_header(args, &args->dst);
Expand Down
2 changes: 2 additions & 0 deletions test/annot-tsv/out.13.1.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
1 10 20 long long,short
1 15 15 short long,short
2 changes: 2 additions & 0 deletions test/annot-tsv/out.13.2.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
1 10 20 long long
1 15 15 short short
2 changes: 2 additions & 0 deletions test/annot-tsv/out.13.3.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
1 10 20 long long
1 15 15 short long,short
2 changes: 2 additions & 0 deletions test/annot-tsv/out.13.4.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
1 10 20 long long,short
1 15 15 short short
2 changes: 2 additions & 0 deletions test/annot-tsv/src.13.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
1 10 20 long
1 15 15 short
6 changes: 6 additions & 0 deletions test/test.pl
Original file line number Diff line number Diff line change
Expand Up @@ -1472,4 +1472,10 @@ sub test_annot_tsv
run_annot_tsv($opts,src=>'src.11.txt',dst=>'dst.11.txt',out=>'out.11.3.txt',args=>'-c chr2,beg2,end2:chr,beg,end -f smpl2:src_smpl -h 3:2 -I');
run_annot_tsv($opts,src=>'src.12.txt',dst=>'dst.12.txt',out=>'out.12.1.txt',args=>'-c 1,2,3:1,2,3 -f 4:5 -h 0:0 -d ,');
run_annot_tsv($opts,src=>'src.12.txt',dst=>'dst.11.txt',out=>'out.11.1.txt',args=>q[-c 1,2,3:1,2,3 -f 4:5 -h 0:0 -d $',:\t']);
run_annot_tsv($opts,src=>'src.13.txt',dst=>'src.13.txt',out=>'out.13.1.txt',args=>q[-c 1,2,3 -f 4:5]);
run_annot_tsv($opts,src=>'src.13.txt',dst=>'src.13.txt',out=>'out.13.1.txt',args=>q[-c 1,2,3 -f 4:5 -O 0.5]);
run_annot_tsv($opts,src=>'src.13.txt',dst=>'src.13.txt',out=>'out.13.2.txt',args=>q[-c 1,2,3 -f 4:5 -O 0.5 -r]);
run_annot_tsv($opts,src=>'src.13.txt',dst=>'src.13.txt',out=>'out.13.2.txt',args=>q[-c 1,2,3 -f 4:5 -O 0.5,0.5]);
run_annot_tsv($opts,src=>'src.13.txt',dst=>'src.13.txt',out=>'out.13.3.txt',args=>q[-c 1,2,3 -f 4:5 -O 0,1]);
run_annot_tsv($opts,src=>'src.13.txt',dst=>'src.13.txt',out=>'out.13.4.txt',args=>q[-c 1,2,3 -f 4:5 -O 1,0]);
}