From 525b843a70f8a72c79b097d6404c45924c99748d Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Wed, 21 Apr 2021 09:35:09 +0100 Subject: [PATCH] Add a --seed option to mpileup. Random numbers are used by ks_shuffle, which is called in htslib's errmod code for sub-sampling the data in regions of > 255 depth. This corrects the OS specific randomness seen in #1459. --- Makefile | 2 +- doc/bcftools.txt | 2 ++ mpileup.c | 6 ++++++ 3 files changed, 9 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 12bf43f38..01b6c78a2 100644 --- a/Makefile +++ b/Makefile @@ -273,7 +273,7 @@ dist.o: dist.c dist.h cols.o: cols.c cols.h regidx.o: regidx.c $(htslib_hts_h) $(htslib_kstring_h) $(htslib_kseq_h) $(htslib_khash_str2int_h) regidx.h consensus.o: consensus.c $(htslib_vcf_h) $(htslib_kstring_h) $(htslib_synced_bcf_reader_h) $(htslib_kseq_h) $(htslib_bgzf_h) regidx.h $(bcftools_h) rbuf.h $(filter_h) -mpileup.o: mpileup.c $(htslib_sam_h) $(htslib_faidx_h) $(htslib_kstring_h) $(htslib_khash_str2int_h) regidx.h $(bcftools_h) $(bam2bcf_h) $(bam_sample_h) $(gvcf_h) +mpileup.o: mpileup.c $(htslib_sam_h) $(htslib_faidx_h) $(htslib_kstring_h) $(htslib_khash_str2int_h) $(htslib_hts_os_h) regidx.h $(bcftools_h) $(bam2bcf_h) $(bam_sample_h) $(gvcf_h) bam2bcf.o: bam2bcf.c $(htslib_hts_h) $(htslib_sam_h) $(htslib_kstring_h) $(htslib_kfunc_h) $(bam2bcf_h) mw.h bam2bcf_indel.o: bam2bcf_indel.c $(htslib_hts_h) $(htslib_sam_h) $(htslib_khash_str2int_h) $(bam2bcf_h) $(htslib_ksort_h) bam_sample.o: bam_sample.c $(htslib_hts_h) $(htslib_kstring_h) $(htslib_khash_str2int_h) $(khash_str2str_h) $(bam_sample_h) $(bcftools_h) diff --git a/doc/bcftools.txt b/doc/bcftools.txt index 899eb5447..47240c423 100644 --- a/doc/bcftools.txt +++ b/doc/bcftools.txt @@ -1870,6 +1870,8 @@ multiple regions and many alignment files are processed. *-x, --ignore-overlaps*:: Disable read-pair overlap detection. +*--seed* 'INT':: + Set the random number seed used when sub-sampling deep regions [0]. ==== Output options diff --git a/mpileup.c b/mpileup.c index 0bfb72495..f4881f9de 100644 --- a/mpileup.c +++ b/mpileup.c @@ -39,6 +39,7 @@ DEALINGS IN THE SOFTWARE. */ #include #include #include +#include #include #include "regidx.h" #include "bcftools.h" @@ -887,6 +888,7 @@ static void print_usage(FILE *fp, const mplp_conf_t *mplp) " -t, --targets REG[,...] similar to -r but streams rather than index-jumps\n" " -T, --targets-file FILE similar to -R but streams rather than index-jumps\n" " -x, --ignore-overlaps disable read-pair overlap detection\n" +" --seed INT random number seed used for sampling deep regions [0]\n" "\n" "Output options:\n" " -a, --annotate LIST optional tags to output; '?' to list available tags []\n" @@ -949,6 +951,8 @@ int main_mpileup(int argc, char *argv[]) mplp.bsmpl = bam_smpl_init(); mplp.fmt_flag = B2B_INFO_VDB|B2B_INFO_RPB; // the default to be changed in future, see also parse_format_flag() + hts_srand48(0); + static const struct option lopts[] = { {"rf", required_argument, NULL, 1}, // require flag @@ -998,6 +1002,7 @@ int main_mpileup(int argc, char *argv[]) {"per-sample-mF", no_argument, NULL, 'p'}, {"per-sample-mf", no_argument, NULL, 'p'}, {"platforms", required_argument, NULL, 'P'}, + {"seed", required_argument, NULL, 13}, {NULL, 0, NULL, 0} }; while ((c = getopt_long(argc, argv, "Ag:f:r:R:q:Q:C:Bd:L:b:P:po:e:h:Im:F:EG:6O:xa:s:S:t:T:",lopts,NULL)) >= 0) { @@ -1092,6 +1097,7 @@ int main_mpileup(int argc, char *argv[]) } mplp.fmt_flag |= parse_format_flag(optarg); break; + case 13: hts_srand48(atoi(optarg)); break; default: fprintf(stderr,"Invalid option: '%c'\n", c); return 1;