diff --git a/README.md b/README.md index e52bf1b..ac9a67c 100755 --- a/README.md +++ b/README.md @@ -440,10 +440,13 @@ optional arguments: -n NUMBER, --number NUMBER Number of reads to be simulated (Default = 20000) -max MAX_LEN, --max_len MAX_LEN - The maximum length for simulated reads (Default = - Infinity) + The maximum length for simulated unaligned reads. Note + that this is not used for simulating aligned reads. + (Default = Infinity) -min MIN_LEN, --min_len MIN_LEN - The minimum length for simulated reads (Default = 50) + The minimum length for simulated unaligned reads. Note + that this is not used for simulating aligned reads. + (Default = 50) --seed SEED Manually seeds the pseudo-random number generator -hp, --homopolymer Simulate homopolymer lengths (Default = False) -k KMERBIAS, --KmerBias KMERBIAS diff --git a/src/simulator.py b/src/simulator.py index 168807d..718b86d 100755 --- a/src/simulator.py +++ b/src/simulator.py @@ -893,6 +893,9 @@ def simulation_aligned_metagenome(min_l, max_l, median_l, sd_l, out_reads, out_e new_read_name = new_read_name + "_perfect_" + str(sequence_index) read_mutated = case_convert(new_read) # not mutated actually, just to be consistent with per == False + + if len(read_mutated) < min_l or len(read_mutated) > max_l: + continue head = 0 tail = 0 @@ -1008,6 +1011,9 @@ def simulation_aligned_metagenome(min_l, max_l, median_l, sd_l, out_reads, out_e read_mutated = ''.join(np.random.choice(BASES, head)) + read_mutated + \ ''.join(np.random.choice(BASES, tail)) + if len(read_mutated) < min_l or len(read_mutated) > max_l: + continue + # Reverse complement half of the reads if is_reversed: read_mutated = reverse_complement(read_mutated) @@ -1420,6 +1426,9 @@ def simulation_aligned_genome(dna_type, min_l, max_l, median_l, sd_l, out_reads, read_mutated = ''.join(np.random.choice(BASES, head)) + read_mutated + \ ''.join(np.random.choice(BASES, tail)) + if len(read_mutated) < min_l or len(read_mutated) > max_l: + continue + # Reverse complement half of the reads if is_reversed: read_mutated = reverse_complement(read_mutated) @@ -1504,6 +1513,9 @@ def simulation_unaligned(dna_type, min_l, max_l, median_l, sd_l, out_reads, fast new_read = case_convert(new_read) # no quals returned here since unaligned quals are not based on mis/ins/match qual distributions read_mutated, _ = mutate_read(new_read, new_read_name, None, error_dict, error_count, False, False) + + if len(read_mutated) < min_l or len(read_mutated) > max_l: + continue if fastq: base_quals = model_base_quals.predict_base_qualities(lognorm_base_qual["unmapped"]["sd"], lognorm_base_qual["unmapped"]["loc"], np.exp(lognorm_base_qual["unmapped"]["mu"]), len(read_mutated)) @@ -2081,9 +2093,9 @@ def main(): default="simulated") parser_t.add_argument('-n', '--number', help='Number of reads to be simulated (Default = 20000)', type=int, default=20000) - parser_t.add_argument('-max', '--max_len', help='The maximum length for simulated reads (Default = Infinity)', + parser_t.add_argument('-max', '--max_len', help='The maximum length for simulated unaligned reads. Note that this is not used for simulating aligned reads. (Default = Infinity)', type=int, default=float("inf")) - parser_t.add_argument('-min', '--min_len', help='The minimum length for simulated reads (Default = 50)', + parser_t.add_argument('-min', '--min_len', help='The minimum length for simulated unaligned reads. Note that this is not used for simulating aligned reads. (Default = 50)', type=int, default=50) parser_t.add_argument('--seed', help='Manually seeds the pseudo-random number generator', type=int, default=None) parser_t.add_argument('-hp', '--homopolymer', help='Simulate homopolymer lengths (Default = False)', @@ -2389,6 +2401,11 @@ def main(): sys.stderr.write("\nMaximum read length must be longer than Minimum read length!\n") parser_mg.print_help(sys.stderr) sys.exit(1) + + if perfect and chimeric: + print("\nPerfect reads cannot be chimeric\n") + parser_mg.print_help(sys.stderr) + sys.exit(1) print("\nrunning the code with following parameters:\n") print("genome_list", genome_list)