Skip to content

Commit

Permalink
Fixes for -min and -max parameters (#234)
Browse files Browse the repository at this point in the history
* Add extra length check for genome mode

* Extra check for lengths when simulating unaligned reads, update README.md

- Clarify that the -min and -max parameters in transcriptome mode are ONLY used for unaligned read simulation

* Add extra checks for min/max lengths in metagenome mode

* Don't allow --chimeric and --perfect in metagenome mode

- counter-intuitive to have chimeric reads in perfect mode
- This update is consistent with genome mode
  • Loading branch information
lcoombe authored Oct 9, 2024
1 parent 19002e1 commit 54a49aa
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 5 deletions.
9 changes: 6 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -440,10 +440,13 @@ optional arguments:
-n NUMBER, --number NUMBER
Number of reads to be simulated (Default = 20000)
-max MAX_LEN, --max_len MAX_LEN
The maximum length for simulated reads (Default =
Infinity)
The maximum length for simulated unaligned reads. Note
that this is not used for simulating aligned reads.
(Default = Infinity)
-min MIN_LEN, --min_len MIN_LEN
The minimum length for simulated reads (Default = 50)
The minimum length for simulated unaligned reads. Note
that this is not used for simulating aligned reads.
(Default = 50)
--seed SEED Manually seeds the pseudo-random number generator
-hp, --homopolymer Simulate homopolymer lengths (Default = False)
-k KMERBIAS, --KmerBias KMERBIAS
Expand Down
21 changes: 19 additions & 2 deletions src/simulator.py
Original file line number Diff line number Diff line change
Expand Up @@ -893,6 +893,9 @@ def simulation_aligned_metagenome(min_l, max_l, median_l, sd_l, out_reads, out_e

new_read_name = new_read_name + "_perfect_" + str(sequence_index)
read_mutated = case_convert(new_read) # not mutated actually, just to be consistent with per == False

if len(read_mutated) < min_l or len(read_mutated) > max_l:
continue

head = 0
tail = 0
Expand Down Expand Up @@ -1008,6 +1011,9 @@ def simulation_aligned_metagenome(min_l, max_l, median_l, sd_l, out_reads, out_e
read_mutated = ''.join(np.random.choice(BASES, head)) + read_mutated + \
''.join(np.random.choice(BASES, tail))

if len(read_mutated) < min_l or len(read_mutated) > max_l:
continue

# Reverse complement half of the reads
if is_reversed:
read_mutated = reverse_complement(read_mutated)
Expand Down Expand Up @@ -1420,6 +1426,9 @@ def simulation_aligned_genome(dna_type, min_l, max_l, median_l, sd_l, out_reads,
read_mutated = ''.join(np.random.choice(BASES, head)) + read_mutated + \
''.join(np.random.choice(BASES, tail))

if len(read_mutated) < min_l or len(read_mutated) > max_l:
continue

# Reverse complement half of the reads
if is_reversed:
read_mutated = reverse_complement(read_mutated)
Expand Down Expand Up @@ -1504,6 +1513,9 @@ def simulation_unaligned(dna_type, min_l, max_l, median_l, sd_l, out_reads, fast
new_read = case_convert(new_read)
# no quals returned here since unaligned quals are not based on mis/ins/match qual distributions
read_mutated, _ = mutate_read(new_read, new_read_name, None, error_dict, error_count, False, False)

if len(read_mutated) < min_l or len(read_mutated) > max_l:
continue

if fastq:
base_quals = model_base_quals.predict_base_qualities(lognorm_base_qual["unmapped"]["sd"], lognorm_base_qual["unmapped"]["loc"], np.exp(lognorm_base_qual["unmapped"]["mu"]), len(read_mutated))
Expand Down Expand Up @@ -2081,9 +2093,9 @@ def main():
default="simulated")
parser_t.add_argument('-n', '--number', help='Number of reads to be simulated (Default = 20000)', type=int,
default=20000)
parser_t.add_argument('-max', '--max_len', help='The maximum length for simulated reads (Default = Infinity)',
parser_t.add_argument('-max', '--max_len', help='The maximum length for simulated unaligned reads. Note that this is not used for simulating aligned reads. (Default = Infinity)',
type=int, default=float("inf"))
parser_t.add_argument('-min', '--min_len', help='The minimum length for simulated reads (Default = 50)',
parser_t.add_argument('-min', '--min_len', help='The minimum length for simulated unaligned reads. Note that this is not used for simulating aligned reads. (Default = 50)',
type=int, default=50)
parser_t.add_argument('--seed', help='Manually seeds the pseudo-random number generator', type=int, default=None)
parser_t.add_argument('-hp', '--homopolymer', help='Simulate homopolymer lengths (Default = False)',
Expand Down Expand Up @@ -2389,6 +2401,11 @@ def main():
sys.stderr.write("\nMaximum read length must be longer than Minimum read length!\n")
parser_mg.print_help(sys.stderr)
sys.exit(1)

if perfect and chimeric:
print("\nPerfect reads cannot be chimeric\n")
parser_mg.print_help(sys.stderr)
sys.exit(1)

print("\nrunning the code with following parameters:\n")
print("genome_list", genome_list)
Expand Down

0 comments on commit 54a49aa

Please sign in to comment.