Fixes for -min and -max parameters (#234)

* Add extra length check for genome mode * Extra check for lengths when simulating unaligned reads, update README.md - Clarify that the -min and -max parameters in transcriptome mode are ONLY used for unaligned read simulation * Add extra checks for min/max lengths in metagenome mode * Don't allow --chimeric and --perfect in metagenome mode - counter-intuitive to have chimeric reads in perfect mode - This update is consistent with genome mode
bcgsc · Oct 9, 2024 · 54a49aa · 54a49aa
1 parent 19002e1
commit 54a49aa
Show file tree

Hide file tree

Showing 2 changed files with 25 additions and 5 deletions.
diff --git a/README.md b/README.md
@@ -440,10 +440,13 @@ optional arguments:
   -n NUMBER, --number NUMBER
                         Number of reads to be simulated (Default = 20000)
   -max MAX_LEN, --max_len MAX_LEN
-                        The maximum length for simulated reads (Default =
-                        Infinity)
+                        The maximum length for simulated unaligned reads. Note
+                        that this is not used for simulating aligned reads.
+                        (Default = Infinity)
   -min MIN_LEN, --min_len MIN_LEN
-                        The minimum length for simulated reads (Default = 50)
+                        The minimum length for simulated unaligned reads. Note
+                        that this is not used for simulating aligned reads.
+                        (Default = 50)
   --seed SEED           Manually seeds the pseudo-random number generator
   -hp, --homopolymer    Simulate homopolymer lengths (Default = False)
   -k KMERBIAS, --KmerBias KMERBIAS

diff --git a/src/simulator.py b/src/simulator.py
@@ -893,6 +893,9 @@ def simulation_aligned_metagenome(min_l, max_l, median_l, sd_l, out_reads, out_e
 
                 new_read_name = new_read_name + "_perfect_" + str(sequence_index)
                 read_mutated = case_convert(new_read)  # not mutated actually, just to be consistent with per == False
+
+                if len(read_mutated) < min_l or len(read_mutated) > max_l:
+                    continue
 
                 head = 0
                 tail = 0
@@ -1008,6 +1011,9 @@ def simulation_aligned_metagenome(min_l, max_l, median_l, sd_l, out_reads, out_e
             read_mutated = ''.join(np.random.choice(BASES, head)) + read_mutated + \
                            ''.join(np.random.choice(BASES, tail))
 
+            if len(read_mutated) < min_l or len(read_mutated) > max_l:
+                continue
+
             # Reverse complement half of the reads
             if is_reversed:
                 read_mutated = reverse_complement(read_mutated)
@@ -1420,6 +1426,9 @@ def simulation_aligned_genome(dna_type, min_l, max_l, median_l, sd_l, out_reads,
             read_mutated = ''.join(np.random.choice(BASES, head)) + read_mutated + \
                            ''.join(np.random.choice(BASES, tail))
 
+            if len(read_mutated) < min_l or len(read_mutated) > max_l:
+                continue
+
             # Reverse complement half of the reads
             if is_reversed:
                 read_mutated = reverse_complement(read_mutated)
@@ -1504,6 +1513,9 @@ def simulation_unaligned(dna_type, min_l, max_l, median_l, sd_l, out_reads, fast
             new_read = case_convert(new_read)
             # no quals returned here since unaligned quals are not based on mis/ins/match qual distributions
             read_mutated, _ = mutate_read(new_read, new_read_name, None, error_dict, error_count, False, False)
+
+            if len(read_mutated) < min_l or len(read_mutated) > max_l:
+                continue
 
             if fastq:
                 base_quals = model_base_quals.predict_base_qualities(lognorm_base_qual["unmapped"]["sd"], lognorm_base_qual["unmapped"]["loc"], np.exp(lognorm_base_qual["unmapped"]["mu"]), len(read_mutated))
@@ -2081,9 +2093,9 @@ def main():
                           default="simulated")
     parser_t.add_argument('-n', '--number', help='Number of reads to be simulated (Default = 20000)', type=int,
                           default=20000)
-    parser_t.add_argument('-max', '--max_len', help='The maximum length for simulated reads (Default = Infinity)',
+    parser_t.add_argument('-max', '--max_len', help='The maximum length for simulated unaligned reads. Note that this is not used for simulating aligned reads. (Default = Infinity)',
                           type=int, default=float("inf"))
-    parser_t.add_argument('-min', '--min_len', help='The minimum length for simulated reads (Default = 50)',
+    parser_t.add_argument('-min', '--min_len', help='The minimum length for simulated unaligned reads. Note that this is not used for simulating aligned reads.  (Default = 50)',
                           type=int, default=50)
     parser_t.add_argument('--seed', help='Manually seeds the pseudo-random number generator', type=int, default=None)
     parser_t.add_argument('-hp', '--homopolymer', help='Simulate homopolymer lengths (Default = False)',
@@ -2389,6 +2401,11 @@ def main():
             sys.stderr.write("\nMaximum read length must be longer than Minimum read length!\n")
             parser_mg.print_help(sys.stderr)
             sys.exit(1)
+
+        if perfect and chimeric:
+            print("\nPerfect reads cannot be chimeric\n")
+            parser_mg.print_help(sys.stderr)
+            sys.exit(1)
 
         print("\nrunning the code with following parameters:\n")
         print("genome_list", genome_list)