Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixes for -min and -max parameters #234

Merged
merged 4 commits into from
Oct 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 6 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -440,10 +440,13 @@ optional arguments:
-n NUMBER, --number NUMBER
Number of reads to be simulated (Default = 20000)
-max MAX_LEN, --max_len MAX_LEN
The maximum length for simulated reads (Default =
Infinity)
The maximum length for simulated unaligned reads. Note
that this is not used for simulating aligned reads.
(Default = Infinity)
-min MIN_LEN, --min_len MIN_LEN
The minimum length for simulated reads (Default = 50)
The minimum length for simulated unaligned reads. Note
that this is not used for simulating aligned reads.
(Default = 50)
--seed SEED Manually seeds the pseudo-random number generator
-hp, --homopolymer Simulate homopolymer lengths (Default = False)
-k KMERBIAS, --KmerBias KMERBIAS
Expand Down
21 changes: 19 additions & 2 deletions src/simulator.py
Original file line number Diff line number Diff line change
Expand Up @@ -893,6 +893,9 @@ def simulation_aligned_metagenome(min_l, max_l, median_l, sd_l, out_reads, out_e

new_read_name = new_read_name + "_perfect_" + str(sequence_index)
read_mutated = case_convert(new_read) # not mutated actually, just to be consistent with per == False

if len(read_mutated) < min_l or len(read_mutated) > max_l:
continue

head = 0
tail = 0
Expand Down Expand Up @@ -1008,6 +1011,9 @@ def simulation_aligned_metagenome(min_l, max_l, median_l, sd_l, out_reads, out_e
read_mutated = ''.join(np.random.choice(BASES, head)) + read_mutated + \
''.join(np.random.choice(BASES, tail))

if len(read_mutated) < min_l or len(read_mutated) > max_l:
continue

# Reverse complement half of the reads
if is_reversed:
read_mutated = reverse_complement(read_mutated)
Expand Down Expand Up @@ -1420,6 +1426,9 @@ def simulation_aligned_genome(dna_type, min_l, max_l, median_l, sd_l, out_reads,
read_mutated = ''.join(np.random.choice(BASES, head)) + read_mutated + \
''.join(np.random.choice(BASES, tail))

if len(read_mutated) < min_l or len(read_mutated) > max_l:
continue

# Reverse complement half of the reads
if is_reversed:
read_mutated = reverse_complement(read_mutated)
Expand Down Expand Up @@ -1504,6 +1513,9 @@ def simulation_unaligned(dna_type, min_l, max_l, median_l, sd_l, out_reads, fast
new_read = case_convert(new_read)
# no quals returned here since unaligned quals are not based on mis/ins/match qual distributions
read_mutated, _ = mutate_read(new_read, new_read_name, None, error_dict, error_count, False, False)

if len(read_mutated) < min_l or len(read_mutated) > max_l:
continue

if fastq:
base_quals = model_base_quals.predict_base_qualities(lognorm_base_qual["unmapped"]["sd"], lognorm_base_qual["unmapped"]["loc"], np.exp(lognorm_base_qual["unmapped"]["mu"]), len(read_mutated))
Expand Down Expand Up @@ -2081,9 +2093,9 @@ def main():
default="simulated")
parser_t.add_argument('-n', '--number', help='Number of reads to be simulated (Default = 20000)', type=int,
default=20000)
parser_t.add_argument('-max', '--max_len', help='The maximum length for simulated reads (Default = Infinity)',
parser_t.add_argument('-max', '--max_len', help='The maximum length for simulated unaligned reads. Note that this is not used for simulating aligned reads. (Default = Infinity)',
type=int, default=float("inf"))
parser_t.add_argument('-min', '--min_len', help='The minimum length for simulated reads (Default = 50)',
parser_t.add_argument('-min', '--min_len', help='The minimum length for simulated unaligned reads. Note that this is not used for simulating aligned reads. (Default = 50)',
type=int, default=50)
parser_t.add_argument('--seed', help='Manually seeds the pseudo-random number generator', type=int, default=None)
parser_t.add_argument('-hp', '--homopolymer', help='Simulate homopolymer lengths (Default = False)',
Expand Down Expand Up @@ -2389,6 +2401,11 @@ def main():
sys.stderr.write("\nMaximum read length must be longer than Minimum read length!\n")
parser_mg.print_help(sys.stderr)
sys.exit(1)

if perfect and chimeric:
print("\nPerfect reads cannot be chimeric\n")
parser_mg.print_help(sys.stderr)
sys.exit(1)

print("\nrunning the code with following parameters:\n")
print("genome_list", genome_list)
Expand Down