Skip to content

Commit

Permalink
Pooled multi region map fix (pinellolab#505)
Browse files Browse the repository at this point in the history
* Fixes a bug in CRISPRessoPooled where if two input regions map to the same genomic locus the demultiplexing will create two processes that collide to write on the same file.

* Remove print statements

* Check to see that no amplicons and their reverse complement exist in the input regions
  • Loading branch information
kclem authored Nov 27, 2024
1 parent 6a363b7 commit 0232b06
Showing 1 changed file with 12 additions and 6 deletions.
18 changes: 12 additions & 6 deletions CRISPResso2/CRISPRessoPooledCORE.py
Original file line number Diff line number Diff line change
Expand Up @@ -735,6 +735,13 @@ def main():
duplicated_entries = df_template.amplicon_seq[df_template.amplicon_seq.duplicated()]
raise Exception('The amplicon sequences must be distinct! (Duplicated entries: ' + str(duplicated_entries.values) + ')')

#check to see that no sequences and their reverse complements are present
amp_seqs = df_template.amplicon_seq.values #Beware, this is a numpy array of dtype str and if you add these arrays amp_seqs + rc_amp_seqs, it will concat the strings, not the arrays....
rc_amp_seqs = [CRISPRessoShared.reverse_complement(amp_seq) for amp_seq in amp_seqs]
for seq in amp_seqs:
if seq in rc_amp_seqs:
raise Exception('Amplicon sequences must be distinct! The amplicon sequence %s is the reverse complement of another amplicon sequence in the region file. Please provide only one of the two sequences.' % seq)

if not len(df_template.amplicon_name.unique())==df_template.shape[0]:
duplicated_entries = df_template.amplicon_name[df_template.amplicon_name.duplicated()]
raise Exception('The amplicon names must be distinct! (Duplicated names: ' + str(duplicated_entries.values) + ')')
Expand Down Expand Up @@ -1056,7 +1063,7 @@ def rreplace(s, old, new):
if can_finish_incomplete_run and 'genome_demultiplexing' in crispresso2_info['running_info']['finished_steps'] and os.path.isfile(REPORT_ALL_DEPTH):
info('Using previously-computed demultiplexing of genomic reads')
df_all_demux = pd.read_csv(REPORT_ALL_DEPTH, sep='\t')
df_all_demux['loc'] = df_all_demux['chr_id'].apply(str) + ' ' + df_all_demux['start'].apply(str) + ' '+df_all_demux['end'].apply(str)
df_all_demux['loc'] = df_all_demux['chr_id'] + ' ' + df_all_demux['start'].apply(str) + ' '+df_all_demux['end'].apply(str)
df_all_demux.set_index(['loc'], inplace=True)
else:
#REDISCOVER LOCATIONS and DEMULTIPLEX READS
Expand Down Expand Up @@ -1097,8 +1104,9 @@ def rreplace(s, old, new):
for idx, row in df_template.iterrows():
chr_output_filename = _jp('MAPPED_REGIONS/chr%s_%s_%s.info' % (row.chr_id, row.bpstart, row.bpend))
sub_chr_command = cmd.replace('__REGIONCHR__', str(row.chr_id)).replace('__REGIONSTART__',str(row.bpstart)).replace('__REGIONEND__',str(row.bpend)).replace("__DEMUX_CHR_LOGFILENAME__", chr_output_filename)
chr_commands.append(sub_chr_command)
chr_output_filenames.append(chr_output_filename)
if chr_output_filename not in chr_output_filenames: # sometimes multiple amplicons map to the same region so we don't want the region to be written to by multiple processes
chr_commands.append(sub_chr_command)
chr_output_filenames.append(chr_output_filename)

# if we should demultiplex everwhere (not just where amplicons aligned)
else:
Expand Down Expand Up @@ -1274,9 +1282,7 @@ def rreplace(s, old, new):
if fastq_filename_region in files_to_match:
files_to_match.remove(fastq_filename_region)
fastq_region_filenames.append(fastq_filename_region)
#else:
#info('Warning: Fastq filename ' + fastq_filename_region + ' is not in ' + str(files_to_match))
#debug here??

if N_READS >= args.min_reads_to_use_region and fastq_filename_region != "":
info('\nThe amplicon [%s] has enough reads (%d) mapped to it! Running CRISPResso!\n' % (idx, N_READS))

Expand Down

0 comments on commit 0232b06

Please sign in to comment.