diff --git a/CRISPResso2/CRISPRessoPooledCORE.py b/CRISPResso2/CRISPRessoPooledCORE.py index 03341cfb..6cdeea0c 100644 --- a/CRISPResso2/CRISPRessoPooledCORE.py +++ b/CRISPResso2/CRISPRessoPooledCORE.py @@ -735,6 +735,13 @@ def main(): duplicated_entries = df_template.amplicon_seq[df_template.amplicon_seq.duplicated()] raise Exception('The amplicon sequences must be distinct! (Duplicated entries: ' + str(duplicated_entries.values) + ')') + #check to see that no sequences and their reverse complements are present + amp_seqs = df_template.amplicon_seq.values #Beware, this is a numpy array of dtype str and if you add these arrays amp_seqs + rc_amp_seqs, it will concat the strings, not the arrays.... + rc_amp_seqs = [CRISPRessoShared.reverse_complement(amp_seq) for amp_seq in amp_seqs] + for seq in amp_seqs: + if seq in rc_amp_seqs: + raise Exception('Amplicon sequences must be distinct! The amplicon sequence %s is the reverse complement of another amplicon sequence in the region file. Please provide only one of the two sequences.' % seq) + if not len(df_template.amplicon_name.unique())==df_template.shape[0]: duplicated_entries = df_template.amplicon_name[df_template.amplicon_name.duplicated()] raise Exception('The amplicon names must be distinct! (Duplicated names: ' + str(duplicated_entries.values) + ')') @@ -1056,7 +1063,7 @@ def rreplace(s, old, new): if can_finish_incomplete_run and 'genome_demultiplexing' in crispresso2_info['running_info']['finished_steps'] and os.path.isfile(REPORT_ALL_DEPTH): info('Using previously-computed demultiplexing of genomic reads') df_all_demux = pd.read_csv(REPORT_ALL_DEPTH, sep='\t') - df_all_demux['loc'] = df_all_demux['chr_id'].apply(str) + ' ' + df_all_demux['start'].apply(str) + ' '+df_all_demux['end'].apply(str) + df_all_demux['loc'] = df_all_demux['chr_id'] + ' ' + df_all_demux['start'].apply(str) + ' '+df_all_demux['end'].apply(str) df_all_demux.set_index(['loc'], inplace=True) else: #REDISCOVER LOCATIONS and DEMULTIPLEX READS @@ -1097,8 +1104,9 @@ def rreplace(s, old, new): for idx, row in df_template.iterrows(): chr_output_filename = _jp('MAPPED_REGIONS/chr%s_%s_%s.info' % (row.chr_id, row.bpstart, row.bpend)) sub_chr_command = cmd.replace('__REGIONCHR__', str(row.chr_id)).replace('__REGIONSTART__',str(row.bpstart)).replace('__REGIONEND__',str(row.bpend)).replace("__DEMUX_CHR_LOGFILENAME__", chr_output_filename) - chr_commands.append(sub_chr_command) - chr_output_filenames.append(chr_output_filename) + if chr_output_filename not in chr_output_filenames: # sometimes multiple amplicons map to the same region so we don't want the region to be written to by multiple processes + chr_commands.append(sub_chr_command) + chr_output_filenames.append(chr_output_filename) # if we should demultiplex everwhere (not just where amplicons aligned) else: @@ -1274,9 +1282,7 @@ def rreplace(s, old, new): if fastq_filename_region in files_to_match: files_to_match.remove(fastq_filename_region) fastq_region_filenames.append(fastq_filename_region) - #else: - #info('Warning: Fastq filename ' + fastq_filename_region + ' is not in ' + str(files_to_match)) - #debug here?? + if N_READS >= args.min_reads_to_use_region and fastq_filename_region != "": info('\nThe amplicon [%s] has enough reads (%d) mapped to it! Running CRISPResso!\n' % (idx, N_READS))