From 798f661031236ee1aa5611f491ca135ec0432dc9 Mon Sep 17 00:00:00 2001 From: Kendell Clement Date: Thu, 14 Jan 2021 23:51:29 -0500 Subject: [PATCH] Allow for int-looking chromosome names in WGS input file In CRISPRessoWGS, the region file contains a 'chr_id' column which is sometimes mis-recognized as ints when read by pandas if using the chromosome notation without 'chr' (e.g. 1,2,3 in stead of chr1,chr2,chr3). This bug fix forces chr_ids to be read as strs. --- CRISPResso2/CRISPRessoWGSCORE.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CRISPResso2/CRISPRessoWGSCORE.py b/CRISPResso2/CRISPRessoWGSCORE.py index 8be565ba..adf2741e 100644 --- a/CRISPResso2/CRISPRessoWGSCORE.py +++ b/CRISPResso2/CRISPRessoWGSCORE.py @@ -444,7 +444,7 @@ def rreplace(s, old, new): #Load and validate the REGION FILE df_regions=pd.read_csv(args.region_file,names=[ 'chr_id','bpstart','bpend','Name','sgRNA', - 'Expected_HDR','Coding_sequence'],comment='#',sep='\t',dtype={'Name':str}) + 'Expected_HDR','Coding_sequence'],comment='#',sep='\t',dtype={'Name':str,'chr_id':str}) #remove empty amplicons/lines @@ -542,7 +542,7 @@ def set_filenames(row): if can_finish_incomplete_run and num_rows_without_fastq == 0 and os.path.isfile(report_reads_aligned_filename) and 'generation_of_fastq_files_for_each_amplicon' in crispresso2_info['finished_steps']: info('Skipping generation of fastq files for each amplicon.') - df_regions = pd.read_csv(report_reads_aligned_filename,sep="\t") + df_regions = pd.read_csv(report_reads_aligned_filename,comment='#',sep='\t',dtype={'Name':str,'chr_id':str}) df_regions.set_index('Name',inplace=True) else: