Allow for int-looking chromosome names in WGS input file

In CRISPRessoWGS, the region file contains a 'chr_id' column which is sometimes mis-recognized as ints when read by pandas if using the chromosome notation without 'chr' (e.g. 1,2,3 in stead of chr1,chr2,chr3). This bug fix forces chr_ids to be read as strs.
pinellolab · Jan 15, 2021 · 798f661 · 798f661
1 parent 92c0086
commit 798f661
Showing 1 changed file with 2 additions and 2 deletions.
diff --git a/CRISPResso2/CRISPRessoWGSCORE.py b/CRISPResso2/CRISPRessoWGSCORE.py
@@ -444,7 +444,7 @@ def rreplace(s, old, new):
         #Load and validate the REGION FILE
         df_regions=pd.read_csv(args.region_file,names=[
                 'chr_id','bpstart','bpend','Name','sgRNA',
-                'Expected_HDR','Coding_sequence'],comment='#',sep='\t',dtype={'Name':str})
+                'Expected_HDR','Coding_sequence'],comment='#',sep='\t',dtype={'Name':str,'chr_id':str})
 
 
         #remove empty amplicons/lines
@@ -542,7 +542,7 @@ def set_filenames(row):
 
         if can_finish_incomplete_run and num_rows_without_fastq == 0 and os.path.isfile(report_reads_aligned_filename) and 'generation_of_fastq_files_for_each_amplicon' in crispresso2_info['finished_steps']:
             info('Skipping generation of fastq files for each amplicon.')
-            df_regions = pd.read_csv(report_reads_aligned_filename,sep="\t")
+            df_regions = pd.read_csv(report_reads_aligned_filename,comment='#',sep='\t',dtype={'Name':str,'chr_id':str})
             df_regions.set_index('Name',inplace=True)
 
         else: