From 798f661031236ee1aa5611f491ca135ec0432dc9 Mon Sep 17 00:00:00 2001
From: Kendell Clement <k.clement.dev@gmail.com>
Date: Thu, 14 Jan 2021 23:51:29 -0500
Subject: [PATCH] Allow for int-looking chromosome names in WGS input file

In CRISPRessoWGS, the region file contains a 'chr_id' column which is sometimes mis-recognized as ints when read by pandas if using the chromosome notation without 'chr' (e.g. 1,2,3 in stead of chr1,chr2,chr3). This bug fix forces chr_ids to be read as strs.
---
 CRISPResso2/CRISPRessoWGSCORE.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/CRISPResso2/CRISPRessoWGSCORE.py b/CRISPResso2/CRISPRessoWGSCORE.py
index 8be565ba..adf2741e 100644
--- a/CRISPResso2/CRISPRessoWGSCORE.py
+++ b/CRISPResso2/CRISPRessoWGSCORE.py
@@ -444,7 +444,7 @@ def rreplace(s, old, new):
         #Load and validate the REGION FILE
         df_regions=pd.read_csv(args.region_file,names=[
                 'chr_id','bpstart','bpend','Name','sgRNA',
-                'Expected_HDR','Coding_sequence'],comment='#',sep='\t',dtype={'Name':str})
+                'Expected_HDR','Coding_sequence'],comment='#',sep='\t',dtype={'Name':str,'chr_id':str})
 
 
         #remove empty amplicons/lines
@@ -542,7 +542,7 @@ def set_filenames(row):
 
         if can_finish_incomplete_run and num_rows_without_fastq == 0 and os.path.isfile(report_reads_aligned_filename) and 'generation_of_fastq_files_for_each_amplicon' in crispresso2_info['finished_steps']:
             info('Skipping generation of fastq files for each amplicon.')
-            df_regions = pd.read_csv(report_reads_aligned_filename,sep="\t")
+            df_regions = pd.read_csv(report_reads_aligned_filename,comment='#',sep='\t',dtype={'Name':str,'chr_id':str})
             df_regions.set_index('Name',inplace=True)
 
         else: