new plot replim work

edilytics · Dec 16, 2024 · e60965f · e60965f
1 parent 0232b06
commit e60965f
Show file tree

Hide file tree

Showing 3 changed files with 578 additions and 0 deletions.
diff --git a/CRISPResso2/CRISPRessoCORE.py b/CRISPResso2/CRISPRessoCORE.py
@@ -4574,6 +4574,10 @@ def count_alternate_alleles(sub_base_vectors, ref_name, ref_sequence, ref_total_
             crispresso2_info['results']['refs'][ref_name]['plot_10g_captions'] = []
             crispresso2_info['results']['refs'][ref_name]['plot_10g_datas'] = []
 
+            crispresso2_info['results']['refs'][ref_name]['plot_10h_roots'] = []
+            crispresso2_info['results']['refs'][ref_name]['plot_10h_captions'] = []
+            crispresso2_info['results']['refs'][ref_name]['plot_10h_datas'] = []
+
             for sgRNA_ind, sgRNA_seq in enumerate(sgRNA_sequences):
                 cut_point = sgRNA_cut_points[sgRNA_ind]
                 plot_cut_point = sgRNA_plot_cut_points[sgRNA_ind]
@@ -4779,6 +4783,65 @@ def count_alternate_alleles(sub_base_vectors, ref_name, ref_sequence, ref_total_
                         crispresso2_info['results']['refs'][ref_name]['plot_10g_captions'].append("Figure 10g: Non-reference base counts. For target nucleotides in the plotting window, this plot shows the number of non-reference (non-" + args.conversion_nuc_from + ") bases. The number of each target base is annotated on the reference sequence at the bottom of the plot.")
                         crispresso2_info['results']['refs'][ref_name]['plot_10g_datas'].append([('Nucleotide frequencies at ' + args.conversion_nuc_from +'s', os.path.basename(quant_window_sel_nuc_freq_filename))])
 
+
+                        plot_half_window = max(1, args.plot_window_size)
+                        df_alleles_around_cut=CRISPRessoShared.get_base_edit_dataframe_around_cut(df_alleles.loc[df_alleles['Reference_Name'] == ref_name], cut_point, plot_half_window, args.conversion_nuc_from)
+                        count_total = counts_total[ref_name]
+                        if args.allele_plot_pcts_only_for_assigned_reference:
+                            df_alleles_around_cut['%AllReads']=df_alleles_around_cut['%Reads']
+                            df_alleles_around_cut['%Reads']=df_alleles_around_cut['#Reads']/count_total*100
+
+                        #write alleles table to file
+                        base_edit_allele_filename = _jp(ref_plot_name + 'base_edit_' + args.conversion_nuc_from + 's_quilt_' + sgRNA_label + '.txt')
+                        df_alleles_around_cut.to_csv(base_edit_allele_filename, sep='\t', header=True)
+                        crispresso2_info['results']['refs'][ref_name]['allele_frequency_files'].append(os.path.basename(base_edit_allele_filename))
+
+                        ref_seq_around_cut=refs[ref_name]['sequence'][cut_point-plot_half_window+1:cut_point+plot_half_window+1]
+                        fig_filename_root = _jp('10h.'+ref_plot_name+'base_edit_'+args.conversion_nuc_from+'s_quilt_'+sgRNA_label)
+                        n_good = df_alleles_around_cut[df_alleles_around_cut['%Reads']>=args.min_frequency_alleles_around_cut_to_plot].shape[0]
+                        if not args.suppress_plots and n_good > 0:
+                        # Plot 10h: Edit Quilt around cut site
+                            df_to_plot = df_alleles_around_cut
+                            if not args.expand_allele_plots_by_quantification:
+                                df_to_plot = df_alleles_around_cut.groupby(['Aligned_Sequence', 'Reference_Sequence']).sum().reset_index().set_index('Aligned_Sequence')
+                                df_to_plot.sort_values(by=['#Reads', 'Aligned_Sequence', 'Reference_Sequence'], inplace=True, ascending=[False, True, True])
+
+                            new_sgRNA_intervals = []
+                            #adjust coordinates of sgRNAs
+                            new_sel_cols_start = cut_point - plot_half_window
+                            for (int_start, int_end) in refs[ref_name]['sgRNA_intervals']:
+                                new_sgRNA_intervals += [(int_start - new_sel_cols_start - 1, int_end - new_sel_cols_start - 1)]
+
+
+                            prepped_df_alleles, annotations, y_labels, insertion_dict, per_element_annot_kws, is_reference = CRISPRessoPlot.prep_alleles_table(
+                                df_to_plot,
+                                ref_seq_around_cut,
+                                args.max_rows_alleles_around_cut_to_plot,
+                                args.min_frequency_alleles_around_cut_to_plot,
+                            )
+                            plot_10h_input = {
+                                'reference_seq': ref_seq_around_cut,
+                                'prepped_df_alleles': prepped_df_alleles,
+                                'annotations': annotations,
+                                'y_labels': y_labels,
+                                'insertion_dict': insertion_dict,
+                                'per_element_annot_kws': per_element_annot_kws,
+                                'is_reference': is_reference,
+                                'fig_filename_root': fig_filename_root,
+                                'custom_colors': custom_config["colors"],
+                                'SAVE_ALSO_PNG': save_png,
+                                'plot_cut_point': plot_cut_point,
+                                'sgRNA_intervals': new_sgRNA_intervals,
+                                'sgRNA_names': sgRNA_names,
+                                'sgRNA_mismatches': sgRNA_mismatches,
+                                'annotate_wildtype_allele': args.annotate_wildtype_allele,
+                            }
+                            breakpoint()
+                            debug('Plotting allele distribution around cut for {0}'.format(ref_name))
+                            plot(CRISPRessoPlot.plot_alleles_table_prepped, plot_10h_input)
+                            crispresso2_info['results']['refs'][ref_name]['plot_10h_roots'].append(os.path.basename(fig_filename_root))
+                            crispresso2_info['results']['refs'][ref_name]['plot_10h_captions'].append("Figure 10f: Quilt of Base Edits for " + args.conversion_nuc_from + 'around cut site for ' + sgRNA_legend + ". Nucleotides are indicated by unique colors (A = green; C = red; G = yellow; T = purple). Substitutions are shown in bold font. Red rectangles highlight inserted sequences. Horizontal dashed lines indicate deleted sequences. The vertical dashed line indicates the predicted cleavage site.")
+                            crispresso2_info['results']['refs'][ref_name]['plot_10h_datas'].append([('Allele frequency table', os.path.basename(base_edit_allele_filename))])
             info('Done!')
 
             #END GUIDE SPECIFIC PLOTS

diff --git a/CRISPResso2/CRISPRessoShared.py b/CRISPResso2/CRISPRessoShared.py
@@ -1206,6 +1206,50 @@ def split_interleaved_fastq(fastq_filename, output_filename_r1, output_filename_
     return output_filename_r1, output_filename_r2
 
 
+def get_base_edit_row_around_cut(row, cut_point, offset, conversion_nuc_from):
+
+    cut_idx = row['ref_positions'].index(cut_point)
+    include_inds = [i for i,c in enumerate(row['Reference_Sequence']) if c == conversion_nuc_from]
+
+    filtered_aligned_seq = ''.join([row['Aligned_Sequence'][i] for i in include_inds])
+    filtered_ref_seq = ''.join([row['Reference_Sequence'][i] for i in include_inds])
+
+    return (
+        filtered_aligned_seq, # row['Aligned_Sequence'][cut_idx - offset + 1:cut_idx + offset + 1],
+        filtered_ref_seq, # row['Reference_Sequence'][cut_idx - offset + 1:cut_idx + offset + 1],
+        row['Read_Status'] == 'UNMODIFIED', 
+        row['n_deleted'], 
+        row['n_inserted'], 
+        row['n_mutated'], 
+        row['#Reads'], 
+        row['%Reads']
+        )
+
+
+def get_base_edit_dataframe_around_cut(df_alleles, cut_point, offset, conversion_nuc_inds, collapse_by_sequence=True):
+    if df_alleles.shape[0] == 0:
+        return df_alleles
+    ref1 = df_alleles['Reference_Sequence'].iloc[0]
+    ref1 = ref1.replace('-', '')
+    if (cut_point + offset + 1 > len(ref1)):
+        raise (BadParameterException(
+            'The plotting window cannot extend past the end of the amplicon. Amplicon length is ' + str(
+                len(ref1)) + ' but plot extends to ' + str(cut_point + offset + 1)))
+
+    df_alleles_around_cut = pd.DataFrame(
+        list(df_alleles.apply(lambda row: get_base_edit_row_around_cut(row, cut_point, offset, conversion_nuc_inds), axis=1).values),
+        columns=['Aligned_Sequence', 'Reference_Sequence', 'Unedited', 'n_deleted', 'n_inserted', 'n_mutated', '#Reads',
+                 '%Reads'])
+
+    df_alleles_around_cut = df_alleles_around_cut.groupby(
+        ['Aligned_Sequence', 'Reference_Sequence', 'Unedited', 'n_deleted', 'n_inserted',
+         'n_mutated']).sum().reset_index().set_index('Aligned_Sequence')
+
+    df_alleles_around_cut.sort_values(by=['#Reads', 'Aligned_Sequence', 'Reference_Sequence'], inplace=True, ascending=[False, True, True])
+    df_alleles_around_cut['Unedited'] = df_alleles_around_cut['Unedited'] > 0
+    breakpoint()
+    return df_alleles_around_cut
+
 ######
 # allele modification functions
 ######