Skip to content

Commit

Permalink
new plot replim work
Browse files Browse the repository at this point in the history
  • Loading branch information
mbowcut2 committed Dec 16, 2024
1 parent 0232b06 commit e60965f
Show file tree
Hide file tree
Showing 3 changed files with 578 additions and 0 deletions.
63 changes: 63 additions & 0 deletions CRISPResso2/CRISPRessoCORE.py
Original file line number Diff line number Diff line change
Expand Up @@ -4574,6 +4574,10 @@ def count_alternate_alleles(sub_base_vectors, ref_name, ref_sequence, ref_total_
crispresso2_info['results']['refs'][ref_name]['plot_10g_captions'] = []
crispresso2_info['results']['refs'][ref_name]['plot_10g_datas'] = []

crispresso2_info['results']['refs'][ref_name]['plot_10h_roots'] = []
crispresso2_info['results']['refs'][ref_name]['plot_10h_captions'] = []
crispresso2_info['results']['refs'][ref_name]['plot_10h_datas'] = []

for sgRNA_ind, sgRNA_seq in enumerate(sgRNA_sequences):
cut_point = sgRNA_cut_points[sgRNA_ind]
plot_cut_point = sgRNA_plot_cut_points[sgRNA_ind]
Expand Down Expand Up @@ -4779,6 +4783,65 @@ def count_alternate_alleles(sub_base_vectors, ref_name, ref_sequence, ref_total_
crispresso2_info['results']['refs'][ref_name]['plot_10g_captions'].append("Figure 10g: Non-reference base counts. For target nucleotides in the plotting window, this plot shows the number of non-reference (non-" + args.conversion_nuc_from + ") bases. The number of each target base is annotated on the reference sequence at the bottom of the plot.")
crispresso2_info['results']['refs'][ref_name]['plot_10g_datas'].append([('Nucleotide frequencies at ' + args.conversion_nuc_from +'s', os.path.basename(quant_window_sel_nuc_freq_filename))])


plot_half_window = max(1, args.plot_window_size)
df_alleles_around_cut=CRISPRessoShared.get_base_edit_dataframe_around_cut(df_alleles.loc[df_alleles['Reference_Name'] == ref_name], cut_point, plot_half_window, args.conversion_nuc_from)
count_total = counts_total[ref_name]
if args.allele_plot_pcts_only_for_assigned_reference:
df_alleles_around_cut['%AllReads']=df_alleles_around_cut['%Reads']
df_alleles_around_cut['%Reads']=df_alleles_around_cut['#Reads']/count_total*100

#write alleles table to file
base_edit_allele_filename = _jp(ref_plot_name + 'base_edit_' + args.conversion_nuc_from + 's_quilt_' + sgRNA_label + '.txt')
df_alleles_around_cut.to_csv(base_edit_allele_filename, sep='\t', header=True)
crispresso2_info['results']['refs'][ref_name]['allele_frequency_files'].append(os.path.basename(base_edit_allele_filename))

ref_seq_around_cut=refs[ref_name]['sequence'][cut_point-plot_half_window+1:cut_point+plot_half_window+1]
fig_filename_root = _jp('10h.'+ref_plot_name+'base_edit_'+args.conversion_nuc_from+'s_quilt_'+sgRNA_label)
n_good = df_alleles_around_cut[df_alleles_around_cut['%Reads']>=args.min_frequency_alleles_around_cut_to_plot].shape[0]
if not args.suppress_plots and n_good > 0:
# Plot 10h: Edit Quilt around cut site
df_to_plot = df_alleles_around_cut
if not args.expand_allele_plots_by_quantification:
df_to_plot = df_alleles_around_cut.groupby(['Aligned_Sequence', 'Reference_Sequence']).sum().reset_index().set_index('Aligned_Sequence')
df_to_plot.sort_values(by=['#Reads', 'Aligned_Sequence', 'Reference_Sequence'], inplace=True, ascending=[False, True, True])

new_sgRNA_intervals = []
#adjust coordinates of sgRNAs
new_sel_cols_start = cut_point - plot_half_window
for (int_start, int_end) in refs[ref_name]['sgRNA_intervals']:
new_sgRNA_intervals += [(int_start - new_sel_cols_start - 1, int_end - new_sel_cols_start - 1)]


prepped_df_alleles, annotations, y_labels, insertion_dict, per_element_annot_kws, is_reference = CRISPRessoPlot.prep_alleles_table(
df_to_plot,
ref_seq_around_cut,
args.max_rows_alleles_around_cut_to_plot,
args.min_frequency_alleles_around_cut_to_plot,
)
plot_10h_input = {
'reference_seq': ref_seq_around_cut,
'prepped_df_alleles': prepped_df_alleles,
'annotations': annotations,
'y_labels': y_labels,
'insertion_dict': insertion_dict,
'per_element_annot_kws': per_element_annot_kws,
'is_reference': is_reference,
'fig_filename_root': fig_filename_root,
'custom_colors': custom_config["colors"],
'SAVE_ALSO_PNG': save_png,
'plot_cut_point': plot_cut_point,
'sgRNA_intervals': new_sgRNA_intervals,
'sgRNA_names': sgRNA_names,
'sgRNA_mismatches': sgRNA_mismatches,
'annotate_wildtype_allele': args.annotate_wildtype_allele,
}
breakpoint()
debug('Plotting allele distribution around cut for {0}'.format(ref_name))
plot(CRISPRessoPlot.plot_alleles_table_prepped, plot_10h_input)
crispresso2_info['results']['refs'][ref_name]['plot_10h_roots'].append(os.path.basename(fig_filename_root))
crispresso2_info['results']['refs'][ref_name]['plot_10h_captions'].append("Figure 10f: Quilt of Base Edits for " + args.conversion_nuc_from + 'around cut site for ' + sgRNA_legend + ". Nucleotides are indicated by unique colors (A = green; C = red; G = yellow; T = purple). Substitutions are shown in bold font. Red rectangles highlight inserted sequences. Horizontal dashed lines indicate deleted sequences. The vertical dashed line indicates the predicted cleavage site.")
crispresso2_info['results']['refs'][ref_name]['plot_10h_datas'].append([('Allele frequency table', os.path.basename(base_edit_allele_filename))])
info('Done!')

#END GUIDE SPECIFIC PLOTS
Expand Down
44 changes: 44 additions & 0 deletions CRISPResso2/CRISPRessoShared.py
Original file line number Diff line number Diff line change
Expand Up @@ -1206,6 +1206,50 @@ def split_interleaved_fastq(fastq_filename, output_filename_r1, output_filename_
return output_filename_r1, output_filename_r2


def get_base_edit_row_around_cut(row, cut_point, offset, conversion_nuc_from):

cut_idx = row['ref_positions'].index(cut_point)
include_inds = [i for i,c in enumerate(row['Reference_Sequence']) if c == conversion_nuc_from]

filtered_aligned_seq = ''.join([row['Aligned_Sequence'][i] for i in include_inds])
filtered_ref_seq = ''.join([row['Reference_Sequence'][i] for i in include_inds])

return (
filtered_aligned_seq, # row['Aligned_Sequence'][cut_idx - offset + 1:cut_idx + offset + 1],
filtered_ref_seq, # row['Reference_Sequence'][cut_idx - offset + 1:cut_idx + offset + 1],
row['Read_Status'] == 'UNMODIFIED',
row['n_deleted'],
row['n_inserted'],
row['n_mutated'],
row['#Reads'],
row['%Reads']
)


def get_base_edit_dataframe_around_cut(df_alleles, cut_point, offset, conversion_nuc_inds, collapse_by_sequence=True):
if df_alleles.shape[0] == 0:
return df_alleles
ref1 = df_alleles['Reference_Sequence'].iloc[0]
ref1 = ref1.replace('-', '')
if (cut_point + offset + 1 > len(ref1)):
raise (BadParameterException(
'The plotting window cannot extend past the end of the amplicon. Amplicon length is ' + str(
len(ref1)) + ' but plot extends to ' + str(cut_point + offset + 1)))

df_alleles_around_cut = pd.DataFrame(
list(df_alleles.apply(lambda row: get_base_edit_row_around_cut(row, cut_point, offset, conversion_nuc_inds), axis=1).values),
columns=['Aligned_Sequence', 'Reference_Sequence', 'Unedited', 'n_deleted', 'n_inserted', 'n_mutated', '#Reads',
'%Reads'])

df_alleles_around_cut = df_alleles_around_cut.groupby(
['Aligned_Sequence', 'Reference_Sequence', 'Unedited', 'n_deleted', 'n_inserted',
'n_mutated']).sum().reset_index().set_index('Aligned_Sequence')

df_alleles_around_cut.sort_values(by=['#Reads', 'Aligned_Sequence', 'Reference_Sequence'], inplace=True, ascending=[False, True, True])
df_alleles_around_cut['Unedited'] = df_alleles_around_cut['Unedited'] > 0
breakpoint()
return df_alleles_around_cut

######
# allele modification functions
######
Expand Down
Loading

0 comments on commit e60965f

Please sign in to comment.