Skip to content

Commit

Permalink
Complete redesign of command line interface to simplify it (closes #74).
Browse files Browse the repository at this point in the history
  • Loading branch information
PeteHaitch committed Jun 11, 2014
1 parent d4118ac commit d44825d
Show file tree
Hide file tree
Showing 5 changed files with 271 additions and 224 deletions.
2 changes: 1 addition & 1 deletion comethylation/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__version_info__ = ('1','0','99')
__version_info__ = ('1','1','0')
__version__ = '.'.join(__version_info__)

import funcs
Expand Down
74 changes: 36 additions & 38 deletions comethylation/funcs.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,14 @@

#### Function definitions ####
def make_ignores_list(ic):
"""Make a list from a string of sequencing cycles that are to be ignored.
"""Make a list from a string of read positions that are to be ignored.
Args:
ic: A string of cycles to ignore. Multiple values should be comma-delimited and ranges can be specified by use of the hyphen, For example:
ic: A string of read positions to ignore. Multiple values should be comma-delimited and ranges can be specified by use of the hyphen, For example:
'1-5, 80, 98-100'
corresponds to ignoring cycles 1, 2, 3, 4, 5, 80, 98, 99, 100.
corresponds to ignoring read positions 1, 2, 3, 4, 5, 80, 98, 99, 100.
Returns:
A Python list of the positions to be ignored.
Expand All @@ -31,15 +31,15 @@ def make_ignores_list(ic):
elif len(z) == 1:
val = val + [int(z[0])]
else:
exit_msg = ''.join(['ERROR: --ignoreCycles_r1 and --ignoreCycles_r2 must be comma-delimited. Ranges can be specified by use of the hyphen, e.g. \'1-5, 80, 98-100\''])
exit_msg = ''.join(['ERROR: -ir1p and -ir2p must be comma-delimited. Ranges can be specified by use of the hyphen, e.g. \'1-5, 80, 98-100\''])
sys.exit(exit_msg)
if not all(isinstance(i, int) for i in val):
exit_msg = ''.join(['ERROR: --ignoreCycles_r1 and --ignoreCycles_r2 must be comma-delimited. Ranges can be specified by use of the hyphen, e.g. \'1-5, 80, 98-100\''])
exit_msg = ''.join(['ERROR: -ir1p and -ir2p must be comma-delimited. Ranges can be specified by use of the hyphen, e.g. \'1-5, 80, 98-100\''])
sys.exit(exit_msg)
return val

def ignore_cycles(read, methylation_index, ignore_cycles_list):
"""Ignore methylation loci in a read that appear in the ignore_cycles_list. A methylation locus may be one of CpG, CHH, CHG or CNN.
def ignore_read_pos(read, methylation_index, ignore_read_pos_list):
"""Ignore methylation loci in a read that appear in the ignore_read_pos_list. A methylation locus may be one of CpG, CHH, CHG or CNN.
Args:
read: A pysam.AlignedRead instance.
Expand All @@ -48,7 +48,7 @@ def ignore_cycles(read, methylation_index, ignore_cycles_list):
[0, 5]
corresponds to a read with a methylation locus at the first and sixth positions of the read.
ignore_cycles_list: The list of sequencing cycles to be ignored.
ignore_read_pos_list: The list of read positions to be ignored.
Returns:
An updated version of methylation_index. Will report a warning if the FLAG does not encode whether the read is part of a paired-end or which mate of the paired-end read it is. Will report an error and call sys.exit() if the XR-tag or XG-tag is incompatible or missing.
Expand All @@ -58,25 +58,25 @@ def ignore_cycles(read, methylation_index, ignore_cycles_list):
# Single-end reads
if not read.is_paired:
if strand == 'OT':
mi_updated = [mi for mi in methylation_index if mi not in ignore_cycles_list]
mi_updated = [mi for mi in methylation_index if mi not in ignore_read_pos_list]
elif strand == 'OB':
ignore_cycles_list = [read.rlen - ic - 1 for ic in ignore_cycles_list]
mi_updated = [mi for mi in methylation_index if mi not in ignore_cycles_list]
ignore_read_pos_list = [read.rlen - ic - 1 for ic in ignore_read_pos_list]
mi_updated = [mi for mi in methylation_index if mi not in ignore_read_pos_list]

# Paired-end reads: read_1
elif read.is_paired and read.is_read1:
if strand == 'OT':
mi_updated = [mi for mi in methylation_index if mi not in ignore_cycles_list]
mi_updated = [mi for mi in methylation_index if mi not in ignore_read_pos_list]
elif strand == 'OB':
ignore_cycles_list = [read.rlen - ic - 1 for ic in ignore_cycles_list]
mi_updated = [mi for mi in methylation_index if mi not in ignore_cycles_list]
ignore_read_pos_list = [read.rlen - ic - 1 for ic in ignore_read_pos_list]
mi_updated = [mi for mi in methylation_index if mi not in ignore_read_pos_list]
# Paired-end reads: read_2
elif read.is_paired and read.is_read2:
if strand == 'OT':
ignore_cycles_list = [read.rlen - ic - 1 for ic in ignore_cycles_list]
mi_updated = [mi for mi in methylation_index if mi not in ignore_cycles_list]
ignore_read_pos_list = [read.rlen - ic - 1 for ic in ignore_read_pos_list]
mi_updated = [mi for mi in methylation_index if mi not in ignore_read_pos_list]
if strand == 'OB':
mi_updated = [mi for mi in methylation_index if mi not in ignore_cycles_list]
mi_updated = [mi for mi in methylation_index if mi not in ignore_read_pos_list]

# Return updated methylation_index
return mi_updated
Expand Down Expand Up @@ -131,7 +131,7 @@ def fix_old_bismark(read):
elif read.flag == 179:
read.flag = 163
else:
exit_msg = ''.join(['ERROR: Unexpected FLAG (', str(read.flag), ') for read ', read.qname, 'Sorry, --oldBismark is unable to deal with this FLAG. Please log an issue at www.github.com/PeteHaitch/comethylation describing the error or email me at peter.hickey@gmail.com.'])
exit_msg = ''.join(['ERROR: Unexpected FLAG (', str(read.flag), ') for read ', read.qname, 'Sorry, --aligner Bismark_old is unable to deal with this FLAG. Please log an issue at www.github.com/PeteHaitch/comethylation describing the error or email me at peter.hickey@gmail.com.'])
sys.exit(exit_msg)
return read

Expand Down Expand Up @@ -269,7 +269,7 @@ def ignore_overlapping_sequence(read_1, read_2, methylation_index_1, methylation
sys.exit(exit_msg)
return methylation_index_1, methylation_index_2

def extract_and_update_methylation_index_from_single_end_read(read, BAM, methylation_m_tuples, m, methylation_type, methylation_pattern, ignore_cycles_r1, min_qual, phred_offset, ob_strand_offset):
def extract_and_update_methylation_index_from_single_end_read(read, BAM, methylation_m_tuples, m, methylation_type, methylation_pattern, ignore_read1_pos, min_qual, phred_offset, ob_strand_offset):
"""Extracts m-tuples of methylation loci from a single-end read and adds the comethylation m-tuple to the methylation_m_tuples object.
Args:
Expand All @@ -279,7 +279,7 @@ def extract_and_update_methylation_index_from_single_end_read(read, BAM, methyla
methylation_type: A string of the methylation type, e.g. CG for CpG methylation. Must be a valid option for the MTuple class.
methylation_pattern: A regular expression of the methylation loci, e.g. '[Zz]' for CpG-methylation
m: Is the "m" in "m-tuple", i.e. the size of the m-tuple. m must be an integer greater than or equal to 1. WARNING: No error or warning produced if this condition is violated.
ignore_cycles_r1: Ignore this list of sequencing cycles from each read.
ignore_read1_pos: Ignore this list of read positions from each read.
min_qual: Ignore bases with quality-score less than this value.
phred_offset: The offset in the Phred scores. Phred33 corresponds to phred_offset = 33 and Phred64 corresponds to phred_offset 64.
ob_strand_offset: How many bases a methylation loci on the OB-strand must be moved to the left in order to line up with the C on the OT-strand; e.g. ob_strand_offset = 1 for CpGs.
Expand All @@ -288,10 +288,9 @@ def extract_and_update_methylation_index_from_single_end_read(read, BAM, methyla
n_methylation_loci: The number of methylation loci extracted from the read.
"""
# Identify methylation events in read, e.g. CpGs or CHHs. The methylation_pattern is specified by a command line argument (e.g. Z/z corresponds to CpG)
# TODO: Translate read-positions to sequencing cycles - should be able to do from CIGAR string
methylation_index = [midx.start() for midx in re.finditer(methylation_pattern, read.opt('XM'))]
# Ignore any sequencing cycles specified in ignore_cycles_r1
methylation_index = ignore_cycles(read, methylation_index, ignore_cycles_r1)
# Ignore any read positions specified in ignore_read1_pos
methylation_index = ignore_read_pos(read, methylation_index, ignore_read1_pos)
# Ignore any positions with a base quality less than min_qual
methylation_index = ignore_low_quality_bases(read, methylation_index, min_qual, phred_offset)
n_methylation_loci = len(methylation_index)
Expand All @@ -314,7 +313,7 @@ def extract_and_update_methylation_index_from_single_end_read(read, BAM, methyla
methylation_m_tuples.increment_count(this_m_tuple_positions, this_comethylation_pattern, read, None)
return methylation_m_tuples, n_methylation_loci

def extract_and_update_methylation_index_from_paired_end_reads(read_1, read_2, BAM, methylation_m_tuples, m, methylation_type, methylation_pattern, ignore_cycles_r1, ignore_cycles_r2, min_qual, phred_offset, ob_strand_offset, overlap_check, n_fragment_skipped_due_to_bad_overlap, FAILED_QC):
def extract_and_update_methylation_index_from_paired_end_reads(read_1, read_2, BAM, methylation_m_tuples, m, methylation_type, methylation_pattern, ignore_read1_pos, ignore_read2_pos, min_qual, phred_offset, ob_strand_offset, overlap_check, n_fragment_skipped_due_to_bad_overlap, FAILED_QC):
"""Extracts m-tuples of methylation loci from a readpair and adds the comethylation m-tuple to the methylation_m_tuples object.
Args:
Expand All @@ -325,8 +324,8 @@ def extract_and_update_methylation_index_from_paired_end_reads(read_1, read_2, B
m: Is the "m" in "m-tuple", i.e. the size of the m-tuple. m must be an integer greater than or equal to 1. WARNING: No error or warning produced if this condition is violated.
methylation_type: A string of the methylation type, e.g. CG for CpG methylation. Must be a valid option for the MTuple class.
methylation_pattern: A regular expression of the methylation loci, e.g. '[Zz]' for CpG-methylation
ignore_cycles_r1: Ignore this list of sequencing cycles from each read_1.
ignore_cycles_r2: Ignore this list of sequencing cycles from each read_2.
ignore_read1_pos: Ignore this list of positions from each read_1.
ignore_read2_pos: Ignore this list of positions from each read_2.
min_qual: Ignore bases with quality-score less than this value.
phred_offset: The offset in the Phred scores. Phred33 corresponds to phred_offset = 33 and Phred64 corresponds to phred_offset 64.
ob_strand_offset: How many bases a methylation loci on the OB-strand must be moved to the left in order to line up with the C on the OT-strand; e.g. ob_strand_offset = 1 for CpGs.
Expand All @@ -338,12 +337,11 @@ def extract_and_update_methylation_index_from_paired_end_reads(read_1, read_2, B
n_methylation_loci: The number of methylation loci extracted from the read.
"""
# Identify methylation events in read, e.g. CpGs or CHHs. The methylation_pattern is specified by a command line argument (e.g. Z/z corresponds to CpG)
# TODO: Translate read-positions to sequencing cycles - should be able to do from CIGAR string
methylation_index_1 = [midx.start() for midx in re.finditer(methylation_pattern, read_1.opt('XM'))]
methylation_index_2 = [midx.start() for midx in re.finditer(methylation_pattern, read_2.opt('XM'))]
# Ignore any sequencing cycles specified in ignore_cycles_r1 or ignore_cycles_r1
methylation_index_1 = ignore_cycles(read_1, methylation_index_1, ignore_cycles_r1)
methylation_index_2 = ignore_cycles(read_2, methylation_index_2, ignore_cycles_r2)
# Ignore any read positions specified in ignore_read1_pos or ignore_read1_pos
methylation_index_1 = ignore_read_pos(read_1, methylation_index_1, ignore_read1_pos)
methylation_index_2 = ignore_read_pos(read_2, methylation_index_2, ignore_read2_pos)
# Ignore any positions with a base quality less than min_qual
methylation_index_1 = ignore_low_quality_bases(read_1, methylation_index_1, min_qual, phred_offset)
methylation_index_2 = ignore_low_quality_bases(read_2, methylation_index_2, min_qual, phred_offset)
Expand All @@ -359,7 +357,7 @@ def extract_and_update_methylation_index_from_paired_end_reads(read_1, read_2, B
if is_overlapping_sequence_identical(read_1, read_2, n_overlap, overlap_check):
methylation_index_1, methylation_index_2 = ignore_overlapping_sequence(read_1, read_2, methylation_index_1, methylation_index_2, n_overlap, overlap_check)
else:
failed_read_msg = '\t'.join([read_1.qname, ''.join(['failed the --overlappingPairedEndFilter ', overlap_check, '\n'])])
failed_read_msg = '\t'.join([read_1.qname, ''.join(['failed the --overlap-filter ', overlap_check, '\n'])])
FAILED_QC.write(failed_read_msg)
n_fragment_skipped_due_to_bad_overlap += 1
methylation_index_1 = []
Expand Down Expand Up @@ -483,11 +481,11 @@ def get_strand(read):
"""
## Single-end
if not read.is_paired:
## Check if aligned to CT- or CTOT-strand
## Check if aligned to OT- or CTOT-strand, i.e., informative for OT-strand.
if (read.opt('XR') == 'CT' and read.opt('XG') == 'CT') or (read.opt('XR') == 'GA' and read.opt('XG') == 'CT'):
# if read_1.opt('XG') == 'CT'
strand = 'OT'
## Else, check if aligned to OB- or CTOB-strand
## Else, check if aligned to OB- or CTOB-strand, i.e., informative for OB-strand.
elif (read.opt('XR') == 'CT' and read.opt('XG') == 'GA') or (read.opt('XR') == 'GA' and read.opt('XG') == 'GA'):
# elif read_1.opt('XG') == 'GA'
strand = 'OB'
Expand All @@ -498,11 +496,11 @@ def get_strand(read):
## Paired-end
elif read.is_paired:
if read.is_read1:
## Check if aligned to CT- or CTOT-strand
## Check if aligned to CT- or CTOT-strand, i.e., informative for OT-strand.
if (read.opt('XR') == 'CT' and read.opt('XG') == 'CT') or (read.opt('XR') == 'GA' and read.opt('XG') == 'CT'):
#if read.opt('XG') == 'CT':
strand = 'OT'
## Else, check if aligned to OB- or CTOB-strand
## Else, check if aligned to OB- or CTOB-strand, i.e., informative for OB-strand.
elif (read.opt('XR') == 'CT' and read.opt('XG') == 'GA') or (read.opt('XR') == 'GA' and read.opt('XG') == 'GA'):
#elif read.opt('XG') == 'GA':
strand = 'OB'
Expand All @@ -511,10 +509,10 @@ def get_strand(read):
exit_msg = ''.join(['ERROR: Read ', read.qname, ' has incompatible or missing XG-tag or XR-tag. Please log an issue at www.github.com/PeteHaitch/comethylation describing the error or email me at peter.hickey@gmail.com'])
sys.exit(exit_msg)
elif read.is_read2:
## Check if aligned CT or CTOT-strand
## Check if aligned CT or CTOT-strand, i.e., informative for OT-strand.
if (read.opt('XR') == 'GA' and read.opt('XG') == 'CT') or (read.opt('XR') == 'CT' and read.opt('XG') == 'CT'):
strand = 'OT'
## Else, check if aligned OB- or CTOB-strand
## Else, check if aligned OB- or CTOB-strand, i.e., informative for OB-strand.
elif (read.opt('XR') == 'GA' and read.opt('XG') == 'GA') or (read.opt('XR') == 'CT' and read.opt('XG') == 'GA'):
strand = 'OB'
## Else, something odd about this read
Expand All @@ -526,7 +524,7 @@ def get_strand(read):
return strand

__all__ = [
'ignore_cycles',
'ignore_read_pos',
'ignore_low_quality_bases',
'fix_old_bismark',
'is_overlapping_sequence_identical',
Expand Down
Loading

0 comments on commit d44825d

Please sign in to comment.