Skip to content

Commit

Permalink
Header CLI proposal. See #119
Browse files Browse the repository at this point in the history
  • Loading branch information
agalitsyna committed Apr 8, 2022
1 parent b002dbe commit 28eb983
Show file tree
Hide file tree
Showing 5 changed files with 819 additions and 13 deletions.
1 change: 1 addition & 0 deletions pairtools/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,3 +126,4 @@ def wrapper(*args, **kwargs):
from .pairtools_stats import stats
from .pairtools_sample import sample
from .pairtools_filterbycov import filterbycov
from .pairtools_header import header
115 changes: 102 additions & 13 deletions pairtools/_headerops.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,22 @@
PAIRS_FORMAT_VERSION = "1.0.0"
SEP_COLS = " "
SEP_CHROMS = " "
COMMENT_CHAR = "#"

def get_header(instream, comment_char="#"):
def get_stream_handlers(instream):
# get peekable buffer for the instream
readline_f, peek_f = None, None
if hasattr(instream, "buffer"):
peek_f = instream.buffer.peek
readline_f = instream.buffer.readline
elif hasattr(instream, "peek"):
peek_f = instream.peek
readline_f = instream.readline
else:
raise ValueError("Cannot find the peek() function of the provided stream!")
return readline_f, peek_f

def get_header(instream, comment_char=COMMENT_CHAR, ignore_warning=False):
"""Returns a header from the stream and an the reaminder of the stream
with the actual data.
Parameters
Expand All @@ -25,6 +39,8 @@ def get_header(instream, comment_char="#"):
comment_char : str
The character prepended to header lines (use '@' when parsing sams,
'#' when parsing pairsams).
ignore_warning : bool
If True, then no warning will be generated if header of pairs file is empty.
Returns
-------
header : list
Expand All @@ -37,17 +53,8 @@ def get_header(instream, comment_char="#"):
if not comment_char:
raise ValueError("Please, provide a comment char!")
comment_byte = comment_char.encode()
# get peekable buffer for the instream
read_f, peek_f = None, None
if hasattr(instream, "buffer"):
peek_f = instream.buffer.peek
readline_f = instream.buffer.readline
elif hasattr(instream, "peek"):
peek_f = instream.peek
readline_f = instream.readline
else:
raise ValueError("Cannot find the peek() function of the provided stream!")

readline_f, peek_f = get_stream_handlers(instream)
current_peek = peek_f(1)
while current_peek.startswith(comment_byte):
# consuming a line from buffer guarantees
Expand All @@ -62,6 +69,10 @@ def get_header(instream, comment_char="#"):
current_peek = peek_f(1)
# apparently, next line does not start with the comment
# return header and the instream, advanced to the beginning of the data

if len(header)==0 and not ignore_warning:
warnings.warn("Headerless input, please, add the header by `pairtools header generate` or `pairtools header transfer`")

return header, instream


Expand All @@ -75,7 +86,7 @@ def extract_fields(header, field_name, save_rest=False):
fields = []
rest = []
for l in header:
if l.lstrip("#").startswith(field_name + ":"):
if l.lstrip(COMMENT_CHAR).startswith(field_name + ":"):
fields.append(l.split(":", 1)[1].strip())
elif save_rest:
rest.append(l)
Expand All @@ -98,6 +109,52 @@ def extract_column_names(header):
return []


def validate_cols(stream, columns):
"""
Validate that the number of columns coincides between stream and columns.
Checks only the first line in the pairs stream!
Note that it irreversibly removes the header from the stream.
Parameters
----------
stream: input stream, body or full .pairs file
columns: columns to validate against
Returns
-------
True if the number of columns is identical between file and columns
"""

comment_byte = COMMENT_CHAR.encode()
readline_f, peek_f = get_stream_handlers(stream)

current_peek = peek_f(1)
while current_peek.startswith(comment_byte):
# consuming a line from buffer guarantees
# that the remainder of the buffer starts
# with the beginning of the line.
line = readline_f()
# peek into the remainder of the instream
current_peek = peek_f(1)

line = readline_f()
if isinstance(line, bytes):
line = line.decode()

ncols_body = len(line.split(_pairsam_format.PAIRSAM_SEP))
ncols_reference = len(columns) if isinstance(columns, list) else columns.split(SEP_COLS)

return ncols_body==ncols_reference


def validate_header_cols(stream, header):
""" Validate that the number of columns corresponds between the stream and header """

columns = extract_column_names(header)
return validate_cols(stream, header)


def extract_chromsizes(header):
"""
Extract chromosome sizes from header lines.
Expand Down Expand Up @@ -129,6 +186,20 @@ def get_chromsizes_from_pysam_header(samheader):
return dict(chromsizes)


def get_chromsizes_from_file(chroms_file):
"""
Produce an "enumeration" of chromosomes based on the list
of chromosomes
"""
chrom_sizes = dict()
with open(chroms_file, "rt") as f:
for line in f:
chrom, size = line.strip().split("\t")
chrom_sizes[chrom] = int(size)

return chrom_sizes


def get_chrom_order(chroms_file, sam_chroms=None):
"""
Produce an "enumeration" of chromosomes based on the list
Expand Down Expand Up @@ -244,7 +315,7 @@ def _update_header_entry(header, field, new_value):
found = False
newline = "#{}: {}".format(field, new_value)
for i in range(len(header)):
if header[i].startswith("#" + field):
if header[i].startswith(COMMENT_CHAR + field):
header[i] = newline
found = True
if not found:
Expand Down Expand Up @@ -641,6 +712,24 @@ def append_columns(header, columns):
header[i] += SEP_COLS + SEP_COLS.join(columns)
return header

def set_columns(header, columns):
"""
Set columns to the header, separated by SEP_COLS
Parameters
----------
header: Previous header
columns: List of column names to append
Returns
-------
Modified header (appended columns to the field "#columns")
"""
for i in range(len(header)):
if header[i].startswith("#columns:"):
header[i] = "#columns:"+ SEP_COLS + SEP_COLS.join(columns)
return header

# def _guess_genome_assembly(samheader):
# PG = [l for l in samheader if l.startswith('@PG') and '\tID:bwa' in l][0]
# CL = [field for field in PG.split('\t') if field.startswith('CL:')]
Expand Down
8 changes: 8 additions & 0 deletions pairtools/_pairsam_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,18 @@
COL_SAM1 = 8
COL_SAM2 = 9

# All possible columns
COLUMNS = ['readID', 'chrom1', 'pos1', 'chrom2', 'pos2',
'strand1', 'strand2', 'pair_type', 'sam1', 'sam2',
'junction_index']

# Required columns for formats:
COLUMNS_PAIRSAM = ['readID', 'chrom1', 'pos1', 'chrom2', 'pos2',
'strand1', 'strand2', 'pair_type', 'sam1', 'sam2']

COLUMNS_PAIRS = ['readID', 'chrom1', 'pos1', 'chrom2', 'pos2',
'strand1', 'strand2', 'pair_type']

UNMAPPED_CHROM = '!'
UNMAPPED_POS = 0
UNMAPPED_STRAND = '-'
Expand Down
Loading

0 comments on commit 28eb983

Please sign in to comment.