Skip to content

Commit

Permalink
Merge pull request #162 from open2c/dedup-cols
Browse files Browse the repository at this point in the history
Allow column names in all dedup backends, and allow sorting by arbitrary columns
  • Loading branch information
golobor authored Mar 17, 2024
2 parents 7cd8c9a + 75cc094 commit 91b40fd
Show file tree
Hide file tree
Showing 7 changed files with 517 additions and 122 deletions.
69 changes: 40 additions & 29 deletions pairtools/cli/dedup.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,12 @@

logger = get_logger()

from ..lib import fileio, pairsam_format, headerops
from . import cli, common_io_options

import click

from ..lib import fileio, headerops, pairsam_format
from ..lib.dedup import streaming_dedup, streaming_dedup_cython
from ..lib.stats import PairCounter
from . import cli, common_io_options

UTIL_NAME = "pairtools_dedup"

Expand Down Expand Up @@ -179,45 +178,45 @@
)
@click.option(
"--c1",
type=int,
default=pairsam_format.COL_C1,
help=f"Chrom 1 column; default {pairsam_format.COL_C1}"
" Only works with '--backend cython'. [input format option]",
type=str,
default=pairsam_format.COLUMNS_PAIRS[1],
help=f"Chrom 1 column; default {pairsam_format.COLUMNS_PAIRS[1]}"
"[input format option]",
)
@click.option(
"--c2",
type=int,
default=pairsam_format.COL_C2,
help=f"Chrom 2 column; default {pairsam_format.COL_C2}"
" Only works with '--backend cython'. [input format option]",
type=str,
default=pairsam_format.COLUMNS_PAIRS[3],
help=f"Chrom 2 column; default {pairsam_format.COLUMNS_PAIRS[3]}"
"[input format option]",
)
@click.option(
"--p1",
type=int,
default=pairsam_format.COL_P1,
help=f"Position 1 column; default {pairsam_format.COL_P1}"
" Only works with '--backend cython'. [input format option]",
type=str,
default=pairsam_format.COLUMNS_PAIRS[2],
help=f"Position 1 column; default {pairsam_format.COLUMNS_PAIRS[2]}"
"[input format option]",
)
@click.option(
"--p2",
type=int,
default=pairsam_format.COL_P2,
help=f"Position 2 column; default {pairsam_format.COL_P2}"
" Only works with '--backend cython'. [input format option]",
type=str,
default=pairsam_format.COLUMNS_PAIRS[4],
help=f"Position 2 column; default {pairsam_format.COLUMNS_PAIRS[4]}"
"[input format option]",
)
@click.option(
"--s1",
type=int,
default=pairsam_format.COL_S1,
help=f"Strand 1 column; default {pairsam_format.COL_S1}"
" Only works with '--backend cython'. [input format option]",
type=str,
default=pairsam_format.COLUMNS_PAIRS[5],
help=f"Strand 1 column; default {pairsam_format.COLUMNS_PAIRS[5]}"
"[input format option]",
)
@click.option(
"--s2",
type=int,
default=pairsam_format.COL_S2,
help=f"Strand 2 column; default {pairsam_format.COL_S2}"
" Only works with '--backend cython'. [input format option]",
type=str,
default=pairsam_format.COLUMNS_PAIRS[6],
help=f"Strand 2 column; default {pairsam_format.COLUMNS_PAIRS[6]}"
"[input format option]",
)
@click.option(
"--unmapped-chrom",
Expand Down Expand Up @@ -505,8 +504,8 @@ def dedup_py(
extra_cols2 = []
if extra_col_pair is not None:
for col1, col2 in extra_col_pair:
extra_cols1.append(column_names[col1] if col1.isdigit() else col1)
extra_cols2.append(column_names[col2] if col2.isdigit() else col2)
extra_cols1.append(column_names[col1] if col1.isnumeric() else col1)
extra_cols2.append(column_names[col2] if col2.isnumeric() else col2)

if backend == "cython":
# warnings.warn(
Expand All @@ -516,6 +515,12 @@ def dedup_py(
# )
extra_cols1 = [column_names.index(col) for col in extra_cols1]
extra_cols2 = [column_names.index(col) for col in extra_cols2]
c1 = column_names.index(c1)
c2 = column_names.index(c2)
p1 = column_names.index(p1)
p2 = column_names.index(p2)
s1 = column_names.index(s1)
s2 = column_names.index(s2)
streaming_dedup_cython(
method,
max_mismatch,
Expand Down Expand Up @@ -555,6 +560,12 @@ def dedup_py(
out_stat=out_stat,
backend=backend,
n_proc=n_proc,
c1=c1,
c2=c2,
p1=p1,
p2=p2,
s1=s1,
s2=s2,
)
else:
raise ValueError("Unknown backend")
Expand Down
134 changes: 107 additions & 27 deletions pairtools/cli/sort.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,50 @@
" If the path ends with .gz or .lz4, the output is compressed by bgzip "
"or lz4, correspondingly. By default, the output is printed into stdout.",
)
@click.option(
"--c1",
type=str,
default=pairsam_format.COLUMNS_PAIRS[1],
help=f"Chrom 1 column; default {pairsam_format.COLUMNS_PAIRS[1]}"
"[input format option]",
)
@click.option(
"--c2",
type=str,
default=pairsam_format.COLUMNS_PAIRS[3],
help=f"Chrom 2 column; default {pairsam_format.COLUMNS_PAIRS[3]}"
"[input format option]",
)
@click.option(
"--p1",
type=str,
default=pairsam_format.COLUMNS_PAIRS[2],
help=f"Position 1 column; default {pairsam_format.COLUMNS_PAIRS[2]}"
"[input format option]",
)
@click.option(
"--p2",
type=str,
default=pairsam_format.COLUMNS_PAIRS[4],
help=f"Position 2 column; default {pairsam_format.COLUMNS_PAIRS[4]}"
"[input format option]",
)
@click.option(
"--pt",
type=str,
default=pairsam_format.COLUMNS_PAIRS[7],
help=f"Pair type column; default {pairsam_format.COLUMNS_PAIRS[7]}"
"[input format option]",
)
@click.option(
"--extra-col",
nargs=1,
type=str,
multiple=True,
help="Extra column (name or numerical index) that is also used for sorting."
"The option can be provided multiple times."
'Example: --extra-col "phase1" --extra-col "phase2". [output format option]',
)
@click.option(
"--nproc",
type=int,
Expand Down Expand Up @@ -56,7 +100,21 @@
"otherwise.",
)
@common_io_options
def sort(pairs_path, output, nproc, tmpdir, memory, compress_program, **kwargs):
def sort(
pairs_path,
output,
c1,
c2,
p1,
p2,
pt,
extra_col,
nproc,
tmpdir,
memory,
compress_program,
**kwargs,
):
"""Sort a .pairs/.pairsam file.
Sort pairs in the lexicographic order along chrom1 and chrom2, in the
Expand All @@ -67,10 +125,38 @@ def sort(pairs_path, output, nproc, tmpdir, memory, compress_program, **kwargs):
input is decompressed by bgzip or lz4c, correspondingly. By default, the
input is read as text from stdin.
"""
sort_py(pairs_path, output, nproc, tmpdir, memory, compress_program, **kwargs)
sort_py(
pairs_path,
output,
c1,
c2,
p1,
p2,
pt,
extra_col,
nproc,
tmpdir,
memory,
compress_program,
**kwargs,
)


def sort_py(pairs_path, output, nproc, tmpdir, memory, compress_program, **kwargs):
def sort_py(
pairs_path,
output,
c1,
c2,
p1,
p2,
pt,
extra_col,
nproc,
tmpdir,
memory,
compress_program,
**kwargs,
):

instream = fileio.auto_open(
pairs_path,
Expand Down Expand Up @@ -104,35 +190,29 @@ def sort_py(pairs_path, output, nproc, tmpdir, memory, compress_program, **kwarg
)
compress_program = "gzip"

command = r"""
column_names = headerops.extract_column_names(header)
columns = [c1, c2, p1, p2, pt] + list(extra_col)
# Now generating the "-k <i>,<i><mode>" expressions for all columns.
# If column name is in the default pairsam format and has an integer dtype there, do numerical sorting
cols = []
for col in columns:
colindex = int(col) if col.isnumeric() else column_names.index(col) + 1
cols.append(
f"-k {colindex},{colindex}{'n' if isinstance(pairsam_format.DTYPES_PAIRSAM.get(column_names[colindex-1], str), int) else ''}"
)
cols = " ".join(cols)
command = rf"""
/bin/bash -c 'export LC_COLLATE=C; export LANG=C; sort
-k {0},{0} -k {1},{1} -k {2},{2}n -k {3},{3}n -k {4},{4}
{cols}
--stable
--field-separator=$'\''{5}'\''
{6}
{7}
-S {8}
{9}
--field-separator=$'\''{pairsam_format.PAIRSAM_SEP_ESCAPE}'\''
--parallel={nproc}
{f'--temporary-directory={tmpdir}' if tmpdir else ''}
-S {memory}
{f'--compress-program={compress_program}' if compress_program else ''}'
""".replace(
"\n", " "
).format(
pairsam_format.COL_C1 + 1,
pairsam_format.COL_C2 + 1,
pairsam_format.COL_P1 + 1,
pairsam_format.COL_P2 + 1,
pairsam_format.COL_PTYPE + 1,
pairsam_format.PAIRSAM_SEP_ESCAPE,
" --parallel={} ".format(nproc) if nproc > 0 else " ",
" --temporary-directory={} ".format(tmpdir) if tmpdir else " ",
memory,
(
" --compress-program={} ".format(compress_program)
if compress_program
else " "
),
)
command += "'"

with subprocess.Popen(
command, stdin=subprocess.PIPE, bufsize=-1, shell=True, stdout=outstream
) as process:
Expand Down
Loading

0 comments on commit 91b40fd

Please sign in to comment.