Skip to content

Commit

Permalink
[feature] filters
Browse files Browse the repository at this point in the history
- filtering problematic mutations isn't hard-coded anymore
- uses a new filter mini-format
- old filters converted into YAML
- tests to compare with hard-coded version
- document syntax in README.md
  • Loading branch information
DrYak committed Jul 5, 2024
1 parent 051ea4c commit c78c0bb
Show file tree
Hide file tree
Showing 6 changed files with 357 additions and 1 deletion.
45 changes: 45 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,8 @@ Options:
-k, --deconv-config, --dec YAML
Configuration of parameters for kernel
deconvolution [required]
--filters YAML List of filters for removing problematic
mutations from tally
-l, --loc, --location, --wwtp, --catchment NAME
Name(s) of location/wastewater treatment
plant/catchment area to process
Expand Down Expand Up @@ -288,6 +290,49 @@ var_dates:
```
see [variants_dates_example.yaml](variants_dates_example.yaml).

#### Filters (optional)

Some mutations might be problematic and need to be taken out --- e.g.
due to drop-outs in the multiplex PCR amplification, they do not show up in the data
and this could be misinterpreted by LolliPop as proof of absence of a variant.
This optional file contains a collection of filters, each filter has a list of statements
selecting entry based on value found in columns.
The general syntax of statements is:
```text
- <column> <op> <value>
```
Valid _op_ are:
- `==` on that line, the value in _column_ is exactly _value_
- for simple strings this can be omitted: `- proto v3` is synonymous with `- proto == v3`
- `<=` the value is less than or equal to _value_
- `>=` the value is greater than or equal to _value_
- `<` the value is less than _value_
- `>` the value is greater than _value_
- `!=` the value is **not** _value_
- `in` the value is found in the list specidied in _value_
- `~` the value matches the regular expression in _value_
- regex can be quoted using `/` or `@`
- `!~` the vlue does **not** matche the regular expression in _value_

Any arbitrary column found in the input file can be used.

All statements are combined with a logical `and` and matching lines are removed from the tally table.

For example:
```yaml
# filter to remove test samples
remove_test:
- sample ~ /^Test/
# filter to remove an amplicon that has drop-outs
amplicon75:
- proto v3
- date > 2021-11-20
- pos >= 22428
- pos <= 22785
```
see [example in filters_preprint.yaml](filters_preprint.yaml).

#### Running it

```bash
Expand Down
53 changes: 53 additions & 0 deletions filters_preprint.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
bad_mutations:
- proto v3
- mutations in [ 28461G, 11201G, 26801C, -28461G, -11201G, -26801C ]

amplicon75:
- proto v3
- date > 2021-11-20
- pos >= 22428
- pos <= 22785

amplicon76:
- proto v3
- date > 2021-11-20
- pos >= 22677
- pos <= 23028

amplicon77:
- proto v3
- date > 2021-11-20
- pos >= 22974
- pos <= 23327

amplicon88:
- proto v3
- date > 2021-11-20
- pos >= 26277
- pos <= 26635

amplicon90:
- proto v3
- date > 2021-11-20
- pos >= 26895
- pos <= 27256

other_0:
- proto v3
- date > 2021-11-20
- pos == 26709

other_1:
- proto v3
- date > 2021-11-20
- pos == 27807

other_2:
- proto v3
- date > 2021-11-20
- pos == 2832

other_3:
- proto v3
- date > 2021-11-20
- pos == 10449
19 changes: 18 additions & 1 deletion lollipop/cli/deconvolute.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,15 @@
default=None,
help="Name(s) of location/wastewater treatment plant/catchment area to process",
)
@click.option(
"--filters",
"-fl",
metavar="YAML",
required=False,
default=None,
type=str,
help="List of filters for removing problematic mutations from tally",
)
@click.option(
"--seed",
"-s",
Expand All @@ -109,6 +118,7 @@ def deconvolute(
variants_dates,
deconv_config,
loc,
filters,
seed,
output,
fmt_columns,
Expand All @@ -135,6 +145,13 @@ def deconvolute(
with open(deconv_config, "r") as file:
deconv = yaml.load(file)

# problematic mutation filters
if filters:
with open(filters, "r") as file:
filters = yaml.load(file)

print(f"{len(filters)} filter{ '' if len(filters) == 1 else 's' } loaded")

# data
try:
df_tally = pd.read_csv(
Expand Down Expand Up @@ -287,7 +304,7 @@ def deconvolute(
no_date=no_date,
remove_deletions=remove_deletions,
)
preproc = preproc.filter_mutations()
preproc = preproc.filter_mutations(filters=filters)

print("deconvolve all")
np.random.seed(seed)
Expand Down
90 changes: 90 additions & 0 deletions lollipop/preprocessors.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
import pandas as pd
import numpy as np
from functools import reduce
import re
import sys
from pandas.api.types import is_numeric_dtype


class DataPreprocesser:
Expand Down Expand Up @@ -118,6 +121,93 @@ def general_preprocess(

def filter_mutations(self, filters=None):
"""filter out hardcoded problematic mutations"""
if filters is None:
return self

types = self.df_tally.dtypes

rxprser = re.compile(
r"^ *(?:(?P<col>"
+ r"|".join(self.df_tally.columns)
+ r")|(?P<bad>\w+)) *(?P<op>in|[<>=~!]*) *(?P<qv>['\"]?)(?P<val>.+)(?P=qv) *$"
)

def apply_filter_statement(name, fs):
"""parse a single statement from a filter and apply it, returning a boolean series"""
m = rxprser.search(fs)
assert m, f"Cannot parse statement <{fs}> in filter {name}"
m = m.groupdict()

assert m[
"col"
], f"bad column name {m['bad']}, not in list: {self.df_tally.columns}, while parsing statement <{fs}> in filter {name}"

# HACK handle 'date' column differently, to force datatypes
col = (
pd.to_datetime(self.df_tally["date"])
if "date" == m["col"]
else self.df_tally[m["col"]]
)
val = (
np.datetime64(m["val"])
if "date" == m["col"]
else (
pd.to_numeric(m["val"])
if is_numeric_dtype(types[m["col"]])
else m["val"]
)
)

# apply operator
match m["op"]:
case "=" | "==" | "" as e:
if e == "":
assert (
" " not in val
), "Do not use values with space <{val}> when using no operator (implicit 'equals'). (while parsing statement <{fs}> in filter {name})"
return col == val
case "!=" | "!":
return col != val
case "<":
return col < val
case "<=" | "=<":
return col <= val
case ">=" | ">=":
return col >= val
case ">":
return col > val
case "in":
# unpack list
return col.isin(
[
v.strip("\"' ")
for v in val.lstrip("[ ").rstrip(" ]").split(",")
]
)
case "~" | "=~" | "~=":
return col.str.contains(
val[1, -2] if val[0] == val[-1] in "/@" else val
)
case "!~" | "~!":
return ~(
col.str.contains(
val[1, -2] if val[0] == val[-1] in "/@" else val
)
)
case _ as o:
raise ValueError(
f"unknown operator {o}, while parsing statement <{fs}> in filter {name}"
)

for name, fl in filters.items():
print(f"filter {name}")

self.df_tally = self.df_tally[
~reduce(
(lambda x, y: x & y),
[apply_filter_statement(name, fstatmt) for fstatmt in fl],
)
]

# HACK completely disable filters
return self
Loading

0 comments on commit c78c0bb

Please sign in to comment.