Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for writing Anopheles SNP data to the plink binary file format #515

Merged
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
Show all changes
50 commits
Select commit Hold shift + click to select a range
ea54da7
add plink converyt function, faff around with some tests
Mar 26, 2024
6afd608
temp fix for avoiding error in mapping all-allele -> biallelic allele…
Apr 11, 2024
3268640
updated plink converter to use biallelic_snp_calls with tmp. amended …
Apr 11, 2024
ab6f28c
Merge branch 'master' into plink-converter-2024-03-26
tristanpwdennis Aug 8, 2024
5e502bf
Added chunking of allele_mapping array so that da.map blocks works
Aug 8, 2024
31f6469
numba works fine
Aug 8, 2024
4ef5e9c
clear up notebook
Aug 9, 2024
98767fc
Merge branch 'master' into plink-converter-2024-03-26
jonbrenas Aug 20, 2024
af47433
remove dud notebook, tidy up to_plink.py
Aug 27, 2024
96eb157
add plink converter test
Sep 19, 2024
98700e3
update plink converter test to include test for created files
Sep 19, 2024
6db9db8
tidy, add tests, refactor snp_data call
Sep 19, 2024
03ab293
Tidy tests and remove allelism selection from converter, add tests to…
Oct 7, 2024
e7a35c8
move bedreader to bottom of pyproject list
Oct 7, 2024
201f9d4
Merge branch 'master' into plink-converter-2024-03-26
jonbrenas Oct 9, 2024
7823580
Merge branch 'malariagen:master' into plink-converter-2024-03-26
tristanpwdennis Oct 15, 2024
6335208
Change the default chunks value.
jonbrenas Oct 16, 2024
c84cd1a
checkout changes from upstream to snpdata.py
Oct 18, 2024
51f9174
Update tests/anoph/test_plink_converter.py
tristanpwdennis Nov 13, 2024
ae7d474
Update malariagen_data/anoph/to_plink.py
tristanpwdennis Nov 13, 2024
e33bfcd
Update malariagen_data/anoph/to_plink.py
tristanpwdennis Nov 13, 2024
23ff3df
Update malariagen_data/anoph/to_plink.py
tristanpwdennis Nov 13, 2024
f2ceb46
update plink converter
Nov 13, 2024
df75274
fix to_plink.py
Nov 13, 2024
2c13752
fix to_plink and tests
Nov 13, 2024
df936c6
Merge branch 'plink-converter-2024-03-26' of https://github.com/trist…
Nov 13, 2024
90e72d5
update ac and an params to be same as pca, replace out dir
Nov 13, 2024
51d6c56
added overwrite param
Nov 14, 2024
0e196af
Merge branch 'master' into plink-converter-2024-03-26
alimanfoo Nov 19, 2024
ebb7ae8
Add Tristan Dennis as author
alimanfoo Nov 19, 2024
fc70d08
poetry update
alimanfoo Nov 19, 2024
643534e
fix relative import
alimanfoo Nov 19, 2024
8c6dec0
fix test plink converter notebook and run tests
Nov 20, 2024
7d10281
update notebook to fix output dir and relative output path
Nov 20, 2024
38b9510
Update malariagen_data/anoph/to_plink.py
tristanpwdennis Nov 24, 2024
b48abf4
add bool type definition
Nov 24, 2024
cc5ed2d
Merge branch 'plink-converter-2024-03-26' of https://github.com/trist…
Nov 24, 2024
6730123
add docstring
Nov 25, 2024
92e8340
fix function definitions
Nov 25, 2024
fb08e1c
add overwrite to base params and finish docstring
Nov 25, 2024
dab86ec
fix docstring and base params
Nov 25, 2024
033fc8a
update output dir type def
Nov 25, 2024
c3d7118
make plink_params.py containing overwrite and output dir, fix type de…
Nov 25, 2024
d235b76
Update malariagen_data/anoph/plink_params.py
tristanpwdennis Nov 26, 2024
c62307c
Update malariagen_data/anoph/to_plink.py
tristanpwdennis Nov 26, 2024
4605612
Update malariagen_data/anoph/to_plink.py
tristanpwdennis Nov 26, 2024
23d8c3a
Update malariagen_data/anoph/to_plink.py
tristanpwdennis Nov 26, 2024
c562106
Update malariagen_data/anoph/to_plink.py
tristanpwdennis Nov 26, 2024
de0b36c
coerce tmp_path from posix to str
tristanpwdennis Nov 27, 2024
31abfea
Merge branch 'master' into plink-converter-2024-03-26
jonbrenas Nov 28, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
debugging_biallelic_snp_calls.ipynb
.idea
.vscode
__pycache__
Expand Down
13 changes: 12 additions & 1 deletion malariagen_data/anoph/snp_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -1627,12 +1627,23 @@ def biallelic_snp_calls(
# Store alleles, transformed.
variant_allele = ds_bi["variant_allele"].data
variant_allele = variant_allele.rechunk((variant_allele.chunks[0], -1))

# Chunk allele mapping according to same variant_allele.
allele_mapping_chunked = da.from_array(
allele_mapping, chunks=variant_allele.chunks
)

# Apply allele mapping blockwise to variant_allele.
variant_allele_out = da.map_blocks(
lambda block: apply_allele_mapping(block, allele_mapping, max_allele=1),
lambda allele, map: apply_allele_mapping(allele, map, max_allele=1),
variant_allele,
allele_mapping_chunked,
dtype=variant_allele.dtype,
chunks=(variant_allele.chunks[0], [2]),
)
# variant_allele_out = apply_allele_mapping(
# variant_allele.compute(), allele_mapping, max_allele=1
# )
data_vars["variant_allele"] = ("variants", "alleles"), variant_allele_out

# Store allele counts, transformed, so we don't have to recompute.
Expand Down
182 changes: 182 additions & 0 deletions malariagen_data/anoph/to_plink.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,182 @@
from typing import Optional

import allel # type: ignore
import numpy as np
import os
import bed_reader
from dask.diagnostics import ProgressBar

from .snp_data import AnophelesSnpData
from . import base_params


class PlinkConverter(
AnophelesSnpData,
):
def __init__(
self,
**kwargs,
):
# N.B., this class is designed to work cooperatively, and
# so it's important that any remaining parameters are passed
# to the superclass constructor.
super().__init__(**kwargs)

def _create_plink_outfile(
self,
*,
results_dir,
region,
n_snps,
min_minor_ac,
thin_offset,
max_missing_an,
):
return f"{results_dir}/{region}.{n_snps}.{min_minor_ac}.{thin_offset}.{max_missing_an}"

def _biallelic_snps_to_plink(
self,
*,
results_dir,
region,
n_snps,
thin_offset,
sample_sets,
sample_query,
sample_indices,
site_mask,
min_minor_ac,
max_missing_an,
random_seed,
inline_array,
chunks,
):
# Define output file
plink_file_path = self._create_plink_outfile(
results_dir=results_dir,
region=region,
n_snps=n_snps,
thin_offset=thin_offset,
min_minor_ac=min_minor_ac,
max_missing_an=max_missing_an,
)

bed_file_path = f"{plink_file_path}.bed"
if os.path.exists(bed_file_path):
return plink_file_path
tristanpwdennis marked this conversation as resolved.
Show resolved Hide resolved

# Get snps
ds_snps = self.biallelic_snp_calls(
region=region,
sample_sets=sample_sets,
sample_query=sample_query,
sample_indices=sample_indices,
site_mask=site_mask,
random_seed=random_seed,
inline_array=inline_array,
chunks=chunks,
n_snps=n_snps,
)

# Filter SNPs for segregating sites only
with self._spinner("Subsetting to segregating sites"):
gt = ds_snps["call_genotype"].data.compute()
print("count alleles")
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you remove this print statement?

with ProgressBar():
ac = allel.GenotypeArray(gt).count_alleles(max_allele=3)
print("ascertain segregating sites")
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same thing here.

n_chroms = ds_snps.dims["samples"] * 2
an_called = ac.sum(axis=1)
an_missing = n_chroms - an_called
min_ref_ac = min_minor_ac
max_ref_ac = n_chroms - min_minor_ac
loc_sites = (
(ac[:, 0] >= min_ref_ac)
& (ac[:, 0] <= max_ref_ac)
& (an_missing <= max_missing_an)
)
print(f"ascertained {np.count_nonzero(loc_sites):,} sites")
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same thing here.


# Set up dataset with required vars for plink conversion
print("Set up dataset")
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same thing here.

ds_snps_asc = ds_snps[
[
"variant_contig",
"variant_position",
"variant_allele",
"sample_id",
"call_genotype",
]
].isel(alleles=slice(0, 2))

# Compute gt ref counts
with self._spinner("Computing genotype ref counts"):
gt_asc = ds_snps_asc["call_genotype"].data.compute()
gn_ref = allel.GenotypeDaskArray(gt_asc).to_n_ref(fill=-127)
with ProgressBar():
gn_ref = gn_ref.compute()
tristanpwdennis marked this conversation as resolved.
Show resolved Hide resolved

loc_var = np.any(gn_ref != gn_ref[:, 0, np.newaxis], axis=1)

with ProgressBar():
ds_snps_final = ds_snps_asc[
["variant_contig", "variant_position", "variant_allele", "sample_id"]
].isel(variants=loc_var)

# Init vars for input to bed reader
gn_ref_final = gn_ref[loc_var]
val = gn_ref_final.T
alleles = ds_snps_final["variant_allele"].values
properties = {
"iid": ds_snps_final["sample_id"].values,
"chromosome": ds_snps_final["variant_contig"].values,
"bp_position": ds_snps_final["variant_position"].values,
"allele_1": alleles[:, 0],
"allele_2": alleles[:, 1],
}
tristanpwdennis marked this conversation as resolved.
Show resolved Hide resolved

bed_reader.to_bed(
filepath=bed_file_path,
val=val,
properties=properties,
count_A1=True,
)

print(f"PLINK files written to to: {plink_file_path}")
tristanpwdennis marked this conversation as resolved.
Show resolved Hide resolved

return plink_file_path

def biallelic_snps_to_plink(
tristanpwdennis marked this conversation as resolved.
Show resolved Hide resolved
self,
results_dir,
tristanpwdennis marked this conversation as resolved.
Show resolved Hide resolved
region: base_params.regions,
n_snps: base_params.n_snps,
thin_offset: base_params.thin_offset = 0,
sample_sets: Optional[base_params.sample_sets] = None,
sample_query: Optional[base_params.sample_query] = None,
sample_indices: Optional[base_params.sample_indices] = None,
site_mask: Optional[base_params.site_mask] = None,
min_minor_ac: Optional[base_params.min_minor_ac] = 0,
max_missing_an: Optional[base_params.max_missing_an] = 0,
tristanpwdennis marked this conversation as resolved.
Show resolved Hide resolved
random_seed: base_params.random_seed = 42,
inline_array: base_params.inline_array = base_params.inline_array_default,
chunks: base_params.chunks = base_params.chunks_default,
):
params = dict(
results_dir=results_dir,
region=region,
n_snps=n_snps,
thin_offset=thin_offset,
sample_sets=sample_sets,
sample_query=sample_query,
sample_indices=sample_indices,
site_mask=site_mask,
min_minor_ac=min_minor_ac,
max_missing_an=max_missing_an,
random_seed=random_seed,
)

filepath = self._biallelic_snps_to_plink(
inline_array=inline_array, chunks=chunks, **params
)
return filepath
2 changes: 2 additions & 0 deletions malariagen_data/anopheles.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@
from .anoph.pca import AnophelesPca
from .anoph.sample_metadata import AnophelesSampleMetadata, locate_cohorts
from .anoph.snp_data import AnophelesSnpData
from .anoph.to_plink import PlinkConverter
from .anoph.g123 import AnophelesG123Analysis
from .anoph.fst import AnophelesFstAnalysis
from .anoph.h12 import AnophelesH12Analysis
Expand Down Expand Up @@ -105,6 +106,7 @@ class AnophelesDataResource(
AnophelesG123Analysis,
AnophelesFstAnalysis,
AnophelesSnpFrequencyAnalysis,
PlinkConverter,
AnophelesPca,
AnophelesIgv,
AnophelesAimData,
Expand Down
Loading