Skip to content

Commit

Permalink
Add Pf8 (#659)
Browse files Browse the repository at this point in the history
* initial pf8 gcs

PR ready Pf8()

PR ready Pf8()

* file path change

* s3 access rewired via fsspec

* fixes the check for annotation file

* removes extra path

* altlen commented in test units

* missed fix for altlen

* fixes for PR feedbacks

* fix-2 for PR feedbacks

* fix for syntax error

* chained urls into s3 fs

* make handling of chained URLs consistent between GCS and S3

* fix bug

---------

Co-authored-by: Alistair Miles <alimanfoo@googlemail.com>
  • Loading branch information
eselimnl and alimanfoo authored Dec 3, 2024
1 parent 6052cdc commit 6cd964b
Show file tree
Hide file tree
Showing 7 changed files with 542 additions and 13 deletions.
1 change: 1 addition & 0 deletions malariagen_data/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from .amin1 import Amin1
from .anopheles import AnophelesDataResource, Region
from .pf7 import Pf7
from .pf8 import Pf8
from .pv4 import Pv4
from .util import SiteClass

Expand Down
43 changes: 43 additions & 0 deletions malariagen_data/pf8.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
import os

from .plasmodium import PlasmodiumDataResource


class Pf8(PlasmodiumDataResource):
"""Provides access to data from the Pf8 release.
Parameters
----------
url : str, optional
Base path to data. Default uses Google Cloud Storage "gs://pf8-release/",
or specify a local path on your file system if data have been downloaded.
data_config : str, optional
Path to config for structure of Pf8 data resource. Defaults to config included
with the malariagen_data package.
**kwargs
Passed through to fsspec when setting up file system access.
Examples
--------
Access data from Google Cloud Storage (default):
>>> import malariagen_data
>>> pf8 = malariagen_data.Pf8()
Access data downloaded to a local file system:
>>> pf8 = malariagen_data.Pf8("/local/path/to/pf8-release/")
"""

def __init__(
self,
url=None,
data_config=None,
**kwargs,
):
# setup filesystem
if not data_config:
working_dir = os.path.dirname(os.path.abspath(__file__))
data_config = os.path.join(working_dir, "pf8_config.json")
super().__init__(data_config=data_config, url=url)
118 changes: 118 additions & 0 deletions malariagen_data/pf8_config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
{
"default_url": "gs://pf8-release/",
"metadata_path": "metadata/Pf8_samples.txt",
"reference_path": "reference/PlasmoDB-54-Pfalciparum3D7-Genome.zarr/",
"reference_contigs": [
"Pf3D7_01_v3",
"Pf3D7_02_v3",
"Pf3D7_03_v3",
"Pf3D7_04_v3",
"Pf3D7_05_v3",
"Pf3D7_06_v3",
"Pf3D7_07_v3",
"Pf3D7_08_v3",
"Pf3D7_09_v3",
"Pf3D7_10_v3",
"Pf3D7_11_v3",
"Pf3D7_12_v3",
"Pf3D7_13_v3",
"Pf3D7_14_v3",
"Pf3D7_API_v3",
"Pf3D7_MIT_v3"
],
"annotations_path": "annotations/PlasmoDB-55_Pfalciparum3D7.gff.gz",
"variant_calls_zarr_path": "zarr/",
"default_variant_variables": {
"FILTER_PASS": ["variants"],
"is_snp": ["variants"],
"numalt": ["variants"],
"CDS": ["variants"]
},
"extended_calldata_variables": {
"DP": ["variants", "samples"],
"GQ": ["variants", "samples"],
"MIN_DP": ["variants", "samples"],
"PGT": ["variants", "samples"],
"PID": ["variants", "samples"],
"PS": ["variants", "samples"],
"RGQ": ["variants", "samples"],
"PL": ["variants", "samples", "genotypes"],
"SB": ["variants", "samples", "sb_statistics"]
},
"extended_variant_fields": {
"AC": ["variants", "alt_alleles"],
"AF": ["variants", "alt_alleles"],
"AN": ["variants"],
"ANN_AA_length": ["variants", "alt_alleles"],
"ANN_AA_pos": ["variants", "alt_alleles"],
"ANN_Allele": ["variants", "alt_alleles"],
"ANN_Annotation": ["variants", "alt_alleles"],
"ANN_Annotation_Impact": ["variants", "alt_alleles"],
"ANN_CDS_length": ["variants", "alt_alleles"],
"ANN_CDS_pos": ["variants", "alt_alleles"],
"ANN_Distance": ["variants", "alt_alleles"],
"ANN_Feature_ID": ["variants", "alt_alleles"],
"ANN_Feature_Type": ["variants", "alt_alleles"],
"ANN_Gene_ID": ["variants", "alt_alleles"],
"ANN_Gene_Name": ["variants", "alt_alleles"],
"ANN_HGVS_c": ["variants", "alt_alleles"],
"ANN_HGVS_p": ["variants", "alt_alleles"],
"ANN_Rank": ["variants", "alt_alleles"],
"ANN_Transcript_BioType": ["variants", "alt_alleles"],
"ANN_cDNA_length": ["variants", "alt_alleles"],
"ANN_cDNA_pos": ["variants", "alt_alleles"],
"AS_BaseQRankSum": ["variants", "alt_alleles"],
"AS_FS": ["variants", "alt_alleles"],
"AS_InbreedingCoeff": ["variants", "alt_alleles"],
"AS_MQ": ["variants", "alt_alleles"],
"AS_MQRankSum": ["variants", "alt_alleles"],
"AS_QD": ["variants", "alt_alleles"],
"AS_ReadPosRankSum": ["variants", "alt_alleles"],
"AS_SOR": ["variants", "alt_alleles"],
"BaseQRankSum": ["variants"],
"DP": ["variants"],
"DS": ["variants"],
"END": ["variants"],
"ExcessHet": ["variants"],
"FILTER_Apicoplast": ["variants"],
"FILTER_Centromere": ["variants"],
"FILTER_InternalHypervariable": ["variants"],
"FILTER_LowQual": ["variants"],
"FILTER_Low_VQSLOD": ["variants"],
"FILTER_Mitochondrion": ["variants"],
"FILTER_SubtelomericHypervariable": ["variants"],
"FILTER_SubtelomericRepeat": ["variants"],
"FILTER_VQSRTrancheINDEL99.50to99.60": ["variants"],
"FILTER_VQSRTrancheINDEL99.60to99.80": ["variants"],
"FILTER_VQSRTrancheINDEL99.80to99.90": ["variants"],
"FILTER_VQSRTrancheINDEL99.90to99.95": ["variants"],
"FILTER_VQSRTrancheINDEL99.95to100.00+": ["variants"],
"FILTER_VQSRTrancheINDEL99.95to100.00": ["variants"],
"FILTER_VQSRTrancheSNP99.50to99.60": ["variants"],
"FILTER_VQSRTrancheSNP99.60to99.80": ["variants"],
"FILTER_VQSRTrancheSNP99.80to99.90": ["variants"],
"FILTER_VQSRTrancheSNP99.90to99.95": ["variants"],
"FILTER_VQSRTrancheSNP99.95to100.00+": ["variants"],
"FILTER_VQSRTrancheSNP99.95to100.00": ["variants"],
"FS": ["variants"],
"ID": ["variants"],
"InbreedingCoeff": ["variants"],
"LOF": ["variants"],
"MLEAC": ["variants", "alt_alleles"],
"MLEAF": ["variants", "alt_alleles"],
"MQ": ["variants"],
"MQRankSum": ["variants"],
"NEGATIVE_TRAIN_SITE": ["variants"],
"NMD": ["variants"],
"POSITIVE_TRAIN_SITE": ["variants"],
"QD": ["variants"],
"QUAL": ["variants"],
"RAW_MQandDP": ["variants", "ploidy"],
"ReadPosRankSum": ["variants"],
"RegionType": ["variants"],
"SOR": ["variants"],
"VQSLOD": ["variants"],
"culprit": ["variants"],
"set": ["variants"]
}
}
2 changes: 1 addition & 1 deletion malariagen_data/plasmodium.py
Original file line number Diff line number Diff line change
Expand Up @@ -298,7 +298,7 @@ def genome_sequence(self, region="*", inline_array=True, chunks="native"):
)
return d

def genome_features(self, attributes=("ID", "Parent", "Name", "alias")):
def genome_features(self, attributes=("ID", "Parent", "Name")):
"""Access genome feature annotations.
Parameters
Expand Down
34 changes: 29 additions & 5 deletions malariagen_data/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -450,18 +450,42 @@ def init_filesystem(url, **kwargs):
scopes=["https://www.googleapis.com/auth/cloud-platform"]
)

# Ensure credentials are passed through to gcsfs.
kwargs.setdefault("token", credentials)

# Ensure options are passed through to gcsfs, even if URL is chained.
if url.startswith("gs://") or url.startswith("gcs://"):
kwargs["token"] = credentials
storage_options = kwargs
elif "gs://" in url:
# Chained URL.
kwargs["gs"] = dict(token=credentials)
storage_options = {"gs": kwargs}
elif "gcs://" in url:
# Chained URL.
kwargs["gcs"] = dict(token=credentials)
storage_options = {"gcs": kwargs}

elif "s3://" in url:
# N.B., we currently assume any S3 URLs refer to buckets hosted at Sanger.
config = {
"signature_version": "s3",
"s3": {"addressing_style": "virtual"},
}

# Create an S3FileSystem with custom endpoint if specified.
kwargs.setdefault("anon", True) # Default to anonymous access.
kwargs.setdefault("endpoint_url", "https://cog.sanger.ac.uk")
kwargs.setdefault("config_kwargs", config)

if url.startswith("s3://"):
storage_options = kwargs
else:
# Chained URL.
storage_options = {"s3": kwargs}

else:
# Some other kind of URL, pass through kwargs as-is.
storage_options = kwargs

# Process the URL using fsspec.
fs, path = url_to_fs(url, **kwargs)
fs, path = url_to_fs(url, **storage_options)

# Path compatibility, fsspec/gcsfs behaviour varies between versions.
while path.endswith("/"):
Expand Down
Loading

0 comments on commit 6cd964b

Please sign in to comment.