From 3b3bf3414c7b5813895d506710f3993e46fd8a35 Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Wed, 23 Oct 2024 11:53:09 +0100 Subject: [PATCH] Fix combine CLI producing empty output with invalid data (#56) * fix variant model to support failed harmonisation * combine: fix CLI not raising exceptions * add test with variants that fail harmonisation * bump patch version * make sure an output file exists * add test to make sure ValidationErrors are correctly raised * add hm_rsID to field validators * relax rsID format check * disable rsid format check when variants are harmonised by liftover --- pgscatalog.core/pyproject.toml | 2 +- .../src/pgscatalog/core/cli/combine_cli.py | 46 +++++++----- .../src/pgscatalog/core/lib/_normalise.py | 2 +- .../src/pgscatalog/core/lib/models.py | 74 +++++++++++++++---- pgscatalog.core/tests/data/bad_harmonised.txt | 23 ++++++ .../tests/data/invalid_scorefile.txt | 18 +++++ pgscatalog.core/tests/test_combine_cli.py | 46 ++++++++++++ 7 files changed, 177 insertions(+), 34 deletions(-) create mode 100644 pgscatalog.core/tests/data/bad_harmonised.txt create mode 100644 pgscatalog.core/tests/data/invalid_scorefile.txt diff --git a/pgscatalog.core/pyproject.toml b/pgscatalog.core/pyproject.toml index 6bef853..f4e1f32 100644 --- a/pgscatalog.core/pyproject.toml +++ b/pgscatalog.core/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "pgscatalog.core" -version = "0.3.1" +version = "0.3.2" description = "Core tools for working with polygenic scores (PGS) and the PGS Catalog" license = "Apache-2.0" authors = ["Benjamin Wingfield ", "Samuel Lambert ", "Laurent Gil "] diff --git a/pgscatalog.core/src/pgscatalog/core/cli/combine_cli.py b/pgscatalog.core/src/pgscatalog/core/cli/combine_cli.py index 080d7a5..d61e0d1 100755 --- a/pgscatalog.core/src/pgscatalog/core/cli/combine_cli.py +++ b/pgscatalog.core/src/pgscatalog/core/cli/combine_cli.py @@ -6,6 +6,7 @@ import textwrap from typing import Optional +import pydantic from tqdm import tqdm from pgscatalog.core.lib.models import ScoreLog, ScoreLogs, ScoreVariant, VariantLog @@ -39,7 +40,9 @@ def _combine( {"accession", "row_nr", "hm_source", "is_complex"} ) # it's important to create the list here to raise EffectTypeErrors - # for the largest scoring files this can use quite a lot of memory (~16GB) + # for the largest scoring files this can use quite a lot of memory (~32GB) + # TODO: materialise and write these variants in batches to reduce memory usage + # don't forget to think about deleting a partially written file (i.e. bad batch?) dumped_variants = list(x.model_dump(include=fields) for x in normalised_score) logger.info(f"Finished processing {scorefile.pgs_id}") except EffectTypeError: @@ -47,27 +50,34 @@ def _combine( f"Unsupported non-additive effect types in {scorefile=}, skipping" ) is_compatible = False + except pydantic.ValidationError: + logger.critical( + f"{scorefile.pgs_id} contains invalid data, stopping and exploding" + ) + raise else: + # no exceptions found, good to write out logger.info("Writing variants to file") writer = TextFileWriter(compress=compress_output, filename=out_path) writer.write(dumped_variants) logger.info("Finished writing") - finally: - variant_logs: Optional[list[VariantLog]] = None - if dumped_variants is not None: - variant_logs = [VariantLog(**x) for x in dumped_variants] - - log: ScoreLog = ScoreLog( - header=scorefile.header, - variant_logs=variant_logs, - compatible_effect_type=is_compatible, + + # (don't use finally clause here, it was hiding helpful exceptions) + variant_logs: Optional[list[VariantLog]] = None + if dumped_variants is not None: + variant_logs = [VariantLog(**x) for x in dumped_variants] + + log: ScoreLog = ScoreLog( + header=scorefile.header, + variant_logs=variant_logs, + compatible_effect_type=is_compatible, + ) + if log.variants_are_missing: + logger.warning( + f"{log.variant_count_difference} fewer variants in output compared to original file" ) - if log.variants_are_missing: - logger.warning( - f"{log.variant_count_difference} fewer variants in output compared to original file" - ) - logger.info("Normalisation complete, returning score log") - return log + logger.info("Normalisation complete, returning score log") + return log def run(): @@ -77,7 +87,7 @@ def run(): logger.setLevel(logging.DEBUG) logger.debug("Verbose logging enabled") - out_path = pathlib.Path(args.outfile) + out_path = pathlib.Path(args.outfile).resolve() if out_path.exists(): logger.critical(f"Output file already exists: {args.outfile}") @@ -155,7 +165,7 @@ def run(): ) variant_log.append(log) - if n_finished == 0: + if n_finished == 0 or not out_path.exists(): raise ValueError( "Couldn't process any scoring files. Did they all have non-additive weights?" ) diff --git a/pgscatalog.core/src/pgscatalog/core/lib/_normalise.py b/pgscatalog.core/src/pgscatalog/core/lib/_normalise.py index 4be9314..2348467 100644 --- a/pgscatalog.core/src/pgscatalog/core/lib/_normalise.py +++ b/pgscatalog.core/src/pgscatalog/core/lib/_normalise.py @@ -155,7 +155,7 @@ def remap_harmonised(variants, harmonised, target_build): Perhaps authors submitted rsID and effect allele originally: >>> from pgscatalog.core.lib.models import ScoreVariant - >>> variant = ScoreVariant(**{"chr_position": 1, "rsID": None, "chr_name": "2", "effect_allele": "A", "effect_weight": 5, "accession": "test", "hm_chr": "1", "hm_pos": 100, "hm_rsID": "testrsid", "hm_inferOtherAllele": "A", "row_nr": 0}) + >>> variant = ScoreVariant(**{"chr_position": 1, "rsID": None, "chr_name": "2", "effect_allele": "A", "effect_weight": 5, "accession": "test", "hm_chr": "1", "hm_pos": 100, "hm_rsID": "rsTEST", "hm_inferOtherAllele": "A", "row_nr": 0}) >>> variant ScoreVariant(..., effect_allele=Allele(allele='A', is_snp=True), other_allele=None, ... diff --git a/pgscatalog.core/src/pgscatalog/core/lib/models.py b/pgscatalog.core/src/pgscatalog/core/lib/models.py index 113704b..0b85f62 100644 --- a/pgscatalog.core/src/pgscatalog/core/lib/models.py +++ b/pgscatalog.core/src/pgscatalog/core/lib/models.py @@ -424,7 +424,12 @@ class CatalogScoreVariant(BaseModel): ] # helpful class attributes (not used by pydantic to instantiate a class) - harmonised_columns: ClassVar[tuple[str, str, str]] = ("hm_rsID", "hm_chr", "hm_pos") + harmonised_columns: ClassVar[tuple[str, str, str, str]] = ( + "hm_source", + "hm_rsID", + "hm_chr", + "hm_pos", + ) complex_columns: ClassVar[tuple[str, str, str]] = ( "is_haplotype", "is_diplotype", @@ -457,6 +462,14 @@ def is_harmonised(self) -> bool: return True return False + @cached_property + def is_hm_bad(self) -> bool: + """Was harmonisation OK?""" + # not a computed field because never used in serialisation + if self.is_harmonised and self.hm_source == "Unknown": + return True + return False + @computed_field # type: ignore @cached_property def is_complex(self) -> bool: @@ -518,6 +531,7 @@ def alleles_must_parse(cls, value: Any) -> Allele: "chr_position", "is_haplotype", "is_diplotype", + "hm_rsID", "hm_chr", "hm_pos", "hm_match_chr", @@ -532,21 +546,32 @@ def empty_string_to_none(cls, v: Any) -> Optional[Any]: return None return v - @field_validator("rsID", mode="after") + @field_validator("rsID", "hm_rsID", mode="after") @classmethod - def check_rsid_format(cls, rsid: Optional[str]) -> Optional[str]: + def set_missing_rsid(cls, rsid: Optional[str]) -> Optional[str]: + # rsID is a special case where . means missing if rsid == ".": rsid = None - if ( - rsid is None - or rsid.startswith("rs") - or rsid.startswith("ss") - or rsid.startswith("HLA") - ): - return rsid - else: - raise ValueError("rsid field must start with rs or ss or HLA") + return rsid + + @model_validator(mode="after") + def check_rsid_format(self) -> Self: + if self.is_hm_bad or self.hm_source == "liftover": + # disable this check when harmonisation fails + # variants that have been harmonised by liftover will put coordinates in rsID column + return self + + for x in (self.rsID, self.hm_rsID): + if not ( + x is None + or x.startswith("rs") + or x.startswith("ss") + or x.startswith("HLA") + ): + raise ValueError("rsid field must start with rs or ss or HLA") + + return self @field_validator( "effect_weight", @@ -617,7 +642,8 @@ def check_position(self) -> Self: # mandatory rsid with optional coordinates pass case _: - if self.is_complex: + if self.is_complex or self.is_hm_bad: + # harmonisation may fail and create variants with bad coordinates # complex variants can have odd non-standard positions # e.g. e2 allele of APOE gene pass @@ -682,6 +708,23 @@ class ScoreVariant(CatalogScoreVariant): >>> variant_complex = ScoreVariant(**{"rsID": None, "chr_name": "1", "chr_position": 1, "effect_allele": "A", "effect_weight": 0.5, "is_haplotype": True, "row_nr": 0, "accession": "test"}) >>> variant_complex.is_complex True + + The harmonisation process might fail, so variants can be missing mandatory fields. + + This must be supported by the model: + + >>> bad_hm_variant = ScoreVariant(**{"rsID": "a_weird_rsid", "chr_name": None, "chr_position": None, "effect_allele": "G", "effect_weight": 0.5, "hm_chr": None, "hm_pos": None, "hm_rsID": None, "hm_source": "Unknown", "row_nr": 0, "accession": "test"}) + >>> bad_hm_variant.is_harmonised + True + >>> bad_hm_variant.is_hm_bad + True + >>> harmonised_variant.is_hm_bad + False + + rsID format validation (i.e. starts with rs, ss...) is disabled when harmonisation fails: + + >>> bad_hm_variant.rsID + 'a_weird_rsid' """ model_config = ConfigDict(use_enum_values=True) @@ -1043,7 +1086,10 @@ def variant_count_difference(self) -> Optional[int]: @property def variants_are_missing(self) -> bool: - return self.variant_count_difference != 0 + return ( + self.variant_count_difference is not None + and self.variant_count_difference != 0 + ) class ScoreLogs(RootModel): diff --git a/pgscatalog.core/tests/data/bad_harmonised.txt b/pgscatalog.core/tests/data/bad_harmonised.txt new file mode 100644 index 0000000..73137e1 --- /dev/null +++ b/pgscatalog.core/tests/data/bad_harmonised.txt @@ -0,0 +1,23 @@ +###PGS CATALOG SCORING FILE - see https://www.pgscatalog.org/downloads/#dl_ftp_scoring for additional information +#format_version=2.0 +##POLYGENIC SCORE (PGS) INFORMATION +#pgs_id=PGS003581 +#pgs_name=MTAG.Envs.LifetimeMDD +#trait_reported=Major Depressive Disorder (Lifetime) +#trait_mapped=major depressive disorder +#trait_efo=MONDO_0002009 +#genome_build=GRCh37 +#variants_number=4861398 +#weight_type=beta +##SOURCE INFORMATION +#pgp_id=PGP000461 +#citation=Dahl A et al. bioRxiv (2022). doi:10.1101/2022.08.15.503980 +##HARMONIZATION DETAILS +#HmPOS_build=GRCh38 +#HmPOS_date=2023-04-12 +#HmPOS_match_chr={""True"": null, ""False"": null} +#HmPOS_match_pos={""True"": null, ""False"": null} +rsID effect_allele other_allele effect_weight allelefrequency_effect variant_description hm_source hm_rsID hm_chr hm_pos hm_inferOtherAllele +. T C 0.004040743 0.329683 VARIANT_ID=1:2556125_C_T Unknown +. T C 0.004025092 0.329689 VARIANT_ID=1:2556548_C_T Unknown +. A G 0.004013246 0.329686 VARIANT_ID=1:2556709_G_A Unknown diff --git a/pgscatalog.core/tests/data/invalid_scorefile.txt b/pgscatalog.core/tests/data/invalid_scorefile.txt new file mode 100644 index 0000000..640371f --- /dev/null +++ b/pgscatalog.core/tests/data/invalid_scorefile.txt @@ -0,0 +1,18 @@ +###PGS CATALOG SCORING FILE - see https://www.pgscatalog.org/downloads/#dl_ftp_scoring for additional information +#format_version=2.0 +##POLYGENIC SCORE (PGS) INFORMATION +#pgs_id=PGS003581 +#pgs_name=MTAG.Envs.LifetimeMDD +#trait_reported=Major Depressive Disorder (Lifetime) +#trait_mapped=major depressive disorder +#trait_efo=MONDO_0002009 +#genome_build=GRCh37 +#variants_number=4861398 +#weight_type=beta +##SOURCE INFORMATION +#pgp_id=PGP000461 +#citation=Dahl A et al. bioRxiv (2022). doi:10.1101/2022.08.15.503980 +rsID effect_allele other_allele effect_weight allelefrequency_effect variant_description +. T C 0.004040743 0.329683 VARIANT_ID=1:2556125_C_T +. T C 0.004025092 0.329689 VARIANT_ID=1:2556548_C_T +. A G 0.004013246 0.329686 VARIANT_ID=1:2556709_G_A \ No newline at end of file diff --git a/pgscatalog.core/tests/test_combine_cli.py b/pgscatalog.core/tests/test_combine_cli.py index 62a8f08..9d1c7b4 100644 --- a/pgscatalog.core/tests/test_combine_cli.py +++ b/pgscatalog.core/tests/test_combine_cli.py @@ -4,6 +4,8 @@ import itertools import json from unittest.mock import patch + +import pydantic import pytest from pgscatalog.core.cli.combine_cli import run @@ -83,6 +85,50 @@ def lift_scorefiles(request): ) +@pytest.fixture(scope="package") +def fail_harmonised(request): + """This scoring file only contains variants that failed harmonisation.""" + return request.path.parent / "data" / "bad_harmonised.txt" + + +@pytest.fixture(scope="package") +def invalid_scorefile(request): + """This scoring file contains invalid variants missing mandatory fields""" + return request.path.parent / "data" / "invalid_scorefile.txt" + + +def test_invalid_scorefile(tmp_path, invalid_scorefile): + """There's nothing that can be done for this file, except explode loudly""" + out_path = tmp_path / "combined.txt" + path = [str(invalid_scorefile)] + + args = [("pgscatalog-combine", "-s"), path, ("-o", str(out_path), "-t", "GRCh37")] + flargs = list(itertools.chain(*args)) + + # make sure the correct exception type is being raised by the CLI + with pytest.raises(pydantic.ValidationError): + with patch("sys.argv", flargs): + run() + + +def test_fail_harmonised(tmp_path, fail_harmonised): + """Variants that have failed harmonisation will be missing mandatory fields, but we should accept them as a special case and write out like normal""" + out_path = tmp_path / "combined.txt" + path = [str(fail_harmonised)] + + args = [("pgscatalog-combine", "-s"), path, ("-o", str(out_path), "-t", "GRCh38")] + flargs = list(itertools.chain(*args)) + + with patch("sys.argv", flargs): + run() + + # https://github.com/PGScatalog/pygscatalog/issues/55 + assert out_path.exists() + with open(out_path, mode="rt") as f: + x = list(csv.reader(f, delimiter="\t")) + assert len(x) == 4 # 3 variants + header + + def test_combine_nonadditive(tmp_path, non_additive_scorefile_grch38): """Test normalising a single non-additive scoring file fails.""" out_path = tmp_path / "combined.txt.gz"