Skip to content

Commit

Permalink
50 harmonised file conforms to new standard format (#71)
Browse files Browse the repository at this point in the history
* ticket #49 #50

This branch is design to solved ticket 49 to read genome build from the yaml file (here, suppose the yaml file is under the same folder), and ticket 50 to rearrange the output file with the new format.

* Update map_to_build.nf

* Update common_constants.py

main.py using the --rsid_col rsid not the variant_id to solve the problem if there are multiple reference records for one site.

* Update main_pysam.py

keep harmonised header in specific order

* Update gwascatalogharm.nf

change the input file as [GCST,yaml path, input path]

* bare bones meta client

* Create GCSTtest.tsv.ymal

* Update and rename GCSTtest_b37.tsv to GCSTtest.tsv

* Rename test_data/GCSTtest.tsv.ymal to test_data/homes/yueji/.nextflow/assets/EBISPOT/gwas-sumstats-harmoniser/test_data/GCSTtest.tsv-meta.yamll

* test yaml file

add test yaml file

* Update test.config

new format test data

* Create GCSTtest.tsv-meta.yaml

* Update GCSTtest.tsv

test data modify

* Update GCSTtest.tsv-meta.yaml

* Update GCSTtest.tsv

* Update main_pysam.py

tmp: keep the original variant_id and hm_varid column temporary for the qc  since qc step need it. the column after the qc is fixed

* Update GCSTtest.tsv

modify the test input file

* Update qc.nf

will add bgzip and tabix in the qc.py using pysam

* Update main_pysam.py

FIx the hm_code and hm_coordinate_conversio at the position after the mandatory fields.

* Update harmonization_log.nf

new hm_code position

* Yaml

make raw_yaml available until the qc step

* keep raw content in col 4

keep what in the raw column 4

* Update map_to_build_nf.py

update master #65 update to the branch

* Update main_pysam.py

Allows the standard_error does not exist in the raw data.

* Update main_pysam.py

* Update qc.nf

sort the qc result for tabix

* sort chr and pos

update the sorted by the chr and pos, also for data with mandatory columns only

* Update harmonization.nf

* adding metadata model, update yaml file and publish

* add coord system to metadata

* #70

#70 liftover with specified coordinate system

* #70

Further update on converting the coordinate system:
1. map_to_build.py: bp-coordinate -> liftover -> bp'+1
2. main_pysam.py:
if 0-base and indels and hm_coordinate_conversion="lo":
     vcf_rec=tabix(chr, bp-2,bp)
else:
     vcf_rec=tabix(chr, bp-1,bp)
3. harmonisation.nf: read yaml file
4. test_data: 1_base and 0_base tsv file.

* #70

* add one new row of the test data

* #70

* import schema from gwas-sumstats-tools, camelCase to snake_case

* add container.config

* auto map col names to scheme

use utils.py to map args for strand count

* rename test input file

* Create GCST0.tsv-meta.yaml

* Create GCST1.tsv-meta.yaml

* Update main_harm.nf

* change on flip_beta

* Change variant_id to rsid for map_to_build

* Update container.config

* qc base on rsid not variant_id

* using gwas_sumstats_tools to reorder the columns

reorder the output columns using the gwas_sumstats_tools function : _set_header_order()

* Update main_pysam.py

* Update main_pysam.py

* wait for all previous process to finish

* wait until all chr finish ten_sc process

* Update main_pysam.py

* Update container.config

resetting default container config

---------

Co-authored-by: jdhayhurst <hayhurst.jd@gmail.com>
Co-authored-by: jdhayhurst <38317975+jdhayhurst@users.noreply.github.com>
  • Loading branch information
3 people authored May 23, 2023
1 parent 04caa52 commit f1542de
Show file tree
Hide file tree
Showing 44 changed files with 488 additions and 152 deletions.
3 changes: 2 additions & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
FROM python:3.7-slim-buster
FROM python:3.9-slim-buster

RUN apt-get update \
&& apt-get install -y --no-install-recommends gcc wget python-dev libmagic-dev tabix \
&& rm -rf /var/lib/apt/lists/*

COPY . .

RUN pip install --upgrade pip
RUN pip install -r environments/requirements.txt \
&& apt-get purge -y --auto-remove gcc python-dev
10 changes: 6 additions & 4 deletions bin/basic_qc_nf.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ def get_synonyms(self, rsid):
}


REQUIRED_HEADERS = [SNP_DSET, PVAL_DSET, CHR_DSET, BP_DSET]
REQUIRED_HEADERS = [RSID, PVAL_DSET, CHR_DSET, BP_DSET]
BLANK_SET = {'', ' ', '-', '.', 'na', None, 'none', 'nan', 'nil'}

# hm codes to drop
Expand Down Expand Up @@ -151,9 +151,10 @@ def drop_last_element_from_filename(filename):
return '-'.join(filename_parts[:-1])


def resolve_invalid_rsids(row, header, ensembl_client=None, sql_client=None):
"""
def resolve_invalid_rsids(row, header, ensembl_client=None, sql_client=None):
hm_rsid_idx = header.index('hm_rsid')
snp_idx = header.index(SNP_DSET)
snp_idx = header.index(RSID)
# if possible, set variant_id to harmonised rsid
if row[hm_rsid_idx].startswith('rs'):
# check that if rsID already present is not synonym of that found in vcf
Expand All @@ -180,6 +181,7 @@ def resolve_invalid_rsids(row, header, ensembl_client=None, sql_client=None):
if not row[snp_idx].startswith('rs'):
row[snp_idx] = 'NA'
return row
"""


def get_csv_reader(csv_file):
Expand Down Expand Up @@ -237,7 +239,7 @@ def main():
# Checks for blanks, integers and floats:
sql_client = sqlClient(db) if db else None
ensembl_client = EnsemblRestClient() if not db else None
row = resolve_invalid_rsids(row, header, ensembl_client, sql_client)
#row = resolve_invalid_rsids(row, header, ensembl_client, sql_client)
row = blanks_to_NA(row)
row = map_chr_values_to_numbers(row, header)
unharmonisable = remove_row_if_unharmonisable(row, header)
Expand Down
29 changes: 20 additions & 9 deletions bin/common_constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
EFFECT_DSET = 'effect_allele'
OTHER_DSET = 'other_allele'
FREQ_DSET = 'effect_allele_frequency'
INFO_DSET='info'
HM_CC_DSET = 'hm_coordinate_conversion'
HM_OR_DSET = 'hm_odds_ratio'
HM_RANGE_U_DSET = 'hm_ci_upper'
Expand All @@ -26,7 +27,7 @@
HM_CODE = 'hm_code'


DSET_TYPES = {SNP_DSET: str, PVAL_DSET: float, MANTISSA_DSET: float, EXP_DSET: int, STUDY_DSET: str,
DSET_TYPES = {SNP_DSET: str, RSID: str, PVAL_DSET: float, MANTISSA_DSET: float, EXP_DSET: int, STUDY_DSET: str,
CHR_DSET: int, BP_DSET: int, OR_DSET: float, RANGE_U_DSET: float, RANGE_L_DSET: float, BETA_DSET: float, SE_DSET: float,
EFFECT_DSET: str, OTHER_DSET: str, FREQ_DSET: float, HM_EFFECT_DSET: str,
HM_OTHER_DSET: str, HM_BETA_DSET: float, HM_OR_DSET: float, HM_FREQ_DSET: float, HM_CODE: int,
Expand All @@ -36,36 +37,46 @@
HARMONISATION_PREFIX = 'hm_'
GWAS_CATALOG_STUDY_PREFIX = 'GCST'

TO_DISPLAY_DEFAULT = {SNP_DSET, PVAL_DSET, STUDY_DSET, CHR_DSET, BP_DSET, HM_OR_DSET, HM_RANGE_L_DSET, HM_RANGE_U_DSET,
TO_DISPLAY_DEFAULT = {SNP_DSET, RSID, PVAL_DSET, STUDY_DSET, CHR_DSET, BP_DSET, HM_OR_DSET, HM_RANGE_L_DSET, HM_RANGE_U_DSET,
HM_BETA_DSET, HM_EFFECT_DSET, HM_OTHER_DSET, HM_FREQ_DSET, HM_CODE}

TO_DISPLAY_RAW = {SNP_DSET, PVAL_DSET, STUDY_DSET, CHR_DSET, BP_DSET, OR_DSET, RANGE_L_DSET, RANGE_U_DSET, BETA_DSET,
TO_DISPLAY_RAW = {SNP_DSET, RSID, PVAL_DSET, STUDY_DSET, CHR_DSET, BP_DSET, OR_DSET, RANGE_L_DSET, RANGE_U_DSET, BETA_DSET,
SE_DSET, EFFECT_DSET, OTHER_DSET, FREQ_DSET}


TO_LOAD_DSET_HEADERS_DEFAULT = {SNP_DSET, PVAL_DSET, CHR_DSET, BP_DSET, OR_DSET, RANGE_L_DSET, RANGE_U_DSET, BETA_DSET,
TO_LOAD_DSET_HEADERS_DEFAULT = {SNP_DSET, RSID, PVAL_DSET, CHR_DSET, BP_DSET, OR_DSET, RANGE_L_DSET, RANGE_U_DSET, BETA_DSET,
SE_DSET, EFFECT_DSET, OTHER_DSET, FREQ_DSET, HM_OR_DSET, HM_RANGE_L_DSET, HM_RANGE_U_DSET, HM_BETA_DSET,
HM_EFFECT_DSET, HM_OTHER_DSET, HM_FREQ_DSET, HM_CODE}
TO_STORE_DSETS_DEFAULT = {SNP_DSET, MANTISSA_DSET, EXP_DSET, STUDY_DSET, CHR_DSET, BP_DSET, OR_DSET, RANGE_L_DSET, RANGE_U_DSET,
TO_STORE_DSETS_DEFAULT = {SNP_DSET, RSID, MANTISSA_DSET, EXP_DSET, STUDY_DSET, CHR_DSET, BP_DSET, OR_DSET, RANGE_L_DSET, RANGE_U_DSET,
BETA_DSET, SE_DSET, EFFECT_DSET, OTHER_DSET, FREQ_DSET, HM_OR_DSET, HM_RANGE_L_DSET, HM_RANGE_U_DSET, HM_BETA_DSET,
HM_EFFECT_DSET, HM_OTHER_DSET, HM_FREQ_DSET, HM_VAR_ID, HM_CODE}
TO_QUERY_DSETS_DEFAULT = {SNP_DSET, MANTISSA_DSET, EXP_DSET, STUDY_DSET, CHR_DSET, BP_DSET, OR_DSET, RANGE_L_DSET, RANGE_U_DSET, BETA_DSET,
TO_QUERY_DSETS_DEFAULT = {SNP_DSET, RSID, MANTISSA_DSET, EXP_DSET, STUDY_DSET, CHR_DSET, BP_DSET, OR_DSET, RANGE_L_DSET, RANGE_U_DSET, BETA_DSET,
SE_DSET, EFFECT_DSET, OTHER_DSET, FREQ_DSET, HM_OR_DSET, HM_RANGE_L_DSET, HM_RANGE_U_DSET, HM_BETA_DSET,
HM_EFFECT_DSET, HM_OTHER_DSET, HM_FREQ_DSET, HM_VAR_ID, HM_CODE}
TO_INDEX = {SNP_DSET, PVAL_DSET, CHR_DSET, BP_DSET}
TO_INDEX = {SNP_DSET, RSID, PVAL_DSET, CHR_DSET, BP_DSET}
REQUIRED = {CHR_DSET, PVAL_DSET, SNP_DSET}#, EFFECT_DSET, OTHER_DSET}

HARMONISER_ARG_MAP = {
CHR_DSET: "--chrom_col",
BP_DSET: "--pos_col",
EFFECT_DSET: "--effAl_col",
OTHER_DSET: "--otherAl_col",
SNP_DSET: "--rsid_col",
RSID: "--rsid_col",
BETA_DSET: "--beta_col",
OR_DSET: "--or_col",
RANGE_L_DSET: "--or_col_lower",
RANGE_U_DSET: "--or_col_upper",
FREQ_DSET: "--eaf_col"
FREQ_DSET: "--eaf_col",
HM_CC_DSET:"--hm_coordinate_conversion"
}

STRAND_COUNT_ARG_MAP = {
CHR_DSET: "--chrom_col",
BP_DSET: "--pos_col",
EFFECT_DSET: "--effAl_col",
OTHER_DSET: "--otherAl_col",
RSID: "--rsid_col",
HM_CC_DSET:"--hm_coordinate_conversion"
}

DEFAULT_CHROMS = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', 'X', 'Y', 'MT']
7 changes: 0 additions & 7 deletions bin/convert_to_standard.py

This file was deleted.

71 changes: 71 additions & 0 deletions bin/gwas_metadata.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
#!/usr/bin/env python

import yaml
import argparse

from gwas_sumstats_tools.schema.metadata import SumStatsMetadata


class MetadataClient:
def __init__(self, meta_dict=None,
in_file=None,
out_file=None) -> None:
self.metadata = SumStatsMetadata.construct()
self._meta_dict = meta_dict
self._in_file = in_file
self._out_file = out_file

def from_file(self) -> None:
with open(self._in_file, "r") as fh:
self._meta_dict = yaml.safe_load(fh)
self.update_metadata(self._meta_dict)

def to_file(self) -> None:
with open(self._out_file, "w") as fh:
yaml.dump(self.metadata.dict(exclude_none=True),
fh,
encoding='utf-8')

def update_metadata(self, data_dict) -> None:
"""
Create a copy of the model and update (no validation occurs)
"""
self._meta_dict.update(data_dict)
self.metadata = self.metadata.parse_obj(self._meta_dict)

def __repr__(self) -> str:
"""
YAML str representation of metadata.
"""
return yaml.dump(self.metadata.dict())


def main():
argparser = argparse.ArgumentParser()
argparser.add_argument('-i', help='Input metadata yaml file')
argparser.add_argument('-o', help='output metadata yaml file')
argparser.add_argument('-e', help='Edit mode, provide params to edit e.g. `-GWASID GCST123456` to edit/add that value', action='store_true')
_, unknown = argparser.parse_known_args()
for arg in unknown:
if arg.startswith(("-", "--")):
arg_pair = arg.split('=')
argparser.add_argument(arg_pair[0])
args = argparser.parse_args()
known_args = {'i', 'o', 'e'}
in_file = args.i
out_file = args.o
edit_mode = args.e
m = MetadataClient(in_file=in_file, out_file=out_file)
if in_file:
m.from_file()
print(f"===========\nMetadata in\n===========\n{m}")
if edit_mode:
data_dict = {k: v for k, v in vars(args).items() if k not in known_args}
m.update_metadata(data_dict)
if out_file:
print(f"============\nMetadata out\n============\n{m}")
m.to_file()


if __name__ == "__main__":
main()
7 changes: 4 additions & 3 deletions bin/lib/SumStatRecord.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ class SumStatRecord:
""" Class to hold a summary statistic record.
"""
def __init__(self, chrom, pos, other_al, effect_al, beta, oddsr,
oddsr_lower, oddsr_upper, eaf, rsid, data):
oddsr_lower, oddsr_upper, eaf, rsid, data,hm_coordinate_conversion):

# Set raw info
self.chrom = chrom
Expand All @@ -17,7 +17,8 @@ def __init__(self, chrom, pos, other_al, effect_al, beta, oddsr,
self.oddsr = safe_float(oddsr) if oddsr is not None else None
self.oddsr_lower = safe_float(oddsr_lower) if oddsr_lower is not None else None
self.oddsr_upper = safe_float(oddsr_upper) if oddsr_upper is not None else None
self.rsid = rsid
self.rsid = str(rsid) if rsid is not None else None
self.lifmethod= str(hm_coordinate_conversion)

# Effect allele frequency is not required if we assume +ve strand
if eaf:
Expand Down Expand Up @@ -73,7 +74,7 @@ def flip_beta(self):
to flipping.
"""
# Flip beta
if self.beta != 0:
if self.beta and self.beta != 0:
self.beta = self.beta * -1
# Flip OR
if self.oddsr:
Expand Down
12 changes: 8 additions & 4 deletions bin/liftover.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,12 +41,12 @@ def isNumber(value):
return False


def map_bp_to_build_via_liftover(chromosome, bp, build_map):
def map_bp_to_build_via_liftover(chromosome, bp, build_map, coordinate):
if isNumber(bp):
data = build_map.convert_coordinate('chr' + str(chromosome), int(bp))
data = build_map.convert_coordinate('chr' + str(chromosome), int(bp)-int(coordinate))
if data is not None:
if len(data) > 0:
return data[0][1]
return data[0][1]+int(1)
return None


Expand Down Expand Up @@ -77,10 +77,12 @@ def open_file_and_process(file, from_build, to_build):
for row in csv_reader:
chromosome = row[CHR_DSET].replace('23', 'X').replace('24', 'Y')
bp = row[BP_DSET]
bp = row[BP_DSET]
coordinate=coordinate[0]

# do the bp location mapping if needed
if from_build != to_build:
mapped_bp = map_bp_to_build_via_liftover(chromosome=chromosome, bp=bp, build_map=build_map)
mapped_bp = map_bp_to_build_via_liftover(chromosome=chromosome, bp=bp, build_map=build_map, coordinate=coordinate)
if mapped_bp is None:
mapped_bp = map_bp_to_build_via_ensembl(chromosome=chromosome, bp=bp, from_build=from_build, to_build=to_build)
row[BP_DSET] = mapped_bp
Expand All @@ -101,6 +103,8 @@ def main():
from_build = args.original
to_build = args.mapped



open_file_and_process(file, from_build, to_build)

if __name__ == "__main__":
Expand Down
Loading

0 comments on commit f1542de

Please sign in to comment.