Skip to content

Commit

Permalink
Merge pull request #80 from EBISPOT/Master_genome_build
Browse files Browse the repository at this point in the history
Master genome build
  • Loading branch information
jdhayhurst authored Nov 13, 2023
2 parents c58b095 + 7f299a8 commit d9b4c6c
Show file tree
Hide file tree
Showing 7 changed files with 37 additions and 16 deletions.
4 changes: 2 additions & 2 deletions bin/liftover.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,8 @@ def isNumber(value):


def map_bp_to_build_via_liftover(chromosome, bp, build_map, coordinate):
if isNumber(bp):
data = build_map.convert_coordinate('chr' + str(chromosome), int(bp)-int(coordinate))
if isNumber(chromosome) and isNumber(bp):
data = build_map.convert_coordinate('chr' + str(int(chromosome)), int(bp)-int(coordinate))
if data is not None:
if len(data) > 0:
return str(data[0][1]+int(1))
Expand Down
12 changes: 9 additions & 3 deletions bin/main_pysam.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,8 +64,14 @@ def main():
if not ss_rec.hm_code:
# Get VCF reference variants for this recordls
coordinate=args.coordinate
if int(coordinate[0])==0 and ss_rec.lifmethod=="lo" and len(str(ss_rec.effect_al))+len(str(ss_rec.other_al))>2:
vcf_recs =get_vcf_records_0base(
if int(coordinate[0])==0 and ss_rec.lifmethod=="lo":
if len(str(ss_rec.effect_al))+len(str(ss_rec.other_al))>2:
vcf_recs =get_vcf_records_0base(
tbx,
ss_rec.chrom,
ss_rec.pos)
else:
vcf_recs = get_vcf_records(
tbx,
ss_rec.chrom,
ss_rec.pos)
Expand Down Expand Up @@ -161,7 +167,7 @@ def main():
out_raw["rsid"] = ss_rec.hm_rsid if vcf_rec and ss_rec.is_harmonised else args.na_rep_out
out_raw["standard_error"]=ss_rec.data["standard_error"] if ss_rec.data["standard_error"] is not None else args.na_rep_out
# Add other data from summary stat file
outed=["chromosome","base_pair_location","p_value","effect_allele","other_allele","effect_allele_frequency","beta","odds_ratio","standard_error","rsid","ci_upper","ci_lower","ref_allele","hm_coordinate_conversion"]
outed=["chromosome","base_pair_location","p_value","effect_allele","other_allele","effect_allele_frequency","beta","odds_ratio","standard_error","rsid","ci_upper","ci_lower","hm_coordinate_conversion"]
for key in ss_rec.data:
if key not in outed:
value = ss_rec.data[key] if ss_rec.data[key] else args.na_rep_out
Expand Down
25 changes: 19 additions & 6 deletions bin/map_to_build_nf.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,23 @@
import argparse
from ast import literal_eval

# Allow very large fields in input file-------------
import sys
import csv

maxInt = sys.maxsize

while True:
# decrease the maxInt value by factor 10
# as long as the OverflowError occurs.

try:
csv.field_size_limit(maxInt)
break
except OverflowError:
maxInt = int(maxInt/10)

# map_to_build----------------------------------------------------
def merge_ss_vcf(ss, vcf, from_build, to_build, chroms, coordinate):
vcfs = glob.glob(vcf)
ssdf = pd.read_table(ss, sep=None, engine='python', dtype=str)
Expand All @@ -32,8 +48,8 @@ def merge_ss_vcf(ss, vcf, from_build, to_build, chroms, coordinate):
# merge on rsid, update chr and position from vcf ref
mergedf = ssdf_with_rsid.merge(vcf_df, left_on=RSID, right_on="ID", how="left")
mapped = mergedf.dropna(subset=["ID"]).drop([CHR_DSET, BP_DSET], axis=1)
mapped[CHR_DSET] = mapped["CHR"].astype("str").str.replace("\..*$","")
mapped[BP_DSET] = mapped["POS"].astype("str").str.replace("\..*$","")
mapped[CHR_DSET] = mapped["CHR"].astype("str").str.replace("\..*$","",regex=True)
mapped[BP_DSET] = mapped["POS"].astype("str").str.replace("\..*$","",regex=True)
mapped = mapped[header]
mapped[HM_CC_DSET] = "rs"
outfile = os.path.join("{}.merged".format(chrom))
Expand All @@ -50,20 +66,17 @@ def merge_ss_vcf(ss, vcf, from_build, to_build, chroms, coordinate):
ssdf[HM_CC_DSET] = "lo"
build_map = lft.LiftOver(lft.ucsc_release.get(from_build), lft.ucsc_release.get(to_build)) if from_build != to_build else None
if build_map:
ssdf[BP_DSET] = [lft.map_bp_to_build_via_liftover(chromosome=x, bp=str(int(y)), build_map=build_map,coordinate=coordinate[0]) for x, y in zip(ssdf[CHR_DSET], ssdf[BP_DSET])]
ssdf[BP_DSET] = [lft.map_bp_to_build_via_liftover(chromosome=x, bp=y, build_map=build_map,coordinate=coordinate[0]) for x, y in zip(ssdf[CHR_DSET], ssdf[BP_DSET])]
for chrom in chroms:
print(chrom)
df = ssdf.loc[ssdf[CHR_DSET].astype("str") == chrom]
df = df.dropna(subset=[BP_DSET])
df[BP_DSET] = df[BP_DSET].astype("str").str.replace("\..*$","")
outfile = os.path.join("{}.merged".format(chrom))
if os.path.isfile(outfile):
print("df to {}".format(outfile))
print(df)
df.to_csv(outfile, sep="\t", mode='a', header=False, index=False, na_rep="NA")
else:
print("df to {}".format(outfile))
print(df)
df.to_csv(outfile, sep="\t", mode='w', index=False, na_rep="NA")
print("liftover complete")
no_chr_df = ssdf[ssdf[CHR_DSET].isnull()]
Expand Down
4 changes: 2 additions & 2 deletions config/basic.config
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@ process {
memory = { 1.GB * task.attempt }
time = { 1.h * task.attempt }

errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'ignore' }
maxRetries = { task.exitStatus in 137..140 ? 4 : 1 }
errorStrategy = { task.exitStatus in ((130..145) + 104) ? 'retry' : 'ignore' }
maxRetries = 3
maxErrors = '-1'

withName:map_to_build {
Expand Down
4 changes: 3 additions & 1 deletion modules/local/map_to_build.nf
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,9 @@ process map_to_build {
shell:
"""
coordinate=\$(grep coordinate_system $yaml | awk -F ":" '{print \$2}' | tr -d "[:blank:]" )
from_build=\$(grep genome_assembly $yaml | rev | cut -c 1-2 | rev )
from_build=\$((grep genome_assembly $yaml | grep -Eo '[0-9][0-9]') || (echo \$(basename $tsv) | grep -Eo '[bB][a-zA-Z]*[0-9][0-9]' | grep -Eo '[0-9][0-9]'))
[[ -z "\$from_build" ]] && { echo "Parameter from_build is empty" ; exit 1; }
map_to_build_nf.py \
-f $tsv \
-from_build \$from_build \
Expand Down
2 changes: 1 addition & 1 deletion workflows/gwascatalogharm.nf
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ workflow GWASCATALOGHARM {

def input_files(Path input)
{
return [input.getName().split('\\.')[0],input+"-meta.yaml",input]
return [(input.getName()=~ /GCST\d+/).findAll()[0],input+"-meta.yaml",input]
}
/*
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Expand Down
2 changes: 1 addition & 1 deletion workflows/gwascatalogharm_gwascatalog.nf
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ workflow GWASCATALOGHARM_GWASCATALOG {
}

def input_list(Path input) {
return [input.getName().split('\\.')[0],input+"-meta.yaml",input]
return [(input.getName()=~ /GCST\d+/).findAll()[0],input+"-meta.yaml",input]
}


Expand Down

0 comments on commit d9b4c6c

Please sign in to comment.