Merge pull request #80 from EBISPOT/Master_genome_build

Master genome build
EBISPOT · Nov 13, 2023 · d9b4c6c · d9b4c6c
2 parents c58b095 + 7f299a8
commit d9b4c6c
Show file tree

Hide file tree

Showing 7 changed files with 37 additions and 16 deletions.
diff --git a/bin/liftover.py b/bin/liftover.py
@@ -42,8 +42,8 @@ def isNumber(value):
 
 
 def map_bp_to_build_via_liftover(chromosome, bp, build_map, coordinate):
-    if isNumber(bp):
-        data = build_map.convert_coordinate('chr' + str(chromosome), int(bp)-int(coordinate))
+    if isNumber(chromosome) and isNumber(bp):
+        data = build_map.convert_coordinate('chr' + str(int(chromosome)), int(bp)-int(coordinate))
         if data is not None:
             if len(data) > 0:
                 return str(data[0][1]+int(1))

diff --git a/bin/main_pysam.py b/bin/main_pysam.py
@@ -64,8 +64,14 @@ def main():
         if not ss_rec.hm_code:
             # Get VCF reference variants for this recordls
             coordinate=args.coordinate
-            if int(coordinate[0])==0 and ss_rec.lifmethod=="lo" and len(str(ss_rec.effect_al))+len(str(ss_rec.other_al))>2: 
-                vcf_recs =get_vcf_records_0base(
+            if int(coordinate[0])==0 and ss_rec.lifmethod=="lo":
+                if len(str(ss_rec.effect_al))+len(str(ss_rec.other_al))>2: 
+                    vcf_recs =get_vcf_records_0base(
+                        tbx,
+                        ss_rec.chrom,
+                        ss_rec.pos)
+                else:
+                    vcf_recs = get_vcf_records(
                     tbx,
                     ss_rec.chrom,
                     ss_rec.pos)
@@ -161,7 +167,7 @@ def main():
             out_raw["rsid"] = ss_rec.hm_rsid if vcf_rec and ss_rec.is_harmonised else args.na_rep_out
             out_raw["standard_error"]=ss_rec.data["standard_error"] if ss_rec.data["standard_error"] is not None else args.na_rep_out
             # Add other data from summary stat file
-            outed=["chromosome","base_pair_location","p_value","effect_allele","other_allele","effect_allele_frequency","beta","odds_ratio","standard_error","rsid","ci_upper","ci_lower","ref_allele","hm_coordinate_conversion"]
+            outed=["chromosome","base_pair_location","p_value","effect_allele","other_allele","effect_allele_frequency","beta","odds_ratio","standard_error","rsid","ci_upper","ci_lower","hm_coordinate_conversion"]
             for key in ss_rec.data:
                 if key not in outed:
                     value = ss_rec.data[key] if ss_rec.data[key] else args.na_rep_out

diff --git a/bin/map_to_build_nf.py b/bin/map_to_build_nf.py
@@ -13,7 +13,23 @@
 import argparse
 from ast import literal_eval
 
+# Allow very large fields in input file-------------
+import sys
+import csv
 
+maxInt = sys.maxsize
+
+while True:
+    # decrease the maxInt value by factor 10
+    # as long as the OverflowError occurs.
+
+    try:
+        csv.field_size_limit(maxInt)
+        break
+    except OverflowError:
+        maxInt = int(maxInt/10)
+
+# map_to_build----------------------------------------------------
 def merge_ss_vcf(ss, vcf, from_build, to_build, chroms, coordinate):
     vcfs = glob.glob(vcf)
     ssdf = pd.read_table(ss, sep=None, engine='python', dtype=str)
@@ -32,8 +48,8 @@ def merge_ss_vcf(ss, vcf, from_build, to_build, chroms, coordinate):
             # merge on rsid, update chr and position from vcf ref
             mergedf = ssdf_with_rsid.merge(vcf_df, left_on=RSID, right_on="ID", how="left")
             mapped = mergedf.dropna(subset=["ID"]).drop([CHR_DSET, BP_DSET], axis=1)
-            mapped[CHR_DSET] = mapped["CHR"].astype("str").str.replace("\..*$","")
-            mapped[BP_DSET] = mapped["POS"].astype("str").str.replace("\..*$","")
+            mapped[CHR_DSET] = mapped["CHR"].astype("str").str.replace("\..*$","",regex=True)
+            mapped[BP_DSET] = mapped["POS"].astype("str").str.replace("\..*$","",regex=True)
             mapped = mapped[header]
             mapped[HM_CC_DSET] = "rs"
             outfile = os.path.join("{}.merged".format(chrom))
@@ -50,20 +66,17 @@ def merge_ss_vcf(ss, vcf, from_build, to_build, chroms, coordinate):
     ssdf[HM_CC_DSET] = "lo"
     build_map = lft.LiftOver(lft.ucsc_release.get(from_build), lft.ucsc_release.get(to_build)) if from_build != to_build else None
     if build_map:
-        ssdf[BP_DSET] = [lft.map_bp_to_build_via_liftover(chromosome=x, bp=str(int(y)), build_map=build_map,coordinate=coordinate[0]) for x, y in zip(ssdf[CHR_DSET], ssdf[BP_DSET])]
+        ssdf[BP_DSET] = [lft.map_bp_to_build_via_liftover(chromosome=x, bp=y, build_map=build_map,coordinate=coordinate[0]) for x, y in zip(ssdf[CHR_DSET], ssdf[BP_DSET])]
     for chrom in chroms:
-        print(chrom)
         df = ssdf.loc[ssdf[CHR_DSET].astype("str") == chrom]
         df = df.dropna(subset=[BP_DSET])
         df[BP_DSET] = df[BP_DSET].astype("str").str.replace("\..*$","")
         outfile = os.path.join("{}.merged".format(chrom))
         if os.path.isfile(outfile):
             print("df to {}".format(outfile))
-            print(df)
             df.to_csv(outfile, sep="\t", mode='a', header=False, index=False, na_rep="NA")
         else:
             print("df to {}".format(outfile))
-            print(df)
             df.to_csv(outfile, sep="\t", mode='w', index=False, na_rep="NA")
     print("liftover complete")
     no_chr_df = ssdf[ssdf[CHR_DSET].isnull()]

diff --git a/config/basic.config b/config/basic.config
@@ -4,8 +4,8 @@ process {
     memory = { 1.GB * task.attempt }
     time   = { 1.h  * task.attempt }
 
-    errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'ignore' }
-    maxRetries    = { task.exitStatus in 137..140 ? 4 : 1 }
+    errorStrategy = { task.exitStatus in ((130..145) + 104) ? 'retry' : 'ignore' }
+    maxRetries    = 3
     maxErrors     = '-1'
 
     withName:map_to_build {

diff --git a/modules/local/map_to_build.nf b/modules/local/map_to_build.nf
@@ -16,7 +16,9 @@ process map_to_build {
     shell:
     """
     coordinate=\$(grep coordinate_system $yaml | awk -F ":" '{print \$2}' | tr -d "[:blank:]" )
-    from_build=\$(grep genome_assembly $yaml | rev | cut -c 1-2 | rev )
+    from_build=\$((grep genome_assembly $yaml | grep -Eo '[0-9][0-9]') || (echo \$(basename $tsv) | grep -Eo '[bB][a-zA-Z]*[0-9][0-9]' | grep -Eo '[0-9][0-9]'))
+    [[ -z "\$from_build" ]] && { echo "Parameter from_build is empty" ; exit 1; }
+
     map_to_build_nf.py \
     -f $tsv \
     -from_build \$from_build \

diff --git a/workflows/gwascatalogharm.nf b/workflows/gwascatalogharm.nf
@@ -86,7 +86,7 @@ workflow GWASCATALOGHARM {
 
 def input_files(Path input)
 {
-    return [input.getName().split('\\.')[0],input+"-meta.yaml",input]
+    return [(input.getName()=~ /GCST\d+/).findAll()[0],input+"-meta.yaml",input]
 }
 /*
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

diff --git a/workflows/gwascatalogharm_gwascatalog.nf b/workflows/gwascatalogharm_gwascatalog.nf
@@ -87,7 +87,7 @@ workflow GWASCATALOGHARM_GWASCATALOG {
 }
 
 def input_list(Path input) {
-    return [input.getName().split('\\.')[0],input+"-meta.yaml",input]
+    return [(input.getName()=~ /GCST\d+/).findAll()[0],input+"-meta.yaml",input]
 }
-Original file line number
+Diff line change
@@ Expand Up / @@ -87,7 +87,7 @@ workflow GWASCATALOGHARM_GWASCATALOG { @@
     }
     def input_list(Path input) {
-        return [input.getName().split('\\.')[0],input+"-meta.yaml",input]
+        return [(input.getName()=~ /GCST\d+/).findAll()[0],input+"-meta.yaml",input]
     }
@@ Expand Down @@