Merge pull request #44 from maxplanck-ie/develop

v1.3.2
maxplanck-ie · Feb 13, 2024 · 476cb05 · 476cb05
2 parents 01acf94 + dc5211c
commit 476cb05
Show file tree

Hide file tree

Showing 12 changed files with 268 additions and 22 deletions.
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -0,0 +1,32 @@
+name: build
+
+on:
+  push:
+
+defaults:
+  run:
+    shell: bash -l {0}
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+      - uses: mamba-org/setup-micromamba@main
+        with:
+          environment-file: requirements.yaml
+          cache-downloads: true
+          environment-name: ultraheatmap
+      - name: build
+        run: |
+          micromamba activate ultraheatmap
+          pip install .
+      - name: addFeatureToMatrix
+        run: |
+          micromamba activate ultraheatmap
+          bash ultraheatmap/test/test_data/addFeatureToMatrix/addFeatureToMatrix.sh
+      - name: computeOrderedMatrix
+        run: |
+          micromamba activate ultraheatmap
+          bash ultraheatmap/test/test_data/computeOrderedMatrix/computeOrderedMatrix.sh
+
diff --git a/README.md b/README.md
@@ -38,7 +38,7 @@ Then activate the environment:
 
 To install the program in this environment:
 
-              $ python setup.py install
+              $ pip install .
 
 
 from the ultraheatmap directory.
@@ -48,7 +48,7 @@ Alternatively, `pip` or `conda` can be used to install the package. We highly
 recommend you to create a new conda environment prior to the installation and
 install it after activating this environment. This can be done as follows:
 
-              $ conda create -n ultraheatmap python=3.6
+              $ conda create -n ultraheatmap python=3.10
 
               $ conda activate ultraheatmap
 

diff --git a/meta.yaml b/meta.yaml
@@ -1,5 +1,5 @@
 {% set name = "ultraheatmap" %}
-{% set version = "1.3.1" %}
+{% set version = "1.3.2" %}
 
 package:
   name: "{{ name|lower }}"
@@ -21,7 +21,7 @@ requirements:
   run:
     - python >=3.10
     - bedtools >2
-    - deeptools >3.5.3
+    - deeptools >=3.5.3
     - gffutils
     - pybedtools
     - pybigwig

diff --git a/requirements.yaml b/requirements.yaml
@@ -3,11 +3,10 @@ channels:
   - conda-forge
   - bioconda
 dependencies:
-  - python >= 3.6
-  - bedtools=2.27.1
-  - deeptools >= 3.3.1
-  - gffutils=0.9
-  - pybedtools=0.7.10
-  - pybigwig=0.3.10
-  - pyyaml >= 5.1
-  - pysam >= 0.16.0.1
+  - python >= 3.10
+  - bedtools
+  - deeptools >= 3.5.3
+  - gffutils
+  - pybedtools
+  - pybigwig
+  - pyyaml
diff --git a/script/multi_heatmap.py b/script/multi_heatmap.py
@@ -0,0 +1,176 @@
+# generating hybrid matrices and plot them by plotHeatmap
+
+#!/usr/bin/env python
+
+import os
+import subprocess as sp
+import glob
+import sys
+import argparse
+import yaml
+
+def parse_args():
+    """
+    parsing arguments
+    """
+    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+
+    required = parser.add_argument_group('required arguments')
+    optional = parser.add_argument_group('optional arguments')
+
+    # required argumnets:
+    required.add_argument("--config",
+                          "-c",
+                          dest="config",
+                          help="config to use to plot heatmap (yaml file)",
+                          required=True)
+
+    # optional arguments:
+    optional.add_argument("--heatmapOnly",
+                           "-ho",
+                           default=False,
+                           action='store_true',
+                           dest="heatmapOnly",
+                           help="only plot heatmap for given matrices in yaml file")
+
+
+
+    return parser
+
+
+def main():
+    """
+    compute_matrix > ultraheatmap (optional) > heatmap (or profile only if no ultraheatmap)
+    """
+    parser = parse_args()
+    args = parser.parse_args()
+    with open(os.path.join(args.config), 'r') as stream:
+        config = yaml.safe_load(stream)
+
+
+    # transcriptomics data
+    try:
+        path_2_rna = config["path_2_rna"]
+    except:
+        path_2_rna = ""
+
+    # regions to plot on
+    regions_to_plot = config["regions_to_plot"]
+
+    # output params
+    output_path = config["output_path"]
+
+    matrix_name = os.path.join(output_path, config["matrix_name"])
+    scale_region = config["scale_region"]
+
+    heatmap_name = os.path.join(output_path, config["heatmap_name"])
+
+    bws = " "
+    names = " "
+
+    for k, v in config["bws"].items():
+        bws += v+" "
+        names += k+" "
+
+    deg_list =[]
+    deg_names = " "
+    if config["ultraheatmap_matrix_name"] != "":
+        for i, deg in enumerate( config["deg_list"]):
+            deg_list.append(os.path.join(config["path_2_rna"], deg))
+            # deg_list = [os.path.join(path_2_rna, "nej_vs_wt_shrunk.tsv")]
+            deg_names += config["deg_names"][i]+" "
+            ultraheatmap_matrix_name = os.path.join(output_path, config["ultraheatmap_matrix_name"])
+
+
+    if not args.heatmapOnly:
+        if scale_region:
+            cmd = "computeMatrix scale-regions "
+            cmd += " -S "+bws
+            cmd += " -R "+regions_to_plot
+            cmd += " -p 20 "
+            cmd += "-bs "+str(config["binsize"])
+            cmd += " --skipZeros --regionBodyLength "+str(config["regionBodyLength"])
+            cmd += " -a "+str(config["after_region"])
+            cmd += " -b "+str(config["before_region"])
+            cmd += " --missingDataAsZero --skipZeros --samplesLabel "+names
+            cmd += " -o "+ matrix_name
+            if config['averageTypeBins']:
+                cmd += ' --averageTypeBins '+config['averageTypeBins']
+            print(cmd)
+            sp.check_output(cmd, shell = True)
+        else:
+            cmd = "computeMatrix reference-point --referencePoint "+config["refpoint"]
+            cmd += " -S "+bws
+            cmd += " -R "+regions_to_plot
+            cmd += " -p 20 --skipZeros "
+            cmd += " -a "+str(config["after_region"])
+            cmd += " -b "+str(config["before_region"])
+            cmd += " --missingDataAsZero --skipZeros --samplesLabel "+names
+            cmd += " -o "+ matrix_name
+            if config['averageTypeBins']:
+                cmd += ' --averageTypeBins '+config['averageTypeBins']
+            sp.check_output(cmd, shell = True)
+        if config["ultraheatmap_matrix_name"] != "":
+            w_mapping = config["w_mapping"]
+            if w_mapping:
+                cmd = "addFeatureToMatrix "
+                cmd += "-m "+ matrix_name
+                cmd += " -o "+ultraheatmap_matrix_name
+                cmd += " -t "+" ".join(deg_list)
+                cmd += " --featureNames  "+config['featureNames']
+                cmd += " --referencePoint TSS "
+                cmd += " --genomeGtf "+config["gtf"]
+                sp.check_output(cmd, shell = True)
+            else:
+                cmd = "addFeatureToMatrix "
+                cmd += "-m "+ matrix_name
+                cmd += " -o "+ultraheatmap_matrix_name
+                cmd += " -t "+" ".join(deg_list)
+                if config['featureNames']:
+                    cmd += " --featureNames "+config['featureNames']
+                if config['featureIdColumn']:
+                    cmd += " --featureIdColumn "+config['featureIdColumn']
+                print(cmd)
+                sp.check_output(cmd, shell = True)
+
+
+    # plot heatmap
+    all_samples_names = names
+    matrix_to_plot = matrix_name
+    if config["ultraheatmap_matrix_name"] != "":
+        all_samples_names+= deg_names
+        matrix_to_plot = ultraheatmap_matrix_name
+    cmd = "plotHeatmap -m "
+    cmd += matrix_to_plot
+    if config["zmin"]:
+        cmd += " --zMin  "+config["zmin"]
+    if config["zmax"]:
+        cmd+= " --zMax  "+config["zmax"]
+    if config["colors"]:
+        cmd += " --colorMap "+config["colors"]
+    cmd += " -o "+heatmap_name
+    cmd += " --samplesLabel "+all_samples_names
+    cmd += " --whatToShow 'heatmap and colorbar' "
+    if sort_by_sample != "":
+        cmd += " --sortUsingSamples "+config["sort_by_sample"]
+    if config["sortUsing"]:
+        cmd += " --sortUsing "+config["sortUsing"]
+    if config["sorting_direction"]
+        "--sortRegions "+config["sorting_direction"]
+    if config["sorted_regions"] != "":
+        sorted_regions = os.path.join(output_path, config["sorted_regions"])
+        cmd += " --outFileSortedRegions "+sorted_regions
+    if config["sorted_matrix"] != "":
+        sorted_matrix = os.path.join(output_path, config["sorted_matrix"])
+        cmd += "  --outFileNameMatrix "+sorted_matrix
+    if config["kmeans"]:
+        cmd += " --kmeans "+config["kmeans"]
+        if config["clusterUsingSamples"]:
+            cmd += " --clusterUsingSamples "+config["clusterUsingSamples"]
+    else:
+        if config["reg_labels"]:
+            cmd += " --regionsLabel "+config["reg_labels"]
+    print(cmd)
+    sp.check_output(cmd, shell = True)
+if __name__ == "__main__":
+   main()
diff --git a/script/multi_track_matrix_on_groseq_scaleregion-2000_with_deg_wo_mapping.yaml b/script/multi_track_matrix_on_groseq_scaleregion-2000_with_deg_wo_mapping.yaml
@@ -0,0 +1,36 @@
+bws:
+  bw1: "/path/to/bw1.bw"
+  bw2: "/path/to/bw2.bw"
+
+path_2_rna: "/path/to/rna"
+regions_to_plot: "/path/region.bed"
+# output params
+output_path: "/path/to/output"
+matrix_name: "output.gz"
+scale_region: True
+before_region: 2000
+after_region: 0
+regionBodyLength: 3500
+binsize: 10
+refpoint: ""
+deg_list: ["rn1.tsv", "rna2.tsv"]
+deg_names: ["rn1", "rna2"]
+ultraheatmap_matrix_name: "appended_output.gz"
+w_mapping: False
+gtf:""
+featureIdColumn: 'GeneID'
+featureNames: 'log2FoldChange'
+zmin: "" #lowest intensity to plot
+zmax: "" #highest intensity to plot
+colors: "" #matplot lib colors
+averageTypeBins: ""
+sortUsing: ""
+sort_by_sample: ""
+sorting_direction: " descend "
+reg_labels: " "
+heatmap_name: "plotname.png"
+sorted_regions: ""
+sorted_matrix: ""
+remove_na: False
+kmeans: ""
+clusterUsingSamples: ""
diff --git a/ultraheatmap/__init__.py b/ultraheatmap/__init__.py
@@ -1 +1 @@
-__version__ = '1.3.1'
+__version__ = '1.3.2'
diff --git a/ultraheatmap/addFeatureToMatrix.py b/ultraheatmap/addFeatureToMatrix.py
@@ -128,6 +128,7 @@ def main():
                                             args.referencePoint,
                                             args.closestGenesOutput)  # XXX instead of all these arguments i can simply add args.
         # paste an extra column per table to the input matrix
+        print("closest gene is found")
         extract_ge_folchange_per_peak(regions, args.tables, closestMapping,
                                       args.Features, args.idcolumn, hm)
 

diff --git a/ultraheatmap/computeOrderedMatrix.py b/ultraheatmap/computeOrderedMatrix.py
@@ -196,7 +196,7 @@ def main():
     # 1. Read the default config file
     with open(os.path.join(configDir, "configs", "computeOrderedMatrix.yaml"),
               'r') as stream:
-        defaultconfigfile = yaml.load(stream)
+        defaultconfigfile = yaml.safe_load(stream)
 
     # 2. Parse the arguments
     parser = parse_args(defaultconfigfile)

diff --git a/ultraheatmap/parseTables.py b/ultraheatmap/parseTables.py
@@ -84,16 +84,18 @@ def extract_ge_folchange_per_peak(peaks, tables, closestMapping, features,
     Peaks = BedTool(peaks)
     Peaks=Peaks.sort()
     field_count = Peaks.field_count()
-    keyMap_closest = keymap_from_closest_genes(closestMapping, peaks, field_count)
+    keyMap_closest = keymap_from_closest_genes(closestMapping, peaks, field_count) ##TODO very inefficient!!
+    print("start updating the matrix")
     __update_matrix_values(peaks, keyMap_closest, tables,features,IdColumn,hm)
 
 def __getValuesFromGETable(peaks, keyMap_closest, table, features, IdColumn):
     """
 
     """
+    print(table[IdColumn].values)
     count = 0
     v = np.empty((len(peaks), len(features)), dtype=float)
-    for i, peak in enumerate(peaks):
+    for i, peak in enumerate(peaks): ##TODO get rid of the for loop
         key = ';'.join(map(str,peak))
         value = keyMap_closest[key]
         if value in table[IdColumn].values: #value is geneId
@@ -106,7 +108,8 @@ def __getValuesFromGETable(peaks, keyMap_closest, table, features, IdColumn):
                 v[i,j] = x
         else:
             v[i] = [ np.nan ]*len(features)
-
+
+
     return v
 
 
@@ -120,7 +123,7 @@ def __getValuesFromNameTable(peaks, table, features, IdColumn):
         name = peak[3]
         if name in table[IdColumn].values:
            for j, feature in enumerate(features):
-                x = float(table[table[IdColumn] == name][feature])
+                x = float(table[table[IdColumn] == name][feature].iloc[0])
                 if np.isnan(x):
                      x = np.nan
                 v[i,j] = x
@@ -138,7 +141,6 @@ def __update_matrix_values(peaks, keyMap_closest, tables, features, IdColumn, hm
     for i, table in enumerate(tables):
         table = parseTable(table)
         values = __getValuesFromGETable(peaks, keyMap_closest, table, features, IdColumn)
-
         valuesTab[:,i*len(features):(i*len(features)+len(features))] = values
         for feature in features:
             hm.matrix.sample_labels = hm.matrix.sample_labels + ["table"+str(i)+"_"+feature]

diff --git a/ultraheatmap/test/test_data/addFeatureToMatrix/addFeatureToMatrix.sh b/ultraheatmap/test/test_data/addFeatureToMatrix/addFeatureToMatrix.sh
@@ -1,3 +1,3 @@
 #!/usr/bin/env bash
-computeMatrix reference-point -S ../scores.bw -R ../regions.bed -o matrix.gz
-addFeatureToMatrix -m matrix.gz -o appended_matrix.gz -t features.bed -f score1 score2 --featureIdColumn name
+computeMatrix reference-point -S ultraheatmap/test/test_data/scores.bw -R ultraheatmap/test/test_data/regions.bed -o matrix.gz
+addFeatureToMatrix -m matrix.gz -o appended_matrix.gz -t ultraheatmap/test/test_data/addFeatureToMatrix/features.bed -f score1 score2 --featureIdColumn name
diff --git a/ultraheatmap/test/test_data/computeOrderedMatrix/computeOrderedMatrix.sh b/ultraheatmap/test/test_data/computeOrderedMatrix/computeOrderedMatrix.sh
@@ -1,2 +1,2 @@
 #!/usr/bin/env bash
-computeOrderedMatrix -S ../scores.bw ../scores.bw -R ../regions.bed -o matrix_ordered.gz -g 1 --kmean 2
+computeOrderedMatrix -S ultraheatmap/test/test_data/scores.bw ultraheatmap/test/test_data/scores.bw -R ultraheatmap/test/test_data/regions.bed -o matrix_ordered.gz -g 1 --kmean 2