kircherlab · visze · Nov 20, 2024 · Nov 5, 2024 · Nov 5, 2024 · Nov 5, 2024
diff --git a/.github/workflows/conventional-prs.yml b/.github/workflows/conventional-prs.yml
@@ -1,5 +1,5 @@
 ---
-name: PR
+name: "Lint PR"
 on:
   pull_request_target:
     types:
@@ -8,12 +8,14 @@ on:
       - edited
       - synchronize
 
+permissions:
+  pull-requests: read
+
 jobs:
-  title-format:
+  main:
+    name: Validate PR title
     runs-on: ubuntu-latest
     steps:
-      - uses: amannn/action-semantic-pull-request@v4
+      - uses: amannn/action-semantic-pull-request@v5
         env:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        with:
-          validateSingleCommit: true
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -50,7 +50,7 @@ jobs:
         with:
           directory: .
           snakefile: workflow/Snakefile
-          args: "--lint --configfile config/example_config.yaml"
+          args: "--lint --configfile config/example_config.yaml --config skip_version_check=True"
   # Testing:
   #   runs-on: ubuntu-latest
   #   needs:

diff --git a/.release-please-manifest.json b/.release-please-manifest.json
@@ -1,3 +1,3 @@
 {
-  ".": "0.1.1"
+  ".": "0.3.0"
 }
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,37 @@
 # Changelog
 
+## [0.3.0](https://github.com/kircherlab/MPRAsnakeflow/compare/MPRAsnakeflow-v0.2.0...MPRAsnakeflow-v0.3.0) (2024-11-20)
+
+
+### ⚠ BREAKING CHANGES
+
+* versioned config ([#140](https://github.com/kircherlab/MPRAsnakeflow/issues/140))
+
+### Features
+
+* versioned config ([#140](https://github.com/kircherlab/MPRAsnakeflow/issues/140))
+* MAD outlier removal is completely removed ([#140](https://github.com/kircherlab/MPRAsnakeflow/issues/140))
+* default is NO outlier detection (none is not present anymore) ([#140](https://github.com/kircherlab/MPRAsnakeflow/issues/140))
+* global config is removed. splits moved now withing mapping in assignment ([#140](https://github.com/kircherlab/MPRAsnakeflow/issues/140))
+
+## [0.2.0](https://github.com/kircherlab/MPRAsnakeflow/compare/MPRAsnakeflow-v0.1.1...MPRAsnakeflow-v0.2.0) (2024-11-05)
+
+### ⚠ BREAKING CHANGES
+
+* Support only snakemake >=8.24.1 ([#130](https://github.com/kircherlab/MPRAsnakeflow/pull/130))
+* File output formats and locations changed
+* Normalization changed which may result in different outputs
+
+### Features
+
+ * outlier removal methods ([#132](https://github.com/kircherlab/MPRAsnakeflow/pull/132))
+ * No min max length for bbmap. default mapq is 30. ([#131](https://github.com/kircherlab/MPRAsnakeflow/pull/131))
+ * IGVF outputs ([#129](https://github.com/kircherlab/MPRAsnakeflow/pull/129))
+ * Documentation improvements
+
+
+### Bug Fixes
+
 ## [0.1.1](https://github.com/kircherlab/MPRAsnakeflow/compare/MPRAsnakeflow-v0.1.0...MPRAsnakeflow-v0.1.1) (2024-09-30)
 
 ### Bug Fixes
@@ -19,3 +51,55 @@ First release of MPRAsnakeflow!
 * Barcode count output
 * Snakemake 8 support
 * Extended documentation: https://mprasnakeflow.readthedocs.io
+
+
+## older development
+
+
+### ⚠ BREAKING CHANGES
+
+* latest development for new release ([#133](https://github.com/kircherlab/MPRAsnakeflow/issues/133))
+* pseudocounts where not used correctly when RNA or DNA set to 0
+* DNA and RNA join correction
+
+### Features
+
+* Add assignment_merge thread configuration ([26e68c2](https://github.com/kircherlab/MPRAsnakeflow/commit/26e68c26f315c524cf28692d636127fbf3bdeb2b))
+* better assignment BC statistics ([00187e6](https://github.com/kircherlab/MPRAsnakeflow/commit/00187e689b2fad10fd317aa2efbd0214fad14434))
+* configurable min mapping quality ([28045ae](https://github.com/kircherlab/MPRAsnakeflow/commit/28045aea23d6fa03f3883b3dc44b3cbc3e8f6205))
+* extending figure width ([8bf81c4](https://github.com/kircherlab/MPRAsnakeflow/commit/8bf81c45e45f9b4c23856c0915bd527f9699b6cd))
+* faster design check ([315b402](https://github.com/kircherlab/MPRAsnakeflow/commit/315b402499d92850382d4110e153602020381e8a))
+* fastq-join implementation ([aaf5315](https://github.com/kircherlab/MPRAsnakeflow/commit/aaf5315364ebb3e3117c3996c2fc357aa9c4d595))
+* latest development for new release ([#133](https://github.com/kircherlab/MPRAsnakeflow/issues/133)) ([bdfc557](https://github.com/kircherlab/MPRAsnakeflow/commit/bdfc557a64cecc19d1d86eead8bdb691a1ff2166))
+* make filtering consistent ([5f7a4c5](https://github.com/kircherlab/MPRAsnakeflow/commit/5f7a4c5a2a3389a75b8d6b7e9aaf34485127b3a4))
+* master variant table ([6bda47c](https://github.com/kircherlab/MPRAsnakeflow/commit/6bda47c78021bc1728bb81a716f5e6daaf6ac084))
+* new final output file with merged replicates ([66cf017](https://github.com/kircherlab/MPRAsnakeflow/commit/66cf0172cb6b556e507be4daabf7e859447787f3))
+* only link assignment fasta when possible ([d7d3822](https://github.com/kircherlab/MPRAsnakeflow/commit/d7d3822933c98d790f3c96bcbfdef1a7ea70c7df)), closes [#50](https://github.com/kircherlab/MPRAsnakeflow/issues/50)
+* remove space, speedup BC extraction ([70e9bd0](https://github.com/kircherlab/MPRAsnakeflow/commit/70e9bd06b91ccb37333e0a69c47917a5eacbf639))
+* replace merging by NGmerge ([0aa8cad](https://github.com/kircherlab/MPRAsnakeflow/commit/0aa8cad6884a953f9c89a2fdd7af397e4e9ccf3e))
+* snakemake 8 compatibility ([cf38ed9](https://github.com/kircherlab/MPRAsnakeflow/commit/cf38ed9de68367d0d1700ccff262e91ad6f1fbc0))
+* snakemake 8 ready with workflow profile ([d637e1f](https://github.com/kircherlab/MPRAsnakeflow/commit/d637e1fdbebfca0616d944101898fbf522df9c82))
+* statistic for assignment workflow ([10c3b26](https://github.com/kircherlab/MPRAsnakeflow/commit/10c3b2677ada59925ddd3de777f7488c9a20e981))
+* using reverese compelment BCs ([d009a6c](https://github.com/kircherlab/MPRAsnakeflow/commit/d009a6c3de7de50a210479b73f5d41969287e234))
+
+
+### Bug Fixes
+
+* batch size issue in sort ([487ba8c](https://github.com/kircherlab/MPRAsnakeflow/commit/487ba8ce059517030fcab3708c3cea40ac210f7e))
+* correct use of assignment configs ([58b64f1](https://github.com/kircherlab/MPRAsnakeflow/commit/58b64f1e753477f7410233ac546701ddbd60f9f2))
+* corrected qc_report_assoc ([afb0127](https://github.com/kircherlab/MPRAsnakeflow/commit/afb012750bc1c3c39f2348b283c23ff97695f672))
+* Detach from anaconda ([#122](https://github.com/kircherlab/MPRAsnakeflow/issues/122)) ([16bcea2](https://github.com/kircherlab/MPRAsnakeflow/commit/16bcea2f04190a5965ad1865cf30f6dd44f1b6a0))
+* DNA and RNA join correction ([7214743](https://github.com/kircherlab/MPRAsnakeflow/commit/7214743008dc6796077e45e62646174ffaf52290))
+* filter config ([38ee37e](https://github.com/kircherlab/MPRAsnakeflow/commit/38ee37ecfcf4a71b840575504811512e0d64609a))
+* issue with stats and asisgnment ([d935fa1](https://github.com/kircherlab/MPRAsnakeflow/commit/d935fa1f62825dfdcd2cd77e4c73bc37686519a0))
+* memory resources for bbmap ([#123](https://github.com/kircherlab/MPRAsnakeflow/issues/123)) ([af93f58](https://github.com/kircherlab/MPRAsnakeflow/commit/af93f588e9387ddf91197f5587d36c3481499b38))
+* plots per insert only used last experiment. not all. ([c2fd82b](https://github.com/kircherlab/MPRAsnakeflow/commit/c2fd82b6d4b545cc3a1acc5ecb145eb3c93af49d))
+* pseudocounts where not used correctly when RNA or DNA set to 0 ([d2483f9](https://github.com/kircherlab/MPRAsnakeflow/commit/d2483f9c7724e0b63cec4f251519d449831ecf04))
+* remove illegal characters from reference ([0ebee81](https://github.com/kircherlab/MPRAsnakeflow/commit/0ebee81d74f3f6170ce4b8083e18c746550154db))
+* rename barcoe output header ([635f043](https://github.com/kircherlab/MPRAsnakeflow/commit/635f0431c78d3d5bf9b77a16f6ce26d9ff6c82c2))
+* rule make_master_tables fix ([df42845](https://github.com/kircherlab/MPRAsnakeflow/commit/df42845b6dfa9a7b64f187b38f1f15518f3e4a31))
+* statistic total counts ([6381b92](https://github.com/kircherlab/MPRAsnakeflow/commit/6381b928fd6c14eb16801a459b8546fa37004c74))
+* typo in report ([ace8cca](https://github.com/kircherlab/MPRAsnakeflow/commit/ace8ccacb3d7ece04af43c9b0b1dc9c9c087a2c4))
+* upgrade code to new pandas version ([aaea236](https://github.com/kircherlab/MPRAsnakeflow/commit/aaea236bc83f459e7a6c2d3fee96d49c79762325))
+* using correct threads ([6dcad7d](https://github.com/kircherlab/MPRAsnakeflow/commit/6dcad7d34173f37d4538644b1ba0d918afd8f149))
+* using multiple fastq inputs in counts ([95935cf](https://github.com/kircherlab/MPRAsnakeflow/commit/95935cfe69956ca50307a9c6a774c4b96dff860f))
diff --git a/Dockerfile b/Dockerfile
@@ -1,4 +1,4 @@
-ARG VERSION=0.1.1
+ARG VERSION=0.2.0
 
 FROM condaforge/miniforge3:latest
 LABEL io.github.snakemake.containerized="true"

diff --git a/config/example_assignment_bbmap.yaml b/config/example_assignment_bbmap.yaml
@@ -1,11 +1,10 @@
 ---
-global: # generall configs effecting one or multiple parts
-  assignments:
-    split_number: 1 # number of files fastq should be split for parallelization
+version: "0.3"
 assignments:
   exampleAssignment: # name of an example assignment (can be any string)
     bc_length: 15
     alignment_tool:
+      split_number: 1 # number of files fastq should be split for parallelization
       tool: bbmap
       configs:
         min_mapping_quality: 30 # 30 is default for bbmap

diff --git a/config/example_assignment_bwa.yaml b/config/example_assignment_bwa.yaml
@@ -1,11 +1,10 @@
 ---
-global: # generall configs effecting one or multiple parts
-  assignments:
-    split_number: 1 # number of files fastq should be split for parallelization
+version: "0.3"
 assignments:
   exampleAssignment: # name of an example assignment (can be any string)
     bc_length: 15
     alignment_tool:
+      split_number: 1 # number of files fastq should be split for parallelization
       tool: bwa
       configs:
         min_mapping_quality: 1 # integer >=0 Please use 1 when you have oligos that differ by 1 base in your reference/design_file

diff --git a/config/example_assignment_exact_lazy.yaml b/config/example_assignment_exact_lazy.yaml
@@ -1,11 +1,10 @@
 ---
-global: # generall configs effecting one or multiple parts
-  assignments:
-    split_number: 1 # number of files fastq should be split for parallelization
+version: "0.3"
 assignments:
   exampleAssignment: # name of an example assignment (can be any string)
     bc_length: 15
     alignment_tool:
+      split_number: 1 # number of files fastq should be split for parallelization
       tool: exact # bwa or exact
       configs:
         sequence_length: 171 # sequence length of design excluding adapters.

diff --git a/config/example_assignment_exact_linker.yaml b/config/example_assignment_exact_linker.yaml
@@ -1,13 +1,12 @@
 ---
-global: # generall configs effecting one or multiple parts
-  assignments:
-    split_number: 1 # number of files fastq should be split for parallelization
+version: "0.3"
 assignments:
   exampleAssignment: # name of an example assignment (can be any string)
     bc_length: 20
     BC_rev_comp: true
     linker: TCTAGACCGTCACTAACTAACAGTGGGTACCC
     alignment_tool:
+      split_number: 1 # number of files fastq should be split for parallelization
       tool: exact # bwa or exact
       configs:
         sequence_length: 171 # sequence length of design excluding adapters.

diff --git a/config/example_config.yaml b/config/example_config.yaml
@@ -1,11 +1,10 @@
 ---
-global: # generall configs effecting one or multiple parts
-  assignments:
-    split_number: 1 # number of files fastq should be split for parallelization
+version: "0.3"
 assignments:
   exampleAssignment: # name of an example assignment (can be any string)
     bc_length: 15
     alignment_tool:
+      split_number: 1 # number of files fastq should be split for parallelization
       tool: exact # bbmap, bwa or exact
       configs:
         sequence_length: 171 # sequence length of design excluding adapters.

diff --git a/config/example_count.yaml b/config/example_count.yaml
@@ -1,4 +1,5 @@
 ---
+version: "0.3"
 experiments:
   exampleCount:
     bc_length: 15

diff --git a/docs/config.rst b/docs/config.rst
@@ -4,7 +4,7 @@
 Config File
 =====================
 
-The config file is a yaml file that contains the configuration. Different runs can be configured. We recommend using one config file per MPRA experiment or MPRA project. But in theory, many different experiments can be configured in only one file. It is divided into :code:`global` (general settings), :code:`assignments` (assigment workflow), and :code:`experiments` (count workflow including variants). This is a full example file with default configurations. :download:`config/example_config.yaml <../config/example_config.yaml>`.
+The config file is a yaml file that contains the configuration. Different runs can be configured. We recommend using one config file per MPRA experiment or MPRA project. But in theory, many different experiments can be configured in only one file. It is divided into :code:`version` (version of MPRAsnakeflow used), :code:`assignments` (assigment workflow), and :code:`experiments` (count workflow). This is a full example file with default configurations. :download:`config/example_config.yaml <../config/example_config.yaml>`.
 
 .. literalinclude:: ../config/example_config.yaml
    :language: yaml
@@ -14,21 +14,18 @@ The config file is a yaml file that contains the configuration. Different runs c
 Note that the config file is controlled by json schema. This means that the config file is validated against the schema. If the config file is not valid, the program will exit with an error message. The schema is located in :download:`workflow/schemas/config.schema.yaml <../workflow/schemas/config.schema.yaml>`.
 
 ----------------
-General settings
+Version settings
 ----------------
 
-The general settings are located in the :code:`global` section. The following settings are possible:
+Set the version of the of MPRAsnakeflow this configuration is used. This is important for future updates. The version is used to check if the config file is compatible with the current version of the workflow. If the version is not the same the workflow will exit with an error message.
 
 .. literalinclude:: ../workflow/schemas/config.schema.yaml
    :language: yaml
-   :start-after: start_global
+   :start-after: start_version
    :end-before: start_assignments
 
-:assignments:
-    Global parameters that hold for the assignment workflow.
-
-    :split_number:
-        To parallize mapping for assignment the reads are split into :code:`split_number` files. E.g. setting to 300 means that the reads are split into 300 files and each file is mapped in parallel. This is only useful when using on a cluster. Running the workflow only on one machine the default value should be used. The default is set to 1. 
+:version:
+    A a string like "0.2.0" or "1.2". When major version "0" is used the minor version should fit with MPRAsnakeflow, e.g. "0.2.0" is compatible with MPRAsnakeflow 0.2.0. as well as 0.2.1 or 0.2.2. When major version greater 0 used then the major version have to fith with MPRAsnakeflow. E.g. config of "1.2.1" fits also with MPRAsnakeflow 1.7 or 1.0.
 
 --------------------
 Assignment workflow
@@ -43,9 +40,12 @@ The assignment workflow is configured in the :code:`assignments` section. The fo
 
 For each assignment you want to process you have to give him a name like :code:`example_assignment`. The name is used to name the output files.
 
+
 :alignment_tool:
     Alignment tool configuration that is used to map the reads to the oligos.
-
+
+    :split_number:
+        To parallize mapping for assignment the reads are split into :code:`split_number` files. E.g. setting to 300 means that the reads are split into 300 files and each file is mapped in parallel. This is only useful when using on a cluster. Running the workflow only on one machine the default value should be used. The default is set to :code:`1`. (For technical reasons when multiple assignments defined all will set to the maximum defined in the config.)
     :tool:
         Alignment tool that is used. Currently :code:`bbmap` :code:`bwa`, :code:`exact` are supported. Default is :code:`bbmap`.
     :configs:

diff --git a/docs/quickstart.rst b/docs/quickstart.rst
@@ -37,7 +37,7 @@ MPRAsnakeflow exoists of two subworkflows, :ref:`Assignment` and :ref:`Experimen
 
 3. Set up the config file
 
-The config file is the heart of MPRAsnakflow. Here different runs can be configured. We recommend using one config file per MPRA experiment or MPRA project. But in theory, many different experiments can be configured in only one file. It is divided into :code:`global` (general settings), :code:`assignments` (assigment workflow), and :code:`experiments` (count workflow including variants).
+The config file is the heart of MPRAsnakflow. Here different runs can be configured. We recommend using one config file per MPRA experiment or MPRA project. But in theory, many different experiments can be configured in only one file. It is divided into :code:`version` (used MPRAsnakeflow version), :code:`assignments` (assigment workflow), and :code:`experiments` (count workflow).
 
 See :ref:`Config` for more details about the config file. Here is an example running only the count experiments and using a provided assignment file.
 

diff --git a/resources/assoc_basic/config.yml b/resources/assoc_basic/config.yml
@@ -1,11 +1,10 @@
 ---
-global:
-  assignments:
-    split_number: 30
+version: "0.3"
 assignments:
   assocBasic:
     bc_length: 15
     alignment_tool:
+      split_number: 30
       tool: bbmap
       configs:
         sequence_length: 171

diff --git a/resources/combined_basic/config.yml b/resources/combined_basic/config.yml
@@ -1,11 +1,10 @@
 ---
-global:
-  assignments:
-    split_number: 30
+version: "0.3"
 assignments:
   assocBasic:
     bc_length: 15
     alignment_tool:
+      split_number: 30
       tool: bbmap
       configs:
         sequence_length: 171
@@ -30,7 +29,7 @@ experiments:
       fromWorkflow:
         type: config
         assignment_name: assocBasic
-        assignment_config: configs
+        assignment_config: default
     design_file: design.fa
     configs:
       default: {}
diff --git a/resources/count_basic/config.yml b/resources/count_basic/config.yml
@@ -1,4 +1,5 @@
 ---
+version: "0.3"
 experiments:
   exampleCount:
     bc_length: 15
@@ -13,10 +14,6 @@ experiments:
     design_file: design.fa
     configs:
       default: {}
-      outlierNone:
-        filter:
-          outlier_detection:
-            method: none
       outlierZscore:
         filter:
           outlier_detection:

diff --git a/version.txt b/version.txt
@@ -1 +1 @@
-0.1.1
+0.3.0
diff --git a/workflow/rules/assigned_counts.smk b/workflow/rules/assigned_counts.smk
@@ -115,20 +115,12 @@ rule assigned_counts_dna_rna_merge:
             % config["experiments"][wc.project]["configs"][wc.config]["filter"][
                 "outlier_detection"
             ]["method"]
-            if config["experiments"][wc.project]["configs"][wc.config]["filter"][
+            if "method"
+            in config["experiments"][wc.project]["configs"][wc.config]["filter"][
                 "outlier_detection"
-            ]["method"]
-            != "none"
+            ]
             else ""
         ),
-        outlier_mad_bins=lambda wc: "--outlier-ratio-mad-bins %d"
-        % config["experiments"][wc.project]["configs"][wc.config]["filter"][
-            "outlier_detection"
-        ]["mad_bins"],
-        outlier_mad_times=lambda wc: "--outlier-ratio-mad-times %f"
-        % config["experiments"][wc.project]["configs"][wc.config]["filter"][
-            "outlier_detection"
-        ]["times_mad"],
         outlier_zscore_times=lambda wc: "--outlier-rna-zscore-times %f"
         % config["experiments"][wc.project]["configs"][wc.config]["filter"][
             "outlier_detection"
@@ -143,7 +135,7 @@ rule assigned_counts_dna_rna_merge:
         --minRNACounts {params.minRNACounts} --minDNACounts {params.minDNACounts} \
         --assignment {input.association} \
         {params.outlier_detection} --outlier-barcodes {output.removed_bcs} \
-        {params.outlier_mad_bins} {params.outlier_mad_times} {params.outlier_zscore_times} \
+        {params.outlier_zscore_times} \
         --output {output.counts} \
         --bcOutput {output.bc_counts} \
         --statistic {output.statistic} &> {log}