ablab · andrewprzh · May 7, 2024 · Apr 12, 2024 · Apr 15, 2024 · Apr 17, 2024
diff --git a/.github/workflows/SIRVs.Set4.options.yml b/.github/workflows/SIRVs.Set4.options.yml
@@ -0,0 +1,63 @@
+name: SIRVs real ONT various options
+
+on:
+  workflow_dispatch:
+  schedule:
+  - cron: '0 21 * * 6'
+
+env:
+  RUN_NAME: SIRVs.Set4.R10.OPTS
+  LAUNCHER: ${{github.workspace}}/tests/github/run_pipeline.py
+  CFG_DIR: /abga/work/andreyp/ci_isoquant/data
+  BIN_PATH: /abga/work/andreyp/ci_isoquant/bin/
+  OUTPUT_BASE: /abga/work/andreyp/ci_isoquant/output/${{github.ref_name}}/
+
+concurrency:
+  group: ${{github.workflow}}
+  cancel-in-progress: false
+
+jobs:
+  launch-runner:
+    runs-on:
+      labels: [self-hosted]
+    name: 'Running IsoQuant and QC'
+
+    steps:
+      - name: 'Cleanup'
+        run: >
+          set -e &&
+          shopt -s dotglob &&
+          rm -rf *
+
+      - name: 'Checkout'
+        uses: actions/checkout@v3
+        with:
+          fetch-depth: 1
+
+      - name: 'IsoQuant SIRVs R10.4 pipeline options'
+        if: always()
+        shell: bash
+        env:
+          STEP_NAME: SIRVs.Set4.R10.opts1
+        run: |
+          export PATH=$PATH:${{env.BIN_PATH}}
+          python3 ${{env.LAUNCHER}} ${{env.CFG_DIR}}/${{env.STEP_NAME}}.cfg -o ${{env.OUTPUT_BASE}}
+
+      - name: 'IsoQuant SIRVs R10.4 read filtering'
+        if: always()
+        shell: bash
+        env:
+          STEP_NAME: SIRVs.Set4.R10.opts2
+        run: |
+          export PATH=$PATH:${{env.BIN_PATH}}
+          python3 ${{env.LAUNCHER}} ${{env.CFG_DIR}}/${{env.STEP_NAME}}.cfg -o ${{env.OUTPUT_BASE}}
+
+      - name: 'IsoQuant SIRVs R10.4 strategies'
+        if: always()
+        shell: bash
+        env:
+          STEP_NAME: SIRVs.Set4.R10.opts3
+        run: |
+          export PATH=$PATH:${{env.BIN_PATH}}
+          python3 ${{env.LAUNCHER}} ${{env.CFG_DIR}}/${{env.STEP_NAME}}.cfg -o ${{env.OUTPUT_BASE}}
+
diff --git a/README.md b/README.md
@@ -462,18 +462,18 @@ where `FILE` is the file name, `READ_COL` is column with read ids (0 if not set)
 
 #### Quantification
 
-`--transcript_quantification` Transcript quantification strategy, should be one of:
-
-* `unique_only` - only reads that are uniquely assigned and consistent with a transcript are used for quantification;
-* `with_ambiguous` - ambiguously assigned reads are split between transcripts with equal weights (e.g. 1/2 when a read is assigned to 2 transcripts simultaneously, default mode);
-* `with_inconsistent` - uniquely assigned reads with non-intronic inconsistencies (i.e. alternative poly-A site, TSS etc) are also included;
-* `all` - all of the above.
-
-`--gene_quantification` Gene quantification strategy, should be one of:
-
-* `unique_only` -  only reads that are uniquely assigned to a gene and consistent with any of gene's isoforms are used for quantification;
-* `with_ambiguous` - ambiguously assigned reads are split between genes with equal weights (e.g. 1/2 when a read is assigned to 2 genes simultaneously);
-* `with_inconsistent` - only reads that are uniquely assigned to a gene but are not necessarily consistent with genes isoforms (default);
+`--transcript_quantification` Transcript quantification strategy;
+`--gene_quantification` Gene quantification strategy;
+
+Available options for quantification:
+
+* `unique_only` - use only reads that are uniquely assigned and consistent with a transcript/gene
+(i.e. flagged as unique/unique_minor_difference), default fot transcript quantification;
+* `with_ambiguous` - in addition to unique reads, ambiguously assigned consistent reads are split between features with equal weights 
+(e.g. 1/2 when a read is assigned to 2 features simultaneously);
+* `unique_splicing_consistent` - uses uniquely assigned reads that do not contradict annotated splice sites
+(i.e. flagged as unique/unique_minor_difference or inconsistent_non_intronic), default for gene quantification;
+* `unique_inconsistent` - uses uniquely assigned reads allowing any kind of inconsistency;
 * `all` - all of the above.
 
 
@@ -552,20 +552,23 @@ We recommend _not_ to modify these options unless you are clearly aware of their
 `--no_gzip`
     Do not compress large output files.
 
+`--no_gtf_check`
+    Do not perform input GTF checks.
+
 `--no_secondary`
     Ignore secondary alignments.
 
 `--aligner`
     Force to use this alignment method, can be `starlong` or `minimap2`; `minimap2` is currently used as default. Make sure the specified aligner is in the `$PATH` variable.
 
 `--no_junc_bed`
-    Do not use annotation for read mapping.
+    Do not use gene annotation for read mapping.
 
 `--junc_bed_file`
     Annotation in BED12 format produced by `paftools.js gff2bed` (can be found in `minimap2`), will be created automatically if not given.
 
 `--delta`
-    Delta for inexact splice junction comparison, chosen automatically based on data type.
+    Delta for inexact splice junction comparison, chosen automatically based on data type (e.g. 4bp for PacBio, 6pb for ONT).
 
 `--genedb_output`
     If your output folder is located on a shared storage (e.g. NFS share), use this option to set another path
@@ -585,6 +588,16 @@ but may improve running time when disk I/O is relatively slow.
 `--simple_alignments_mapq_cutoff`
     Filers out alignments with 1 or 2 exons and MAPQ less than this value (works only in annotation-free mode, default is 1).
 
+`--normalization_method`
+    Method for normalizing non-grouped counts into TPMs:
+* `simple` - standard method, scale factor equals to 1 million divided by the counts sum (default);
+* `usable_reads` - includes all reads assigned to a feature including the ones that were filtered out
+during quantification (i.e. inconsistent or ambiguous);
+scale factor equals to 1 million divided by the number of all assigned reads.
+In this case the sum of all gene/transcript TPMs may not add up to 1 million.
+Experiments with simulated data show that this method could give more accurate estimations.
+However, normalization method does not affect correlation/relative proportions.
+
 
 ### Examples
 <a name="examples"></a>

diff --git a/isoquant.py b/isoquant.py
@@ -22,7 +22,7 @@
 import gffutils
 import pyfaidx
 
-from src.gtf2db import convert_gtf_to_db, check_gtf_duplicates
+from src.gtf2db import convert_gtf_to_db
 from src.read_mapper import (
     DATA_TYPE_ALIASES,
     SUPPORTED_STRANDEDNESS,
@@ -35,7 +35,7 @@
 from src.dataset_processor import DatasetProcessor, PolyAUsageStrategies
 from src.graph_based_model_construction import StrandnessReportingLevel
 from src.long_read_assigner import AmbiguityResolvingMethod
-from src.long_read_counter import COUNTING_STRATEGIES
+from src.long_read_counter import COUNTING_STRATEGIES, CountingStrategy, NormalizationMethod
 from src.input_data_storage import InputDataStorage
 from src.multimap_resolver import MultimapResolvingStrategy
 from src.stats import combine_counts
@@ -56,6 +56,7 @@ def parse_args(cmd_args=None, namespace=None):
     input_args_group = parser.add_argument_group('Input data')
     output_args_group = parser.add_argument_group('Output naming')
     pipeline_args_group = parser.add_argument_group('Pipeline options')
+    algo_args_group = parser.add_argument_group('Algorithm settings')
 
     other_options = parser.add_argument_group("Additional options:")
     show_full_help = '--full_help' in cmd_args
@@ -132,37 +133,40 @@ def add_hidden_option(*args, **kwargs):  # show command only with --full-help
                         help="reads represent FL transcripts; both ends of the read are considered to be reliable")
 
     # ALGORITHM
-    add_additional_option("--delta", type=int, default=None,
-                          help="delta for inexact splice junction comparison, chosen automatically based on data type")
-    add_hidden_option("--graph_clustering_distance", type=int, default=None,
-                      help="intron graph clustering distance, "
-                           "splice junctions less that this number of bp apart will not be differentiated")
-    add_additional_option("--matching_strategy", choices=["exact", "precise", "default", "loose"],
-                          help="read-to-isoform matching strategy from the most strict to least", type=str, default=None)
-    add_additional_option("--splice_correction_strategy", choices=["none", "default_pacbio", "default_ont", "conservative_ont", "all", "assembly"],
-                          help="read alignment correction strategy to use",
-                          type=str, default=None)
-    add_additional_option("--model_construction_strategy", choices=["reliable", "default_pacbio",
-                                                                    "sensitive_pacbio", "fl_pacbio", "default_ont",
-                                                                    "sensitive_ont", "all", "assembly"],
-                          help="transcript model construction strategy to use",
-                          type=str, default=None)
-
-    add_additional_option("--transcript_quantification", choices=COUNTING_STRATEGIES,
-                          help="transcript quantification strategy", type=str, default="with_ambiguous")
-    add_additional_option("--gene_quantification", choices=COUNTING_STRATEGIES,
-                          help="gene quantification strategy", type=str, default="with_inconsistent")
-    add_additional_option("--report_novel_unspliced", "-u", type=bool_str,
-                          help="report novel monoexonic transcripts (true/false), "
-                               "default: false for ONT, true for other data types")
-    add_additional_option("--report_canonical",  type=str, choices=[e.name for e in StrandnessReportingLevel],
-                          help="reporting level for novel transcripts based on canonical splice sites;"
-                               " default: " + StrandnessReportingLevel.auto.name,
-                          default=StrandnessReportingLevel.only_stranded.name)
-    add_additional_option("--polya_requirement", type=str, choices=[e.name for e in PolyAUsageStrategies],
-                          help="require polyA tails to be present when reporting transcripts; "
-                               "default: auto (requires polyA only when polyA percentage is >= 70%%)",
-                          default=PolyAUsageStrategies.auto.name)
+    add_additional_option_to_group(algo_args_group, "--report_novel_unspliced", "-u", type=bool_str,
+                                   help="report novel monoexonic transcripts (true/false), "
+                                        "default: false for ONT, true for other data types")
+    add_additional_option_to_group(algo_args_group, "--report_canonical",  type=str,
+                                   choices=[e.name for e in StrandnessReportingLevel],
+                                   help="reporting level for novel transcripts based on canonical splice sites;"
+                                        " default: " + StrandnessReportingLevel.auto.name,
+                                   default=StrandnessReportingLevel.only_stranded.name)
+    add_additional_option_to_group(algo_args_group, "--polya_requirement", type=str,
+                                   choices=[e.name for e in PolyAUsageStrategies],
+                                   help="require polyA tails to be present when reporting transcripts; "
+                                        "default: auto (requires polyA only when polyA percentage is >= 70%%)",
+                                   default=PolyAUsageStrategies.auto.name)
+
+    add_additional_option_to_group(algo_args_group, "--transcript_quantification", choices=COUNTING_STRATEGIES,
+                                   help="transcript quantification strategy", type=str,
+                                   default=CountingStrategy.unique_only.name)
+    add_additional_option_to_group(algo_args_group, "--gene_quantification", choices=COUNTING_STRATEGIES,
+                                   help="gene quantification strategy", type=str,
+                                   default=CountingStrategy.unique_splicing_consistent.name)
+
+    add_additional_option_to_group(algo_args_group, "--matching_strategy",
+                                   choices=["exact", "precise", "default", "loose"],
+                                   help="read-to-isoform matching strategy from the most strict to least",
+                                   type=str, default=None)
+    add_additional_option_to_group(algo_args_group, "--splice_correction_strategy",
+                                   choices=["none", "default_pacbio", "default_ont",
+                                            "conservative_ont", "all", "assembly"],
+                                   help="read alignment correction strategy to use", type=str, default=None)
+    add_additional_option_to_group(algo_args_group, "--model_construction_strategy",
+                                   choices=["reliable", "default_pacbio", "sensitive_pacbio", "fl_pacbio",
+                                            "default_ont", "sensitive_ont", "all", "assembly"],
+                                   help="transcript model construction strategy to use", type=str, default=None)
+
     # OUTPUT PROPERTIES
     pipeline_args_group.add_argument("--threads", "-t", help="number of threads to use", type=int,
                                      default="16")
@@ -191,8 +195,15 @@ def add_hidden_option(*args, **kwargs):  # show command only with --full-help
                                    help="align reads to reference without running further analysis")
 
     # ADDITIONAL
+    add_additional_option("--delta", type=int, default=None,
+                          help="delta for inexact splice junction comparison, chosen automatically based on data type")
+    add_hidden_option("--graph_clustering_distance", type=int, default=None,
+                      help="intron graph clustering distance, "
+                           "splice junctions less that this number of bp apart will not be differentiated")
     add_additional_option("--no_gzip", help="do not gzip large output files", dest="gzipped",
                           action='store_false', default=True)
+    add_additional_option("--no_gtf_check", help="do not perform GTF checks", dest="gtf_check",
+                          action='store_false', default=True)
     add_additional_option("--high_memory", help="increase RAM consumption (store alignment and the genome in RAM)",
                           action='store_true', default=False)
     add_additional_option("--no_junc_bed", action="store_true", default=False,
@@ -210,6 +221,10 @@ def add_hidden_option(*args, **kwargs):  # show command only with --full-help
     add_additional_option("--simple_alignments_mapq_cutoff", help="ignore alignments with 1 or 2 exons and "
                                                                   "MAPQ < this (works only in annotation-free mode, "
                                                                   "default=1)", type=int, default=1)
+    add_additional_option("--normalization_method", type=str, choices=[e.name for e in NormalizationMethod],
+                          help="TPM normalization method: simple - conventional normalization using all counted reads;"
+                               "usable_reads - includes all assigned reads.",
+                          default=NormalizationMethod.simple.name)
 
     add_additional_option_to_group(pipeline_args_group, "--keep_tmp", help="do not remove temporary files "
                                                                            "in the end", action='store_true',

diff --git a/src/alignment_processor.py b/src/alignment_processor.py
@@ -284,7 +284,7 @@ def process_alignments_in_region(self, current_region, alignment_storage):
         if gene_info.empty():
             assignment_storage = self.process_intergenic(alignment_storage, current_region)
         else:
-            assignment_storage = self.process_genic(alignment_storage, gene_info)
+            assignment_storage = self.process_genic(alignment_storage, gene_info, current_region)
 
         return gene_info, assignment_storage
 
@@ -332,6 +332,7 @@ def process_intergenic(self, alignment_storage, region):
                                            alignment_info.polya_info.internal_polyt_pos != -1)
             read_assignment.polya_info = alignment_info.polya_info
             read_assignment.cage_found = len(alignment_info.cage_hits) > 0
+            read_assignment.genomic_region = region
             read_assignment.exons = alignment_info.read_exons
             read_assignment.corrected_exons = corrector.correct_read(alignment_info)
             read_assignment.corrected_introns = junctions_from_blocks(read_assignment.corrected_exons)
@@ -346,7 +347,7 @@ def process_intergenic(self, alignment_storage, region):
             assignment_storage.append(read_assignment)
         return assignment_storage
 
-    def process_genic(self, alignment_storage, gene_info):
+    def process_genic(self, alignment_storage, gene_info, region):
         assigner = LongReadAssigner(gene_info, self.params)
         profile_constructor = CombinedProfileConstructor(gene_info, self.params)
         exon_corrector = ExonCorrector(gene_info, self.params, self.chr_record)
@@ -388,6 +389,7 @@ def process_genic(self, alignment_storage, gene_info):
                                            alignment_info.polya_info.internal_polyt_pos != -1)
             read_assignment.polya_info = alignment_info.polya_info
             read_assignment.cage_found = len(alignment_info.cage_hits) > 0
+            read_assignment.genomic_region = region
             read_assignment.exons = alignment_info.read_exons
             read_assignment.corrected_exons = exon_corrector.correct_assigned_read(alignment_info,
                                                                                    read_assignment)

diff --git a/src/assignment_io.py b/src/assignment_io.py
@@ -21,8 +21,10 @@
     read_short_int,
     SHORT_TERMINATION_INT
 )
-from .isoform_assignment import match_subtype_to_str_with_additional_info, ReadAssignment, MatchClassification
-from .long_read_assigner import ReadAssignmentType
+from .isoform_assignment import (match_subtype_to_str_with_additional_info,
+                                 ReadAssignment,
+                                 MatchClassification,
+                                 ReadAssignmentType)
 from .gene_info import GeneInfo
 
 
@@ -241,11 +243,13 @@ def add_read_info(self, read_assignment):
                                                                                read_introns, isoform_introns)
                                      for x in m.match_subclassifications])
             strand = read_assignment.strand
-            line = read_assignment.read_id + "\t" + read_assignment.chr_id + "\t" + strand + "\t" + \
-                   m.assigned_transcript + "\t" + m.assigned_gene + "\t" + \
-                   read_assignment.assignment_type.name + "\t" + event_string + "\t" + range_list_to_str(read_exons)
+            line = (read_assignment.read_id + "\t" + read_assignment.chr_id + "\t" + strand + "\t" +
+                    m.assigned_transcript + "\t" + m.assigned_gene + "\t" +
+                    read_assignment.assignment_type.name + "\t" + event_string + "\t" +
+                    range_list_to_str(read_exons))
 
             additional_info = []
+            additional_info.append("gene_assignment=" + str(read_assignment.gene_assignment_type.name) + ";")
             additional_info.append("PolyA=" + str(read_assignment.polyA_found) + ";")
             if self.params.cage is not None:
                 additional_info.append("CAGE=" + str(read_assignment.cage_found) + ";")