From 7d08208970f5e20a633ed2ce093b968ed96ca34d Mon Sep 17 00:00:00 2001
From: Victor Lin <13424970+victorlin@users.noreply.github.com>
Date: Thu, 31 Oct 2024 13:31:53 -0700
Subject: [PATCH 1/2] Configure pre-commit checks

Files copied from pathogen-repo-guide at commit
https://github.com/nextstrain/pathogen-repo-guide/commit/71feb3374a0c1356af0661aef0ffa12c5847e0db
---
 .github/workflows/pre-commit.yaml | 14 +++++++++++
 .pre-commit-config.yaml           | 41 +++++++++++++++++++++++++++++++
 README.md                         | 21 ++++++++++++++++
 3 files changed, 76 insertions(+)
 create mode 100644 .github/workflows/pre-commit.yaml
 create mode 100644 .pre-commit-config.yaml

diff --git a/.github/workflows/pre-commit.yaml b/.github/workflows/pre-commit.yaml
new file mode 100644
index 0000000..70da533
--- /dev/null
+++ b/.github/workflows/pre-commit.yaml
@@ -0,0 +1,14 @@
+name: pre-commit
+
+on:
+  - push
+
+jobs:
+  pre-commit:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
+        with:
+          python-version: "3.12"
+      - uses: pre-commit/action@v3.0.1
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 0000000..b43bdcf
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,41 @@
+default_language_version:
+  python: python3
+exclude: '\.(tsv|fasta|gb)$|^ingest/vendored/'
+repos:
+  - repo: https://github.com/pre-commit/sync-pre-commit-deps
+    rev: v0.0.1
+    hooks:
+      - id: sync-pre-commit-deps
+  - repo: https://github.com/shellcheck-py/shellcheck-py
+    rev: v0.10.0.1
+    hooks:
+      - id: shellcheck
+  - repo: https://github.com/rhysd/actionlint
+    rev: v1.6.27
+    hooks:
+      - id: actionlint
+        entry: env SHELLCHECK_OPTS='--exclude=SC2027' actionlint
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.6.0
+    hooks:
+      - id: trailing-whitespace
+      - id: check-ast
+      - id: check-case-conflict
+      - id: check-docstring-first
+      - id: check-json
+      - id: check-executables-have-shebangs
+      - id: check-merge-conflict
+      - id: check-shebang-scripts-are-executable
+      - id: check-symlinks
+      - id: check-toml
+      - id: check-yaml
+      - id: destroyed-symlinks
+      - id: detect-private-key
+      - id: end-of-file-fixer
+      - id: fix-byte-order-marker
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    # Ruff version.
+    rev: v0.4.6
+    hooks:
+      # Run the linter.
+      - id: ruff
diff --git a/README.md b/README.md
index 3951b6e..7f61775 100644
--- a/README.md
+++ b/README.md
@@ -24,3 +24,24 @@ nextstrain view .
 ## Documentation
 
 - [Running a pathogen workflow](https://docs.nextstrain.org/en/latest/tutorials/running-a-workflow.html)
+
+## Working on this repo
+
+This repo is configured to use [pre-commit](https://pre-commit.com),
+to help automatically catch common coding errors and syntax issues
+with changes before they are committed to the repo.
+.
+If you will be writing new code or otherwise working within this repo,
+please do the following to get started:
+
+1. install `pre-commit` by running either `python -m pip install
+   pre-commit` or `brew install pre-commit`, depending on your
+   preferred package management solution
+2. install the local git hooks by running `pre-commit install` from
+   the root of the repo
+3. when problems are detected, correct them in your local working tree
+   before committing them.
+
+Note that these pre-commit checks are also run in a GitHub Action when
+changes are pushed to GitHub, so correcting issues locally will
+prevent extra cycles of correction.

From 3d5e0c3e94d8a231eba66efe52e0e7513c2ecd6d Mon Sep 17 00:00:00 2001
From: Victor Lin <13424970+victorlin@users.noreply.github.com>
Date: Thu, 31 Oct 2024 13:45:21 -0700
Subject: [PATCH 2/2] Fix pre-commit violations

These changes were automatically generated by running `pre-commit run
--all-files`.
---
 .github/workflows/ingest-to-phylogenetic.yaml  |  8 ++++----
 ingest/bin/parse-measles-genotype-names.py     | 16 ++++++++--------
 nextclade/README.md                            |  6 +++---
 nextclade/Snakefile                            |  2 +-
 nextclade/rules/annotate_phylogeny.smk         |  2 +-
 phylogenetic/defaults/auspice_config.json      |  2 +-
 phylogenetic/defaults/auspice_config_N450.json |  2 +-
 phylogenetic/rules/prepare_sequences.smk       |  3 +--
 8 files changed, 20 insertions(+), 21 deletions(-)

diff --git a/.github/workflows/ingest-to-phylogenetic.yaml b/.github/workflows/ingest-to-phylogenetic.yaml
index 44ac4b6..ff25be8 100644
--- a/.github/workflows/ingest-to-phylogenetic.yaml
+++ b/.github/workflows/ingest-to-phylogenetic.yaml
@@ -66,7 +66,7 @@ jobs:
         ingest/benchmarks/
         ingest/logs/
         ingest/.snakemake/log/
-  
+
   # Check if ingest results include new data by checking for the cache
   # of the file with the results' Metadata.sh256sum (which should have been added within upload-to-s3)
   # GitHub will remove any cache entries that have not been accessed in over 7 days,
@@ -89,8 +89,8 @@ jobs:
 
           # Code below is modified from ingest/upload-to-s3
           # https://github.com/nextstrain/ingest/blob/c0b4c6bb5e6ccbba86374d2c09b42077768aac23/upload-to-s3#L23-L29
-          
-          
+
+
           no_hash=0000000000000000000000000000000000000000000000000000000000000000
 
           for s3_url in "${s3_urls[@]}"; do
@@ -109,7 +109,7 @@ jobs:
           path: ingest-output-sha256sum
           key: ingest-output-sha256sum-${{ hashFiles('ingest-output-sha256sum') }}
           lookup-only: true
-  
+
   phylogenetic:
     needs: [check-new-data]
     if: ${{ needs.check-new-data.outputs.cache-hit != 'true' }}
diff --git a/ingest/bin/parse-measles-genotype-names.py b/ingest/bin/parse-measles-genotype-names.py
index c321c54..c8eebfe 100755
--- a/ingest/bin/parse-measles-genotype-names.py
+++ b/ingest/bin/parse-measles-genotype-names.py
@@ -24,9 +24,9 @@ def parse_args():
 
 def _set_genotype_name(record):
     genotype_name = record["genotype_ncbi"]
-    
+
     genotype_name = genotype_name.replace('Measles virus genotype ', '')
-    genotype_name = re.sub(r'Measles morbillivirus.*$', r'', genotype_name)   
+    genotype_name = re.sub(r'Measles morbillivirus.*$', r'', genotype_name)
     genotype_name = re.sub(r'.*?\[(.*)\]$', r'\1', genotype_name) # If square brackets present at end of string, keep only the text inside the brackets
     genotype_name = re.sub(r'Measles virus MVs.*$', r'', genotype_name)
     genotype_name = re.sub(r'Measles virus MVi.*$', r'', genotype_name)
@@ -34,12 +34,12 @@ def _set_genotype_name(record):
     genotype_name = genotype_name.replace('Measles virus strain ', '')
     genotype_name = re.sub(r'Measles virus.*$', r'', genotype_name)
     genotype_name = re.sub(r'A-vaccine.*$', r'A', genotype_name)
-    genotype_name = re.sub(r'B3.1', r'B3', genotype_name) 
-    genotype_name = re.sub(r'B3.2', r'B3', genotype_name) 
-    genotype_name = re.sub(r'D4a', r'D4', genotype_name) 
-    genotype_name = re.sub(r'D4b', r'D4', genotype_name) 
-    genotype_name = re.sub(r'H1a', r'H1', genotype_name) 
-    genotype_name = re.sub(r'H1b', r'H1', genotype_name) 
+    genotype_name = re.sub(r'B3.1', r'B3', genotype_name)
+    genotype_name = re.sub(r'B3.2', r'B3', genotype_name)
+    genotype_name = re.sub(r'D4a', r'D4', genotype_name)
+    genotype_name = re.sub(r'D4b', r'D4', genotype_name)
+    genotype_name = re.sub(r'H1a', r'H1', genotype_name)
+    genotype_name = re.sub(r'H1b', r'H1', genotype_name)
 
     return (
         genotype_name)
diff --git a/nextclade/README.md b/nextclade/README.md
index 79bfa4f..b40cc84 100644
--- a/nextclade/README.md
+++ b/nextclade/README.md
@@ -1,11 +1,11 @@
 
 # Measles Nextclade Dataset Tree
 
-This workflow creates a phylogenetic tree that can be used as part of a Nextclade dataset to assign genotypes to measles samples based on [criteria outlined by the WHO](https://www.who.int/publications/i/item/WER8709). 
+This workflow creates a phylogenetic tree that can be used as part of a Nextclade dataset to assign genotypes to measles samples based on [criteria outlined by the WHO](https://www.who.int/publications/i/item/WER8709).
 
-The WHO has defined 24 measles genotypes based on N gene and H gene sequences from 28 reference strains. For new measles samples, genotypes can be assigned based on genetic similarity to the reference strains at the "N450" region (a 450 bp region of the N gene). 
+The WHO has defined 24 measles genotypes based on N gene and H gene sequences from 28 reference strains. For new measles samples, genotypes can be assigned based on genetic similarity to the reference strains at the "N450" region (a 450 bp region of the N gene).
 
-The tree created here includes N450 sequences for the 28 reference strains, along with other representative strains for each genotype. 
+The tree created here includes N450 sequences for the 28 reference strains, along with other representative strains for each genotype.
 
 The workflow includes the following steps:
 * Build a tree using samples from the `ingest` output, with the following sampling criteria:
diff --git a/nextclade/Snakefile b/nextclade/Snakefile
index dd876ff..d30974c 100644
--- a/nextclade/Snakefile
+++ b/nextclade/Snakefile
@@ -1,4 +1,4 @@
-configfile: "defaults/config.yaml" 
+configfile: "defaults/config.yaml"
 
 rule all:
     input:
diff --git a/nextclade/rules/annotate_phylogeny.smk b/nextclade/rules/annotate_phylogeny.smk
index 5726f92..fc66be9 100644
--- a/nextclade/rules/annotate_phylogeny.smk
+++ b/nextclade/rules/annotate_phylogeny.smk
@@ -65,7 +65,7 @@ rule timeout:
     run:
         import json
         with open(input[0], 'r') as fh:
-            data = json.load(fh)        
+            data = json.load(fh)
         new_nodes = {}
         for name, attrs in data['nodes'].items():
             new_nodes[name] = {'mutation_length': attrs.get('mutation_length')}
diff --git a/phylogenetic/defaults/auspice_config.json b/phylogenetic/defaults/auspice_config.json
index 81c05c7..d6d7635 100644
--- a/phylogenetic/defaults/auspice_config.json
+++ b/phylogenetic/defaults/auspice_config.json
@@ -10,7 +10,7 @@
       "url": "https://www.ncbi.nlm.nih.gov/genbank/"
     }
   ],
-  "build_url": "https://github.com/nextstrain/measles",  
+  "build_url": "https://github.com/nextstrain/measles",
   "colorings": [
     {
       "key": "gt",
diff --git a/phylogenetic/defaults/auspice_config_N450.json b/phylogenetic/defaults/auspice_config_N450.json
index 00a350c..c6111bc 100644
--- a/phylogenetic/defaults/auspice_config_N450.json
+++ b/phylogenetic/defaults/auspice_config_N450.json
@@ -67,7 +67,7 @@
     "map",
     "entropy",
     "frequencies"
-  ],  
+  ],
   "metadata_columns": [
     "author"
   ]
diff --git a/phylogenetic/rules/prepare_sequences.smk b/phylogenetic/rules/prepare_sequences.smk
index 90154d7..b06b433 100644
--- a/phylogenetic/rules/prepare_sequences.smk
+++ b/phylogenetic/rules/prepare_sequences.smk
@@ -64,7 +64,7 @@ rule filter:
             --group-by {params.group_by} \
             --sequences-per-group {params.sequences_per_group} \
             --min-date {params.min_date} \
-            --min-length {params.min_length} 
+            --min-length {params.min_length}
         """
 
 rule align:
@@ -86,4 +86,3 @@ rule align:
             --fill-gaps \
             --remove-reference
         """
-