Skip to content

Commit

Permalink
Merge pull request #225 from andersen-lab/get-lineage-def
Browse files Browse the repository at this point in the history
Get lineage def
  • Loading branch information
dylanpilz authored Mar 29, 2024
2 parents 1e1ddc4 + a918c79 commit 289895e
Show file tree
Hide file tree
Showing 9 changed files with 186 additions and 27 deletions.
2 changes: 2 additions & 0 deletions docs/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ To ensure reproducibility of results, we provide old (timestamped) barcodes and
src/installation
src/usage/demix
src/usage/variants
src/usage/barcode-build
src/usage/get-lineage-def
src/usage/update
src/usage/boot
src/usage/aggregate
Expand Down
5 changes: 5 additions & 0 deletions docs/src/usage/barcode-build.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
.. click:: freyja._cli:barcode_build
:prog: freyja barcode-build
:nested: full
:commands: barcode-build

2 changes: 1 addition & 1 deletion docs/src/usage/covariants.rst
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ to ``freyja/data/NC_045512_Hu-1.fasta``. If you are using a different
build to perfrom alignment, it is important to pass that file in to
``--ref-genome`` instead. Optionally, a gff file
(e.g. ``freyja/data/NC_045512_Hu-1.gff``) may be included via the
``--gff-file`` option to output amino acid mutations alongside
``--annot`` option to output amino acid mutations alongside
nucleotide mutations. Inclusion thresholds for read-mapping quality and
the number of observed instances of a set of covariants can be set using
``--min_quality`` and ``--min_count`` respectively.
39 changes: 39 additions & 0 deletions docs/src/usage/get-lineage-def.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
.. click:: freyja._cli:get_lineage_def
:prog: freyja get-lineage-def
:nested: full
:commands: get_lineage_def

------------

**Example Usage:**

This command fetches the lineage-defining mutations for the specified lineage. Defaults to stdout if no output file is specified.

.. code-block:: bash
freyja get-lineage-def B.1.1.7
C241T
C913T
C3037T
C3267T
C5388A
(cont...)
If a gene annotation file (gff3) is provided along with a reference genome (fasta), the mutations will include the respective amino acid changes for non-synonymous mutations.

.. code-block:: bash
freyja get-lineage-def BA.2.12 --annot freyja/data/NC_045512_Hu-1.gff --ref freyja/data/NC_045512_Hu-1.fasta
...
C21618T(S:T19I)
T22200G(S:V213G)
G22578A(S:G339D)
C22674T(S:S371F)
T22679C(S:S373P)
C22686T(S:S375F)
A22688G(S:T376A)
G22775A(S:D405N)
A22786C(S:R408S)
G22813T(S:K417N)
(cont...)
59 changes: 56 additions & 3 deletions freyja/_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -290,6 +290,59 @@ def barcode_build(pb, outdir, noncl):
os.remove(lineagePath)


@cli.command()
@click.argument('lineage', type=str)
@click.option('--barcodes', default='data/usher_barcodes.csv',
help='Path to custom barcode file', show_default=True)
@click.option('--annot', default=None,
help='Path to annotation file in gff3 format. '
'If included, shows amino acid mutations. ')
@click.option('--ref', default=None,
help='Reference file in fasta format. '
'Required to display amino acid mutations',
show_default=True)
@click.option('--output', default=None,
help='Output file to save lineage definition. '
'Defaults to stdout.')
def get_lineage_def(lineage, barcodes, annot, ref, output):
"""
Get the mutations defining a LINEAGE of interest
from provided barcodes
"""
from freyja.read_analysis_utils import parse_gff, translate_snps

if barcodes == 'data/usher_barcodes.csv':
barcodes = os.path.join(locDir, barcodes)

df = pd.read_csv(barcodes, index_col=0)
try:
target = df.loc[lineage]
except KeyError:
raise KeyError(f'Lineage "{lineage}" not found in barcodes')
target = target[target > 0]
target_muts = list(target.index)

if annot is not None:
if ref is None:
raise ValueError('Both annot and ref must be provided '
'to show amino acid mutations')
else:
gene_positions = parse_gff(annot)
aa_mut = translate_snps(target_muts, ref, gene_positions)
target_muts = [f'{mut}({aa_mut[mut]})' if aa_mut[mut] is not None
else f'{mut}' for mut in target_muts]

target_muts = sorted(target_muts, key=lambda x: int(x.split('(')[0][1:-1]))
target_muts = '\n'.join(target_muts)

if output is not None:
with open(output, 'w') as f:
f.write(target_muts)
else:
print(target_muts)
return target_muts


@cli.command()
@click.argument('bamfile',
type=click.Path(exists=True))
Expand Down Expand Up @@ -763,7 +816,7 @@ def filter(query_mutations, input_bam, min_site, max_site, output):
help='path to save co-occurring mutations', show_default=True)
@click.option('--ref-genome', type=click.Path(),
default='data/NC_045512_Hu-1.fasta', show_default=True)
@click.option('--gff-file', type=click.Path(exists=True),
@click.option('--annot', type=click.Path(exists=True),
default=None,
help=('path to gff file corresponding to reference genome. If '
'included, outputs amino acid mutations in addition to '
Expand All @@ -783,7 +836,7 @@ def filter(query_mutations, input_bam, min_site, max_site, output):
'(in descending order). Set to "site" to sort patterns by '
'start site (n ascending order).'), show_default=True)
def covariants(input_bam, min_site, max_site, output,
ref_genome, gff_file, min_quality, min_count, spans_region,
ref_genome, annot, min_quality, min_count, spans_region,
sort_by):
"""
Finds mutations co-occurring on the same read pair
Expand All @@ -796,7 +849,7 @@ def covariants(input_bam, min_site, max_site, output,

from freyja.read_analysis_tools import covariants as _covariants
_covariants(input_bam, min_site, max_site, output,
ref_genome, gff_file, min_quality, min_count, spans_region,
ref_genome, annot, min_quality, min_count, spans_region,
sort_by)


Expand Down
34 changes: 12 additions & 22 deletions freyja/read_analysis_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

from freyja.read_analysis_utils import nt_position, get_colnames_and_sites, \
read_pair_generator, \
filter_covariants_output
filter_covariants_output, parse_gff


def extract(query_mutations, input_bam, output, same_read):
Expand Down Expand Up @@ -330,27 +330,17 @@ def get_gene(locus):

# Load gene annotations (if present)
if gff_file is not None:
gene_positions = {}
with open(gff_file) as f:
for line in f.readlines():
line = line.split('\t')
if 'gene' in line:
attrs = line[-1].split(';')
for attr in attrs:
if attr.startswith('Name='):
gene_name = attr.split('=')[1]
gene_positions[gene_name] = (int(line[3]),
int(line[4]))

# Split ORF1ab for SARS-CoV-2
if 'ORF1ab' in gene_positions:
del gene_positions['ORF1ab']
gene_positions['ORF1a'] = (266, 13468)
gene_positions['ORF1b'] = (13468, 21555)
elif 'orf1ab' in gene_positions:
del gene_positions['orf1ab']
gene_positions['orf1a'] = (266, 13468)
gene_positions['orf1b'] = (13468, 21555)
gene_positions = parse_gff(gff_file)

# Split ORF1ab for SARS-CoV-2
if 'ORF1ab' in gene_positions:
del gene_positions['ORF1ab']
gene_positions['ORF1a'] = (266, 13468)
gene_positions['ORF1b'] = (13468, 21555)
elif 'orf1ab' in gene_positions:
del gene_positions['orf1ab']
gene_positions['orf1a'] = (266, 13468)
gene_positions['orf1b'] = (13468, 21555)

# Open input bam file for reading
samfile = pysam.AlignmentFile(input_bam, 'rb')
Expand Down
61 changes: 61 additions & 0 deletions freyja/read_analysis_utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
import pandas as pd
from Bio.Seq import MutableSeq
from Bio import SeqIO


def nt_position(x):
Expand Down Expand Up @@ -80,3 +82,62 @@ def filter_covariants_output(cluster, nt_muts, min_mutations):
return pd.NA
else:
return list(dict.fromkeys(cluster_final))


def parse_gff(gff_file):
gene_positions = {}
with open(gff_file) as f:
for line in f.readlines():
line = line.split('\t')
if 'gene' in line:
attrs = line[-1].split(';')
for attr in attrs:
if attr.startswith('Name='):
gene_name = attr.split('=')[1]
gene_positions[gene_name] = (int(line[3]),
int(line[4]))
return gene_positions


def translate_snps(snps, ref, gene_positions):

# Load reference genome
ref = MutableSeq(next(SeqIO.parse(ref, 'fasta')).seq)

output = {snp: None for snp in snps}
for snp in snps:
locus = int(snp[1:-1])

# Find the key in gene_positions that corresponds to the gene
# containing the SNP
gene_info = None
for gene in gene_positions:
start, end = gene_positions[gene]
if start <= locus <= end:
gene_info = gene, start
break
if gene_info is None:
continue

codon_position = (locus - start) % 3
aa_locus = ((locus - codon_position - start) // 3) + 1

ref_codon = ref[locus - codon_position - 1:
locus - codon_position + 2]
ref_aa = ref_codon.translate()

alt_codon = MutableSeq(str(ref_codon))
alt_codon[codon_position] = snp[-1]

if len(alt_codon) % 3 != 0 or len(alt_codon) == 0 \
or 'N' in alt_codon:
continue
alt_aa = alt_codon.translate()

if ref_aa == alt_aa or '*' in alt_aa:
continue # Synonymous/stop codon mutation

aa_mut = f'{gene}:{ref_aa}{aa_locus}{alt_aa}'
output[snp] = aa_mut

return output
9 changes: 9 additions & 0 deletions freyja/tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,15 @@ def test_dash(self):
--output test_dash.html')
self.assertTrue(file_exists('.', "test_dash.html"))

def test_get_lineage_def(self):
os.system('freyja get-lineage-def B.1.1.7 '
'--annot freyja/data/NC_045512_Hu-1.gff '
'--ref freyja/data/NC_045512_Hu-1.fasta '
'--output lineage_def.txt')
self.assertTrue(file_exists('.', "lineage_def.txt"))
with open('lineage_def.txt') as f:
self.assertEqual(len(f.readlines()), 28)


if __name__ == '__main__':
unittest.main()
2 changes: 1 addition & 1 deletion freyja/tests/test_read_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,7 @@ def test_filter_deletions(self):

def test_covariants(self):
cmd = ['freyja', 'covariants', self.input_bam,
'21563', '25384', '--gff-file',
'21563', '25384', '--annot',
'freyja/data/NC_045512_Hu-1.gff', '--output',
'freyja/data/test_covar.tsv']
err = subprocess.run(cmd)
Expand Down

0 comments on commit 289895e

Please sign in to comment.