diff --git a/lapis2-docs/astro.config.mjs b/lapis2-docs/astro.config.mjs index f511cc00..8fb2cc94 100644 --- a/lapis2-docs/astro.config.mjs +++ b/lapis2-docs/astro.config.mjs @@ -75,6 +75,10 @@ export default defineConfig({ label: 'Mutation filters', link: '/concepts/mutation-filters/', }, + { + label: 'Ambiguous symbols', + link: '/concepts/ambiguous-symbols/', + }, { label: 'Pango lineage query', link: '/concepts/pango-lineage-query/', diff --git a/lapis2-docs/src/components/MutationFilters/MaybeAminoAcidMutationExample.astro b/lapis2-docs/src/components/MutationFilters/MaybeAminoAcidMutationExample.astro new file mode 100644 index 00000000..67949f01 --- /dev/null +++ b/lapis2-docs/src/components/MutationFilters/MaybeAminoAcidMutationExample.astro @@ -0,0 +1,5 @@ +--- +import AminoAcidMutationExample from './AminoAcidMutationExample.astro'; +--- + +MAYBE() diff --git a/lapis2-docs/src/content/docs/architecture-and-dev-docs/99-glossary.mdx b/lapis2-docs/src/content/docs/architecture-and-dev-docs/99-glossary.mdx index 9dfa3f7c..05321dcd 100644 --- a/lapis2-docs/src/content/docs/architecture-and-dev-docs/99-glossary.mdx +++ b/lapis2-docs/src/content/docs/architecture-and-dev-docs/99-glossary.mdx @@ -5,45 +5,10 @@ description: Explanation of terms used in the context of LAPIS. | Term | Definition | | -------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| AA | amino acid | +| AA | short for amino acid | | aligned | a nucleotide sequence is aligned, if it is arranged such that it has many similarities to a given reference genome. The aligned sequence has the same length as the reference genome. Gaps are marked in the aligned sequence. Insertions are stored separately. | -| Mutation | a divergence from the reference genome (see below). | +| Mutation | a divergence from the reference genome (see [mutation-filters](../concepts/mutation-filters)). | | Organism | The organism that the genomic data was extracted from. Each LAPIS instance serves data for a single organism. | | QC | quality control; in our case, it usually refers to the quality checks and metrics of the sequences, targeting how well the nucleotide sequence was determined from the probe. | | Segment | The genome of an organism may consist of multiple nucleotide sequence pieces. We call those pieces "segments". | | Variant | We follow a very open definition of variants. Every subset of sequences is considered a variant. A variant is specified by lineage/clade names and mutations. A variant does not need to be [monophyletic](https://en.wikipedia.org/wiki/Monophyly). | - -## Mutations - -Mutations can occur either on nucleotide level or on amino acid level. -For the nucleotides a single symbol can produce a mutation, whereas for the amino acids, -some nucleotide mutations still produce the same amino acid -([see also](https://en.wikipedia.org/wiki/DNA_and_RNA_codon_tables)). - -The following explains the notations for mutations. - -### Amino Acid Mutations - -The gene has to be provided for the AA mutation, since AAs only make sense within a gene. - -**Example ORF_1a\:G1234S**. This translates to - -- in Gene: ORF_1a -- AA mutation from "G" to "S" at position 1234 - -The origin AA symbol can be omitted, since it is clear from the reference genome. -**Example: ORF_1a:1234S** - -### Nucleotide Mutations - -**Example: C1234T**. This translates to - -- a nucleotide mutation from nucleotide "C" -- at position 1234 in the genome -- to nucleotide "T" - -The origin nucleotide symbol can be omitted, since it is clear from the reference genome. -**Example: 1234T** - -If the organism has multiple nucleotide sequence segments, the segment has to be provided. -**Example: segment_name\:C1234T** diff --git a/lapis2-docs/src/content/docs/concepts/ambiguous-symbols.mdx b/lapis2-docs/src/content/docs/concepts/ambiguous-symbols.mdx new file mode 100644 index 00000000..bf2853f6 --- /dev/null +++ b/lapis2-docs/src/content/docs/concepts/ambiguous-symbols.mdx @@ -0,0 +1,53 @@ +--- +title: Ambiguous symbols +description: Explanation how ambiguous reads are handled in the data +--- + +The underlying sequence files in `.FASTA` format can contain any of the following symbols: + +| Symbol | Meaning | +| ------ | ----------------- | +| A | Adenine | +| C | Cytosine | +| G | Guanine | +| T | Thymine | +| - | Deletion | +| N | failed read / any | +| R | A or G | +| Y | C or T | +| S | C or G | +| W | A or T | +| K | G or T | +| M | A or C | +| B | not A | +| D | not C | +| H | not G | +| V | not T | + +The ambiguous symbols arise from imperfect reads in the sequencer. + +While one mostly queries for the symbols `A`, `C`, `G`, `T` and `-` to look for specific features and mutations of a sequence, +or `N` for quality control of the underlying data, +the ambiguous symbols `R` through `V` are often too cumbersome to consider in analyses. + +LAPIS supports the flexible consideration of these ambiguous symbols +through an extension of the boolean logic syntax in the variant queries. + +Here we introduce a new expression `MAYBE` to consider sequences that have an ambiguous code which **maybe** matches the queried value. + +#### Example + +Consider the following sequences: + +``` +12345 +AAACG +AARCG +AANCG +AAGCG +AAACG +``` + +A filter for the mutation `3G` returns only the sequence `AAGCG`, as it is the only sequence with the symbol `G` at position 3. +The filter `MAYBE(3G)` however also considers that the sequences `AARCG` and `AANCG` **may** have the symbol `G` at position 3, +because the symbols `R` and `N` can represent Guanine. diff --git a/lapis2-docs/src/content/docs/concepts/mutation-filters.mdx b/lapis2-docs/src/content/docs/concepts/mutation-filters.mdx index b5790e58..242ddaac 100644 --- a/lapis2-docs/src/content/docs/concepts/mutation-filters.mdx +++ b/lapis2-docs/src/content/docs/concepts/mutation-filters.mdx @@ -4,6 +4,7 @@ description: Mutation filters --- import AminoAcidMutationExample from '../../../components/MutationFilters/AminoAcidMutationExample.astro'; +import MaybeAminoAcidMutationExample from '../../../components/MutationFilters/MaybeAminoAcidMutationExample.astro'; import GeneNames from '../../../components/MutationFilters/GeneNames.astro'; import NucleotideMutations from '../../../components/MutationFilters/NucleotideMutations.astro'; @@ -19,3 +20,9 @@ It can also be `-` for deletion and `X` for unknown. **Example:** ` can be omitted to filter for any mutation. You can write a `.` for the `` to filter for sequences for which it is confirmed that no mutation occurred, i.e. has the same base as the reference genome at the specified position. + +:::note +Both, nucleotide and amino acid mutation filter, also support `Maybe` queries. +Read more in [ambiguous symbols](/concepts/ambiguous-symbols). +**Example:** . +::: diff --git a/lapis2-docs/src/content/docs/concepts/variant-query.mdx b/lapis2-docs/src/content/docs/concepts/variant-query.mdx index d8241676..4eda3f5d 100644 --- a/lapis2-docs/src/content/docs/concepts/variant-query.mdx +++ b/lapis2-docs/src/content/docs/concepts/variant-query.mdx @@ -16,8 +16,12 @@ query correctly (in JavaScript, this can be done with the function)! The formal specification of the query language is available -[here](https://github.com/cevo-public/LAPIS/blob/main/server/src/main/antlr/ch/ethz/lapis/api/parser/VariantQuery.g4) as -an ANTLR v4 grammar. In following, we provide an informal description and examples. +[here](https://github.com/GenSpectrum/LAPIS/blob/main/lapis2/src/main/antlr/org/genspectrum/lapis/model/variantqueryparser/VariantQuery.g4) +as an ANTLR v4 grammar. +In following, we provide an informal description and examples. +The respective +[unit test](https://github.com/GenSpectrum/LAPIS/blob/main/lapis2/src/test/kotlin/org/genspectrum/lapis/model/VariantQueryFacadeTest.kt) +provides a full list of possible atomic queries. The query language understands Boolean logic. Expressions can be connected with `&` (and), `|` (or) and `!` (not). Parentheses `(` and `)` can be used to define the order of the operations. Further, there is a special syntax to match @@ -51,4 +55,10 @@ or by Nextclade) and filter by Nextstrain clades: BA.5* | nextcladePangoLineage:BA.5* | nextstrainClade:22B ``` +LAPIS supports a ternary logic to query [ambiguous nucleotide symbols](../ambiguous-symbols/). + +``` +MAYBE(123W) +``` + diff --git a/lapis2-docs/tests/docs.spec.ts b/lapis2-docs/tests/docs.spec.ts index 0eca1e20..6c2de6c3 100644 --- a/lapis2-docs/tests/docs.spec.ts +++ b/lapis2-docs/tests/docs.spec.ts @@ -16,6 +16,7 @@ const referencesPages = [ const conceptsPages = [ 'Data versions', 'Mutation filters', + 'Ambiguous symbols', 'Pango lineage query', 'Request methods: GET and POST', 'Response format', diff --git a/lapis2/src/main/kotlin/org/genspectrum/lapis/silo/SiloQuery.kt b/lapis2/src/main/kotlin/org/genspectrum/lapis/silo/SiloQuery.kt index c367597b..f48ab3b5 100644 --- a/lapis2/src/main/kotlin/org/genspectrum/lapis/silo/SiloQuery.kt +++ b/lapis2/src/main/kotlin/org/genspectrum/lapis/silo/SiloQuery.kt @@ -192,9 +192,13 @@ data class AminoAcidInsertionContains(val position: Int, val value: String, val data object True : SiloFilterExpression("True") -data class And(val children: List) : SiloFilterExpression("And") +data class And(val children: List) : SiloFilterExpression("And") { + constructor(vararg children: SiloFilterExpression) : this(children.toList()) +} -data class Or(val children: List) : SiloFilterExpression("Or") +data class Or(val children: List) : SiloFilterExpression("Or") { + constructor(vararg children: SiloFilterExpression) : this(children.toList()) +} data class Not(val child: SiloFilterExpression) : SiloFilterExpression("Not") diff --git a/lapis2/src/test/kotlin/org/genspectrum/lapis/model/SiloFilterExpressionMapperTest.kt b/lapis2/src/test/kotlin/org/genspectrum/lapis/model/SiloFilterExpressionMapperTest.kt index 29a8b635..156d0d08 100644 --- a/lapis2/src/test/kotlin/org/genspectrum/lapis/model/SiloFilterExpressionMapperTest.kt +++ b/lapis2/src/test/kotlin/org/genspectrum/lapis/model/SiloFilterExpressionMapperTest.kt @@ -66,7 +66,7 @@ class SiloFilterExpressionMapperTest { val result = underTest.map(filterParameter) val expected = - and(or(PangoLineageEquals(FIELD_WITH_UPPERCASE_LETTER, SOME_VALUE, includeSublineages = false))) + And(Or(PangoLineageEquals(FIELD_WITH_UPPERCASE_LETTER, SOME_VALUE, includeSublineages = false))) assertThat(result, equalTo(expected)) } @@ -77,7 +77,7 @@ class SiloFilterExpressionMapperTest { val result = underTest.map(filterParameter) val expected = - and(or(PangoLineageEquals(FIELD_WITH_UPPERCASE_LETTER, SOME_VALUE, includeSublineages = false))) + And(Or(PangoLineageEquals(FIELD_WITH_UPPERCASE_LETTER, SOME_VALUE, includeSublineages = false))) assertThat(result, equalTo(expected)) } @@ -286,7 +286,7 @@ class SiloFilterExpressionMapperTest { val result = underTest.map(filterParameter) val expected = - and(NucleotideSymbolEquals(null, 123, "B"), NucleotideSymbolEquals("sequenceName", 999, "A")) + And(NucleotideSymbolEquals(null, 123, "B"), NucleotideSymbolEquals("sequenceName", 999, "A")) assertThat(result, equalTo(expected)) } @@ -303,7 +303,7 @@ class SiloFilterExpressionMapperTest { val result = underTest.map(filterParameter) val expected = - and(HasNucleotideMutation(null, 123), HasNucleotideMutation("sequenceName", 999)) + And(HasNucleotideMutation(null, 123), HasNucleotideMutation("sequenceName", 999)) assertThat(result, equalTo(expected)) } @@ -320,7 +320,7 @@ class SiloFilterExpressionMapperTest { val result = underTest.map(filterParameter) val expected = - and(AminoAcidSymbolEquals("geneName1", 123, "B"), AminoAcidSymbolEquals("geneName2", 999, "A")) + And(AminoAcidSymbolEquals("geneName1", 123, "B"), AminoAcidSymbolEquals("geneName2", 999, "A")) assertThat(result, equalTo(expected)) } @@ -337,7 +337,7 @@ class SiloFilterExpressionMapperTest { val result = underTest.map(filterParameter) val expected = - and(HasAminoAcidMutation("geneName1", 123), HasAminoAcidMutation("geneName2", 999)) + And(HasAminoAcidMutation("geneName1", 123), HasAminoAcidMutation("geneName2", 999)) assertThat(result, equalTo(expected)) } @@ -354,7 +354,7 @@ class SiloFilterExpressionMapperTest { val result = underTest.map(filterParameter) val expected = - and(NucleotideInsertionContains(123, "ABCD"), NucleotideInsertionContains(999, "DEF")) + And(NucleotideInsertionContains(123, "ABCD"), NucleotideInsertionContains(999, "DEF")) assertThat(result, equalTo(expected)) } @@ -371,7 +371,7 @@ class SiloFilterExpressionMapperTest { val result = underTest.map(filterParameter) val expected = - and(AminoAcidInsertionContains(123, "ABCD", "gene"), AminoAcidInsertionContains(999, "DEF", "ORF1")) + And(AminoAcidInsertionContains(123, "ABCD", "gene"), AminoAcidInsertionContains(999, "DEF", "ORF1")) assertThat(result, equalTo(expected)) } @@ -496,22 +496,22 @@ class SiloFilterExpressionMapperTest { "some_metadata" to listOf("ABC"), "other_metadata" to listOf("def"), ), - and( - or(StringEquals("some_metadata", "ABC")), - or(StringEquals("other_metadata", "def")), + And( + Or(StringEquals("some_metadata", "ABC")), + Or(StringEquals("other_metadata", "def")), ), ), Arguments.of( mapOf("pangoLineage" to listOf("A.1.2.3")), - and(or(PangoLineageEquals("pangoLineage", "A.1.2.3", includeSublineages = false))), + And(Or(PangoLineageEquals("pangoLineage", "A.1.2.3", includeSublineages = false))), ), Arguments.of( mapOf("pangoLineage" to listOf("A.1.2.3*")), - and(or(PangoLineageEquals("pangoLineage", "A.1.2.3", includeSublineages = true))), + And(Or(PangoLineageEquals("pangoLineage", "A.1.2.3", includeSublineages = true))), ), Arguments.of( mapOf("pangoLineage" to listOf("A.1.2.3.*")), - and(or(PangoLineageEquals("pangoLineage", "A.1.2.3", includeSublineages = true))), + And(Or(PangoLineageEquals("pangoLineage", "A.1.2.3", includeSublineages = true))), ), Arguments.of( mapOf( @@ -521,9 +521,9 @@ class SiloFilterExpressionMapperTest { ), And( listOf( - or(PangoLineageEquals("pangoLineage", "A.1.2.3", includeSublineages = false)), - or(StringEquals("some_metadata", "ABC")), - or(StringEquals("other_metadata", "DEF")), + Or(PangoLineageEquals("pangoLineage", "A.1.2.3", includeSublineages = false)), + Or(StringEquals("some_metadata", "ABC")), + Or(StringEquals("other_metadata", "DEF")), ), ), ), @@ -531,43 +531,43 @@ class SiloFilterExpressionMapperTest { mapOf( "date" to listOf("2021-06-03"), ), - and(DateBetween("date", from = LocalDate.of(2021, 6, 3), to = LocalDate.of(2021, 6, 3))), + And(DateBetween("date", from = LocalDate.of(2021, 6, 3), to = LocalDate.of(2021, 6, 3))), ), Arguments.of( mapOf( "dateTo" to listOf("2021-06-03"), ), - and(DateBetween("date", from = null, to = LocalDate.of(2021, 6, 3))), + And(DateBetween("date", from = null, to = LocalDate.of(2021, 6, 3))), ), Arguments.of( mapOf( "dateFrom" to listOf("2021-03-28"), ), - and(DateBetween("date", from = LocalDate.of(2021, 3, 28), to = null)), + And(DateBetween("date", from = LocalDate.of(2021, 3, 28), to = null)), ), Arguments.of( mapOf( "dateFrom" to listOf("2021-03-28"), "dateTo" to listOf("2021-06-03"), ), - and(DateBetween("date", from = LocalDate.of(2021, 3, 28), to = LocalDate.of(2021, 6, 3))), + And(DateBetween("date", from = LocalDate.of(2021, 3, 28), to = LocalDate.of(2021, 6, 3))), ), Arguments.of( mapOf( "dateTo" to listOf("2021-06-03"), "some_metadata" to listOf("ABC"), ), - and( + And( DateBetween("date", from = null, to = LocalDate.of(2021, 6, 3)), - or(StringEquals("some_metadata", "ABC")), + Or(StringEquals("some_metadata", "ABC")), ), ), Arguments.of( mapOf( "variantQuery" to listOf("300G & 400A"), ), - and( - and( + And( + And( NucleotideSymbolEquals(null, 300, "G"), NucleotideSymbolEquals(null, 400, "A"), ), @@ -578,53 +578,53 @@ class SiloFilterExpressionMapperTest { "variantQuery" to listOf("300G"), "some_metadata" to listOf("ABC"), ), - and( + And( NucleotideSymbolEquals(null, 300, "G"), - or(StringEquals("some_metadata", "ABC")), + Or(StringEquals("some_metadata", "ABC")), ), ), Arguments.of( mapOf( "intField" to listOf("42"), ), - and(IntEquals("intField", 42)), + And(IntEquals("intField", 42)), ), Arguments.of( mapOf( "intFieldFrom" to listOf("42"), ), - and(IntBetween("intField", 42, null)), + And(IntBetween("intField", 42, null)), ), Arguments.of( mapOf( "intFieldTo" to listOf("42"), ), - and(IntBetween("intField", null, 42)), + And(IntBetween("intField", null, 42)), ), Arguments.of( mapOf( "floatField" to listOf("42.45"), ), - and(FloatEquals("floatField", 42.45)), + And(FloatEquals("floatField", 42.45)), ), Arguments.of( mapOf( "floatFieldFrom" to listOf("42.45"), ), - and(FloatBetween("floatField", 42.45, null)), + And(FloatBetween("floatField", 42.45, null)), ), Arguments.of( mapOf( "floatFieldTo" to listOf("42.45"), ), - and(FloatBetween("floatField", null, 42.45)), + And(FloatBetween("floatField", null, 42.45)), ), Arguments.of( mapOf( "some_metadata" to listOf("value1", "value2"), ), - and( - or( + And( + Or( StringEquals("some_metadata", "value1"), StringEquals("some_metadata", "value2"), ), @@ -634,8 +634,8 @@ class SiloFilterExpressionMapperTest { mapOf( "pangoLineage" to listOf("A.1.2.3", "B.1.2.3"), ), - and( - or( + And( + Or( PangoLineageEquals("pangoLineage", "A.1.2.3", includeSublineages = false), PangoLineageEquals("pangoLineage", "B.1.2.3", includeSublineages = false), ), @@ -660,7 +660,3 @@ class SiloFilterExpressionMapperTest { override val offset: Int? = null, ) : CommonSequenceFilters } - -private fun and(vararg expressions: SiloFilterExpression) = And(expressions.toList()) - -private fun or(vararg expressions: SiloFilterExpression) = Or(expressions.toList()) diff --git a/lapis2/src/test/kotlin/org/genspectrum/lapis/model/VariantQueryFacadeTest.kt b/lapis2/src/test/kotlin/org/genspectrum/lapis/model/VariantQueryFacadeTest.kt index cc74bf9b..77b3c35a 100644 --- a/lapis2/src/test/kotlin/org/genspectrum/lapis/model/VariantQueryFacadeTest.kt +++ b/lapis2/src/test/kotlin/org/genspectrum/lapis/model/VariantQueryFacadeTest.kt @@ -39,50 +39,36 @@ class VariantQueryFacadeTest { val expectedResult = And( - listOf( + And( And( - listOf( + And( And( - listOf( - And( - listOf( - And( - listOf( - NucleotideSymbolEquals(null, 300, "G"), - Or( - listOf( - NucleotideSymbolEquals(null, 400, "-"), - NucleotideSymbolEquals(null, 500, "B"), - ), - ), - ), - ), - Not(HasNucleotideMutation(null, 600)), - ), - ), - Maybe( - Or( - listOf( - NucleotideSymbolEquals(null, 700, "B"), - NucleotideSymbolEquals(null, 800, "-"), - ), - ), - ), + NucleotideSymbolEquals(null, 300, "G"), + Or( + NucleotideSymbolEquals(null, 400, "-"), + NucleotideSymbolEquals(null, 500, "B"), ), ), - NOf( - 3, - matchExactly = false, - listOf( - NucleotideSymbolEquals(null, 123, "A"), - NucleotideSymbolEquals(null, 234, "T"), - NucleotideSymbolEquals(null, 345, "G"), - ), + Not(HasNucleotideMutation(null, 600)), + ), + Maybe( + Or( + NucleotideSymbolEquals(null, 700, "B"), + NucleotideSymbolEquals(null, 800, "-"), ), ), ), - PangoLineageEquals(PANGO_LINEAGE_COLUMN, "A.1.2.3", true), + NOf( + 3, + matchExactly = false, + listOf( + NucleotideSymbolEquals(null, 123, "A"), + NucleotideSymbolEquals(null, 234, "T"), + NucleotideSymbolEquals(null, 345, "G"), + ), + ), ), + PangoLineageEquals(PANGO_LINEAGE_COLUMN, "A.1.2.3", true), ) assertThat(result, equalTo(expectedResult)) @@ -114,10 +100,8 @@ class VariantQueryFacadeTest { val result = underTest.map(variantQuery) val expectedResult = And( - listOf( - NucleotideSymbolEquals(null, 300, "G"), - NucleotideSymbolEquals(null, 400, "-"), - ), + NucleotideSymbolEquals(null, 300, "G"), + NucleotideSymbolEquals(null, 400, "-"), ) assertThat(result, equalTo(expectedResult)) } @@ -129,15 +113,11 @@ class VariantQueryFacadeTest { val result = underTest.map(variantQuery) val expectedResult = And( - listOf( - And( - listOf( - NucleotideSymbolEquals(null, 300, "G"), - NucleotideSymbolEquals(null, 400, "-"), - ), - ), - NucleotideSymbolEquals(null, 500, "B"), + And( + NucleotideSymbolEquals(null, 300, "G"), + NucleotideSymbolEquals(null, 400, "-"), ), + NucleotideSymbolEquals(null, 500, "B"), ) assertThat(result, equalTo(expectedResult)) } @@ -159,10 +139,8 @@ class VariantQueryFacadeTest { val result = underTest.map(variantQuery) val expectedResult = Or( - listOf( - NucleotideSymbolEquals(null, 300, "G"), - NucleotideSymbolEquals(null, 400, "-"), - ), + NucleotideSymbolEquals(null, 300, "G"), + NucleotideSymbolEquals(null, 400, "-"), ) assertThat(result, equalTo(expectedResult)) } @@ -174,14 +152,10 @@ class VariantQueryFacadeTest { val result = underTest.map(variantQuery) val expectedResult = And( - listOf( - NucleotideSymbolEquals(null, 300, "C"), - Or( - listOf( - NucleotideSymbolEquals(null, 400, "A"), - NucleotideSymbolEquals(null, 500, "G"), - ), - ), + NucleotideSymbolEquals(null, 300, "C"), + Or( + NucleotideSymbolEquals(null, 400, "A"), + NucleotideSymbolEquals(null, 500, "G"), ), ) assertThat(result, equalTo(expectedResult))