-
-
Notifications
You must be signed in to change notification settings - Fork 21
Readers
sacha schutz edited this page Sep 3, 2019
·
8 revisions
Cutevariant imports variant data from an Abstract class called AbstractReader. You can inherits from it to support custom format which contains variants.
3 methods must be overrided :
Yield fields as dictionnaries with the following structure:
- name (str): the field name
- category (str): the table which the field belongs to. It can be (variants,annotations, samples) #TODO rename
- description (str): The definition of the field
- type (str): Field type in Python format ( str, int, float,bool)
- constraint (str:optional): SQL constraints
{
"name": "chr",
"category": "variants",
"description": "chromosom",
"type": "str",
"constraint": "NOT NULL",
}
Yield variants as dictionnaries with the following structure:
- chr (str): chromosom name
- pos (str): chromosom name
- ref (str): chromosom name
- alt (str): chromosom name
- field n (type): n fields more returns by get_fields with category variants
-
annotations (list):
- transcript (str): Transcript name
- gene (str): gene name
- field n (type_): n fields more returns by get_fields with category annotations
-
samples (list):
- name (str): name of sample
- gt (int): Genotype of variant for a sample. ( 0: homozygous wild, 1: heterozygous, 2: homozygous muted, -1: unknown)
{
"chr": "11",
"pos": 125010,
"ref": "T",
"alt": "A",
"dp": 32,
"annotations": [
{"transcript": "NM_234234", "gene": "CFTR", "in_exon": true, "pathogen_score": 0.2},
{"transcript": "NM_234235", "gene": "CFTR","in_exon": false, "pathogen_score": 0.5},
],
"samples": [{"name": "sacha", "gt": 1, "af": 0.4}]
}
Return a list of samples. If you have no sample, you can avoid to override this method.
You can get inspired by the FakeReader
from .abstractreader import AbstractReader
class FakeReader(AbstractReader):
def __init__(self):
super().__init__(None)
def get_variants(self):
yield {
"chr": "11",
"pos": 125010,
"ref": "T",
"alt": "A",
"annotations": [
{"transcript": "NM_234234", "gene": "CFTR"},
{"transcript": "NM_234235", "gene": "CFTR"},
],
"samples": [{"name": "sacha", "gt": 1}],
}
yield {
"chr": "12",
"pos": 125010,
"ref": "T",
"alt": "A",
"annotations": [
{"transcript": "NM_234234", "gene": "CFTR"},
{"transcript": "NM_234235", "gene": "CFTR"},
],
"samples": [{"name": "sacha", "gt": 1}],
}
yield {
"chr": "13",
"pos": 125010,
"ref": "T",
"alt": "A",
"annotations": [
{"transcript": "NM_234234", "gene": "CFTR"},
{"transcript": "NM_234235", "gene": "CFTR"},
],
"samples": [{"name": "sacha", "gt": 1}],
}
def get_fields(self):
"""Extract fields informations from VCF fields
.. note:: Fields used in PRIMARY KEYS have the constraint NOT NULL.
By default, all other fields can have NULL values.
"""
yield {
"name": "chr",
"category": "variants",
"description": "chromosom",
"type": "str",
"constraint": "NOT NULL",
}
yield {
"name": "pos",
"category": "variants",
"description": "position",
"type": "int",
"constraint": "NOT NULL",
}
yield {
"name": "ref",
"category": "variants",
"description": "reference base",
"type": "str",
"constraint": "NOT NULL",
}
yield {
"name": "alt",
"category": "variants",
"description": "alternative base",
"type": "str",
"constraint": "NOT NULL",
}
yield {
"name": "gt",
"category": "samples",
"description": "genotype",
"type": "int",
}
yield {
"name": "af",
"category": "samples",
"description": "allele frequency",
"type": "float",
}
yield {
"name": "gene",
"category": "annotations",
"description": "gene name",
"type": "str",
}
yield {
"name": "transcript",
"category": "annotations",
"description": "gene transcripts",
"type": "str",
}
def get_samples(self):
return ["sacha"]