From d2bb3e8d078f9eaf09f2eeb88efdf465f7ddb187 Mon Sep 17 00:00:00 2001 From: Frank Austin Nothaft Date: Mon, 13 Oct 2014 15:22:35 -0700 Subject: [PATCH] Adding sequence records, and cleaning up reference records. --- src/main/resources/avro/referencemethods.avdl | 63 --------- src/main/resources/avro/references.avdl | 44 +++--- src/main/resources/avro/sequencemethods.avdl | 133 ++++++++++++++++++ src/main/resources/avro/sequences.avdl | 61 ++++++++ 4 files changed, 214 insertions(+), 87 deletions(-) create mode 100644 src/main/resources/avro/sequencemethods.avdl create mode 100644 src/main/resources/avro/sequences.avdl diff --git a/src/main/resources/avro/referencemethods.avdl b/src/main/resources/avro/referencemethods.avdl index 33c4c446..3d495c20 100644 --- a/src/main/resources/avro/referencemethods.avdl +++ b/src/main/resources/avro/referencemethods.avdl @@ -161,67 +161,4 @@ GAReference getReference( */ string id) throws GAException; -/**************** /references/{id}/bases *******************/ -/** -The query parameters for a request to `GET /references/{id}/bases`, for -example: - -`GET /references/{id}/bases?start=100&end=200` -*/ -record GAListReferenceBasesRequest { - /** - The start position (0-based) of this query. Defaults to 0. - Genomic positions are non-negative integers less than reference length. - Requests spanning the join of circular genomes are represented as - two requests one on each side of the join (position 0). - */ - long start = 0; - - /** - The end position (0-based, exclusive) of this query. Defaults - to the length of this `GAReference`. - */ - union { null, long } end = null; - - /** - The continuation token, which is used to page through large result sets. - To get the next page of results, set this parameter to the value of - `nextPageToken` from the previous response. - */ - union { null, string } pageToken = null; -} - -/** The response from `GET /references/{id}/bases` expressed as JSON. */ -record GAListReferenceBasesResponse { - /** - The offset position (0-based) of the given `sequence` from the start of this - `GAReference`. This value will differ for each page in a paginated request. - */ - long offset = 0; - - /** - A substring of the bases that make up this reference. Bases are represented - as IUPAC-IUB codes; this string matches the regexp `[ACGTMRWSYKVHDBN]*`. - */ - string sequence; - - /** - The continuation token, which is used to page through large result sets. - Provide this value in a subsequent request to return the next page of - results. This field will be empty if there aren't any additional results. - */ - union { null, string } nextPageToken = null; -} - -/** -Lists `GAReference` bases by ID and optional range. -`GET /references/{id}/bases` will return a JSON version of -`GAListReferenceBasesResponse`. -*/ -GAListReferenceBasesResponse getReferenceBases( - /** The ID of the `GAReference`. */ - string id, - /** Additional request parameters to restrict the query. */ - GAListReferenceBasesRequest request) throws GAException; - } diff --git a/src/main/resources/avro/references.avdl b/src/main/resources/avro/references.avdl index 96d69b0b..544ee72a 100644 --- a/src/main/resources/avro/references.avdl +++ b/src/main/resources/avro/references.avdl @@ -11,9 +11,15 @@ record GAReference { /** The reference ID. Unique within the repository. */ string id; + /** The ID of the reference set that contains this reference contig. */ + string referenceSetId; + /** The length of this reference's sequence. */ long length; + /** The ID of the assembled sequence of this contig. */ + string sequenceId; + /** MD5 of the upper-case sequence excluding all whitespace characters (this is equivalent to SQ:M5 in SAM). @@ -21,8 +27,7 @@ record GAReference { string md5checksum; /** - The name of this reference. (e.g. '22') Also see the - `names` field on the parent `GAReferenceSet`. + The name of this reference. (e.g. '22'). */ string name; @@ -44,17 +49,19 @@ record GAReference { is sufficiently small. Two sequences derived from the same official sequence share the same coordinates and annotations, and can be replaced with the official sequence for certain use cases. + + This field contains the IDs of the references that this reference is derived + from. If this reference sequence is not derived from any other reference sequences, + this array should be empty. */ - boolean isDerived = false; + array derivedFrom = []; /** The `sourceDivergence` is the fraction of non-indel bases that do not match the - reference this record was derived from. + reference this record was derived from. This array should have the same length + as the derivedFrom array. */ - union { null, float } sourceDivergence = null; - - /** ID from http://www.ncbi.nlm.nih.gov/taxonomy (e.g. 9606->human). */ - union { null, int } ncbiTaxonId = null; + array sourceDivergence = []; } /** @@ -66,17 +73,6 @@ record GAReferenceSet { /** The reference set ID. Unique in the repository. */ string id; - /** The IDs of the `GAReference` objects that are part of this set. */ - array referenceIds = []; - - /** - Order-independent MD5 checksum which identifies this `GAReferenceSet`. The - checksum is computed by sorting all `reference.md5checksum` (for all - `reference` in this set) in ascending lexicographic order, concatenating, - and taking the MD5 of that value. - */ - string md5checksum; - /** ID from http://www.ncbi.nlm.nih.gov/taxonomy (e.g. 9606->human) indicating the species which this assembly is intended to model. Note that contained @@ -101,14 +97,14 @@ record GAReferenceSet { All known corresponding accession IDs in INSDC (GenBank/ENA/DDBJ) ideally with a version number, e.g. `NC_000001.11`. */ - array sourceAccessions; + array sourceAccessions = []; /** - A reference set may be derived from a source if it contains - additional sequences, or some of the sequences within it are derived - (see the definition of `isDerived` in `GAReference`). + The IDs of the reference sets that this reference set is derived from. + If this reference set is not derived from another reference set, this + field should be empty. */ - boolean isDerived = false; + array derivedFrom = []; } } diff --git a/src/main/resources/avro/sequencemethods.avdl b/src/main/resources/avro/sequencemethods.avdl new file mode 100644 index 00000000..b1226f79 --- /dev/null +++ b/src/main/resources/avro/sequencemethods.avdl @@ -0,0 +1,133 @@ +@namespace("org.ga4gh") +protocol GAReferenceMethods { + +import idl "common.avdl"; +import idl "sequences.avdl"; + +/**************** /sequences/search *******************/ +/** +This request maps to the body of `POST /sequences/search` +as JSON. +*/ +record GASearchSequencesRequest { + /** If present, the names of the sequences to search for. */ + array names = []; + + /** + The continuation token, which is used to page through large result sets. + To get the next page of results, set this parameter to the value of + `nextPageToken` from the previous response. + */ + union { null, string } pageToken = null; +} + +/** +This is the response from `POST /sequences/search` expressed as JSON. +*/ +record GASearchSequencesResponse { + /** The list of matching sequences. */ + array sequences = []; + + /** + The continuation token, which is used to page through large result sets. + Provide this value in a subsequent request to return the next page of + results. This field will be empty if there aren't any additional results. + */ + union { null, string } nextPageToken = null; +} + +/** +Gets a list of `GASequence` matching the search criteria. + +`POST /sequences/search` must accept a JSON version of +`GASearchSequencesRequest` as the post body and will return a JSON +version of `GASearchSequencesResponse` +*/ +GASearchSequencesResponse searchSequences( + /** + This request maps to the body of `POST /sequences/search` as JSON. + */ + GASearchSequencesRequest request) throws GAException; + +/**************** /sequences/{id} *******************/ +/** +Gets a `GASequence` by ID. `GET /sequences/{id}` will return a +JSON version of `GASequence`. +*/ +GASequence getSequence( + /** + The ID of the `GASequence`. + */ + string id) throws GAException; + +/**************** /sequencefragments/search *******************/ +/** +This request maps to the body of `POST /sequencefragments/search` +as JSON. +*/ +record GASearchSequenceFragmentsRequest { + /** If present, the IDs of the sequences to search for. */ + array sequenceIds = []; + + /** + The start position (0-based) of this query. Defaults to 0. + Genomic positions are non-negative integers less than reference length. + Requests spanning the join of circular genomes are represented as + two requests one on each side of the join (position 0). + */ + long start = 0; + + /** + The end position (0-based, exclusive) of this query. Defaults + to the length of this `GAReference`. + */ + union { null, long } end = null; + + /** + The continuation token, which is used to page through large result sets. + To get the next page of results, set this parameter to the value of + `nextPageToken` from the previous response. + */ + union { null, string } pageToken = null; +} + +/** +This is the response from `POST /sequencefragments/search` expressed as JSON. +*/ +record GASearchSequenceFragmentsResponse { + /** The list of matching sequence fragments. */ + array sequenceFragments = []; + + /** + The continuation token, which is used to page through large result sets. + Provide this value in a subsequent request to return the next page of + results. This field will be empty if there aren't any additional results. + */ + union { null, string } nextPageToken = null; +} + +/** +Gets a list of `GASequenceFragments` matching the search criteria. + +`POST /sequencefragments/search` must accept a JSON version of +`GASearchSequenceFragmentsRequest` as the post body and will return a JSON +version of `GASearchSequenceFragmentsResponse` +*/ +GASearchSequenceFragmentsResponse searchSequenceFragments( + /** + This request maps to the body of `POST /sequencefragments/search` as JSON. + */ + GASearchSequenceFragmentsRequest request) throws GAException; + +/**************** /sequencefragments/{id} *******************/ +/** +Gets a `GASequenceFragment` by ID. `GET /sequencefragments/{id}` will return a +JSON version of `GASequenceFragment`. +*/ +GASequenceFragment getSequenceFragment( + /** + The ID of the `GASequenceFragment`. + */ + string id) throws GAException; + +} \ No newline at end of file diff --git a/src/main/resources/avro/sequences.avdl b/src/main/resources/avro/sequences.avdl new file mode 100644 index 00000000..5fafea8d --- /dev/null +++ b/src/main/resources/avro/sequences.avdl @@ -0,0 +1,61 @@ +@namespace("org.ga4gh") + +protocol GASequences { + +/** +A `GASequence` is a contiguous sequence of bases, which is made up by a +set of `GASequenceFragments`. A `GASequence` could correspond to a +reference contig, bases from a de novo assembly, etc. +*/ +record GASequence { + + /** The sequence ID. Unique within the repository. */ + string id; + + /** The name of the sequence. */ + union { null, string } name = null; + + /** A description of the sequence. */ + union { null, string } description = null; + + /** + The IDs of the data sources from which this sequence is generated. + This can include IDs for any combination of the following (and more): + + * A GAReadSet (if the sequence was de novo assembled from reads) + * A GAReference (if the sequence was taken from a reference assembly) + * A GAVariantSet (if variants were spliced in to a reference assembly) + */ + array sourceIds = []; + + /** + The length of this sequence. Sequences are assumed to start at 0. + */ + long length; +} + +/** +A `GASequenceFragment` represents a subsection of a sequence. +*/ +record GASequenceFragment { + + /** The ID of this sequence fragment. Unique within the repository. */ + string id; + + /** + The bases in this sequence. Bases are represented as IUPAC-IUB codes; + this string matches the regexp `[ACGTMRWSYKVHDBN]*`. + */ + string bases; + + /** + The start position of this fragment, within the sequence. The fragment's + end position will be `start + len(bases)`. + */ + long start; + + /** The ID of the sequence which contains this fragment. */ + string sequenceId; +} + +} \ No newline at end of file