Skip to content
This repository has been archived by the owner on Oct 28, 2022. It is now read-only.

Adding sequence records, and cleaning up reference records. #162

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 0 additions & 63 deletions src/main/resources/avro/referencemethods.avdl
Original file line number Diff line number Diff line change
Expand Up @@ -161,67 +161,4 @@ GAReference getReference(
*/
string id) throws GAException;

/**************** /references/{id}/bases *******************/
/**
The query parameters for a request to `GET /references/{id}/bases`, for
example:

`GET /references/{id}/bases?start=100&end=200`
*/
record GAListReferenceBasesRequest {
/**
The start position (0-based) of this query. Defaults to 0.
Genomic positions are non-negative integers less than reference length.
Requests spanning the join of circular genomes are represented as
two requests one on each side of the join (position 0).
*/
long start = 0;

/**
The end position (0-based, exclusive) of this query. Defaults
to the length of this `GAReference`.
*/
union { null, long } end = null;

/**
The continuation token, which is used to page through large result sets.
To get the next page of results, set this parameter to the value of
`nextPageToken` from the previous response.
*/
union { null, string } pageToken = null;
}

/** The response from `GET /references/{id}/bases` expressed as JSON. */
record GAListReferenceBasesResponse {
/**
The offset position (0-based) of the given `sequence` from the start of this
`GAReference`. This value will differ for each page in a paginated request.
*/
long offset = 0;

/**
A substring of the bases that make up this reference. Bases are represented
as IUPAC-IUB codes; this string matches the regexp `[ACGTMRWSYKVHDBN]*`.
*/
string sequence;

/**
The continuation token, which is used to page through large result sets.
Provide this value in a subsequent request to return the next page of
results. This field will be empty if there aren't any additional results.
*/
union { null, string } nextPageToken = null;
}

/**
Lists `GAReference` bases by ID and optional range.
`GET /references/{id}/bases` will return a JSON version of
`GAListReferenceBasesResponse`.
*/
GAListReferenceBasesResponse getReferenceBases(
/** The ID of the `GAReference`. */
string id,
/** Additional request parameters to restrict the query. */
GAListReferenceBasesRequest request) throws GAException;

}
44 changes: 20 additions & 24 deletions src/main/resources/avro/references.avdl
Original file line number Diff line number Diff line change
Expand Up @@ -11,18 +11,23 @@ record GAReference {
/** The reference ID. Unique within the repository. */
string id;

/** The ID of the reference set that contains this reference contig. */
string referenceSetId;

/** The length of this reference's sequence. */
long length;

/** The ID of the assembled sequence of this contig. */
string sequenceId;

/**
MD5 of the upper-case sequence excluding all whitespace characters
(this is equivalent to SQ:M5 in SAM).
*/
string md5checksum;

/**
The name of this reference. (e.g. '22') Also see the
`names` field on the parent `GAReferenceSet`.
The name of this reference. (e.g. '22').
*/
string name;

Expand All @@ -44,17 +49,19 @@ record GAReference {
is sufficiently small. Two sequences derived from the same official
sequence share the same coordinates and annotations, and
can be replaced with the official sequence for certain use cases.

This field contains the IDs of the references that this reference is derived
from. If this reference sequence is not derived from any other reference sequences,
this array should be empty.
*/
boolean isDerived = false;
array<string> derivedFrom = [];

/**
The `sourceDivergence` is the fraction of non-indel bases that do not match the
reference this record was derived from.
reference this record was derived from. This array should have the same length
as the derivedFrom array.
*/
union { null, float } sourceDivergence = null;

/** ID from http://www.ncbi.nlm.nih.gov/taxonomy (e.g. 9606->human). */
union { null, int } ncbiTaxonId = null;
array<float> sourceDivergence = [];
}

/**
Expand All @@ -66,17 +73,6 @@ record GAReferenceSet {
/** The reference set ID. Unique in the repository. */
string id;

/** The IDs of the `GAReference` objects that are part of this set. */
array<string> referenceIds = [];

/**
Order-independent MD5 checksum which identifies this `GAReferenceSet`. The
checksum is computed by sorting all `reference.md5checksum` (for all
`reference` in this set) in ascending lexicographic order, concatenating,
and taking the MD5 of that value.
*/
string md5checksum;

/**
ID from http://www.ncbi.nlm.nih.gov/taxonomy (e.g. 9606->human) indicating
the species which this assembly is intended to model. Note that contained
Expand All @@ -101,14 +97,14 @@ record GAReferenceSet {
All known corresponding accession IDs in INSDC (GenBank/ENA/DDBJ) ideally
with a version number, e.g. `NC_000001.11`.
*/
array<string> sourceAccessions;
array<string> sourceAccessions = [];

/**
A reference set may be derived from a source if it contains
additional sequences, or some of the sequences within it are derived
(see the definition of `isDerived` in `GAReference`).
The IDs of the reference sets that this reference set is derived from.
If this reference set is not derived from another reference set, this
field should be empty.
*/
boolean isDerived = false;
array<string> derivedFrom = [];
}

}
133 changes: 133 additions & 0 deletions src/main/resources/avro/sequencemethods.avdl
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
@namespace("org.ga4gh")
protocol GAReferenceMethods {

import idl "common.avdl";
import idl "sequences.avdl";

/**************** /sequences/search *******************/
/**
This request maps to the body of `POST /sequences/search`
as JSON.
*/
record GASearchSequencesRequest {
/** If present, the names of the sequences to search for. */
array<string> names = [];

/**
The continuation token, which is used to page through large result sets.
To get the next page of results, set this parameter to the value of
`nextPageToken` from the previous response.
*/
union { null, string } pageToken = null;
}

/**
This is the response from `POST /sequences/search` expressed as JSON.
*/
record GASearchSequencesResponse {
/** The list of matching sequences. */
array<GASequence> sequences = [];

/**
The continuation token, which is used to page through large result sets.
Provide this value in a subsequent request to return the next page of
results. This field will be empty if there aren't any additional results.
*/
union { null, string } nextPageToken = null;
}

/**
Gets a list of `GASequence` matching the search criteria.

`POST /sequences/search` must accept a JSON version of
`GASearchSequencesRequest` as the post body and will return a JSON
version of `GASearchSequencesResponse`
*/
GASearchSequencesResponse searchSequences(
/**
This request maps to the body of `POST /sequences/search` as JSON.
*/
GASearchSequencesRequest request) throws GAException;

/**************** /sequences/{id} *******************/
/**
Gets a `GASequence` by ID. `GET /sequences/{id}` will return a
JSON version of `GASequence`.
*/
GASequence getSequence(
/**
The ID of the `GASequence`.
*/
string id) throws GAException;

/**************** /sequencefragments/search *******************/
/**
This request maps to the body of `POST /sequencefragments/search`
as JSON.
*/
record GASearchSequenceFragmentsRequest {
/** If present, the IDs of the sequences to search for. */
array<string> sequenceIds = [];

/**
The start position (0-based) of this query. Defaults to 0.
Genomic positions are non-negative integers less than reference length.
Requests spanning the join of circular genomes are represented as
two requests one on each side of the join (position 0).
*/
long start = 0;

/**
The end position (0-based, exclusive) of this query. Defaults
to the length of this `GAReference`.
*/
union { null, long } end = null;

/**
The continuation token, which is used to page through large result sets.
To get the next page of results, set this parameter to the value of
`nextPageToken` from the previous response.
*/
union { null, string } pageToken = null;
}

/**
This is the response from `POST /sequencefragments/search` expressed as JSON.
*/
record GASearchSequenceFragmentsResponse {
/** The list of matching sequence fragments. */
array<GASequenceFragment> sequenceFragments = [];

/**
The continuation token, which is used to page through large result sets.
Provide this value in a subsequent request to return the next page of
results. This field will be empty if there aren't any additional results.
*/
union { null, string } nextPageToken = null;
}

/**
Gets a list of `GASequenceFragments` matching the search criteria.

`POST /sequencefragments/search` must accept a JSON version of
`GASearchSequenceFragmentsRequest` as the post body and will return a JSON
version of `GASearchSequenceFragmentsResponse`
*/
GASearchSequenceFragmentsResponse searchSequenceFragments(
/**
This request maps to the body of `POST /sequencefragments/search` as JSON.
*/
GASearchSequenceFragmentsRequest request) throws GAException;

/**************** /sequencefragments/{id} *******************/
/**
Gets a `GASequenceFragment` by ID. `GET /sequencefragments/{id}` will return a
JSON version of `GASequenceFragment`.
*/
GASequenceFragment getSequenceFragment(
/**
The ID of the `GASequenceFragment`.
*/
string id) throws GAException;

}
61 changes: 61 additions & 0 deletions src/main/resources/avro/sequences.avdl
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
@namespace("org.ga4gh")

protocol GASequences {

/**
A `GASequence` is a contiguous sequence of bases, which is made up by a
set of `GASequenceFragments`. A `GASequence` could correspond to a
reference contig, bases from a de novo assembly, etc.
*/
record GASequence {

/** The sequence ID. Unique within the repository. */
string id;

/** The name of the sequence. */
union { null, string } name = null;

/** A description of the sequence. */
union { null, string } description = null;

/**
The IDs of the data sources from which this sequence is generated.
This can include IDs for any combination of the following (and more):

* A GAReadSet (if the sequence was de novo assembled from reads)
* A GAReference (if the sequence was taken from a reference assembly)
* A GAVariantSet (if variants were spliced in to a reference assembly)
*/
array<string> sourceIds = [];

/**
The length of this sequence. Sequences are assumed to start at 0.
*/
long length;
}

/**
A `GASequenceFragment` represents a subsection of a sequence.
*/
record GASequenceFragment {

/** The ID of this sequence fragment. Unique within the repository. */
string id;

/**
The bases in this sequence. Bases are represented as IUPAC-IUB codes;
this string matches the regexp `[ACGTMRWSYKVHDBN]*`.
*/
string bases;

/**
The start position of this fragment, within the sequence. The fragment's
end position will be `start + len(bases)`.
*/
long start;

/** The ID of the sequence which contains this fragment. */
string sequenceId;
}

}