Skip to content

Commit

Permalink
Add optional column configuration
Browse files Browse the repository at this point in the history
  • Loading branch information
garrettjstevens authored and cmdcolin committed Nov 8, 2024
1 parent 86c7400 commit c33489a
Show file tree
Hide file tree
Showing 4 changed files with 186 additions and 61 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -13,19 +13,106 @@ import SyntenyFeature from '../SyntenyFeature'

// Blast output column names/descriptions taken from
// https://www.ncbi.nlm.nih.gov/books/NBK279684/#_appendices_Options_for_the_commandline_a_
interface BlastRecord {
interface BlastColumns {
/** Query Seq-id */
qseqid: string
qseqid?: string
/** Query GI */
qgi?: string
/** Query accesion */
qacc?: string
/** Subject Seq-id */
sseqid: string
/** Percentage of identical matches */
pident: number
sseqid?: string
/** All subject Seq-id(s), separated by a ';' */
sallseqid?: string
/** Subject GI */
sgi?: string
/** All subject GIs */
sallgi?: string
/** Subject accession */
sacc?: string
/** All subject accessions */
sallacc?: string
/** Start of alignment in query */
qstart?: number
/** End of alignment in query */
qend?: number
/** Start of alignment in subject */
sstart?: number
/** End of alignment in subject */
send?: number
/** Aligned part of query sequence */
qseq?: string
/** Aligned part of subject sequence */
sseq?: string
/** Expect value */
evalue?: string
/** Bit score */
bitscore?: string
/** Raw score */
score?: string
/** Alignment length */
length: number
length?: string
/** Percentage of identical matches */
pident?: string
/** Number of identical matches */
nident?: string
/** Number of mismatches */
mismatch: number
mismatch?: string
/** Number of positive-scoring matches */
positive?: string
/** Number of gap openings */
gapopen: number
gapopen?: string
/** Total number of gap */
gaps?: string
/** Percentage of positive-scoring matches */
ppos?: string
/** Query and subject frames separated by a '/' */
frames?: string
/** Query frame */
qframe?: string
/** Subject frame */
sframe?: string
/** Blast traceback operations (BTOP) */
btop?: string
/** Unique Subject Taxonomy ID(s), separated by a ';'(in numerical order) */
staxids?: string
/** Unique Subject Scientific Name(s), separated by a ';' */
sscinames?: string
/** Unique Subject Common Name(s), separated by a ';' */
scomnames?: string
/**
* Unique Subject Blast Name(s), separated by a ';' (in alphabetical order)
*/
sblastnames?: string
/**
* Unique Subject Super Kingdom(s), separated by a ';' (in alphabetical order)
*/
sskingdoms?: string
/** Subject Title */
stitle?: string
/** All Subject Title(s), separated by a '<>' */
salltitles?: string
/** Subject Strand */
sstrand?: string
/** Query Coverage Per Subject (for all HSPs) */
qcovs?: string
/** Query Coverage Per HSP */
qcovhsp?: string
/**
* A measure of Query Coverage that counts a position in a subject sequence
* for this measure only once. The second time the position is aligned to the
* query is not counted towards this measure.
*/
qcovus?: string
}

// Blast output column names/descriptions taken from
// https://www.ncbi.nlm.nih.gov/books/NBK279684/#_appendices_Options_for_the_commandline_a_
interface BlastRecord extends BlastColumns {
/** Query Seq-id */
qseqid: string
/** Subject Seq-id */
sseqid: string
/** Start of alignment in query */
qstart: number
/** End of alignment in query */
Expand All @@ -34,62 +121,75 @@ interface BlastRecord {
sstart: number
/** End of alignment in subject */
send: number
/** Expect value */
evalue: number
/** Bit score */
bitscore: number
}

export function parseBlastLine(line: string): BlastRecord | undefined {
const [
qseqid,
sseqid,
pident,
length,
mismatch,
gapopen,
qstart,
qend,
sstart,
send,
evalue,
bitscore,
] = line.split('\t')

if (
!(
qseqid &&
sseqid &&
pident &&
length &&
mismatch &&
gapopen &&
qstart &&
qend &&
sstart &&
send &&
evalue &&
bitscore
)
) {
console.warn('Invalid BLAST line')
console.warn(line)
return
function createBlastLineParser(columns: string) {
const columnNames = columns.trim().split(' ') as (keyof BlastRecord)[]
const qseqidIndex = columnNames.indexOf('qseqid')
if (qseqidIndex === -1) {
throw new Error('Missing required column "qseqid"')
}
const sseqidIndex = columnNames.indexOf('sseqid')
if (sseqidIndex === -1) {
throw new Error('Missing required column "sseqid"')
}
const qstartIndex = columnNames.indexOf('qstart')
if (qstartIndex === -1) {
throw new Error('Missing required column "qstart"')
}
const qendIndex = columnNames.indexOf('qend')
if (qendIndex === -1) {
throw new Error('Missing required column "qend"')
}
const sstartIndex = columnNames.indexOf('sstart')
if (sstartIndex === -1) {
throw new Error('Missing required column "sstart"')
}
const sendIndex = columnNames.indexOf('send')
if (sendIndex === -1) {
throw new Error('Missing required column "send"')
}

return {
qseqid,
sseqid,
pident: Number.parseFloat(pident),
length: Number.parseInt(length, 10),
mismatch: Number.parseInt(mismatch, 10),
gapopen: Number.parseInt(gapopen, 10),
qstart: Number.parseInt(qstart, 10),
qend: Number.parseInt(qend, 10),
sstart: Number.parseInt(sstart, 10),
send: Number.parseInt(send, 10),
evalue: Number.parseFloat(evalue),
bitscore: Number.parseFloat(bitscore),
return (line: string): BlastRecord | undefined => {
if (line.startsWith('#')) {
return
}
const row = line.split('\t')
const qseqid = row[qseqidIndex]
const sseqid = row[sseqidIndex]
const qstart = row[qstartIndex]
const qend = row[qendIndex]
const sstart = row[sstartIndex]
const send = row[sendIndex]
if (!(qseqid && sseqid && qstart && qend && sstart && send)) {
console.warn('Invalid BLAST line')
console.warn(line)
return
}
const record: BlastRecord = {
qseqid,
sseqid,
qstart: Number.parseInt(qstart),
qend: Number.parseInt(qend),
sstart: Number.parseInt(sstart),
send: Number.parseInt(send),
}
for (const [idx, columnName] of columnNames.entries()) {
if (
['qseqid', 'sseqid', 'qstart', 'qend', 'sstart', 'send'].includes(
columnName,
)
) {
continue
}
const value = row[idx]
if (!value) {
continue
}
// @ts-expect-error
record[columnName] = value
}
return record
}
}

Expand Down Expand Up @@ -118,7 +218,8 @@ export default class BlastTabularAdapter extends BaseFeatureDataAdapter {
encoding: undefined,
})
const buf = isGzip(buffer) ? await unzip(buffer) : buffer
return parseLineByLine(buf, parseBlastLine)
const columns: string = readConfObject(this.config, 'columns')
return parseLineByLine(buf, createBlastLineParser(columns))
}

async hasDataForRefName() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,16 @@ const BlastTabularAdapter = ConfigurationSchema(
locationType: 'UriLocation',
},
},
/**
* #slot
*/
columns: {
type: 'string',
description:
'Optional space-separated column name list. If custom columns were used in outfmt, enter them here exactly as specified in the command. At least qseqid, sseqid, qstart, qend, sstart, and send are required',
defaultValue:
'qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore',
},
},
{ explicitlyTyped: true },
)
Expand Down
14 changes: 14 additions & 0 deletions test_data/config_synteny_grape_peach.json
Original file line number Diff line number Diff line change
Expand Up @@ -1910,6 +1910,20 @@
}
}
}
},
{
"type": "SyntenyTrack",
"trackId": "grape_peach_synteny_tblastx",
"name": "Grape peach synteny (TBLASTX)",
"assemblyNames": ["peach", "grape"],
"category": ["Annotation"],
"adapter": {
"type": "BlastTabularAdapter",
"blastTableLocation": {
"uri": "grape_peach_synteny/peach_vs_grape.tsv.gz"
},
"assemblyNames": ["peach", "grape"]
}
}
]
}
Binary file modified test_data/grape_peach_synteny/peach_vs_grape.tsv.gz
100755 → 100644
Binary file not shown.

0 comments on commit c33489a

Please sign in to comment.