forked from samtools/hts-specs
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathSAMtags.tex
567 lines (441 loc) · 26.6 KB
/
SAMtags.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
\documentclass[10pt]{article}
\usepackage[margin=1in]{geometry}
\usepackage{longtable}
\usepackage[pdfborder={0 0 0},hyperfootnotes=false]{hyperref}
\usepackage[title]{appendix}
\newcommand{\mailtourl}[1]{\href{mailto:#1}{\tt #1}}
\newcommand{\tagvalue}[1]{{\tt #1}}
\newcommand{\tagregex}[1]{{\tt #1}}
\newcommand{\metavar}[1]{{\rm\emph{#1}}}
\begin{document}
\input{SAMtags.ver}
\title{Sequence Alignment/Map Optional Fields Specification}
\author{The SAM/BAM Format Specification Working Group}
\date{\headdate}
\maketitle
\begin{quote}\small
The master version of this document can be found at
\url{https://github.com/samtools/hts-specs}.\\
This printing is version~\commitdesc\ from that repository,
last modified on the date shown above.
\end{quote}
\vspace*{1em}
\noindent
This document is a companion to the {\sl Sequence Alignment/Map Format
Specification} that defines the SAM and~BAM formats, and to the {\sl CRAM
Format Specification} that defines the CRAM format.\footnote{See
\href{http://samtools.github.io/hts-specs/SAMv1.pdf}{\tt SAMv1.pdf} and
\href{http://samtools.github.io/hts-specs/CRAMv3.pdf}{\tt CRAMv3.pdf}
at \url{https://github.com/samtools/hts-specs}.}
Alignment records in each of these formats may contain a number of optional
fields, each labelled with a {\it tag\/} identifying that field's data.
This document describes each of the predefined standard tags, and discusses
conventions around creating new tags.
\section{Standard tags}
Predefined standard tags are listed in the following table and described
in greater detail in later subsections.
Optional fields are usually displayed as {\tt TAG:TYPE:VALUE}; the {\it type\/}
may be one of
{\tt A} (character),
{\tt B} (general array),
{\tt f} (real number),
{\tt H} (hexadecimal array),
{\tt i} (integer),
or
{\tt Z} (string).
\begin{center}\small
% This table is sorted alphabetically
\begin{longtable}{ccp{12.5cm}}
\hline
{\bf Tag} & {\bf Type} & {\bf Description} \\
\hline
{\tt AM} & i & The smallest template-independent mapping quality in the template \\
{\tt AS} & i & Alignment score generated by aligner \\
{\tt BC} & Z & Barcode sequence identifying the sample \\
{\tt BQ} & Z & Offset to base alignment quality (BAQ) \\
{\tt BZ} & Z & Phred quality of the unique molecular barcode bases in the {\tt OX} tag \\
{\tt CB} & Z & Cell identifier \\
{\tt CC} & Z & Reference name of the next hit \\
{\tt CG} & B,I & BAM only: {\sf CIGAR} in BAM's binary encoding if (and only if) it consists of $>$65535 operators \\
{\tt CM} & i & Edit distance between the color sequence and the color reference (see also {\tt NM}) \\
{\tt CO} & Z & Free-text comments \\
{\tt CP} & i & Leftmost coordinate of the next hit \\
{\tt CQ} & Z & Color read base qualities \\
{\tt CR} & Z & Cellular barcode sequence bases (uncorrected) \\
{\tt CS} & Z & Color read sequence \\
{\tt CT} & Z & Complete read annotation tag, used for consensus annotation dummy features \\
{\tt CY} & Z & Phred quality of the cellular barcode sequence in the {\tt CR} tag \\
{\tt E2} & Z & The 2nd most likely base calls \\
{\tt FI} & i & The index of segment in the template \\
{\tt FS} & Z & Segment suffix \\
{\tt FZ} & B,S & Flow signal intensities \\
{\tt GC} & ? & Reserved for backwards compatibility reasons \\
{\tt GQ} & ? & Reserved for backwards compatibility reasons \\
{\tt GS} & ? & Reserved for backwards compatibility reasons \\
{\tt H0} & i & Number of perfect hits \\
{\tt H1} & i & Number of 1-difference hits (see also {\tt NM}) \\
{\tt H2} & i & Number of 2-difference hits \\
{\tt HI} & i & Query hit index \\
{\tt IH} & i & Query hit total count \\
{\tt LB} & Z & Library \\
{\tt MC} & Z & CIGAR string for mate/next segment \\
{\tt MD} & Z & String for mismatching positions \\
{\tt MF} & ? & Reserved for backwards compatibility reasons \\
{\tt MI} & Z & Molecular identifier; a string that uniquely identifies the molecule from which the record was derived \\
{\tt MQ} & i & Mapping quality of the mate/next segment \\
{\tt NH} & i & Number of reported alignments that contain the query in the current record \\
{\tt NM} & i & Edit distance to the reference \\
{\tt OC} & Z & Original CIGAR \\
{\tt OP} & i & Original mapping position \\
{\tt OQ} & Z & Original base quality \\
{\tt OX} & Z & Original unique molecular barcode bases \\
{\tt PG} & Z & Program \\
{\tt PQ} & i & Phred likelihood of the template \\
{\tt PT} & Z & Read annotations for parts of the padded read sequence \\
{\tt PU} & Z & Platform unit \\
{\tt Q2} & Z & Phred quality of the mate/next segment sequence in the {\tt R2} tag \\
{\tt QT} & Z & Phred quality of the sample barcode sequence in the {\tt BC} tag \\
{\tt QX} & Z & Quality score of the unique molecular identifier in the {\tt RX} tag \\
{\tt R2} & Z & Sequence of the mate/next segment in the template \\
{\tt RG} & Z & Read group \\
{\tt RT} & ? & Reserved for backwards compatibility reasons \\
{\tt RX} & Z & Sequence bases of the (possibly corrected) unique molecular identifier \\
{\tt S2} & ? & Reserved for backwards compatibility reasons \\
{\tt SA} & Z & Other canonical alignments in a chimeric alignment \\
{\tt SM} & i & Template-independent mapping quality \\
{\tt SQ} & ? & Reserved for backwards compatibility reasons \\
{\tt TC} & i & The number of segments in the template \\
{\tt U2} & Z & Phred probability of the 2nd call being wrong conditional on the best being wrong \\
{\tt UQ} & i & Phred likelihood of the segment, conditional on the mapping being correct \\
{\tt X?} & ? & Reserved for end users \\
{\tt Y?} & ? & Reserved for end users \\
{\tt Z?} & ? & Reserved for end users \\
\hline
\end{longtable}
\end{center}
\subsection{Additional Template and Mapping data}
\begin{description}
\item[AM:i:\tagvalue{score}]
The smallest template-independent mapping quality of any segment in the same template as this read.
(See also {\tt SM}.)
\item[AS:i:\tagvalue{score}]
Alignment score generated by aligner.
\item[BQ:Z:\tagvalue{qualities}]
Offset to base alignment quality (BAQ), of the same length as the read sequence.
At the $i$-th read base, ${\rm BAQ}_i=Q_i-({\rm BQ}_i-64)$ where $Q_i$ is the $i$-th base quality.
\item[CC:Z:\tagvalue{rname}]
Reference name of the next hit; `{\tt =}' for the same chromosome.
\item[CG:B:I,\tagvalue{encodedCigar}]
Real CIGAR in its binary form if (and only if) it contains $>$65535 operations. This is
a BAM file only tag as a workaround of BAM's incapability to store long CIGARs
in the standard way. SAM and CRAM files created with updated tools aware of the
workaround are not expected to contain this tag. See also the footnote in
Section 4.2 of the SAM spec for details.
\item[CP:i:\tagvalue{pos}]
Leftmost coordinate of the next hit.
\item[E2:Z:\tagvalue{bases}]
The 2nd most likely base calls. Same encoding and same length as {\sf SEQ}.
See also {\tt U2} for associated quality values.
\item[FI:i:\tagvalue{int}]
The index of segment in the template.
\item[FS:Z:\tagvalue{str}]
Segment suffix.
\item[H0:i:\tagvalue{count}]
Number of perfect hits.
\item[H1:i:\tagvalue{count}]
Number of 1-difference hits (see also {\tt NM}).
\item[H2:i:\tagvalue{count}]
Number of 2-difference hits.
\item[HI:i:\emph{i}]
Query hit index, indicating the alignment record is the $i$-th one stored
in SAM.
\item[IH:i:\tagvalue{count}]
Number of alignments stored in the file that contain the query in the current
record.
\item[MC:Z:\tagvalue{cigar}]
CIGAR string for mate/next segment.
\item[MD:Z:\tagregex{[0-9]+(([A-Z]|\char92\char94[A-Z]+)[0-9]+)*}]
String for mismatching positions.
The {\tt MD} field aims to achieve SNP/indel calling without
looking at the reference. For example, a string `{\tt 10A5\char94AC6}' means
from the leftmost reference base in the alignment, there are 10 matches
followed by an A on the reference which is different from the aligned read
base; the next 5 reference bases are matches followed by a 2bp deletion from
the reference; the deleted sequence is AC; the last 6~bases are matches.
The {\tt MD} field ought to match the {\sf CIGAR} string.
\item[MQ:i:\tagvalue{score}]
Mapping quality of the mate/next segment.
\item[NH:i:\tagvalue{count}]
Number of reported alignments that contain the query in the current record.
\item[NM:i:\tagvalue{count}]
Number of differences (mismatches plus inserted and deleted bases) between the sequence and reference, counting only (case-insensitive) A, C, G and T bases in sequence and reference as potential matches, with everything else being a mismatch.
Note this means that ambiguity codes in both sequence and reference that match each other, such as `{\tt N}' in both, or compatible codes such as `{\tt A}' and `{\tt R}', are still counted as mismatches.
The special sequence base `{\tt =}' will always be considered to be a match, even if the reference is ambiguous at that point.
Alignment reference skips, padding, soft and hard clipping (`{\tt N}', `{\tt P}', `{\tt S}' and `{\tt H}' {\sf CIGAR} operations) do not count as mismatches, but insertions and deletions count as one mismatch per base.
Note that historically this has been ill-defined and both data and tools exist that disagree with this definition.
\item[PQ:i:\tagvalue{score}]
Phred likelihood of the template, conditional on the mapping locations of both/all segments being correct.
\item[Q2:Z:\tagvalue{qualities}]
Phred quality of the mate/next segment sequence in the {\tt R2} tag.
Same encoding as {\sf QUAL}.
\item[R2:Z:\tagvalue{bases}]
Sequence of the mate/next segment in the template. See also {\tt Q2}
for any associated quality values.
\item[SA:Z:\tagregex{{\tt (}\emph{rname}{\tt ,}\emph{pos}{\tt ,}\emph{strand}{\tt ,}\emph{CIGAR}{\tt ,}\emph{mapQ}{\tt ,}\emph{NM}{\tt ;)}+}]
Other canonical alignments in a chimeric alignment, formatted as a semicolon-delimited list.
Each element in the list represents a part of the chimeric alignment. Conventionally, at a supplementary line, the first element points to the primary line.
\emph{Strand} is either `{\tt +}' or `{\tt -}', indicating positive/negative strand, corresponding to FLAG bit 0x10.
\emph{Pos} is a 1-based coordinate.
\item[SM:i:\tagvalue{score}]
Template-independent mapping quality, i.e., the mapping quality if the read were mapped as a single read rather than as part of a read pair or template.
\item[TC:i:\tagvalue{}]
The number of segments in the template.
\item[U2:Z:\tagvalue{}]
Phred probility of the 2nd call being wrong conditional on the best being wrong.
The same encoding and length as {\sf QUAL}. See also {\tt E2} for associated base calls.
\item[UQ:i:\tagvalue{}]
Phred likelihood of the segment, conditional on the mapping being correct.
\end{description}
\subsection{Metadata}
\begin{description}
\item[RG:Z:\tagvalue{readgroup}]
The read group to which the read belongs.
If {\tt @RG} headers are present, then \emph{readgroup} must match the
{\tt RG-ID} field of one of the headers.
\item[LB:Z:\tagvalue{library}]
The library from which the read has been sequenced.
If {\tt @RG} headers are present, then \emph{library} must match the
{\tt RG-LB} field of one of the headers.
\item[PG:Z:\tagvalue{program\_id}]
Program. Value matches the header {\tt PG-ID} tag if {\tt @PG} is present.
\item[PU:Z:\tagvalue{platformunit}]
The platform unit in which the read was sequenced.
If {\tt @RG} headers are present, then \emph{platformunit} must match the
{\tt RG-PU} field of one of the headers.
\item[CO:Z:\tagvalue{text}]
Free-text comments.
\end{description}
\subsection{Barcodes}
DNA barcodes can be used to identify the provenance of the underlying reads.
There are currently three varieties of barcodes that may co-exist: Sample Barcode, Cell Barcode, and Unique Molecular Identifier (UMI).
\begin{itemize}
\item
Despite its name, the \emph{Sample Barcode} identifies the \emph{Library} and allows multiple libraries to be combined and sequenced together.
After sequencing, the reads can be separated according to this barcode and placed in different ``read groups'' each corresponding to a library.
Since the library was generated from a sample, knowing the library should inform of the sample.
The barcode itself can be included in the {\tt PU} field in the {\tt RG} header line.
Since the {\tt PU} field should be globally unique, it is advisable to include specific information such as flowcell barcode and lane.
It is not recommended to use the barcode as the {\tt ID} field of the {\tt RG} header line, as some tools modify this field (e.g., when merging files).
\item
The \emph{Cell Barcode} is similar to the sample barcode but there is (normally) no control over the assignment of cells to barcodes (whose sequence could be random or predetermined).
The Cell Barcode can help identify when reads come from different cells in a ``single-cell'' sequencing experiment.
\item
The \emph{UMI} is intended to identify the (single- or double-stranded) molecule at the time that the barcode was introduced.
This can be used to inform duplicate marking and make consensus calling in ultra-deep sequencing.
Additionally, the UMI can be used to (informatically) link reads that were generated from the same long molecule, enabling long-range phasing and better informed mapping.
In some experimental setups opposite strands of the same double-stranded DNA molecule get related barcodes.
These templates can also be considered duplicates even though technically they may have different UMIs.
Multiple UMIs can be added by a protocol, possibly at different time-points, which means that specific knowledge of the protocol may be needed in order to analyze the resulting data correctly.
\end{itemize}
\begin{description}
\item[BC:Z:\tagvalue{sequence}]
Barcode sequence (Identifying the sample/library), with any quality scores (optionally) stored in the {\tt QT} tag.
The {\tt BC} tag should match the {\tt QT} tag in length.
In the case of multiple unique molecular identifiers (e.g., one on each end of the template) the recommended implementation concatenates all the barcodes and places a hyphen (`{\tt -}') between the barcodes from the same template.
\item[QT:Z:\tagvalue{qualities}]
Phred quality of the sample barcode sequence in the {\tt BC} tag.
Same encoding as {\sf QUAL}, i.e., Phred score + 33.
In the case of multiple unique molecular identifiers (e.g., one on each end of the template) the recommended implementation concatenates all the quality strings with spaces (`{\tt \textvisiblespace}') between the different strings from the same template.
\item[CB:Z:\tagvalue{str}]
Cell identifier, consisting of the optionally-corrected cellular barcode sequence and an optional suffix.
The sequence part is similar to the {\tt CR} tag, but may have had sequencing errors etc corrected.
This may be followed by a suffix consisting of a hyphen (`{\tt -}') and one or more alphanumeric characters to form an identifier.
In the case of the cellular barcode ({\tt CR}) being based on multiple barcode sequences the recommended implementation concatenates all the (corrected or uncorrected) barcodes with a hyphen (`{\tt -}') between the different barcodes.
Sequencing errors etc aside, all reads from a single cell are expected to have the same {\tt CB} tag.
\item[CR:Z:\tagvalue{sequence+}]
Cellular barcode. The uncorrected sequence bases of the cellular barcode as reported by the sequencing machine, with the corresponding base quality scores (optionally) stored in {\tt CY}.
Sequencing errors etc aside, all reads with the same {\tt CR} tag likely derive from the same cell.
In the case of the cellular barcode being based on multiple barcode sequences the recommended implementation concatenates all the barcodes with a hyphen (`{\tt -}') between the different barcodes.
\item[CY:Z:\tagvalue{qualities+}]
Phred quality of the cellular barcode sequence in the {\tt CR} tag.
Same encoding as {\sf QUAL}, i.e., Phred score + 33.
The lengths of the {\tt CY} and {\tt CR} tags must match.
In the case of the cellular barcode being based on multiple barcode sequences the recommended implementation concatenates all the quality strings with with spaces (`{\tt \textvisiblespace}') between the different strings.
\item[MI:Z:\tagvalue{str}]
Molecular Identifier.
A unique ID within the SAM file for the source molecule from which this read is derived.
All reads with the same {\tt MI} tag represent the group of reads derived from the same source molecule.
\item[OX:Z:\tagvalue{sequence+}]
Raw (uncorrected) unique molecular identifier bases, with any quality scores (optionally) stored in the {\tt BZ} tag.
In the case of multiple unique molecular identifiers (e.g., one on each end of the template) the recommended implementation concatenates all the barcodes with a hyphen (`{\tt -}') between the different barcodes.
\item[BZ:Z:\tagvalue{qualities+}]
Phred quality of the (uncorrected) unique molecular identifier sequence in the {\tt OX} tag.
Same encoding as {\sf QUAL}, i.e., Phred score + 33.
The {\tt OX} tags should match the {\tt BZ} tag in length.
In the case of multiple unique molecular identifiers (e.g., one on each end of the template) the recommended implementation concatenates all the quality strings with a space (`{\tt \textvisiblespace}') between the different strings.
\item[RX:Z:\tagvalue{sequence+}]
Sequence bases from the unique molecular identifier.
These could be either corrected or uncorrected. Unlike {\tt MI}, the value may be non-unique in the file.
Should be comprised of a sequence of bases.
In the case of multiple unique molecular identifiers (e.g., one on each end of the template) the recommended implementation concatenates all the barcodes with a hyphen (`{\tt -}') between the different barcodes.
If the bases represent corrected bases, the original sequence can be stored in {\tt OX} (similar to {\tt OQ} storing the original qualities of bases.)
\item[QX:Z:\tagvalue{qualities+}]
Phred quality of the unique molecular identifier sequence in the {\tt RX} tag.
Same encoding as {\sf QUAL}, i.e., Phred score + 33.
The qualities here may have been corrected (Raw bases and qualities can be stored in {\tt OX} and {\tt BZ} respectively.)
The lengths of the {\tt QX} and the {\tt RX} tags must match.
In the case of multiple unique molecular identifiers (e.g., one on each end of the template) the recommended implementation concatenates all the quality strings with a space (`{\tt \textvisiblespace}') between the different strings.
\end{description}
\subsection{Original data}
\begin{description}
\item[OC:Z:\tagvalue{cigar}]
Original CIGAR, usually before realignment.
\item[OP:i:\tagvalue{pos}]
Original 1-based mapping position, usually before realignment.
\item[OQ:Z:\tagvalue{qualities}]
Original base quality, usually before recalibration.
Same encoding as {\sf QUAL}.
\end{description}
\subsection{Annotation and Padding}
The SAM format can be used to represent \emph{de novo} assemblies, generally by using padded reference sequences and the annotation tags described here.
See the \emph{Guide for Describing Assembly Sequences} in the \href{http://samtools.github.io/hts-specs/SAMv1.pdf}{\emph{SAM Format Specification}} for full details of this representation.
\begin{description}
\item[CT:Z:\tagregex{\metavar{strand};\metavar{type}(;\metavar{key}(=\metavar{value})?)*}]
\hfill\\
Complete read annotation tag, used for consensus annotation dummy features.
The {\tt CT} tag is intended primarily for annotation
dummy reads, and consists of a \emph{strand}, \emph{type} and zero or
more \emph{key}=\emph{value} pairs, each separated with semicolons.
The \emph{strand} field has four values as in GFF3,\footnote{The
Generic Feature Format version 3 (GFF3) specification can be found at
\href{http://www.sequenceontology.org/}{\tt http://sequenceontology.org}.}
and supplements FLAG
bit 0x10 to allow unstranded (`{\tt .}'), and stranded but unknown strand
(`{\tt ?}') annotation. For these and annotation on the forward strand
(\emph{strand} set to `{\tt +}'), do not set FLAG bit 0x10. For
annotation on the reverse strand, set the \emph{strand} to `{\tt -}'
and set FLAG bit 0x10.
The \emph{type} and any \emph{keys} and their
optional \emph{values} are all percent encoded according to
RFC3986 to escape meta-characters `{\tt =}', `{\tt \%}', `{\tt ;}',
`{\tt |}' or non-printable characters not matched by the isprint()
macro (with the C locale). For example a percent sign becomes
`{\tt \%25}'.
%NOTE - This leaves open the possibility of allowing multiple such
%entries for a single CT tag to be combined with | as in the PT tag.
\item[PT:Z:\tagregex{\metavar{annotag}(\char92|\metavar{annotag})*}]\enskip where each \metavar{annotag} matches\quad\tagregex{\metavar{start};\metavar{end};\metavar{strand};\metavar{type}(;\metavar{key}(=\metavar{value})?)*}
\hfill\\
Read annotations for parts of the padded read sequence.
The {\tt PT} tag value has the format of a series of annotation
tags separated by `{\tt |}', each annotating a sub-region of the read.
Each tag consists of \emph{start}, \emph{end}, \emph{strand},
\emph{type} and zero or more \emph{key}{\tt =}\emph{value} pairs, each
separated with semicolons. \emph{Start} and \emph{end} are 1-based
positions between one and the sum of the {\tt M/I/D/P/S/=/X}
{\sf CIGAR} operators, i.e. {\sf SEQ} length plus any pads. Note
any editing of the CIGAR string may require updating the {\tt PT}
tag coordinates, or even invalidate them.
As in GFF3, \emph{strand} is one of `{\tt +}' for forward strand tags,
`{\tt -}' for reverse strand, `{\tt .}' for unstranded or `{\tt ?}'
for stranded but unknown strand.
The \emph{type} and any \emph{keys} and their optional \emph{values}
are all percent encoded as in the {\tt CT} tag.
\end{description}
\subsection{Technology-specific data}
\begin{description}
\item[FZ:B:S,\tagvalue{intensities}]
Flow signal intensities on the original strand of the read, stored as {\tt (uint16\_t) round(value * 100.0)}.
\end{description}
\subsubsection{Color space}
% TODO Describe color space and the encoding here.
\begin{description}
\item[CM:i:\tagvalue{distance}]
Edit distance between the color sequence and the color reference (see also {\tt NM}).
\item[CS:Z:\tagvalue{sequence}]
Color read sequence on the original strand of the read. The primer base must be included.
\item[CQ:Z:\tagvalue{qualities}]
Color read quality on the original strand of the read. Same encoding as {\sf QUAL}; same length as {\tt CS}.
\end{description}
\section{Locally-defined tags}
You can freely add new tags.
Note that tags starting with `{\tt X}', `{\tt Y}', or `{\tt Z}' and tags
containing lowercase letters in either position are reserved for local use
and will not be formally defined in any future version of this specification.
If a new tag may be of general interest, it may be useful to have it added
to this specification. Additions can be proposed by opening a new issue at
\url{https://github.com/samtools/hts-specs/issues} and/or by sending email
to \mailtourl{samtools-devel@lists.sourceforge.net}.
\begin{appendices}
\appendix
\section{Tag History}
This appendix lists when standard tags were initially defined or significantly changed, and other historical events that affect how tags are interpreted or what files they may appear in.
\setlength{\parindent}{0pt}
\newcommand*{\gap}{\vspace*{2ex}}
\subsubsection*{July 2018}
Clarified the calculation of NM score.
\subsubsection*{May 2018}
Cellular barcode tags CB, CR, and CY added.
Removed the RT:Z tag, which was a long-deprecated synonym for BC.
\subsubsection*{November 2017}
SAM version number {\tt VN:1.6} introduced, indicating the addition of the CG tag representation of very long CIGAR strings.
Files that contain records with more than 65,535 CIGAR operators should not declare a version number lower than~1.6 in their {\tt @HD} headers.
% Technically only BAM files containing records with CG tags need to avoid
% declaring VN<1.6, but recommending that SAM and CRAM files with long CIGAR
% strings also declare VN:1.6+ aids file format conversion.
\subsubsection*{August 2017}
Unique molecular identifier tags BZ, MI, OX, QX, and RX added.
Usage of sample barcode tag BC clarified.
\subsubsection*{June 2017}
Corrected the description of the E2 (second-most-likely bases) tag, which was previously unclear as to whether it contains bases or base qualities.
\subsubsection*{September 2016}
Predefined tags, previously listed as a brief table within the main SAM specification, have been split out into this new document.
There is now space for clearer and more complete tag descriptions.
\subsubsection*{February 2014}
MC tag added.
\subsubsection*{May 2013}
SAM version number {\tt VN:1.5} introduced, with limited impact for tags other than indicating that the CT/PT annotation tag definitions are considered finalised.
\gap
SA tag added.
\subsubsection*{March 2012}
Descriptions of CT and PT annotation tags significantly clarified.
\subsubsection*{October 2011}
Sample barcode tags QT and RT added, with RT being identified as a deprecated alternative to BC.
% These were actually added in late September as RT/PT, but RT was changed to
% CT (see samtools-devel, "Potential clash of RT tags (annotation vs barcode)",
% October 2011) before read-annotation-RT appeared in the wild.
Read annotation tags CT and PT added.
\subsubsection*{September 2011}
% This was actually August 29th, but let's call it September.
FZ tag's type changed from {\tt H} to {\tt B,S}-array.
BC and CO tags added.
\subsubsection*{April 2011}
SAM version number {\tt VN:1.4} introduced, indicating the addition of the {\tt B}-array tag type.
Files that contain records with {\tt B}-array fields should not declare a version number lower than~1.4 in their {\tt @HD} headers.
\gap
FZ tag added, with type {\tt H}.
MD tag description changed to allow IUPAC ambiguity codes in addition to {\tt ACGTN}.
\subsubsection*{March 2011}
CC and CP tags reinstated with their original meanings.
\subsubsection*{November 2010}
BQ tag added.
\subsubsection*{July 2010}
The specification was rewritten as a \LaTeX\ document specifying SAM version number {\tt VN:1.3}.
\gap
Tags FI, FS, OC, OP, OQ, and TC added.
Tags GC:Z, GQ:Z, and GS:Z, briefly proposed for representing repeatedly-sequenced reads, noted as reserved for backwards compatibility.
Existing tags MF:i (MAQ pair flag), SQ:H (suboptimal bases), and S2:H (mate's suboptimal bases) removed and noted as reserved for backwards compatibility.
CC and CP tags temporarily removed.
\subsubsection*{July 2009}
The original SAM ``0.1.2-draft'' specification specified version number {\tt VN:1.0} and defined a total of thirty standard tags (though SQ and S2 were already deprecated in favour of E2 and U2):
\begin{center}
\begin{tabular}{l@{\qquad}l@{\qquad}l@{\qquad}l@{\qquad}l}
AM & CS & IH & NM & RG \\
AS & E2 & LB & PG & S2 \\
CC & H0 & MD & PQ & SM \\
CM & H1 & MF & PU & SQ \\
CP & H2 & MQ & Q2 & U2 \\
CQ & HI & NH & R2 & UQ
\end{tabular}
\end{center}
\end{appendices}
\end{document}