Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ADAM-1192] Correctly handle other whitespace in FASTA description. #1198

Merged
merged 1 commit into from
Oct 8, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,8 @@ class Fasta2ADAMSuite extends ADAMFunSuite {
val contigFragments = sc.loadParquetContigFragments(convertPath)
assert(contigFragments.rdd.count() === 26)
val first = contigFragments.rdd.first()
assert(first.getContig.getContigName === "gi|224384749|gb|CM000682.1|")
assert(first.getDescription === "Homo sapiens chromosome 20, GRCh37 primary reference assembly")
assert(first.getContig.getContigName === null)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure I understand, the string before the first space is typically the sequence name in FASTA format. You might want to compare to the various open-bio projects.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Well, gi is the Entrez ID, and gb is the GenBank ID. I assure you that they are not the sequence name.

assert(first.getDescription === "gi|224384749|gb|CM000682.1| Homo sapiens chromosome 20, GRCh37 primary reference assembly")
assert(first.getFragmentNumber === 0)
assert(first.getFragmentSequence.length === 10000)
assert(first.getFragmentStartPosition === 0L)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -69,15 +69,20 @@ private[adam] object FastaConverter {
require(id == -1L, "Cannot have a headerless line in a file with more than one fragment.")
(None: Option[String], None: Option[String])
} { (dL) =>
val splitIndex = dL.indexOf(' ')
// fasta description line splits on whitespace
val splitIndex = dL.indexWhere(c => c.isWhitespace)
if (splitIndex >= 0) {
val split = dL.splitAt(splitIndex)

val contigName: String = split._1.stripPrefix(">").trim
val contigDescription: String = split._2.trim

(Some(contigName), Some(contigDescription))
// is this description metadata or not? if it is metadata, it will contain "|"
if (split._1.contains('|')) {
(None, Some(dL.stripPrefix(">").trim))
} else {
val contigName: String = split._1.stripPrefix(">").trim
val contigDescription: String = split._2.trim

(Some(contigName), Some(contigDescription))
}
} else {
(Some(dL.stripPrefix(">").trim), None)
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,14 @@ object NucleotideContigFragmentRDD extends Serializable {
private[rdd] def apply(rdd: RDD[NucleotideContigFragment]): NucleotideContigFragmentRDD = {

// get sequence dictionary
val sd = new SequenceDictionary(rdd.map(SequenceRecord.fromADAMContigFragment)
.distinct
val sd = new SequenceDictionary(rdd.flatMap(ncf => {
if (ncf.getContig != null &&
ncf.getContig.getContigName != null) {
Some(SequenceRecord.fromADAMContigFragment(ncf))
} else {
None
}
}).distinct
.collect
.toVector)

Expand Down
100 changes: 100 additions & 0 deletions adam-core/src/test/resources/HLA_DQB1_05_01_01_02.fa
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
>HLA-DQB1*05:01:01:02 HLA06615 7090 bp
TTCTAAGACCTTTGCTCTTCTCCCCAGGACTTAAGGCTCTTCAGCGTGTCTAAGACAACAGCAGTAAAAATT
TCTGTGACAGCAATTTTCTCTCCCCTGAAATATGATCCCCACCTAATTTGCTTTATTAAAAATCCCAAGTAT
AATAACAACTGGTTTTTAACAATATTACAGAGATGTTTACTGTTGAATTGCATTTTTCTTTTGCCTCTCAAA
ATCCCTGAGGAATTTGTTCTTCAGCTCTTCTATAATCGAGAGGAAATTTTCACCTCAGATGTTCATCCAGTG
CAATTGGAAGACGTCACAGTGCCAGGCACTGGATTGAGAACCTTCACAAAAAAAAATGTCTGCCCAGAGACA
GATGAGGTCCTTCAGCTCCAGTGCTGATTGGTTCTTTTCCAAGCGACCATCCAATCCTGCCACGCACGGAAA
CATCCACAGGTTTTTATTCTTTCTGCCAGGTACATCAGATCCATCAGGTCTGAGCTGTGTTGACTACCACTA
CTTTTCCCTTCGTCTCAATTATGTCTTGGAAGAAGTCTTTGCGGATCCCCGGAGACCTTCGGGTAGCAACTG
TCACCTTGATGCTGGCGATCCTGAGCTCCTCACTGGCTGAGGGCAGAGACTCTCCCGGTAAGTGCAGGAAAG
CTGCTCTCCAGAGCCGCCACTCTGGGAACAGGCTTTCCTTGGGCTGGGGTATGGGGGATGGTGATCTCCATG
GTCTTGGACACAATCTTTCATCAACATTTCCTCTTTTTTGGGAAAGAAAGCTATGTTGCATTCCCATTTACC
TTTTAGTGATGAAATGAGGACAATCCAATCCCCATCCTACAGGTTTAAGCCTGGAAGAGGAGGAGAGAGGAG
AGGAAAGAGGAGATAAAGTGTACATTTACTACCAGTGATAGGACAAAGTGAGCATGGGGTTATTTTTGAAGA
TATTAATTTCTCCAAAGACACAGCAGGATTTGTCATTTCAGCGTGCCCCAAGACTTTGCCTGGACTAAATAT
TATGAGATTCTGCAGTGGGAAATGTAAGGCAGCGATGGTGTCTGTAGTCTCCGTATTTGAGGAAAAGTAGTC
TGTATTCCTGACTGACTGGAGCGTTTGTGGAGGCAAAATCTTGGTACTGAGGGAAGCTGACTGGCTGACCAC
AGACAGGGAGTCTTCAGGTTTCACTGATTTATGGGCAAATGGTGACTTGAGTGGGATTCAGAGACCCGAGTT
GTTGGTGGACTAAATTTAGTAGAAAGGAGGATGTAAAGAAGGGAAATAACACATATTATGAAACCACTCACT
CAGACACAGAACAATACTTTACATAAATTCTCTCTCACTCCTTCTAACATCCTGTGTGCAGATATCATGATT
TTCTTTTACACAATTATACTTGTGATACGGATATTCTGCCCAATATGGATATACACAACCTGGCCAAGCTGG
TAACTGCCACAGTTTAATTGGAATCTAGTTTATCAAATTCAAAAGCTTATGCTCTTTCCATGAATAAATATT
TCTATCTAGGACTCAGAGTTGTAGGTCCTTTCCAACATAGAAGGGAGTGAACCTCAACGGGTCTTGGGAGGG
TAAATCCAGGCATGGGAAGGAAGGTATTTTACCCAGGGACCAAGAGAATACGCGTATCAGAACGAGGACAAG
ATTAATTCCTGGACCTGTCGCATCATTCCCTTGAACTCACAGGTTTATGTGGATAAGTTTATCTCTGAGGTT
TCCAGGAGCTGCATGAAAAATGGGATTTCATGCGAGAACGCCCCGATCCCTCTAAGTGCAGAGGTCCATGTA
AAATCAGCCCGACTGCCTCTTCGCTGGGTTCACTGGCTCAGGCAGGGACAGGGCTTTCCGCCCTTTCCTGCC
TGTAGGAAGGCGGATTCCCGAAGCCCCCAGAGAGGGCGGGCAGGGCTGGGCAGAGCCGCCGGGAGGATCCCA
GGTCTGGGGCGCCAGGCACGGGCTGGCGGGAACTGGAGGTCGCGCGGGCGGTTCCACAGCTCCGGTCCGGGT
CAGGGCGGCGGCTGGGGGCGCAGCCGGGCTAGGGCCAGGCTGGGGCCTGACTGACTGGCCCGTGATTCCCCG
CAGAGGATTTCGTGTACCAGTTTAAGGGCCTGTGCTACTTCACCAACGGGACGGAGCGCGTGCGGGGTGTGA
CCAGACACATCTATAACCGAGAGGAGTACGTGCGCTTCGACAGCGACGTGGGGGTGTACCGGGCAGTGACGC
CGCAGGGGCGGCCTGTTGCCGAGTACTGGAACAGCCAGAAGGAAGTCCTGGAGGGGGCCCGGGCGTCGGTGG
ACAGGGTGTGCAGACACAACTACGAGGTGGCGTACCGCGGGATCCTGCAGAGGAGAGGTGAGCGTCGTCGCC
CCTCCGTGAGGCCCACCGTTGGCCGGGACCTCTAGTCTCTGTGCTGGGAGGGGCGATGGGGTTGCGGCCTCT
GAAACCTGAGCCCCGTTTATTCCACCCCAGGGGACAGGAGTTGGCGGCGTGAGTGGTGGGGCAGGTGCATCA
GATGGGCGGGGACCTAGGGCAGAGCAGGGAGACAAGCAGAGTTGGCCAGGCTGCCTAGTGTCCCCCCAGGGT
CCTCGTCTGTTGGCCTCGTCCTCCGCTCTGCACGTTTCTCGCCTCGTGCCTTATGTGTTTGCCTCCTCGTGC
CTTACCTTTGCTAAGCAGTTCTCTCTGCCCCCAATGCCCACCCTCTTCCCCTGCCCGTCCGCCCCACTAGCA
CTGCCCCATTCAGCAAGGCCCACGTGCGCAGCTCGCGCCGCAGGAAGCTTCAGGCTTGGCCTGGTGGAGTTA
GGGCTGCCCCACAACTGCGCCCAGGGCATCCAGCAATTACAGTTGCAAAATAAGATATTTTGACTTTTTGGC
TTCAAATCATTATTCATCGTAATTCTGTTTTCCTAAATGGCTATCACTAATGGCGGAGATTTTTGAGGTGGG
AGACTGTTTAAATTATTGCATGCTTGGTACCTGACACTTTGACTGGTATGTGGTATGAGCTCAATGATCTTC
TGTTAAATTCATGAATAAATGTACTCAGCTGCCCATCCACTTAGGCTCAAGAAAAAAACAGAGGCTTACAAA
TGGACTTTGTTAATTATTTTCTATCATTTTGCCTAATGCTTTAAAGTAAACTCTTATTGACTAGGATCTTAA
TAGAATTTGTGAATACAAAGTCTGAGAAAAAAAGTGTTTGCTAAAAATAAAAACAATGCTTGAATGACGTTG
TAAGGCAGAGTTTTAATTTCTTAGACAAGCTGAACAAATGGCACAATGCAAAGGGCAGAAGTTTGGGAATAA
ATAGGTTGAAGCCATTAAATTATTAGATAAAAATAGTTTCAGGTTGCTTTTGGCCTGGGTTCTCCCCTCCCT
CCATCACTATCCACTTCAGGAATAAACATTCTGAAAGTCAATTTTACCCATTCAGGTAGCACTTATTTCTAG
ACAGTTGCCTTATCAAATACCATATATGTTGTGCCATTTAATCTCACAGTTACCTGTGCATTAGAGATTAGC
ATCACCACTTGATATATCCTAATATTGGTACAGGATAAACACTTTAAGTAATCAGCCCACAATTACTCACCA
AGACCTTAAGCCTCCCAAAGTATAAACCATTCTTATGTTCCTCAGTGTACATCCATAGAGTCTAAGGGATGT
AAGGCCTTGTTGAAGCCAGTTTTGACCAGAAGCAGCAATGAGCCTATTCCTGTTTGTTCTCCATGTTAATGG
GGCATTGAAAATTCACTGATTAATCAATCCCTAGTCTGACCCCAGTGTTATCTATGCAGGTTCACAAAACTT
TTAGATTACTTTACACCCCCTTGCCTTCTTTTGACTCACATTCTAATGCCAGCAAGTACTTATATTTTTGCT
ATTTCAGTTCTATTTCCATAAAATTTATTTTATCATCTTTTCTCATAAAATTGTGCCCTCTATTTTACTCCC
AGTCTGTTTAAGATGAACAAATCTTACAAGGTCACATAGCTGACTGTGATATCAGTTGGACTCCAGGAAGGA
GAACCTAAAGAAAAGTTCAAGTCCAAGCAGAAACCGTGATTCCTTCCGGATGATGGCTCAAGAGTGATGTTT
AACTGGGATGCAACCTGCTGACCTCAGCAAATCCTAGTTATATGTATGTGTTCACATTACAGGCTCATTAGC
CCAGGCCGACCTCTGCATGGATCTCAGAATATTTTCTATGGAGAACATACATGATAATGTCTGATTTCAGAA
CAAGAAAGTAATTCTCAATAGCAAGGAAATGGAGTAGGGTAGACAGCTAGTAACTAAACTCACTTGTGCGTT
AAGAAGAAATTAAGGAAAAAAGAAAATAAGAGAACATATTACTAAATAAAGAAACACACATTAAATATTTGC
TATAGTTTCACACTAAGAGAATAAAGGAAATGCAATAAAGTGGCCTGAAAGGTAAAGGATGAGATGTGTAAA
AGAGGCAGGGAAAGATGTATAATAATTTTTTACTATGAGCAGCAATCTGAGAAGATAAAGGAATTGAGCTGT
GGGCAAACATGATGTTTGATCAGTGTTATTTGTTTTCAAGGCCTGCCTACTTTTTTTTTCAAATATTACAAA
CTTTTGAAATAACATTCTTTTTGTTTTTTACTGTCTGTTACTAGATTGCATATTCTATAAATGAAGGGACCA
TGGTATGTTGTTTATCTTTGGATTCTCAGTGATTGTCAAATTTATATTTGTTGAAGGAACCTTAATCCAAGA
CTTGGACTCCAGGTATCTTTCCACTCTGGTTCCAAGGAGGGACCCTTCCTCATGGTGGACGTGCTGTGTGGT
CTCACGTCTCACTCCTGTGTCTTTTCCTGTCTGTTACTGCCCTTAGTGGAGCCCACAGTGACCATCTCCCCA
TCCAGGACAGAGGCCCTCAACCACCACAACCTGCTGATCTGCTCGGTGACAGATTTCTATCCAAGCCAGATC
AAAGTCCGGTGGTTTCGGAATGATCAGGAGGAGACAGCCGGCGTTGTGTCCACCCCCCTCATTAGGAACGGT
GACTGGACCTTCCAGATCCTGGTGATGCTGGAAATGACTCCCCAGCGTGGAGATGTCTACACCTGCCACGTG
GAGCACCCCAGCCTCCAGAGCCCCATCACCGTGGAGTGGCGTAAGGAGATATTGAGTTTCTGTTATTATGGG
CCCCACAAGACAAAGAGCTCCTTCTGACCCATTCCTTCCCATCTCTTATCCCTGATGTCACTACTGAGCTGG
GAATCACAGGAGACTAGAGCACCTCTAGTTCCATGGCGAGTGCATCAGAAGAATCCTGATCTCATCACCTTT
CCAGATGCTAGGGAAATTACTCTACATACTGTTGCTCTGGATCCCAGTCCTGATAGCTCTGACGGACTGATT
CTTAGGGCTGGTGATTGGGATCTTAGGGTCTAAGGTTATGGATGAGTTCCTGAGGAGCAGAGATTTGCTTCC
CCACTCTCTCACCTACCCACTGTATCCAAGGACCTATTGTCTGGCCTTTCCCCTCCTTAGGGGTGGTCTGAA
TGGAGAGCTAGGTTCCTTTGATGCCTTCACCTCCTGCACCTCAGACTGGACTTCAACTCCTCAGCAGGGATG
CTATGGGGTGTGGGGACAAACACTGACACTCAGGTTCTGCTTTTTAGGGGCTCAGTCTGAATCTGCCCAGAG
CAAGATGCTGAGTGGCGTTGGAGGCTTCGTGCTGGGGCTGATCTTCCTTGGGCTTGGCCTTATCATCCGTCA
AAGGAGTCGGAAAGGTGAGGAACCCCAGGGGAAAAGGGGAAGATGGGCTGTGACCCAGACCCTCTGTTCAGA
GTGGTCCTGTCTGTAGATTAGCTCTTTCCTCCTGACCCTGAGAGGAAGTGCGAGGAGACAGGACAAGATGGG
AGGAGGCATTGGAATCTGATTTTACTGGTTGAAAGGTAGCGCTGTCACAGAGCTGACTGATAGAGCTTATTC
CAGGGCATCCTTACTATTCATCATTGTCTCACTGGCTCCTTTCCAAAAGCTTCCTCCATTAAGAGGGTCAGA
GCCTCAGCCTCCTTTCTTTCTGGTGACAATTTCCTTTGTTTTAGGGGATTTTAAATTAGGGTGCTGAAGGCC
TGGAAGAACATGGGTGGGAAGAGAATGTAACTCTAAGTCATGTGTGTCATTTTCCTTTGGGGTGAGAGAGTG
GCTGTTTGTGTAATGAGACCTTTCTCTGCATAACTTCCTTTTGTAAGACCTCAAGGGCCTCCACCAGCAGGT
AATATTTCAGCCATGATCCAGTGTGGGGAGGCACAGGTATAAGAGGGAAGAGCATGAGCTGAGTGTACCTGA
CCACAGTGGTCCATGTTCGTGGCCTATTTGCTGCTATGAGGATCAAGACTTAGGGGAGAAGTTTGCCAGTTT
CTAGGAATCTCCAGACATTGTTCCCCAGAACCAAGCCTTAACTTTGGTGGCATCTTCTTGTGAAATGTGGAG
CCAGAACCACAGCTTAAATGTTAGACACTAGGATGATGCCCACTTTGTGCCACATGATGGTGGCTACTGCCT
GTAGGCATTTTCCAGTGACTGAAAGAGGCTGCTAGTGGTCGGGATGAGATATCATCCAATTTCCTAAAAAGA
CTGAACCCTTCATATTCCCCAGAAGAATAACAGCTGTTCCCCACCTCCCACACATCTGCATCAAGCTGAAGT
TCTGTGTCCTCGTGAGCTGATTTCACCTTTGCACAGATCTTGGGGGAGGTGATGACAATACACCCTGGACCT
CAACTTTCTCTGTCTGAAGCTGCAGGGGGCCCCTGAAGGGTGGGGGAGATGGCAGGCCCACCAGGATACCCT
GTGCTGATCAATCCTCTTCTCTCTTCTCCAGGGCTTCTGCACTGACTCCTGAGACTGTTTTAACTAAGACTG
GTTATCACTCTTCTGTGATGCCTGCTTGTCCCTGCCCAGAATTCCCAGCTGCCTGTGTCAGCTTGTCCCCCT
GAGATCAAAGTCCTACAGTGGCTGTCACGCAACCACCAGGTCATCTCCTTTCATCCCCACCCCAAGGCGCTG
GCTGTGACTCTGCTTCCTGCACTGACCCAGAGCC
10 changes: 10 additions & 0 deletions adam-core/src/test/resources/hs38DH_chr1_10.fa
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
>chr1 AC:CM000663.2 gi:568336023 LN:248956422 rl:Chromosome M5:6aef897c3d6ff0c78aff06ac189178dd AS:GRCh38
NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,6 @@ class FastaConverterSuite extends ADAMFunSuite {
assert(0 === FastaConverter.findContigIndex(252366300L, headerIndices))
assert(892647244L === FastaConverter.findContigIndex(892647249L, headerIndices))
assert(252366306L === FastaConverter.findContigIndex(498605720L, headerIndices))

}

test("convert a single record without naming information") {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -293,15 +293,25 @@ class ADAMContextSuite extends ADAMFunSuite {
assert(reads.rdd.count === reloadedReads.rdd.count)
}

sparkTest("read a HLA fasta from GRCh38") {
val inputPath = resourcePath("HLA_DQB1_05_01_01_02.fa")
val gRdd = sc.loadFasta(inputPath, 10000L)
assert(gRdd.sequences.records.size === 1)
assert(gRdd.sequences.records.head.name === "HLA-DQB1*05:01:01:02")
val fragments = gRdd.rdd.collect
assert(fragments.size === 1)
assert(fragments.head.getContig.getContigName === "HLA-DQB1*05:01:01:02")
}

sparkTest("read a gzipped fasta file") {
val inputPath = resourcePath("chr20.250k.fa.gz")
val contigFragments: RDD[NucleotideContigFragment] = sc.loadFasta(inputPath, 10000L)
.rdd
.sortBy(_.getFragmentNumber.toInt)
assert(contigFragments.rdd.count() === 26)
val first: NucleotideContigFragment = contigFragments.first()
assert(first.getContig.getContigName === "gi|224384749|gb|CM000682.1|")
assert(first.getDescription === "Homo sapiens chromosome 20, GRCh37 primary reference assembly")
assert(first.getContig.getContigName === null)
assert(first.getDescription === "gi|224384749|gb|CM000682.1| Homo sapiens chromosome 20, GRCh37 primary reference assembly")
assert(first.getFragmentNumber === 0)
assert(first.getFragmentSequence.length === 10000)
assert(first.getFragmentStartPosition === 0L)
Expand Down