Skip to content

Commit

Permalink
YLAB2-2755: gb parser fix for spaces in locus name
Browse files Browse the repository at this point in the history
  • Loading branch information
abychkova committed Dec 17, 2024
1 parent e45a279 commit 95b519e
Show file tree
Hide file tree
Showing 3 changed files with 290 additions and 50 deletions.
36 changes: 20 additions & 16 deletions src/Bio/GB/Parser.hs
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ import Data.Functor (($>))
import Data.Text (Text, intercalate, pack, splitOn, unpack)
import qualified Data.Text as T
import Text.Megaparsec (notFollowedBy, option, satisfy, sepBy1, takeWhile1P,
takeWhileP, try, (<?>))
takeWhileP, try, (<?>), chunk)
import Text.Megaparsec.Char (char, digitChar, eol, letterChar, string)
import Text.Megaparsec.Char.Lexer (decimal)

Expand All @@ -26,7 +26,7 @@ genBankP :: Parser GenBankSequence
genBankP = GenBankSequence
<$> (metaP <?> "Meta parser")
<*> (gbSeqP <?> "GB sequence parser")
<* string "//" <* eolSpaceP
<* string "//" <* eolSpaceP

--------------------------------------------------------------------------------
-- Block with meta-information.
Expand All @@ -46,18 +46,22 @@ metaP = do

pure $ Meta locus' definitionM accessionM versionM keywordsM sourceM referencesL commentsL

-- LOCUS AB32-36 pIntA_BC 6260 bp DNA circular SYN 11-SEP-2024
locusP :: Parser Locus
locusP = string "LOCUS" *> space *> (Locus
<$> textP <* space -- name
<$> nameP <* space -- name
<*> decimal <* space <* string "bp" <* space -- sequence length
<*> textP <* space -- molecule type
<*> optional formP <* space -- form of sequence
<*> optional (pack <$> some (satisfy isUpper)) <* space -- GenBank division
<*> optional formP <* space -- form of sequence
<*> optional (pack <$> some (satisfy isUpper)) <* space -- GenBank division
<*> textP -- modification date
<* eolSpaceP)
where
textP = takeWhile1P Nothing $ not . isSpace

nameP :: Parser Text
nameP = textP <> (try (string " " <> nameP) <|> "")

formP :: Parser Form
formP = try (string "linear" $> Linear) <|> (string "circular" $> Circular)

Expand Down Expand Up @@ -108,7 +112,7 @@ commentP = string "COMMENT" *> (try emptyP <|> (many (char ' ') *> someLinesP))
--------------------------------------------------------------------------------

featuresP :: Parser [(Feature, Range)]
featuresP = -- skip unknown fields and stop on line with "FEATURES"
featuresP = -- skip unknown fields and stop on line with "FEATURES"
manyTill (textWithSpacesP <* eolSpaceP) (string "FEATURES") *> space
*> textWithSpacesP <* eolSpaceP
*> some (featureP <?> "Single feature parser")
Expand All @@ -128,8 +132,8 @@ featureP = do
pure (Feature featureName' props, shiftRange (-1) range)

rangeP :: Parser Range
rangeP = try spanP
<|> try betweenP
rangeP = try spanP
<|> try betweenP
<|> try pointP
<|> try joinP
<|> complementP
Expand All @@ -141,8 +145,8 @@ rangeP = try spanP
_ <- string ".."
upperBorderType <- option Precise (try $ char '>' *> pure Exceeded)
upperBorderLocation <- decimal
pure $ Span (RangeBorder lowerBorderType lowerBorderLocation) (RangeBorder upperBorderType upperBorderLocation)
pure $ Span (RangeBorder lowerBorderType lowerBorderLocation) (RangeBorder upperBorderType upperBorderLocation)

betweenP :: Parser Range
betweenP = do
before <- decimal
Expand All @@ -152,13 +156,13 @@ rangeP = try spanP

pointP :: Parser Range
pointP = fmap Point decimal

joinP :: Parser Range
joinP = string "join(" *> fmap Join (rangeP `sepBy1` char ',') <* char ')'

complementP :: Parser Range
complementP = fmap Complement $ string "complement(" *> rangeP <* char ')'


propsP :: Parser (Text, Text)
propsP = do
Expand All @@ -178,17 +182,17 @@ propsP = do
indLine = do
_ <- string featureIndent2
notFollowedBy (char '/')
text <- textWithSpacesP
text <- textWithSpacesP
eolSpaceP
pure text

multiLineProp :: Parser Text
multiLineProp = do
fstText <- textWithSpacesP <* eolSpaceP
fstText <- textWithSpacesP <* eolSpaceP
rest <- many (try indLine)
pure $ T.concat (fstText : rest)
pure $ T.concat (fstText : rest)




-- | First level of identation in FEATURES table file.
--
Expand Down
228 changes: 228 additions & 0 deletions test/GB/spaces_in_locus.gb
Original file line number Diff line number Diff line change
@@ -0,0 +1,228 @@
LOCUS AB32-36 pIntA_BC 6260 bp DNA circular SYN 11-SEP-2024
DEFINITION synthetic circular DNA.
ACCESSION .
VERSION .
KEYWORDS .
SOURCE synthetic DNA construct
ORGANISM synthetic DNA construct
REFERENCE 1 (bases 1 to 6260)
AUTHORS Solov
TITLE Direct Submission
JOURNAL Exported Sep 12, 2024 from SnapGene 6.1.0
https://www.snapgene.com
FEATURES Location/Qualifiers
source 1..6260
/mol_type="other DNA"
/organism="recombinant plasmid"
source 2008..2013
/mol_type="other DNA"
/organism="recombinant plasmid"
source 2014..2385
/mol_type="other DNA"
/organism="recombinant plasmid"
source 2014..2385
/mol_type="other DNA"
/organism="recombinant plasmid"
enhancer 449..858
/label=cmv enhanser
/label=cmv\enhanser
/note="/vntifkey=9"
misc_feature 859..984
/label=hCMV promoter
/label=hCMV\promoter
/note="/vntifkey=21"
intron 1012..1919
/label=IntronA
/note="/vntifkey=15"
primer_bind 1181..1205
/label=1_30 intronA_seqF1
primer_bind 1801..1821
/label=1_14 pIntAseqF1
misc_feature 1945..1953
/label=Kozak
/note="/vntifkey=21"
CDS 1954..2013
/codon_start=1
/label=Leader IgK
/label=Leader\IgK
/note="Leader IgK"
/note="/ugene_name=Leader\ IgK"
/note="/vntifkey=21"
/translation="METDTLLLWVLLLWVPGSTG"
misc_feature 1954..1956
/label=START
/note="START"
/note="/ugene_name=START"
/note="/vntifkey=21"
primer_bind 1989..2013
/label=1_73 1fVHplic
primer_bind 1989..2009
/label=1_70 VL_VH_F_for_plic
CDS 2014..2385
/codon_start=1
/label=VH
/translation="EVQLLESGGGLVQPGGSLRLSCAASGRTVSRRAMAWFRQAPGKGR
EWVSAIGWNGDGPYVADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCALALSLAY
TLRPGDYDQWGQGTLVTVSS"
primer_bind complement(2131..2190)
/label=R2_F47W
primer_bind 2131..2179
/label=F2_F47W
primer_bind 2168..2213
/label=F1_Y60V
primer_bind complement(2171..2211)
/label=R1_Y60V
CDS 2386..3369
/codon_start=1
/label=K97_DEL_delGK_HC
/label=HC_DEL_delGK
/translation="ASTKGPSVFPLAPSSKSTSGGTAALGCLVKDYFPEPVTVSWNSGA
LTSGVHTFPAVLQSSGLYSLSSVVTVPSSSLGTQTYICNVNHKPSNTKVDKKVEPKSCD
KTHTCPPCPAPELLGGPSVFLFPPKPKDTLMISRTPEVTCVVVDVSHEDPEVKFNWYVD
GVEVHNAKTKPREEQYNSTYRVVSVLTVLHQDWLNGKEYKCKVSNKALPAPIEKTISKA
KGQPREPQVYTLPPSRDELTKNQVSLTCLVKGFYPSDIAVEWESNGQPENNYKTTPPVL
DSDGSFFLYSKLTVDKSRWQQGNVFSCSVMHEALHNHYTQKSLSLSP"
primer_bind complement(2386..2406)
/label=1_71 VH_R_for_plic
misc_feature 2674..2676
/label=K97
misc_feature 3370..3375
/label=STOP
/note="STOP"
/note="/ugene_name=STOP"
/note="/vntifkey=21"
misc_feature 3388..3626
/gene="SV40_PA term"
/label=SV40_PA term
/label=SV40_PA\term
/note="/vntifkey=21"
rep_origin 3891..4284
/label=EBV ori
/label=EBV\ori
/note="/vntifkey=33"
promoter 4747..4851
/gene="bla"
/label=AmpR promoter
CDS 4852..5712
/codon_start=1
/label=AmpR
/note="/vntifkey=4"
/translation="MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDQLGARVGYI
ELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVEYS
PVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDRW
EPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSA
LPAGWFIADKSGAGERGSRGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGAS
LIKHW"
rep_origin join(5883..6260,1..211)
/direction=RIGHT
/label=pUCorigin
/label=ori
/note="high-copy-number ColE1/pMB1/pBR322/pUC origin of
replication"
ORIGIN
1 cctacagcgt gagctatgag aaagcgccac gcttcccgaa gggagaaagg cggacaggta
61 tccggtaagc ggcagggtcg gaacaggaga gcgcacgagg gagcttccag ggggaaacgc
121 ctggtatctt tatagtcctg tcgggtttcg ccacctctga cttgagcgtc gatttttgtg
181 atgctcgtca ggggggcgga gcctatggaa aaacgccagc aacgcggcct ttttacggtt
241 cctggccttt tgctggcctt ttgctcacat gttctttcct gcgttatccc ctgattctgt
301 ggataaccgt attaccgcct ttgagtgagc tgataccgct cgccgcagcc gaacgaccga
361 gcgcagcgag tcagtgagcg aggaagcgta catttatatt ggctcatgtc caatatgacc
421 gccatgttga cattgattat tgactagacc gcgttacata acttacggta aatggcccgc
481 ctggctgacc gcccaacgac ccccgcccat tgacgtcaat aatgacgtat gttcccatag
541 taacgccaat agggactttc cattgacgtc aatgggtgga gtatttacgg taaactgccc
601 acttggcagt acatcaagtg tatcatatgc caagtacgcc ccctattgac gtcaatgacg
661 gtaaatggcc cgcctggcat tatgcccagt acatgacctt atgggacttt cctacttggc
721 agtacatcta cgtattagtc atcgctatta ccatggtgat gcggttttgg cagtacatca
781 atgggcgtgg atagcggttt gactcacggg gatttccaag tctccacccc attgacgtca
841 atgggagttt gttttggcac caaaatcaac gggactttcc aaaatgtcgt aacaactccg
901 ccccattgac gcaaatgggc ggtaggcgtg tacggtggga ggtctatata agcagagctc
961 gtttagtgaa ccgtcagatc gcctggagac gccatccacg ctgttttgac ctccatagaa
1021 gacaccggga ccgatccagc ctccgcggcc gggaacggtg cattggaacg cggattcccc
1081 gtgccaagag tgacgtaagt accgcctata gagtctatag gcccaccccc ttggcttctt
1141 atgcatgcta tactgttttt ggcttggggt ctatacaccc ccgcttcctc atgttatagg
1201 tgatggtata gcttagccta taggtgtggg ttattgacca ttattgacca ctcccctatt
1261 ggtgacgata ctttccatta ctaatccata acatggctct ttgccacaac tctctttatt
1321 ggctatatgc caatacactg tccttcagag actgacacgg actctgtatt tttacaggat
1381 ggggtctcat ttattattta caaattcaca tatacaacac caccgtcccc agtgcccgca
1441 gtttttatta aacataacgt gggatctcca cgcgaatctc gggtacgtgt tccggacatg
1501 ggctcatctc cggtagcggc ggagcttcta catccgagcc ctgctcccat gcctccagcg
1561 actcatggtc gctcggcagc tccttgctcc taacagtgga ggccagactt aggcacagca
1621 cgatgcccac caccaccagt gtgccgcaca aggccgtggc ggtagggtat gtgtctgaaa
1681 atgagctcgg ggagcgggct tgcaccgctg acgcatttgg aagacttaag gcagcggcag
1741 aagaagatgc aggcagctga gttgttgtgt tctgataaga gtcagaggta actcccgttg
1801 cggtgctgtt aacggtggag ggcagtgtag tctgagcagt actcgttgct gccgcgcgcg
1861 ccaccagaca taatagctga cagactaaca gactgttcct ttccatgggt cttttctgca
1921 gtcaccgtcc ttgacacgaa gcttgccgcc accatggaga ccgacaccct gctgctgtgg
1981 gtgctgctgc tgtgggtgcc cgggtcgacc ggtgaggtgc agcttctgga atctgggggc
2041 gggttagtgc agcctggagg atctctgcgt ttgagctgcg ccgctagcgg gaggacagtg
2101 agcagaagag ctatggcatg gttccgacag gcccctggga agggcagaga gtgggtgagt
2161 gccattggct ggaatggaga cggcccctac gtcgcggact ccgtcaaagg ccgcttcacc
2221 atcagccggg acaacagcaa gaacaccctg tacctgcaga tgaacagcct ccgcgccgaa
2281 gataccgcag tgtactattg cgcactggca ctgagcctgg cctacactct gagacccggg
2341 gactacgatc agtggggaca gggtacactg gtgaccgtga gctctgctag caccaagggc
2401 ccatcggtct tccccctggc accctcctcc aagagcacct ctgggggcac agcggccctg
2461 ggctgcctgg tcaaggacta cttccccgaa ccggtgacgg tgtcgtggaa ctcaggcgcc
2521 ctgaccagcg gcgtgcacac cttcccggct gtcctacagt cctcaggact ctactccctc
2581 agcagcgtgg tgaccgtgcc ctccagcagc ttgggcaccc agacctacat ctgcaacgtg
2641 aatcacaagc ccagcaacac caaggtggac aagaaagttg agcccaaatc ttgtgacaaa
2701 actcacacat gcccaccgtg cccagcacct gaactcctgg ggggaccgtc agtcttcctc
2761 ttccccccaa aacccaagga caccctcatg atctcccgga cccctgaggt cacatgcgtg
2821 gtggtggacg tgagccacga agaccctgag gtcaagttca actggtacgt ggacggcgtg
2881 gaggtgcata atgccaagac aaagccgcgg gaggagcagt acaacagcac gtaccgtgtg
2941 gtcagcgtcc tcaccgtcct gcaccaggac tggctgaatg gcaaggagta caagtgcaag
3001 gtctccaaca aagccctccc agcccccatc gagaaaacca tctccaaagc caaagggcag
3061 ccccgagaac cacaggtgta caccctgccc ccatcccggg acgagctgac caagaaccag
3121 gtcagcctga cctgcctggt caaaggcttc tatcccagcg acatcgccgt ggagtgggag
3181 agcaatgggc agccggagaa caactacaag accacgcctc ccgtgctgga ctccgacggc
3241 tccttcttcc tctatagcaa gctcaccgtg gacaagagca ggtggcagca ggggaacgtc
3301 ttctcatgct ccgtgatgca tgaggctctg cacaaccact acacgcagaa aagcctctcc
3361 ctgtccccgt aatagtctag acctaggtga tcataatcag ccataccaca tttgtagagg
3421 ttttacttgc tttaaaaaac ctcccacacc tccccctgaa cctgaaacat aaaatgaatg
3481 caattgttgt tgttaacttg tttattgcag cttataatgg ttacaaataa agcaatagca
3541 tcacaaattt cacaaataaa gcattttttt cactgcattc tagttgtggt ttgtccaaac
3601 tcatcaatgt atcttatcat gtctggagat ctctagctag aggatcgatc cccgccccgg
3661 acgaactaaa cctgactacg acatctctgc cccttcttcg cggggcagtg catgtaatcc
3721 cttcagttgg ttggtacaac ttgccaactg aaccctaaac gggtagcata tgcttcccgg
3781 gtagtagtat atactatcca gactaaccct aattcaatag catatgttac ccaacgggaa
3841 gcatatgcta tcgaattagg gttagtaaaa gggtcctaag gaacagcgat gtaggtgggc
3901 gggccaagat aggggcgcga ttgctgcgat ctggaggaca aattacacac acttgcgcct
3961 gagcgccaag cacagggttg ttggtcctca tattcacgag gtcgctgaga gcacggtggg
4021 ctaatgttgc catgggtagc atatactacc caaatatctg gatagcatat gctatcctaa
4081 tctatatctg ggtagcatag gctatcctaa tctatatctg ggtagcatat gctatcctaa
4141 tctatatctg ggtagtatat gctatcctaa tttatatctg ggtagcatag gctatcctaa
4201 tctatatctg ggtagcatat gctatcctaa tctatatctg ggtagtatat gctatcctaa
4261 tctgtatccg ggtagcatat gctatcctaa tagagattag ggtagtatat gctatcctaa
4321 tttatatctg ggtagcatat actacccaaa tatctggata gcatatgcta tcctaatcta
4381 tatctgggta gcatatgcta tcctaatcta tatctgggta gcataggcta tcctaatcta
4441 tatctgggta gcatatgcta tcctaatcta tatctgggta gtatatgcta tcctaattta
4501 tatctgggta gcataggcta tcctaatcta tatctgggta gcatatgcta tcctaatcta
4561 tatctgggta gtatatgcta tcctaatctg tatccgggta gcatatgcta tcctcatgat
4621 aagctgtcaa acatgagaat taattcttga agacgaaagg gcctcgtgat acgcctattt
4681 ttataggtta atgtcatgat aataatggtt tcttagacgt caggtggcac ttttcgggga
4741 aatgtgcgcg gaacccctat ttgtttattt ttctaaatac attcaaatat gtatccgctc
4801 atgagacaat aaccctgata aatgcttcaa taatattgaa aaaggaagag tatgagtatt
4861 caacatttcc gtgtcgccct tattcccttt tttgcggcat tttgccttcc tgtttttgct
4921 cacccagaaa cgctggtgaa agtaaaagat gctgaagatc agttgggtgc acgagtgggt
4981 tacatcgaac tggatctcaa cagcggtaag atccttgaga gttttcgccc cgaagaacgt
5041 tttccaatga tgagcacttt taaagttctg ctatgtggcg cggtattatc ccgtgttgac
5101 gccgggcaag agcaactcgg tcgccgcata cactattctc agaatgactt ggttgagtac
5161 tcaccagtca cagaaaagca tcttacggat ggcatgacag taagagaatt atgcagtgct
5221 gccataacca tgagtgataa cactgcggcc aacttacttc tgacaacgat cggaggaccg
5281 aaggagctaa ccgctttttt gcacaacatg ggggatcatg taactcgcct tgatcgttgg
5341 gaaccggagc tgaatgaagc cataccaaac gacgagcgtg acaccacgat gcctgcagca
5401 atggcaacaa cgttgcgcaa actattaact ggcgaactac ttactctagc ttcccggcaa
5461 caattaatag actggatgga ggcggataaa gttgcaggac cacttctgcg ctcggccctt
5521 ccggctggct ggtttattgc tgataaatct ggagccggtg agcgtgggtc tcgcggtatc
5581 attgcagcac tggggccaga tggtaagccc tcccgtatcg tagttatcta cacgacgggg
5641 agtcaggcaa ctatggatga acgaaataga cagatcgctg agataggtgc ctcactgatt
5701 aagcattggt aactgtcaga ccaagtttac tcatatatac tttagattga tttaaaactt
5761 catttttaat ttaaaaggat ctaggtgaag atcctttttg ataatctcat gaccaaaatc
5821 ccttaacgtg agttttcgtt ccactgagcg tcagaccccg tagaaaagat caaaggatct
5881 tcttgagatc ctttttttct gcgcgtaatc tgctgcttgc aaacaaaaaa accaccgcta
5941 ccagcggtgg tttgtttgcc ggatcaagag ctaccaactc tttttccgaa ggtaactggc
6001 ttcagcagag cgcagatacc aaatactgtt cttctagtgt agccgtagtt aggccaccac
6061 ttcaagaact ctgtagcacc gcctacatac ctcgctctgc taatcctgtt accagtggct
6121 gctgccagtg gcgataagtc gtgtcttacc gggttggact caagacgata gttaccggat
6181 aaggcgcagc ggtcgggctg aacggggggt tcgtgcacac agcccagctt ggagcgaacg
6241 acctacaccg aactgagata
//
Loading

0 comments on commit 95b519e

Please sign in to comment.