Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Rewrite FASTA parser to Megaparsec #67

Merged
merged 23 commits into from
Oct 4, 2022
1 change: 0 additions & 1 deletion src/Bio/FASTA.hs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@ module Bio.FASTA
, fromFile
, toFile
, fastaP
, fastaPGeneric
, fastaLine
, modificationP
) where
Expand Down
14 changes: 5 additions & 9 deletions src/Bio/FASTA/Parser.hs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@

module Bio.FASTA.Parser
( fastaP
, fastaPGeneric
, fastaLine
, modificationP
) where
Expand All @@ -18,10 +17,10 @@ import Text.Megaparsec.Char
import qualified Text.Megaparsec.Char.Lexer as L

instance ParsableFastaToken Char where
parseToken = alphaNumChar
parseToken = letterChar

instance ParsableFastaToken ModItem where
parseToken = (Mod <$> modificationP <?> "fasta item modification") <|> Letter <$> alphaNumChar
parseToken = (Mod <$> modificationP <?> "fasta item modification") <|> Letter <$> letterChar

type Parser = Parsec Void Text

Expand All @@ -38,10 +37,7 @@ symbol :: Text -> Parser Text
symbol = L.symbol sc

fastaP :: ParsableFastaToken a => Parser (Fasta a)
fastaP = fastaPGeneric

fastaPGeneric :: ParsableFastaToken a => Parser (Fasta a)
fastaPGeneric = many item
fastaP = many item <* hidden space <* eof

item :: ParsableFastaToken a => Parser (FastaItem a)
item =
Expand All @@ -56,10 +52,10 @@ fastaSeq :: ParsableFastaToken a => Parser (BareSequence a)
fastaSeq = bareSequence . concat <$> many fastaLine <* hidden space
maksbotan marked this conversation as resolved.
Show resolved Hide resolved

fastaLine :: ParsableFastaToken a => Parser [a]
fastaLine = concat <$> some (some (parseToken <* hidden hspace)) <* myEnd
fastaLine = concat <$> some (some parseToken <* hidden hspace) <* myEnd

myEnd :: Parser ()
myEnd = void (some eol) <|> eof
myEnd = void eol <|> eof

modificationP :: Parser Modification
modificationP
Expand Down
14 changes: 14 additions & 0 deletions test/FASTA/order1.fasta
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
>3HMX:A|PDBID|CHAIN|SEQUENCE
IWELKKDVYVVELDWYPDAPGEMVVLTCDTPEEDGITWTLDQSSE
VLGSGKTLTIQVKEFGDAGQYTCHKGGEVLSHSLL

>7HMX:A|PDBID|CHAIN|SEQUENCE
EEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE
VLGSGKTLTIQVKEFGDAGQYTCHKGGEVLSHSLL

> With_spaces
MDFFDLDIEI KQERLPAECS LNSPLNYSLS AQLTDRMTPR TENVRRQRER
MDFFDLDIEI KQERLPAECS LNSPLNYSLS AQLTDRMTPR TENVRRQRER
MDFFDLDIEI KQERLPAECS LNSPLNYSLS AQLTDRMTPR TENVRRQRER

> Empty_ha_ha_ha
5 changes: 5 additions & 0 deletions test/FASTA/order2.fasta
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
>Sample_name1
ACGT....TCG

>Sample_name2
GTCA....TGC
2 changes: 2 additions & 0 deletions test/FASTA/order3.fasta
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
>N-His-E4Orf6-7-R2(115)
TGATGGTGATGGTGATGcatGTGGTAAACTCGACTTTCACTTTTCTCTATCACTGATAGGGAGTGGTAAACTCGACTTTCACTTTTCTCTATCACTGATAGGGAaacagtcagcc
27 changes: 27 additions & 0 deletions test/FASTA/order4.fasta
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
>CMV-Lox2272-HindIII_R
aatcatAAGCTTataacttcgtataaagtatcctatacgaagttatagctctgcttatatagacctcccacc


HindIII-BFP_F
aagttatAAGCTTatgagcgagctgattaaggagaacatgc

>sPA-lox2272_R
taatTGGCTAGCATataacttcgtataaagtatcctatacgaagttatgctgcatcacacaaaaaaccaacac

>NheI-GFP_F
ttatATGCTAGCCAatggtgagcaagggcgagg

>NotI-pMP_F
tataatGCGGCCGCAGGTGGCac

>CMV-LoxP-HindIII_R
attaatAAGCTTataacttcgtataatgtatgctatacgaagttatagctctgcttatatagacctcccacc

>sPA-NotI_R
aattaaTGCGGCCGCgctgcatcacacaaaaaaccaacacac

sPA-LoxP-NheI_R
taatTGGCTAGCATataacttcgtataatgtatgctatacgaagttatgctgcatcacacaaaaaaccaacac



9 changes: 9 additions & 0 deletions test/FASTA/order5.fasta
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
>qCHO49 F
TGGAGAGATGGCTCGAGGTT





qCHO R
TGGTTGCTGGGAATTGAACTC
26 changes: 26 additions & 0 deletions test/FASTA/order6.fasta
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
>CMV-Lox2272-HindIII_R
aatcatAAGCTTataacttcgtataaagtatcctatacgaagttatagctctgcttatatagacctcccacc

>HindIII-BFP_F
aagttatAAGCTTatgagcgagctgattaaggagaacatgc

>sPA-lox2272_R
taatTGGCTAGCATataacttcgtataaagtatcctatacgaagttatgctgcatcacacaaaaaaccaacac

>NheI-GFP_F
ttatATGCTAGCCAatggtgagcaagggcgagg

>NotI-pMP_F
tataatGCGGCCGCAGGTGGCac

>CMV-LoxP-HindIII_R
attaatAAGCTTataacttcgtataatgtatgctatacgaagttatagctctgcttatatagacctcccacc

>sPA-NotI_R
aattaaTGCGGCCGCgctgcatcacacaaaaaaccaacacac

sPA-LoxP-NheI_R
taatTGGCTAGCATataacttcgtataatgtatgctatacgaagttatgctgcatcacacaaaaaaccaacac



11 changes: 11 additions & 0 deletions test/FASTA/order7.fasta
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
>GB_F
5’-CTTCAAGAGAGAGACCTGCGT-3’

>GB_R
5’-GATGTTGTTGGCCACCTCG-3’

>F8_GB20_F
GCTACACCTTCAAGCACA

>F8_GB20_R
GGGTTCTCCATGCTCA
22 changes: 22 additions & 0 deletions test/FASTA/Ампликон_28_07_22.FASTA
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
>Ampl_prcTnT_del
tttttACGCGTtaatagtaatcaattacggggtcattagttcatagcccatatatggagttccggctgccttatcagcgtctcgggcactcacgtatctccgtccgacgggtttaaaatagcaaaactctgagcgctgctgccaaaatagcagctcacaagtgttgcattcctctctgggcgccgggcacattcctgctggctctgcccgccccccatatatggagttccgcgttacataacttacggtaaatgg
>Ampl_MHCK7-1
tttttACGCGTtaatagtaatcaattacggggtcattagttcatagcccatatatggagttccgccttcagattaaaaataactgaggtaagggcctgggtaggggaggtggtgtgagacgctcctgtctctcctctatctgcccatcggccctttggggaggaggaatgtgcccaaggactaaaaaaaggccatggagccagaggggcgagggcaacagacctttcatgggcaaaccttggggccctgctgtctagcatgcccc
>Ampl_MHCK7-2
accttggggccctgctgtctagcatgccccactacgggtctaggctgcccatgtaaggaggcaaggcctggggacacccgagatgcctggttataattaacccagacatgtggctgcccccccccccccaacacctgctgcctctaaaaataaccctgtccctggtggatcccctgcatgcgaagatcttcgaaccatatatggagttccgcgttacataacttacggtaaatgg
>Amplicon1_MH
tttttACGCGTtaatagtaatcaattacggggtcattagttcatagcccatatatggagttccgGTGCTGTCAGCCTTCCTTGACACCTCTGTCTCCTCAGGTGCCTGGCTCCCAGTCCCCAGAACGCCTCTCCTGTACCTTGCTTCCTAGCTGGGCCTTTCCTTCTCCTCTATAAATACCAGCTCTGGTATTTCGCCTTGGCAGCTGTagcagccactacgggtctaggctgcccatgtaaggaggcaaggcctgggga
>Amplicon2_MH
gctgcccatgtaaggaggcaaggcctggggacacccgagatgcctggttataattaacccagacatgtggctgcccccccccccccaacacctgctgcctgagcctcacccccaccccggtgcctgggtcttaggctctgtacaccatggaggagaagctcgctctaaaaataaccctgtcccccatatatggagttccgcgttacataacttacggtaaatgg
>Ampl_MHCK7-1
tttttACGCGTtaatagtaatcaattacggggtcattagttcatagcccatatatggagttccgccttcagattaaaaataactgaggtaagggcctgggtaggggaggtggtgtgagacgctcctgtctctcctctatctgcccatcggccctttggggaggaggaatgtgcccaaggactaaaaaaaggccatggagccagaggggcgagggcaacagacctttcatgggcaaaccttggggccctgctgtctagcatgcccc
>Ampl_MHCK7-2
accttggggccctgctgtctagcatgccccactacgggtctaggctgcccatgtaaggaggcaaggcctggggacacccgagatgcctggttataattaacccagacatgtggctgcccccccccccccaacacctgctgcctctaaaaataaccctgtccctggtggatcccctgcatgcgaagatcttcgaaccatatatggagttccgcgttacataacttacggtaaatgg
>CMV + enhMH-1
GTGCTGTCAGCCTTCCTTGACACCTCTGTCTCCTCAGGTGCCTGGCTCCCAGTCCCCAGAACGCCTCTCCTGTACCTTGCTTCCTAGCTGGGCCTTTCCTTCTCCTCTATAAATACCAGCTCTGGTATTTCGCCTTGGCAGCTGTagcagccactacgggtctaggctgcccatgtaaggaggcaaggcctggggacacccgagatgcctggttataattaacccagacatgtggctgcccccccccccccaacacctgctgcctgagcctcacccccaccccggtgcctgggtcttaggctctgtacac
>CMV + enhMH-2
cccggtgcctgggtcttaggctctgtacaccatggaggagaagctcgctctaaaaataaccctgtcccgtgatgcggttttggcagtacatcaatgggcgtggatagcggtttgactcacggggatttccaagtctccaccccattgacgtcaatgggagtttgttttggcaccaaaatcaacgggactttccaaaatgtcgtaacaactccgccccattgacgcaaatgggcggtaggcgtgtacggtgggaggtctatataagcagagct
>CMV + enhMCK + prcTnT-1
ccactacgggtctaggctgcccatgtaaggaggcaaggcctggggacacccgagatgcctggttataattaaccccaacacctgctgcccccccccccccaacacctgctgcctgagcctgagcggttaccccaccccggtgcctgggtcttaggctctgtacaccatggaggagaagctcgctctaaaaataaccctgtccctggtgggtgccttatcagcgtccccagccctgggaggtgacagctggctggcttgtgtcagcccctcgggcactcacgtatctccgt
CMV + enhMCK + prcTnT-2
tcagcccctcgggcactcacgtatctccgtccgacgggtttaaaatagcaaaactgtgatgcggttttggcagtacatcaatgggcgtggatagcggtttgactcacggggatttccaagtctccaccccattgacgtcaatgggagtttgttttggcaccaaaatcaacgggactttccaaaatgtcgtaacaactccgccccattgacgcaaatgggcggtaggcgtgtacggtgggaggtctatataagcagagct
78 changes: 60 additions & 18 deletions test/FASTASpec.hs
Original file line number Diff line number Diff line change
Expand Up @@ -2,37 +2,79 @@

module FASTASpec where

import Bio.FASTA (fromFile, toFile)
import Bio.FASTA (fromFile, toFile, fastaP)
import Bio.FASTA.Type (Fasta, FastaItem (..))
import Bio.Sequence (bareSequence)
import Data.Text.IO (readFile)
import Control.Monad.IO.Class (MonadIO, liftIO)
import Prelude hiding (readFile, writeFile)
import System.Directory (removeFile)
import Test.Hspec
import Data.Text (Text)
import Data.Bifunctor
import Text.Megaparsec (Parsec, errorBundlePretty, parse, MonadParsec (eof))
import Data.Void (Void)

correctFasta :: Fasta Char
correctFasta = [ FastaItem "3HMX:A|PDBID|CHAIN|SEQUENCE" (bareSequence "IWELKKDVYVVELDWYPDAPGEMVVLTCDTPEEDGITWTLDQSSEVLGSGKTLTIQVKEFGDAGQYTCHKGGEVLSHSLL")
, FastaItem "7HMX:A|PDBID|CHAIN|SEQUENCE" (bareSequence "EEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEVLGSGKTLTIQVKEFGDAGQYTCHKGGEVLSHSLL")
, FastaItem "With_spaces" (bareSequence "MDFFDLDIEIKQERLPAECSLNSPLNYSLSAQLTDRMTPRTENVRRQRERMDFFDLDIEIKQERLPAECSLNSPLNYSLSAQLTDRMTPRTENVRRQRERMDFFDLDIEIKQERLPAECSLNSPLNYSLSAQLTDRMTPRTENVRRQRER")
, FastaItem "Empty_ha_ha_ha" (bareSequence "")
]
parseOnly :: Parsec Void Text (Fasta a) -> Text -> Either String (Fasta a)
parseOnly p s = first errorBundlePretty $ parse (p <* eof) "test.fasta" s

correctFasta1 :: Fasta Char
correctFasta1 = [ FastaItem "3HMX:A|PDBID|CHAIN|SEQUENCE" (bareSequence "IWELKKDVYVVELDWYPDAPGEMVVLTCDTPEEDGITWTLDQSSEVLGSGKTLTIQVKEFGDAGQYTCHKGGEVLSHSLL")
, FastaItem "7HMX:A|PDBID|CHAIN|SEQUENCE" (bareSequence "EEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEVLGSGKTLTIQVKEFGDAGQYTCHKGGEVLSHSLL")
, FastaItem "With_spaces" (bareSequence "MDFFDLDIEIKQERLPAECSLNSPLNYSLSAQLTDRMTPRTENVRRQRERMDFFDLDIEIKQERLPAECSLNSPLNYSLSAQLTDRMTPRTENVRRQRERMDFFDLDIEIKQERLPAECSLNSPLNYSLSAQLTDRMTPRTENVRRQRER")
, FastaItem "Empty_ha_ha_ha" (bareSequence "")
]

correctFasta3 :: Fasta Char
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

а где correctFasta2?))))

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

забыл(

correctFasta3 = [ FastaItem "N-His-E4Orf6-7-R2(115)" (bareSequence "TGATGGTGATGGTGATGcatGTGGTAAACTCGACTTTCACTTTTCTCTATCACTGATAGGGAGTGGTAAACTCGACTTTCACTTTTCTCTATCACTGATAGGGAaacagtcagcc")
]

badFasta4 :: Either String (Fasta Char)
badFasta4 = Left "test.fasta:5:1:\n |\n5 | HindIII-BFP_F \r\n | ^\nunexpected 'H'\nexpecting '>' or end of input\n"

badFasta5 :: Either String (Fasta Char)
badFasta5 = Left "test.fasta:8:1:\n |\n8 | qCHO R \r\n | ^\nunexpected 'q'\nexpecting '>' or end of input\n"

badFasta6 :: Either String (Fasta Char)
badFasta6 = Left "test.fasta:22:1:\n |\n22 | sPA-LoxP-NheI_R \r\n | ^\nunexpected 's'\nexpecting '>' or end of input\n"

badFasta7 :: Either String (Fasta Char)
badFasta7 = Left "test.fasta:2:1:\n |\n2 | 5\8217-CTTCAAGAGAGAGACCTGCGT-3\8217\r\n | ^\nunexpected '5'\nexpecting '>', end of input, or sequence\n"

badFasta8 :: Either String (Fasta Char)
badFasta8 = Left "test.fasta:21:5:\n |\n21 | CMV + enhMCK + prcTnT-2\r\n | ^^\nunexpected \"+ \"\nexpecting end of input, end of line, or letter\n"

fastaSpec :: Spec
fastaSpec = describe "Fasta file parser." $ do
parseFile "test/FASTA/correct.fasta"
writeFile "test/FASTA/test.fasta"
fastaSpec = describe "Fasta files parser." $ do
parseFile "test/FASTA/order1.fasta" correctFasta1
writeFile "test/FASTA/test.fasta" correctFasta1
parseFile "test/FASTA/order3.fasta" correctFasta3
writeFile "test/FASTA/test.fasta" correctFasta3
parseBadFile "test/FASTA/order4.fasta" badFasta4
parseBadFile "test/FASTA/order5.fasta" badFasta5
parseBadFile "test/FASTA/order6.fasta" badFasta6
parseBadFile "test/FASTA/order7.fasta" badFasta7
parseBadFile "test/FASTA/Ампликон_28_07_22.FASTA" badFasta8

parseFile :: FilePath -> Spec
parseFile path = do
parseFile :: FilePath -> Fasta Char -> Spec
parseFile path cf = do
describe "fromFile" $ do
it "correctly parses fasta from file" $ do
fasta <- fromFile path
fasta `shouldBe` correctFasta
fasta `shouldBe` cf

writeFile :: FilePath -> Spec
writeFile path = describe "writeFile" $ do
parseBadFile :: FilePath -> Either String (Fasta Char) -> Spec
parseBadFile path cf = do
describe "fromFile" $ do
it "correctly parses fasta from file" $ do
res <- liftIO (readFile path)
let badRes = parseOnly fastaP res
badRes `shouldBe` cf

writeFile :: FilePath -> Fasta Char -> Spec
writeFile path cf = describe "writeFile" $ do
it "correctly write fasta into file" $ do
toFile correctFasta path
toFile cf path
fasta <- fromFile path
removeFile path
fasta `shouldBe` correctFasta

fasta `shouldBe` cf
18 changes: 9 additions & 9 deletions test/FastaParserSpec.hs
Original file line number Diff line number Diff line change
Expand Up @@ -63,8 +63,8 @@ twoSequences = describe "twoSequences" $ do
sequenceWithDigit :: Spec
sequenceWithDigit = describe "sequenceWithDigit" $ do
it "correctly parses incorrect sequence with digit" $ do
let res = parseOnly fastaP ">123\nIWELKKDVYVVELDWYPDAPGEMVVLTCDTPEE4GITWTLDQSSE"
res `shouldBe` Right [FastaItem @Char "123" (bareSequence "IWELKKDVYVVELDWYPDAPGEMVVLTCDTPEE4GITWTLDQSSE")]
let res = parseOnly (fastaP @Char) ">123\nIWELKKDVYVVELDWYPDAPGEMVVLTCDTPEE4GITWTLDQSSE"
res `shouldBe` Left "test.fasta:2:34:\n |\n2 | IWELKKDVYVVELDWYPDAPGEMVVLTCDTPEE4GITWTLDQSSE\n | ^^\nunexpected \"4G\"\nexpecting end of input, end of line, or letter\n"

sequenceWithWrongName :: Spec
sequenceWithWrongName = describe "sequenceWithWrongName" $ do
Expand All @@ -80,15 +80,15 @@ sequenceWithSpacesInName = describe "sequenceWithSpacesInName" $ do

sequenceWithSeveralEndOfLine :: Spec
sequenceWithSeveralEndOfLine = describe "sequenceWithSeveralEndOfLine" $ do
it "correctly parses sequence with several \\n after name" $ do
let res = parseOnly fastaP ">this is my sequence\n\n\nIWELKKDVYVVELDWYPDAPGEMVVLTCDTPEEGITWTLDQSSE"
res `shouldBe` Right [FastaItem @Char "this is my sequence" (bareSequence "IWELKKDVYVVELDWYPDAPGEMVVLTCDTPEEGITWTLDQSSE")]
it "correctly parses incorrect sequence with several \\n after name" $ do
let res = parseOnly (fastaP @Char) ">this is my sequence\n\n\nIWELKKDVYVVELDWYPDAPGEMVVLTCDTPEEGITWTLDQSSE"
res `shouldBe` Left "test.fasta:4:1:\n |\n4 | IWELKKDVYVVELDWYPDAPGEMVVLTCDTPEEGITWTLDQSSE\n | ^\nunexpected 'I'\nexpecting '>' or end of input\n"

sequenceWithSeveralEndOfLineInSequence :: Spec
sequenceWithSeveralEndOfLineInSequence = describe "sequenceWithSeveralEndOfLineInSequence" $ do
it "correctly parses sequence with several \\n between sequence parts" $ do
let res = parseOnly fastaP ">this is my sequence\nIWELKKDVYVVELDWYPDAPGEMVVLTCDTPEEGITWTLDQSSE\n\n\nYYYYYYYYYYYYYYYYYYYYYYYY"
res `shouldBe` Right [FastaItem @Char "this is my sequence" (bareSequence "IWELKKDVYVVELDWYPDAPGEMVVLTCDTPEEGITWTLDQSSEYYYYYYYYYYYYYYYYYYYYYYYY")]
it "correctly parses incorrect sequence with several \\n between sequence parts" $ do
let res = parseOnly (fastaP @Char) ">this is my sequence\nIWELKKDVYVVELDWYPDAPGEMVVLTCDTPEEGITWTLDQSSE\n\n\nYYYYYYYYYYYYYYYYYYYYYYYY"
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

хм
надо будет уточнить у Насти и остальных насколько это incorrect

можешь пожалуйста в общий чат abscan+ylab2 написать вопрос? покажи пример фасты с такой дыркой, скажи что сейчас наш парсер не будет читать, скажи почему так хотим сделать и спроси норм ли

потому что старый читал, значит это будет регрессия

res `shouldBe` Left "test.fasta:5:1:\n |\n5 | YYYYYYYYYYYYYYYYYYYYYYYY\n | ^\nunexpected 'Y'\nexpecting '>' or end of input\n"

sequenceWithTabsInName :: Spec
sequenceWithTabsInName = describe "sequenceWithTabsInName" $ do
Expand Down Expand Up @@ -123,7 +123,7 @@ toughParserTests = describe "various parser tests" $ do
it "correctly fails to parse a name without >" $ checkParser incorrectTest1
(Left "test.fasta:1:1:\n |\n1 | test1\n | ^\nunexpected 't'\nexpecting '>' or end of input\n")
it "correctly fails to parse a new sequence at the same line" $ checkParser incorrectTest2
(Left "test.fasta:3:8:\n |\n3 | GHIJKL >test2\n | ^^\nunexpected \">t\"\nexpecting alphanumeric character, end of input, or end of line\n")
(Left "test.fasta:3:8:\n |\n3 | GHIJKL >test2\n | ^^\nunexpected \">t\"\nexpecting end of input, end of line, or letter\n")

correctTest1 :: Text
correctTest1 = T.unlines
Expand Down