From 5ea53847ddad94e8dd3b5c52772178b566f0caba Mon Sep 17 00:00:00 2001 From: abychkova <7148571+abychkova@users.noreply.github.com> Date: Tue, 3 Sep 2019 12:18:11 +0300 Subject: [PATCH] fasta parser and writer + spec (#15) * wrike-385288488: fasta parser and writer + spec --- ChangeLog.md | 5 +++ package.yaml | 2 +- src/Bio/FASTA.hs | 24 +++++++++++ src/Bio/FASTA/Parser.hs | 32 +++++++++++++-- src/Bio/FASTA/Type.hs | 2 +- src/Bio/FASTA/Writer.hs | 20 +++++++-- test/FASTA/correct.fasta | 7 ++++ test/FASTASpec.hs | 30 ++++++++++++++ test/FastaParserSpec.hs | 89 ++++++++++++++++++++++++++++++++++++++++ test/FastaWriterSpec.hs | 37 +++++++++++++++++ test/Spec.hs | 7 ++++ 11 files changed, 247 insertions(+), 8 deletions(-) create mode 100644 src/Bio/FASTA.hs create mode 100644 test/FASTA/correct.fasta create mode 100644 test/FASTASpec.hs create mode 100644 test/FastaParserSpec.hs create mode 100644 test/FastaWriterSpec.hs diff --git a/ChangeLog.md b/ChangeLog.md index 5b0a6b6..3ae9737 100644 --- a/ChangeLog.md +++ b/ChangeLog.md @@ -2,6 +2,11 @@ ## [Unreleased] +## [0.1.2.0] - 2019-09-03 +### Added +- Parser for `FASTA`. +- Writer for `FASTA`. + ## [0.1.1.1] - 2019-06-05 ### Changed - `length` on `Sequence` now works in O(1). diff --git a/package.yaml b/package.yaml index 59447f1..d4a0042 100644 --- a/package.yaml +++ b/package.yaml @@ -1,5 +1,5 @@ name: cobot-io -version: 0.1.1.1 +version: 0.1.2.0 github: "less-wrong/cobot-io" license: BSD3 category: Bio diff --git a/src/Bio/FASTA.hs b/src/Bio/FASTA.hs new file mode 100644 index 0000000..140b028 --- /dev/null +++ b/src/Bio/FASTA.hs @@ -0,0 +1,24 @@ +module Bio.FASTA + ( module T + , fromFile + , toFile + , fastaP + ) where + +import Bio.FASTA.Parser +import Bio.FASTA.Type as T +import Bio.FASTA.Writer (fastaToText) +import Control.Monad.IO.Class (MonadIO, liftIO) +import Data.Attoparsec.Text (parseOnly) +import Data.Text.IO (readFile, writeFile) +import Prelude hiding (writeFile, readFile) + +-- | Reads 'FastaSequence' from given file. +-- +fromFile :: MonadIO m => FilePath -> m (Fasta Char) +fromFile f = liftIO (readFile f) >>= either fail pure . parseOnly fastaP + +-- | Writes 'FastaSequence' to file. +-- +toFile :: MonadIO m => Fasta Char -> FilePath -> m () +toFile s f = liftIO $ writeFile f $ fastaToText s diff --git a/src/Bio/FASTA/Parser.hs b/src/Bio/FASTA/Parser.hs index ee026c1..a2f8c9a 100644 --- a/src/Bio/FASTA/Parser.hs +++ b/src/Bio/FASTA/Parser.hs @@ -2,10 +2,36 @@ module Bio.FASTA.Parser ( fastaP ) where -import Bio.FASTA.Type (Fasta) -import Data.Attoparsec.Text (Parser) +import Bio.FASTA.Type (Fasta, FastaItem(..)) +import Bio.Sequence (BareSequence, bareSequence) +import Data.Attoparsec.Text (Parser, many', many1', char, endOfLine, letter, + takeWhile, choice, endOfInput) +import Data.Text (Text, strip) +import Prelude hiding (takeWhile) -- | Parser of .fasta file. -- fastaP :: Parser (Fasta Char) -fastaP = undefined +fastaP = many' item + +item :: Parser (FastaItem Char) +item = FastaItem <$> seqName <*> fastaSeq + +seqName :: Parser (Text) +seqName = strip <$> (char '>' *> tabs *> takeWhile (`notElem` ['\n', '\r']) <* tabs <* eol) + +fastaSeq :: Parser (BareSequence Char) +fastaSeq = bareSequence . mconcat <$> many' line + +line :: Parser (String) +line = many1' letter <* eol + +eol :: Parser () +eol = tabs *> choice [slashN, endOfInput] + +slashN :: Parser () +slashN = () <$ many1' endOfLine + +tabs :: Parser () +tabs = () <$ many' (char '\t') + diff --git a/src/Bio/FASTA/Type.hs b/src/Bio/FASTA/Type.hs index 0f93c75..6630d6c 100644 --- a/src/Bio/FASTA/Type.hs +++ b/src/Bio/FASTA/Type.hs @@ -7,7 +7,7 @@ import Bio.Sequence (BareSequence) import Data.Text (Text) -- | Type alias for FASTA file. --- +-- satisfies the following format : >(\s|\t)*[^\n\r]+(\s|\t)*(\n|\r)*(\w(\n|\r)*)* type Fasta a = [FastaItem a] -- | One record in FASTA file. diff --git a/src/Bio/FASTA/Writer.hs b/src/Bio/FASTA/Writer.hs index 21c97ff..383abe4 100644 --- a/src/Bio/FASTA/Writer.hs +++ b/src/Bio/FASTA/Writer.hs @@ -2,8 +2,22 @@ module Bio.FASTA.Writer ( fastaToText ) where -import Bio.FASTA.Type (Fasta) -import Data.Text (Text) +import Bio.FASTA.Type (Fasta, FastaItem(..)) +import Bio.Sequence (BareSequence, sequ) +import Control.Lens ((^.)) +import Data.Text (Text, pack) +import Data.List.Split (chunksOf) +import Data.Vector (Vector, toList) +import Prelude hiding (drop) fastaToText :: Fasta Char -> Text -fastaToText = undefined +fastaToText f = mconcat $ map writeItem f + +writeItem :: FastaItem Char -> Text +writeItem (FastaItem name s) = ">" <> name <> "\n" <> seq2Text s + +seq2Text :: BareSequence Char -> Text +seq2Text s = pack $ vector2Text $ s ^. Bio.Sequence.sequ + +vector2Text :: Vector Char -> String +vector2Text v = concatMap (++ "\n") $ chunksOf 80 $ toList v diff --git a/test/FASTA/correct.fasta b/test/FASTA/correct.fasta new file mode 100644 index 0000000..10607ab --- /dev/null +++ b/test/FASTA/correct.fasta @@ -0,0 +1,7 @@ +>3HMX:A|PDBID|CHAIN|SEQUENCE +IWELKKDVYVVELDWYPDAPGEMVVLTCDTPEEDGITWTLDQSSE +VLGSGKTLTIQVKEFGDAGQYTCHKGGEVLSHSLL + +>7HMX:A|PDBID|CHAIN|SEQUENCE +EEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE +VLGSGKTLTIQVKEFGDAGQYTCHKGGEVLSHSLL \ No newline at end of file diff --git a/test/FASTASpec.hs b/test/FASTASpec.hs new file mode 100644 index 0000000..14a073b --- /dev/null +++ b/test/FASTASpec.hs @@ -0,0 +1,30 @@ +{-# LANGUAGE OverloadedStrings #-} + +module FASTASpec where + +import Bio.FASTA (fromFile, toFile) +import Bio.FASTA.Type (FastaItem(..), Fasta) +import Bio.Sequence (bareSequence) +import Prelude hiding (writeFile, readFile) +import Test.Hspec + +correctFasta :: Fasta Char +correctFasta = [FastaItem "3HMX:A|PDBID|CHAIN|SEQUENCE" (bareSequence "IWELKKDVYVVELDWYPDAPGEMVVLTCDTPEEDGITWTLDQSSEVLGSGKTLTIQVKEFGDAGQYTCHKGGEVLSHSLL"), FastaItem "7HMX:A|PDBID|CHAIN|SEQUENCE" (bareSequence "EEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEVLGSGKTLTIQVKEFGDAGQYTCHKGGEVLSHSLL")] + +fastaSpec :: Spec +fastaSpec = describe "Fasta file parser." $ do + parseFile "test/FASTA/correct.fasta" + writeFile "test/FASTA/test.fasta" + +parseFile :: FilePath -> Spec +parseFile path = describe "fromFile" $ do + it "correctly parses fasta from file" $ do + fasta <- fromFile path + fasta `shouldBe` correctFasta + +writeFile :: FilePath -> Spec +writeFile path = describe "writeFile" $ do + it "correctly write fasta into file" $ do + toFile correctFasta path + fasta <- fromFile path + fasta `shouldBe` correctFasta diff --git a/test/FastaParserSpec.hs b/test/FastaParserSpec.hs new file mode 100644 index 0000000..04e5d3a --- /dev/null +++ b/test/FastaParserSpec.hs @@ -0,0 +1,89 @@ +{-# LANGUAGE OverloadedStrings #-} + +module FastaParserSpec where + +import Bio.FASTA.Parser (fastaP) +import Bio.FASTA.Type (FastaItem(..)) +import Bio.Sequence (bareSequence) +import Data.Attoparsec.Text (parseOnly) +import Test.Hspec + +fastaParserSpec :: Spec +fastaParserSpec = describe "Fasta format parser." $ do + emptyFasta + onlyName + oneSequence + twoSequences + sequenceWithDigit + sequenceWithWrongName + sequenceWithSpacesInName + sequenceWithSeveralEndOfLine + sequenceWithSeveralEndOfLineInSequence + sequenceWithTabsInName + sequenceWithTabsInSequence + +emptyFasta :: Spec +emptyFasta = describe "emptyFasta" $ do + it "correctly parses empty fasta" $ do + let res = parseOnly fastaP "" + res `shouldBe` Right [] + +onlyName :: Spec +onlyName = describe "onlyName" $ do + it "correctly parses fasta without sequence" $ do + let res = parseOnly fastaP ">3HMX:A|PDBID|CHAIN|SEQUENCE" + res `shouldBe` Right [FastaItem "3HMX:A|PDBID|CHAIN|SEQUENCE" (bareSequence "")] + +oneSequence :: Spec +oneSequence = describe "oneSequence" $ do + it "correctly parses one correct sequence" $ do + let res = parseOnly fastaP ">3HMX:A|PDBID|CHAIN|SEQUENCE\nIWELKKDVYVVELDWYPDAPGEMVVLTCDTPEEDGITWTLDQSSE\nVLGSGKTLTIQVKEFGDAGQYTCHKGGEVLSHSLL\n" + res `shouldBe` Right [FastaItem "3HMX:A|PDBID|CHAIN|SEQUENCE" (bareSequence "IWELKKDVYVVELDWYPDAPGEMVVLTCDTPEEDGITWTLDQSSEVLGSGKTLTIQVKEFGDAGQYTCHKGGEVLSHSLL")] + +twoSequences :: Spec +twoSequences = describe "twoSequences" $ do + it "correctly parses two correct sequences" $ do + let res = parseOnly fastaP ">3HMX:A|PDBID|CHAIN|SEQUENCE\nIWELKKDVYVVELDWYPDAPGEMVVLTCDTPEEDGITWTLDQSSE\nVLGSGKTLTIQVKEFGDAGQYTCHKGGEVLSHSLL\n>7HMX:A|PDBID|CHAIN|SEQUENCE\nEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE\nVLGSGKTLTIQVKEFGDAGQYTCHKGGEVLSHSLL" + res `shouldBe` Right [FastaItem "3HMX:A|PDBID|CHAIN|SEQUENCE" (bareSequence "IWELKKDVYVVELDWYPDAPGEMVVLTCDTPEEDGITWTLDQSSEVLGSGKTLTIQVKEFGDAGQYTCHKGGEVLSHSLL"), FastaItem "7HMX:A|PDBID|CHAIN|SEQUENCE" (bareSequence "EEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEVLGSGKTLTIQVKEFGDAGQYTCHKGGEVLSHSLL")] + +sequenceWithDigit :: Spec +sequenceWithDigit = describe "sequenceWithDigit" $ do + it "correctly parses incorrect sequence with digit" $ do + let res = parseOnly fastaP ">123\nIWELKKDVYVVELDWYPDAPGEMVVLTCDTPEE4GITWTLDQSSE" + res `shouldBe` Right [FastaItem "123" (bareSequence "")] + +sequenceWithWrongName :: Spec +sequenceWithWrongName = describe "sequenceWithWrongName" $ do + it "correctly parses incorrect sequence with wrong name" $ do + let res = parseOnly fastaP "123\nIWELKKDVYVVELDWYPDAPGEMVVLTCDTPEEGITWTLDQSSE" + res `shouldBe` Right [] + +sequenceWithSpacesInName :: Spec +sequenceWithSpacesInName = describe "sequenceWithSpacesInName" $ do + it "correctly parses sequence with spaces in name" $ do + let res = parseOnly fastaP "> this is my sequence \nIWELKKDVYVVELDWYPDAPGEMVVLTCDTPEEGITWTLDQSSE" + res `shouldBe` Right [FastaItem "this is my sequence" (bareSequence "IWELKKDVYVVELDWYPDAPGEMVVLTCDTPEEGITWTLDQSSE")] + +sequenceWithSeveralEndOfLine :: Spec +sequenceWithSeveralEndOfLine = describe "sequenceWithSeveralEndOfLine" $ do + it "correctly parses sequence with several \n after name" $ do + let res = parseOnly fastaP ">this is my sequence\n\n\nIWELKKDVYVVELDWYPDAPGEMVVLTCDTPEEGITWTLDQSSE" + res `shouldBe` Right [FastaItem "this is my sequence" (bareSequence "IWELKKDVYVVELDWYPDAPGEMVVLTCDTPEEGITWTLDQSSE")] + +sequenceWithSeveralEndOfLineInSequence :: Spec +sequenceWithSeveralEndOfLineInSequence = describe "sequenceWithSeveralEndOfLineInSequence" $ do + it "correctly parses sequence with several \n between sequence parts" $ do + let res = parseOnly fastaP ">this is my sequence\nIWELKKDVYVVELDWYPDAPGEMVVLTCDTPEEGITWTLDQSSE\n\n\nYYYYYYYYYYYYYYYYYYYYYYYY" + res `shouldBe` Right [FastaItem "this is my sequence" (bareSequence "IWELKKDVYVVELDWYPDAPGEMVVLTCDTPEEGITWTLDQSSEYYYYYYYYYYYYYYYYYYYYYYYY")] + +sequenceWithTabsInName :: Spec +sequenceWithTabsInName = describe "sequenceWithTabsInName" $ do + it "correctly parses sequence with tabs in name" $ do + let res = parseOnly fastaP ">\tthis\tis\tmy\tsequence\t\t\nIWELKKDVYVVELDWYPDAPGEMVVLTCDTPEEGITWTLDQSSE" + res `shouldBe` Right [FastaItem "this\tis\tmy\tsequence" (bareSequence "IWELKKDVYVVELDWYPDAPGEMVVLTCDTPEEGITWTLDQSSE")] + +sequenceWithTabsInSequence :: Spec +sequenceWithTabsInSequence = describe "sequenceWithTabsInSequence" $ do + it "correctly parses sequence with tabs between sequence parts" $ do + let res = parseOnly fastaP ">this is my sequence\nIWELKKDVYVVELDWYPDAPGEMVVLTCDTPEEGITWTLDQSSE\t\t\nYYYYYYYYYYYYYYYYYYYYYYYY\t\n" + res `shouldBe` Right [FastaItem "this is my sequence" (bareSequence "IWELKKDVYVVELDWYPDAPGEMVVLTCDTPEEGITWTLDQSSEYYYYYYYYYYYYYYYYYYYYYYYY")] \ No newline at end of file diff --git a/test/FastaWriterSpec.hs b/test/FastaWriterSpec.hs new file mode 100644 index 0000000..c6a5e50 --- /dev/null +++ b/test/FastaWriterSpec.hs @@ -0,0 +1,37 @@ +module FastaWriterSpec where + +import Bio.FASTA.Writer (fastaToText) +import Bio.FASTA.Type (FastaItem(..)) +import Bio.Sequence (bareSequence) +import Test.Hspec + +fastaWriterSpec :: Spec +fastaWriterSpec = describe "Fasta format parser." $ do + emptyFasta + oneShortSequence + oneLongSequence + twoSequences + +emptyFasta :: Spec +emptyFasta = describe "emptyFasta" $ do + it "correctly write empty fasta" $ do + let res = fastaToText [] + res `shouldBe` "" + +oneShortSequence :: Spec +oneShortSequence = describe "oneShortSequence" $ do + it "correctly write one correct short (less than 80 chars) sequence" $ do + let res = fastaToText [FastaItem "3HMX:A|PDBID|CHAIN|SEQUENCE" (bareSequence "IWELKKDVYVVELDWYPDAPGEMVVLTCDTPEEDGITWTLDQSSEVLGSGKTLTIQVKEFGDAGQYTCHKGGEVLSHSLL")] + res `shouldBe` ">3HMX:A|PDBID|CHAIN|SEQUENCE\nIWELKKDVYVVELDWYPDAPGEMVVLTCDTPEEDGITWTLDQSSEVLGSGKTLTIQVKEFGDAGQYTCHKGGEVLSHSLL\n" + +oneLongSequence :: Spec +oneLongSequence = describe "oneLongSequence" $ do + it "correctly write one correct long (more than 80 chars) sequence" $ do + let res = fastaToText [FastaItem "3HMX:A|PDBID|CHAIN|SEQUENCE" (bareSequence "IWELKKDVYVVELDWYPDAPGEMVVLTCDTPEEDGITWTLDQSSEVLGSGKTLTIQVKEFGDAGQYTCHKGGEVLSHSLLLLHKKEDGIWSTDILKDQKEPKNKTFLRCEAKNYSGRFTCWWLTTISTDLTFSVKSSRGSSDPQGVTCGAATLSAERVRGDNKEYEYSVECQEDSACPAAEESLPIEVMVDAVHKLKYENYTSSFFIRDIIKPDPPKNLQLKPLKNSRQVEVSWEYPDTWSTPHSYFSLTFCVQVQGKSKREKKDRVFTDKTSATVICRKNASISVRAQDRYYSSSWSEWASVPCS")] + res `shouldBe` ">3HMX:A|PDBID|CHAIN|SEQUENCE\nIWELKKDVYVVELDWYPDAPGEMVVLTCDTPEEDGITWTLDQSSEVLGSGKTLTIQVKEFGDAGQYTCHKGGEVLSHSLL\nLLHKKEDGIWSTDILKDQKEPKNKTFLRCEAKNYSGRFTCWWLTTISTDLTFSVKSSRGSSDPQGVTCGAATLSAERVRG\nDNKEYEYSVECQEDSACPAAEESLPIEVMVDAVHKLKYENYTSSFFIRDIIKPDPPKNLQLKPLKNSRQVEVSWEYPDTW\nSTPHSYFSLTFCVQVQGKSKREKKDRVFTDKTSATVICRKNASISVRAQDRYYSSSWSEWASVPCS\n" + +twoSequences :: Spec +twoSequences = describe "twoSequences" $ do + it "correctly write two correct sequences" $ do + let res = fastaToText [FastaItem "3HMX:A|PDBID|CHAIN|SEQUENCE" (bareSequence "IWELKKDVYVVELDWYPDAPGEMVVLTCDTPEEDGITWTLDQSSEVLGSGKTLTIQVKEFGDAGQYTCHKGGEVLSHSLL"), FastaItem "7HMX:A|PDBID|CHAIN|SEQUENCE" (bareSequence "IWELKKDVYVVELDWYPDAPGEMVVLTCDTPEEDGITWTLDQSSEVLGSGKTLTIQVKEFGDAGQYTCHKGGEVLSHSLLLLHKKEDGIWSTDILKDQKEPKNKTFLRCEAKNYSGRFTCWWLTTISTDLTFSVKSSRGSSDPQGVTCGAATLSAERVRGDNKEYEYSVECQEDSACPAAEESLPIEVMVDAVHKLKYENYTSSFFIRDIIKPDPPKNLQLKPLKNSRQVEVSWEYPDTWSTPHSYFSLTFCVQVQGKSKREKKDRVFTDKTSATVICRKNASISVRAQDRYYSSSWSEWASVPCS")] + res `shouldBe` ">3HMX:A|PDBID|CHAIN|SEQUENCE\nIWELKKDVYVVELDWYPDAPGEMVVLTCDTPEEDGITWTLDQSSEVLGSGKTLTIQVKEFGDAGQYTCHKGGEVLSHSLL\n>7HMX:A|PDBID|CHAIN|SEQUENCE\nIWELKKDVYVVELDWYPDAPGEMVVLTCDTPEEDGITWTLDQSSEVLGSGKTLTIQVKEFGDAGQYTCHKGGEVLSHSLL\nLLHKKEDGIWSTDILKDQKEPKNKTFLRCEAKNYSGRFTCWWLTTISTDLTFSVKSSRGSSDPQGVTCGAATLSAERVRG\nDNKEYEYSVECQEDSACPAAEESLPIEVMVDAVHKLKYENYTSSFFIRDIIKPDPPKNLQLKPLKNSRQVEVSWEYPDTW\nSTPHSYFSLTFCVQVQGKSKREKKDRVFTDKTSATVICRKNASISVRAQDRYYSSSWSEWASVPCS\n" \ No newline at end of file diff --git a/test/Spec.hs b/test/Spec.hs index 491ed50..02ac986 100644 --- a/test/Spec.hs +++ b/test/Spec.hs @@ -6,6 +6,9 @@ import SequenceSpec import System.IO import Test.Hspec import UniprotSpec +import FastaParserSpec +import FastaWriterSpec +import FASTASpec main :: IO () main = do @@ -28,3 +31,7 @@ main = do -- GB gbParserSpec gbWriterSpec + -- Fasta + fastaParserSpec + fastaSpec + fastaWriterSpec