Skip to content

Commit

Permalink
fasta parser and writer + spec (#15)
Browse files Browse the repository at this point in the history
* wrike-385288488: fasta parser and writer + spec
  • Loading branch information
abychkova authored and ozzzzz committed Sep 3, 2019
1 parent 741e222 commit 5ea5384
Show file tree
Hide file tree
Showing 11 changed files with 247 additions and 8 deletions.
5 changes: 5 additions & 0 deletions ChangeLog.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,11 @@

## [Unreleased]

## [0.1.2.0] - 2019-09-03
### Added
- Parser for `FASTA`.
- Writer for `FASTA`.

## [0.1.1.1] - 2019-06-05
### Changed
- `length` on `Sequence` now works in O(1).
Expand Down
2 changes: 1 addition & 1 deletion package.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
name: cobot-io
version: 0.1.1.1
version: 0.1.2.0
github: "less-wrong/cobot-io"
license: BSD3
category: Bio
Expand Down
24 changes: 24 additions & 0 deletions src/Bio/FASTA.hs
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
module Bio.FASTA
( module T
, fromFile
, toFile
, fastaP
) where

import Bio.FASTA.Parser
import Bio.FASTA.Type as T
import Bio.FASTA.Writer (fastaToText)
import Control.Monad.IO.Class (MonadIO, liftIO)
import Data.Attoparsec.Text (parseOnly)
import Data.Text.IO (readFile, writeFile)
import Prelude hiding (writeFile, readFile)

-- | Reads 'FastaSequence' from given file.
--
fromFile :: MonadIO m => FilePath -> m (Fasta Char)
fromFile f = liftIO (readFile f) >>= either fail pure . parseOnly fastaP

-- | Writes 'FastaSequence' to file.
--
toFile :: MonadIO m => Fasta Char -> FilePath -> m ()
toFile s f = liftIO $ writeFile f $ fastaToText s
32 changes: 29 additions & 3 deletions src/Bio/FASTA/Parser.hs
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,36 @@ module Bio.FASTA.Parser
( fastaP
) where

import Bio.FASTA.Type (Fasta)
import Data.Attoparsec.Text (Parser)
import Bio.FASTA.Type (Fasta, FastaItem(..))
import Bio.Sequence (BareSequence, bareSequence)
import Data.Attoparsec.Text (Parser, many', many1', char, endOfLine, letter,
takeWhile, choice, endOfInput)
import Data.Text (Text, strip)
import Prelude hiding (takeWhile)

-- | Parser of .fasta file.
--
fastaP :: Parser (Fasta Char)
fastaP = undefined
fastaP = many' item

item :: Parser (FastaItem Char)
item = FastaItem <$> seqName <*> fastaSeq

seqName :: Parser (Text)
seqName = strip <$> (char '>' *> tabs *> takeWhile (`notElem` ['\n', '\r']) <* tabs <* eol)

fastaSeq :: Parser (BareSequence Char)
fastaSeq = bareSequence . mconcat <$> many' line

line :: Parser (String)
line = many1' letter <* eol

eol :: Parser ()
eol = tabs *> choice [slashN, endOfInput]

slashN :: Parser ()
slashN = () <$ many1' endOfLine

tabs :: Parser ()
tabs = () <$ many' (char '\t')

2 changes: 1 addition & 1 deletion src/Bio/FASTA/Type.hs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ import Bio.Sequence (BareSequence)
import Data.Text (Text)

-- | Type alias for FASTA file.
--
-- satisfies the following format : >(\s|\t)*[^\n\r]+(\s|\t)*(\n|\r)*(\w(\n|\r)*)*
type Fasta a = [FastaItem a]

-- | One record in FASTA file.
Expand Down
20 changes: 17 additions & 3 deletions src/Bio/FASTA/Writer.hs
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,22 @@ module Bio.FASTA.Writer
( fastaToText
) where

import Bio.FASTA.Type (Fasta)
import Data.Text (Text)
import Bio.FASTA.Type (Fasta, FastaItem(..))
import Bio.Sequence (BareSequence, sequ)
import Control.Lens ((^.))
import Data.Text (Text, pack)
import Data.List.Split (chunksOf)
import Data.Vector (Vector, toList)
import Prelude hiding (drop)

fastaToText :: Fasta Char -> Text
fastaToText = undefined
fastaToText f = mconcat $ map writeItem f

writeItem :: FastaItem Char -> Text
writeItem (FastaItem name s) = ">" <> name <> "\n" <> seq2Text s

seq2Text :: BareSequence Char -> Text
seq2Text s = pack $ vector2Text $ s ^. Bio.Sequence.sequ

vector2Text :: Vector Char -> String
vector2Text v = concatMap (++ "\n") $ chunksOf 80 $ toList v
7 changes: 7 additions & 0 deletions test/FASTA/correct.fasta
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
>3HMX:A|PDBID|CHAIN|SEQUENCE
IWELKKDVYVVELDWYPDAPGEMVVLTCDTPEEDGITWTLDQSSE
VLGSGKTLTIQVKEFGDAGQYTCHKGGEVLSHSLL

>7HMX:A|PDBID|CHAIN|SEQUENCE
EEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE
VLGSGKTLTIQVKEFGDAGQYTCHKGGEVLSHSLL
30 changes: 30 additions & 0 deletions test/FASTASpec.hs
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
{-# LANGUAGE OverloadedStrings #-}

module FASTASpec where

import Bio.FASTA (fromFile, toFile)
import Bio.FASTA.Type (FastaItem(..), Fasta)
import Bio.Sequence (bareSequence)
import Prelude hiding (writeFile, readFile)
import Test.Hspec

correctFasta :: Fasta Char
correctFasta = [FastaItem "3HMX:A|PDBID|CHAIN|SEQUENCE" (bareSequence "IWELKKDVYVVELDWYPDAPGEMVVLTCDTPEEDGITWTLDQSSEVLGSGKTLTIQVKEFGDAGQYTCHKGGEVLSHSLL"), FastaItem "7HMX:A|PDBID|CHAIN|SEQUENCE" (bareSequence "EEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEVLGSGKTLTIQVKEFGDAGQYTCHKGGEVLSHSLL")]

fastaSpec :: Spec
fastaSpec = describe "Fasta file parser." $ do
parseFile "test/FASTA/correct.fasta"
writeFile "test/FASTA/test.fasta"

parseFile :: FilePath -> Spec
parseFile path = describe "fromFile" $ do
it "correctly parses fasta from file" $ do
fasta <- fromFile path
fasta `shouldBe` correctFasta

writeFile :: FilePath -> Spec
writeFile path = describe "writeFile" $ do
it "correctly write fasta into file" $ do
toFile correctFasta path
fasta <- fromFile path
fasta `shouldBe` correctFasta
89 changes: 89 additions & 0 deletions test/FastaParserSpec.hs
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
{-# LANGUAGE OverloadedStrings #-}

module FastaParserSpec where

import Bio.FASTA.Parser (fastaP)
import Bio.FASTA.Type (FastaItem(..))
import Bio.Sequence (bareSequence)
import Data.Attoparsec.Text (parseOnly)
import Test.Hspec

fastaParserSpec :: Spec
fastaParserSpec = describe "Fasta format parser." $ do
emptyFasta
onlyName
oneSequence
twoSequences
sequenceWithDigit
sequenceWithWrongName
sequenceWithSpacesInName
sequenceWithSeveralEndOfLine
sequenceWithSeveralEndOfLineInSequence
sequenceWithTabsInName
sequenceWithTabsInSequence

emptyFasta :: Spec
emptyFasta = describe "emptyFasta" $ do
it "correctly parses empty fasta" $ do
let res = parseOnly fastaP ""
res `shouldBe` Right []

onlyName :: Spec
onlyName = describe "onlyName" $ do
it "correctly parses fasta without sequence" $ do
let res = parseOnly fastaP ">3HMX:A|PDBID|CHAIN|SEQUENCE"
res `shouldBe` Right [FastaItem "3HMX:A|PDBID|CHAIN|SEQUENCE" (bareSequence "")]

oneSequence :: Spec
oneSequence = describe "oneSequence" $ do
it "correctly parses one correct sequence" $ do
let res = parseOnly fastaP ">3HMX:A|PDBID|CHAIN|SEQUENCE\nIWELKKDVYVVELDWYPDAPGEMVVLTCDTPEEDGITWTLDQSSE\nVLGSGKTLTIQVKEFGDAGQYTCHKGGEVLSHSLL\n"
res `shouldBe` Right [FastaItem "3HMX:A|PDBID|CHAIN|SEQUENCE" (bareSequence "IWELKKDVYVVELDWYPDAPGEMVVLTCDTPEEDGITWTLDQSSEVLGSGKTLTIQVKEFGDAGQYTCHKGGEVLSHSLL")]

twoSequences :: Spec
twoSequences = describe "twoSequences" $ do
it "correctly parses two correct sequences" $ do
let res = parseOnly fastaP ">3HMX:A|PDBID|CHAIN|SEQUENCE\nIWELKKDVYVVELDWYPDAPGEMVVLTCDTPEEDGITWTLDQSSE\nVLGSGKTLTIQVKEFGDAGQYTCHKGGEVLSHSLL\n>7HMX:A|PDBID|CHAIN|SEQUENCE\nEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE\nVLGSGKTLTIQVKEFGDAGQYTCHKGGEVLSHSLL"
res `shouldBe` Right [FastaItem "3HMX:A|PDBID|CHAIN|SEQUENCE" (bareSequence "IWELKKDVYVVELDWYPDAPGEMVVLTCDTPEEDGITWTLDQSSEVLGSGKTLTIQVKEFGDAGQYTCHKGGEVLSHSLL"), FastaItem "7HMX:A|PDBID|CHAIN|SEQUENCE" (bareSequence "EEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEVLGSGKTLTIQVKEFGDAGQYTCHKGGEVLSHSLL")]

sequenceWithDigit :: Spec
sequenceWithDigit = describe "sequenceWithDigit" $ do
it "correctly parses incorrect sequence with digit" $ do
let res = parseOnly fastaP ">123\nIWELKKDVYVVELDWYPDAPGEMVVLTCDTPEE4GITWTLDQSSE"
res `shouldBe` Right [FastaItem "123" (bareSequence "")]

sequenceWithWrongName :: Spec
sequenceWithWrongName = describe "sequenceWithWrongName" $ do
it "correctly parses incorrect sequence with wrong name" $ do
let res = parseOnly fastaP "123\nIWELKKDVYVVELDWYPDAPGEMVVLTCDTPEEGITWTLDQSSE"
res `shouldBe` Right []

sequenceWithSpacesInName :: Spec
sequenceWithSpacesInName = describe "sequenceWithSpacesInName" $ do
it "correctly parses sequence with spaces in name" $ do
let res = parseOnly fastaP "> this is my sequence \nIWELKKDVYVVELDWYPDAPGEMVVLTCDTPEEGITWTLDQSSE"
res `shouldBe` Right [FastaItem "this is my sequence" (bareSequence "IWELKKDVYVVELDWYPDAPGEMVVLTCDTPEEGITWTLDQSSE")]

sequenceWithSeveralEndOfLine :: Spec
sequenceWithSeveralEndOfLine = describe "sequenceWithSeveralEndOfLine" $ do
it "correctly parses sequence with several \n after name" $ do
let res = parseOnly fastaP ">this is my sequence\n\n\nIWELKKDVYVVELDWYPDAPGEMVVLTCDTPEEGITWTLDQSSE"
res `shouldBe` Right [FastaItem "this is my sequence" (bareSequence "IWELKKDVYVVELDWYPDAPGEMVVLTCDTPEEGITWTLDQSSE")]

sequenceWithSeveralEndOfLineInSequence :: Spec
sequenceWithSeveralEndOfLineInSequence = describe "sequenceWithSeveralEndOfLineInSequence" $ do
it "correctly parses sequence with several \n between sequence parts" $ do
let res = parseOnly fastaP ">this is my sequence\nIWELKKDVYVVELDWYPDAPGEMVVLTCDTPEEGITWTLDQSSE\n\n\nYYYYYYYYYYYYYYYYYYYYYYYY"
res `shouldBe` Right [FastaItem "this is my sequence" (bareSequence "IWELKKDVYVVELDWYPDAPGEMVVLTCDTPEEGITWTLDQSSEYYYYYYYYYYYYYYYYYYYYYYYY")]

sequenceWithTabsInName :: Spec
sequenceWithTabsInName = describe "sequenceWithTabsInName" $ do
it "correctly parses sequence with tabs in name" $ do
let res = parseOnly fastaP ">\tthis\tis\tmy\tsequence\t\t\nIWELKKDVYVVELDWYPDAPGEMVVLTCDTPEEGITWTLDQSSE"
res `shouldBe` Right [FastaItem "this\tis\tmy\tsequence" (bareSequence "IWELKKDVYVVELDWYPDAPGEMVVLTCDTPEEGITWTLDQSSE")]

sequenceWithTabsInSequence :: Spec
sequenceWithTabsInSequence = describe "sequenceWithTabsInSequence" $ do
it "correctly parses sequence with tabs between sequence parts" $ do
let res = parseOnly fastaP ">this is my sequence\nIWELKKDVYVVELDWYPDAPGEMVVLTCDTPEEGITWTLDQSSE\t\t\nYYYYYYYYYYYYYYYYYYYYYYYY\t\n"
res `shouldBe` Right [FastaItem "this is my sequence" (bareSequence "IWELKKDVYVVELDWYPDAPGEMVVLTCDTPEEGITWTLDQSSEYYYYYYYYYYYYYYYYYYYYYYYY")]
37 changes: 37 additions & 0 deletions test/FastaWriterSpec.hs
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
module FastaWriterSpec where

import Bio.FASTA.Writer (fastaToText)
import Bio.FASTA.Type (FastaItem(..))
import Bio.Sequence (bareSequence)
import Test.Hspec

fastaWriterSpec :: Spec
fastaWriterSpec = describe "Fasta format parser." $ do
emptyFasta
oneShortSequence
oneLongSequence
twoSequences

emptyFasta :: Spec
emptyFasta = describe "emptyFasta" $ do
it "correctly write empty fasta" $ do
let res = fastaToText []
res `shouldBe` ""

oneShortSequence :: Spec
oneShortSequence = describe "oneShortSequence" $ do
it "correctly write one correct short (less than 80 chars) sequence" $ do
let res = fastaToText [FastaItem "3HMX:A|PDBID|CHAIN|SEQUENCE" (bareSequence "IWELKKDVYVVELDWYPDAPGEMVVLTCDTPEEDGITWTLDQSSEVLGSGKTLTIQVKEFGDAGQYTCHKGGEVLSHSLL")]
res `shouldBe` ">3HMX:A|PDBID|CHAIN|SEQUENCE\nIWELKKDVYVVELDWYPDAPGEMVVLTCDTPEEDGITWTLDQSSEVLGSGKTLTIQVKEFGDAGQYTCHKGGEVLSHSLL\n"

oneLongSequence :: Spec
oneLongSequence = describe "oneLongSequence" $ do
it "correctly write one correct long (more than 80 chars) sequence" $ do
let res = fastaToText [FastaItem "3HMX:A|PDBID|CHAIN|SEQUENCE" (bareSequence "IWELKKDVYVVELDWYPDAPGEMVVLTCDTPEEDGITWTLDQSSEVLGSGKTLTIQVKEFGDAGQYTCHKGGEVLSHSLLLLHKKEDGIWSTDILKDQKEPKNKTFLRCEAKNYSGRFTCWWLTTISTDLTFSVKSSRGSSDPQGVTCGAATLSAERVRGDNKEYEYSVECQEDSACPAAEESLPIEVMVDAVHKLKYENYTSSFFIRDIIKPDPPKNLQLKPLKNSRQVEVSWEYPDTWSTPHSYFSLTFCVQVQGKSKREKKDRVFTDKTSATVICRKNASISVRAQDRYYSSSWSEWASVPCS")]
res `shouldBe` ">3HMX:A|PDBID|CHAIN|SEQUENCE\nIWELKKDVYVVELDWYPDAPGEMVVLTCDTPEEDGITWTLDQSSEVLGSGKTLTIQVKEFGDAGQYTCHKGGEVLSHSLL\nLLHKKEDGIWSTDILKDQKEPKNKTFLRCEAKNYSGRFTCWWLTTISTDLTFSVKSSRGSSDPQGVTCGAATLSAERVRG\nDNKEYEYSVECQEDSACPAAEESLPIEVMVDAVHKLKYENYTSSFFIRDIIKPDPPKNLQLKPLKNSRQVEVSWEYPDTW\nSTPHSYFSLTFCVQVQGKSKREKKDRVFTDKTSATVICRKNASISVRAQDRYYSSSWSEWASVPCS\n"

twoSequences :: Spec
twoSequences = describe "twoSequences" $ do
it "correctly write two correct sequences" $ do
let res = fastaToText [FastaItem "3HMX:A|PDBID|CHAIN|SEQUENCE" (bareSequence "IWELKKDVYVVELDWYPDAPGEMVVLTCDTPEEDGITWTLDQSSEVLGSGKTLTIQVKEFGDAGQYTCHKGGEVLSHSLL"), FastaItem "7HMX:A|PDBID|CHAIN|SEQUENCE" (bareSequence "IWELKKDVYVVELDWYPDAPGEMVVLTCDTPEEDGITWTLDQSSEVLGSGKTLTIQVKEFGDAGQYTCHKGGEVLSHSLLLLHKKEDGIWSTDILKDQKEPKNKTFLRCEAKNYSGRFTCWWLTTISTDLTFSVKSSRGSSDPQGVTCGAATLSAERVRGDNKEYEYSVECQEDSACPAAEESLPIEVMVDAVHKLKYENYTSSFFIRDIIKPDPPKNLQLKPLKNSRQVEVSWEYPDTWSTPHSYFSLTFCVQVQGKSKREKKDRVFTDKTSATVICRKNASISVRAQDRYYSSSWSEWASVPCS")]
res `shouldBe` ">3HMX:A|PDBID|CHAIN|SEQUENCE\nIWELKKDVYVVELDWYPDAPGEMVVLTCDTPEEDGITWTLDQSSEVLGSGKTLTIQVKEFGDAGQYTCHKGGEVLSHSLL\n>7HMX:A|PDBID|CHAIN|SEQUENCE\nIWELKKDVYVVELDWYPDAPGEMVVLTCDTPEEDGITWTLDQSSEVLGSGKTLTIQVKEFGDAGQYTCHKGGEVLSHSLL\nLLHKKEDGIWSTDILKDQKEPKNKTFLRCEAKNYSGRFTCWWLTTISTDLTFSVKSSRGSSDPQGVTCGAATLSAERVRG\nDNKEYEYSVECQEDSACPAAEESLPIEVMVDAVHKLKYENYTSSFFIRDIIKPDPPKNLQLKPLKNSRQVEVSWEYPDTW\nSTPHSYFSLTFCVQVQGKSKREKKDRVFTDKTSATVICRKNASISVRAQDRYYSSSWSEWASVPCS\n"
7 changes: 7 additions & 0 deletions test/Spec.hs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@ import SequenceSpec
import System.IO
import Test.Hspec
import UniprotSpec
import FastaParserSpec
import FastaWriterSpec
import FASTASpec

main :: IO ()
main = do
Expand All @@ -28,3 +31,7 @@ main = do
-- GB
gbParserSpec
gbWriterSpec
-- Fasta
fastaParserSpec
fastaSpec
fastaWriterSpec

0 comments on commit 5ea5384

Please sign in to comment.