Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Rewrite FASTA parser to Megaparsec #67

Merged
merged 23 commits into from
Oct 4, 2022
Merged
4 changes: 4 additions & 0 deletions ChangeLog.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@

## [Unreleased]

## [0.1.5.0] - 2022-09-30
### Changed
- Update FASTA parser to megaparsec.

## [0.1.4.4] - 2022-06-02
### Changed
- Update more dependencies;
Expand Down
12 changes: 12 additions & 0 deletions default.nix
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
let
bcd-lts = import (builtins.fetchGit {
url = "git@github.com:biocad/nix-lts.git";
ref = "master";
});
in
bcd-lts.mkBiocadProject {
src = bcd-lts.pkgs.haskell-nix.haskellLib.cleanGit { name = "cobot-io"; src = ./.; };
shellArgs = {
buildInputs = [ bcd-lts.pkgs.RNA ];
};
}
3 changes: 2 additions & 1 deletion package.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
name: cobot-io
version: 0.1.4.4
version: 0.1.5.0
github: "biocad/cobot-io"
license: BSD3
category: Bio
Expand Down Expand Up @@ -41,6 +41,7 @@ dependencies:
- containers >= 0.5.7.1 && < 0.7
- data-msgpack >= 0.0.9 && < 0.1
- deepseq >= 1.4 && < 1.5
- filepath
- http-conduit >= 2.3 && < 2.4
- hyraxAbif >= 0.2.3.27 && < 0.2.4.0
- lens >= 4.16 && < 5.2
Expand Down
1 change: 1 addition & 0 deletions shell.nix
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
(import ./default.nix).shellFor
7 changes: 4 additions & 3 deletions src/Bio/FASTA.hs
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,15 @@ module Bio.FASTA
, fromFile
, toFile
, fastaP
, fastaPGeneric
, fastaLine
, modificationP
, Parser
) where

import Control.Monad.IO.Class (MonadIO, liftIO)
import Data.Attoparsec.Text (parseOnly)
import Data.Text.IO (readFile, writeFile)
import System.FilePath (takeBaseName)
import Text.Megaparsec (errorBundlePretty, parse)
#if !MIN_VERSION_base(4,13,0)
import Control.Monad.Fail (MonadFail (..))
import Prelude hiding (fail, readFile, writeFile)
Expand All @@ -28,7 +29,7 @@ import Bio.FASTA.Writer (WritableFastaToken (..), fastaToText)
-- | Reads 'FastaSequence' from given file.
--
fromFile :: (MonadFail m, MonadIO m) => FilePath -> m (Fasta Char)
fromFile f = liftIO (readFile f) >>= either fail pure . parseOnly fastaP
fromFile f = liftIO (readFile f) >>= either (fail . errorBundlePretty) pure . parse fastaP (takeBaseName f)

-- | Writes 'FastaSequence' to file.
--
Expand Down
259 changes: 138 additions & 121 deletions src/Bio/FASTA/Parser.hs
Original file line number Diff line number Diff line change
Expand Up @@ -2,151 +2,168 @@

module Bio.FASTA.Parser
( fastaP
, fastaPGeneric
, fastaLine
, fastaLine
, parseOnly
, modificationP
, fastaPGeneric
, Parser
) where

import Bio.FASTA.Type (Fasta, FastaItem (..), ModItem (..), Modification (..),
ParsableFastaToken (..))
import Bio.Sequence (BareSequence, bareSequence)
import Control.Applicative ((<|>))
import Data.Attoparsec.Text (Parser, char, choice, endOfInput, endOfLine, many', many1', satisfy,
skipWhile, space, string, takeWhile, try)
import Data.Char (isAlphaNum, isLetter, isSpace)
import Data.Text (Text, strip)
import Prelude hiding (takeWhile)
import Bio.FASTA.Type (Fasta, FastaItem (..),
ModItem (..), Modification (..),
ParsableFastaToken (..))
import Bio.Sequence (BareSequence, bareSequence)
import Data.Bifunctor (first)
import Data.Char (isLetter)
import Data.Functor (void, ($>))
import Data.Text (Text, pack, strip)
import Data.Void (Void)
import Text.Megaparsec
import Text.Megaparsec.Char
import qualified Text.Megaparsec.Char.Lexer as L

instance ParsableFastaToken Char where
parseToken = satisfy
parseToken p = satisfy p <?> "letter"

instance ParsableFastaToken ModItem where
parseToken predicate = (Mod <$> modificationP) <|> (Letter <$> satisfy predicate)
parseToken p = (Mod <$> modificationP <?> "fasta item modification") <|> (Letter <$> satisfy p <?> "letter")

type Parser = Parsec Void Text

-- | Parser of .fasta file.
--

parseOnly :: Parsec Void Text a -> Text -> Either String a
parseOnly p s = first errorBundlePretty $ parse p "input.fasta" s

sc :: Parser ()
sc = L.space space1 empty empty

lexeme :: Parser a -> Parser a
lexeme = L.lexeme sc

symbol :: Text -> Parser Text
symbol = L.symbol sc

fastaP :: ParsableFastaToken a => Parser (Fasta a)
fastaP = many' space *> fastaPGeneric isLetter
fastaP = many (item isLetter) <* hidden space <* eof

fastaPGeneric :: ParsableFastaToken a => (Char -> Bool) -> Parser (Fasta a)
fastaPGeneric = many' . item
fastaPGeneric p = many (item p) <* hidden space <* eof

item :: ParsableFastaToken a => (Char -> Bool) -> Parser (FastaItem a)
item predicate = (FastaItem <$> seqName <*> fastaSeq predicate) <* skipWhile isSpace
item p =
FastaItem
<$> seqName
<*> (fastaSeq p <?> "sequence")

seqName :: Parser Text
seqName = strip <$> (char '>' *> tabs *> takeWhile (`notElem` ['\n', '\r']) <* tabs <* eol)
seqName = strip . pack <$> (symbol ">" *> (manyTill anySingle myEnd <?> "sequence name"))

fastaSeq :: ParsableFastaToken a => (Char -> Bool) -> Parser (BareSequence a)
fastaSeq predicate = bareSequence . mconcat <$> many' (fastaLine predicate)
fastaSeq p = bareSequence . concat <$> many (fastaLine p) <* hidden space

fastaLine :: ParsableFastaToken a => (Char -> Bool) -> Parser [a]
fastaLine predicate = concat <$> many1' (many1' (parseToken predicate) <* many' (char ' ')) <* eol

eol :: Parser ()
eol = tabs *> choice [slashN, endOfInput]

slashN :: Parser ()
slashN = () <$ many1' endOfLine
fastaLine p = concat <$> some (some (parseToken p) <* hidden hspace) <* myEnd

tabs :: Parser ()
tabs = () <$ many' (char '\t')
myEnd :: Parser ()
myEnd = void (some eol) <|> eof

modificationP :: Parser Modification
modificationP
= string "[A*]" *> pure Mod_A_Star
<|> string "[C*]" *> pure Mod_C_Star
<|> string "[G*]" *> pure Mod_G_Star
<|> string "[T*]" *> pure Mod_T_Star
<|> string "[rA]" *> pure Mod_rA
<|> string "[rC]" *> pure Mod_rC
<|> string "[rG]" *> pure Mod_rG
<|> string "[rU]" *> pure Mod_rU
<|> string "[+A]" *> pure Mod_Plus_A
<|> string "[+C]" *> pure Mod_Plus_C
<|> string "[+G]" *> pure Mod_Plus_G
<|> string "[+T]" *> pure Mod_Plus_T
<|> string "[rAf]" *> pure Mod_rAf
<|> string "[rCf]" *> pure Mod_rCf
<|> string "[rGf]" *> pure Mod_rGf
<|> string "[rUf]" *> pure Mod_rUf
<|> string "[mA]" *> pure Mod_mA
<|> string "[mC]" *> pure Mod_mC
<|> string "[mG]" *> pure Mod_mG
<|> string "[mU]" *> pure Mod_mU
<|> string "[mA*]" *> pure Mod_mA_Star
<|> string "[mC*]" *> pure Mod_mC_Star
<|> string "[mG*]" *> pure Mod_mG_Star
<|> string "[mU*]" *> pure Mod_mU_Star
<|> string "[dU]" *> pure Mod_dU
<|> string "[5Bio]" *> pure Mod_5Bio
<|> string "[iBio]" *> pure Mod_iBio
<|> string "[56FAM]" *> pure Mod_56FAM
<|> string "[36FAM]" *> pure Mod_36FAM
<|> string "[5HEX]" *> pure Mod_5HEX
<|> string "[5TMR]" *> pure Mod_5TMR
<|> string "[3BHQ1]" *> pure Mod_3BHQ1
<|> string "[3BHQ2]" *> pure Mod_3BHQ2
<|> string "[5NH2]" *> pure Mod_5NH2
<|> string "[3NH2]" *> pure Mod_3NH2
<|> string "[5PO4]" *> pure Mod_5PO4
<|> string "[3PO4]" *> pure Mod_3PO4
<|> string "[3BioTEG]" *> pure Mod_3BioTEG
<|> string "[C12]" *> pure Mod_C12
<|> string "[NHSdT]" *> pure Mod_NHSdT
<|> string "[5Mal]" *> pure Mod_5Mal
<|> string "[5thio]" *> pure Mod_5thio
<|> string "[3thio]" *> pure Mod_3thio
<|> string "[3azide]" *> pure Mod_3azide
<|> string "[3alkine]" *> pure Mod_3alkine
<|> string "[5CholTEG]" *> pure Mod_5CholTEG
<|> string "[3CholTEG]" *> pure Mod_3CholTEG
<|> string "[5C10]" *> pure Mod_5C10
<|> string "[5Alk]" *> pure Mod_5Alk
<|> string "[GC]" *> pure Mod_GC
<|> string "[GT]" *> pure Mod_GT
<|> string "[AT]" *> pure Mod_AT
<|> string "[TG]" *> pure Mod_TG
<|> string "[AC]" *> pure Mod_AC
<|> string "[CC]" *> pure Mod_CC
<|> string "[AA]" *> pure Mod_AA
<|> string "[TC]" *> pure Mod_TC
<|> string "[TT]" *> pure Mod_TT
<|> string "[CG]" *> pure Mod_CG
<|> string "[GG]" *> pure Mod_GG
<|> string "[AG]" *> pure Mod_AG
<|> string "[GA]" *> pure Mod_GA
<|> string "[CA]" *> pure Mod_CA
<|> string "[CT]" *> pure Mod_CT
<|> string "[TA]" *> pure Mod_TA
<|> string "[AAA]" *> pure Mod_AAA
<|> string "[AAC]" *> pure Mod_AAC
<|> string "[ACT]" *> pure Mod_ACT
<|> string "[ATC]" *> pure Mod_ATC
<|> string "[ATG]" *> pure Mod_ATG
<|> string "[CAG]" *> pure Mod_CAG
<|> string "[AGA]" *> pure Mod_AGA
<|> string "[CAT]" *> pure Mod_CAT
<|> string "[CCG]" *> pure Mod_CCG
<|> string "[CGT]" *> pure Mod_CGT
<|> string "[CTG]" *> pure Mod_CTG
<|> string "[GAA]" *> pure Mod_GAA
<|> string "[GAC]" *> pure Mod_GAC
<|> string "[GCT]" *> pure Mod_GCT
<|> string "[GGT]" *> pure Mod_GGT
<|> string "[GTT]" *> pure Mod_GTT
<|> string "[TAC]" *> pure Mod_TAC
<|> string "[TCT]" *> pure Mod_TCT
<|> string "[TGC]" *> pure Mod_TGC
<|> string "[TGG]" *> pure Mod_TGG
<|> string "[TTC]" *> pure Mod_TTC
<|> string "[TTT]" *> pure Mod_TTT
<|> unknownP
modificationP
= choice
[ string "[A*]" $> Mod_A_Star
, string "[C*]" $> Mod_C_Star
, string "[G*]" $> Mod_G_Star
, string "[T*]" $> Mod_T_Star
, string "[rA]" $> Mod_rA
, string "[rC]" $> Mod_rC
, string "[rG]" $> Mod_rG
, string "[rU]" $> Mod_rU
, string "[+A]" $> Mod_Plus_A
, string "[+C]" $> Mod_Plus_C
, string "[+G]" $> Mod_Plus_G
, string "[+T]" $> Mod_Plus_T
, string "[rAf]" $> Mod_rAf
, string "[rCf]" $> Mod_rCf
, string "[rGf]" $> Mod_rGf
, string "[rUf]" $> Mod_rUf
, string "[mA]" $> Mod_mA
, string "[mC]" $> Mod_mC
, string "[mG]" $> Mod_mG
, string "[mU]" $> Mod_mU
, string "[mA*]" $> Mod_mA_Star
, string "[mC*]" $> Mod_mC_Star
, string "[mG*]" $> Mod_mG_Star
, string "[mU*]" $> Mod_mU_Star
, string "[dU]" $> Mod_dU
, string "[5Bio]" $> Mod_5Bio
, string "[iBio]" $> Mod_iBio
, string "[56FAM]" $> Mod_56FAM
, string "[36FAM]" $> Mod_36FAM
, string "[5HEX]" $> Mod_5HEX
, string "[5TMR]" $> Mod_5TMR
, string "[3BHQ1]" $> Mod_3BHQ1
, string "[3BHQ2]" $> Mod_3BHQ2
, string "[5NH2]" $> Mod_5NH2
, string "[3NH2]" $> Mod_3NH2
, string "[5PO4]" $> Mod_5PO4
, string "[3PO4]" $> Mod_3PO4
, string "[3BioTEG]" $> Mod_3BioTEG
, string "[C12]" $> Mod_C12
, string "[NHSdT]" $> Mod_NHSdT
, string "[5Mal]" $> Mod_5Mal
, string "[5thio]" $> Mod_5thio
, string "[3thio]" $> Mod_3thio
, string "[3azide]" $> Mod_3azide
, string "[3alkine]" $> Mod_3alkine
, string "[5CholTEG]" $> Mod_5CholTEG
, string "[3CholTEG]" $> Mod_3CholTEG
, string "[5C10]" $> Mod_5C10
, string "[5Alk]" $> Mod_5Alk
, string "[GC]" $> Mod_GC
, string "[GT]" $> Mod_GT
, string "[AT]" $> Mod_AT
, string "[TG]" $> Mod_TG
, string "[AC]" $> Mod_AC
, string "[CC]" $> Mod_CC
, string "[AA]" $> Mod_AA
, string "[TC]" $> Mod_TC
, string "[TT]" $> Mod_TT
, string "[CG]" $> Mod_CG
, string "[GG]" $> Mod_GG
, string "[AG]" $> Mod_AG
, string "[GA]" $> Mod_GA
, string "[CA]" $> Mod_CA
, string "[CT]" $> Mod_CT
, string "[TA]" $> Mod_TA
, string "[AAA]" $> Mod_AAA
, string "[AAC]" $> Mod_AAC
, string "[ACT]" $> Mod_ACT
, string "[ATC]" $> Mod_ATC
, string "[ATG]" $> Mod_ATG
, string "[CAG]" $> Mod_CAG
, string "[AGA]" $> Mod_AGA
, string "[CAT]" $> Mod_CAT
, string "[CCG]" $> Mod_CCG
, string "[CGT]" $> Mod_CGT
, string "[CTG]" $> Mod_CTG
, string "[GAA]" $> Mod_GAA
, string "[GAC]" $> Mod_GAC
, string "[GCT]" $> Mod_GCT
, string "[GGT]" $> Mod_GGT
, string "[GTT]" $> Mod_GTT
, string "[TAC]" $> Mod_TAC
, string "[TCT]" $> Mod_TCT
, string "[TGC]" $> Mod_TGC
, string "[TGG]" $> Mod_TGG
, string "[TTC]" $> Mod_TTC
, string "[TTT]" $> Mod_TTT
, unknownP
]

unknownP :: Parser Modification
unknownP = try $ do
_ <- char '['
m <- many1' $ satisfy (\c -> isAlphaNum c || c `elem` ['+', '-', '*', '_'])
_ <- char ']'
pure $ Unknown ("[" <> m <> "]")
unknownP = do
res <- between (symbol "[") (symbol "]")
(lexeme (some (alphaNumChar <|> choice (char <$> ['+', '-', '*', '_'])) <?> "modification name"))
pure $ Unknown ("[" <> res <> "]")
12 changes: 6 additions & 6 deletions src/Bio/FASTA/Type.hs
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,11 @@ module Bio.FASTA.Type
, modificationToString
) where

import Bio.Sequence (BareSequence)
import Data.Attoparsec.Text (Parser)
import Data.Text (Text)
import GHC.Generics (Generic)
import Bio.Sequence (BareSequence)
import Data.Text (Text)
import Data.Void
import GHC.Generics (Generic)
import Text.Megaparsec

-- | Type alias for FASTA file.
-- satisfies the following format : >(\s|\t)*[^\n\r]+(\s|\t)*(\n|\r)*((\w|\s)(\n|\r)*)*
Expand All @@ -29,7 +30,7 @@ data FastaItem a
deriving (Eq, Show, Functor)

class ParsableFastaToken a where
parseToken :: (Char -> Bool) -> Parser a
parseToken :: (Char -> Bool) -> Parsec Void Text a

data ModItem
= Mod Modification
Expand Down Expand Up @@ -216,4 +217,3 @@ modificationToString Mod_TGG = "[TGG]"
modificationToString Mod_TTC = "[TTC]"
modificationToString Mod_TTT = "[TTT]"
modificationToString (Unknown s) = s

Loading