Skip to content

Commit

Permalink
version 0.1.5.6: YLAB2-2755: spaces in gb locus name (#76)
Browse files Browse the repository at this point in the history
  • Loading branch information
abychkova authored Dec 18, 2024
1 parent e45a279 commit 71fc907
Show file tree
Hide file tree
Showing 14 changed files with 149 additions and 99 deletions.
13 changes: 8 additions & 5 deletions ChangeLog.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@

## [Unreleased]

## [0.1.5.6] - 2024-12-18
- Fix for gb-parser: spaces in name in LOCUS

## [0.1.5.5] - 2024-05-16
- Add `5ROX` modification to Fasta parser.

Expand Down Expand Up @@ -64,12 +67,12 @@ Added ASN hydrogen names sometimes set by Scho
- Update dependency versions.

## [0.1.3.20] - 2021-06-04
### Changed
- YLAB2-629: Fasta parser is now able to parse empty lines in the beginning.
### Changed
- YLAB2-629: Fasta parser is now able to parse empty lines in the beginning.

## [0.1.3.19] - 2021-04-30
### Changed
- Exports and instances for Biosset.
### Changed
- Exports and instances for Biosset.

## [0.1.3.18] - 2021-03-09
### Fixed
Expand Down Expand Up @@ -110,7 +113,7 @@ Added ASN hydrogen names sometimes set by Scho

## [0.1.3.9] - 2020-10-27
### Fixed
- FASTA parser can now parse empty lines with spaces.
- FASTA parser can now parse empty lines with spaces.

## [0.1.3.8] - 2020-10-22
### Fixed
Expand Down
2 changes: 1 addition & 1 deletion package.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
name: cobot-io
version: 0.1.5.5
version: 0.1.5.6
github: "biocad/cobot-io"
license: BSD3
category: Bio
Expand Down
33 changes: 18 additions & 15 deletions src/Bio/GB/Parser.hs
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ genBankP :: Parser GenBankSequence
genBankP = GenBankSequence
<$> (metaP <?> "Meta parser")
<*> (gbSeqP <?> "GB sequence parser")
<* string "//" <* eolSpaceP
<* string "//" <* eolSpaceP

--------------------------------------------------------------------------------
-- Block with meta-information.
Expand All @@ -48,16 +48,19 @@ metaP = do

locusP :: Parser Locus
locusP = string "LOCUS" *> space *> (Locus
<$> textP <* space -- name
<$> nameP <* space -- name
<*> decimal <* space <* string "bp" <* space -- sequence length
<*> textP <* space -- molecule type
<*> optional formP <* space -- form of sequence
<*> optional (pack <$> some (satisfy isUpper)) <* space -- GenBank division
<*> optional formP <* space -- form of sequence
<*> optional (pack <$> some (satisfy isUpper)) <* space -- GenBank division
<*> textP -- modification date
<* eolSpaceP)
where
textP = takeWhile1P Nothing $ not . isSpace

nameP :: Parser Text
nameP = textP <> (try (string " " <> nameP) <|> "")

formP :: Parser Form
formP = try (string "linear" $> Linear) <|> (string "circular" $> Circular)

Expand Down Expand Up @@ -108,7 +111,7 @@ commentP = string "COMMENT" *> (try emptyP <|> (many (char ' ') *> someLinesP))
--------------------------------------------------------------------------------

featuresP :: Parser [(Feature, Range)]
featuresP = -- skip unknown fields and stop on line with "FEATURES"
featuresP = -- skip unknown fields and stop on line with "FEATURES"
manyTill (textWithSpacesP <* eolSpaceP) (string "FEATURES") *> space
*> textWithSpacesP <* eolSpaceP
*> some (featureP <?> "Single feature parser")
Expand All @@ -128,8 +131,8 @@ featureP = do
pure (Feature featureName' props, shiftRange (-1) range)

rangeP :: Parser Range
rangeP = try spanP
<|> try betweenP
rangeP = try spanP
<|> try betweenP
<|> try pointP
<|> try joinP
<|> complementP
Expand All @@ -141,8 +144,8 @@ rangeP = try spanP
_ <- string ".."
upperBorderType <- option Precise (try $ char '>' *> pure Exceeded)
upperBorderLocation <- decimal
pure $ Span (RangeBorder lowerBorderType lowerBorderLocation) (RangeBorder upperBorderType upperBorderLocation)
pure $ Span (RangeBorder lowerBorderType lowerBorderLocation) (RangeBorder upperBorderType upperBorderLocation)

betweenP :: Parser Range
betweenP = do
before <- decimal
Expand All @@ -152,13 +155,13 @@ rangeP = try spanP

pointP :: Parser Range
pointP = fmap Point decimal

joinP :: Parser Range
joinP = string "join(" *> fmap Join (rangeP `sepBy1` char ',') <* char ')'

complementP :: Parser Range
complementP = fmap Complement $ string "complement(" *> rangeP <* char ')'


propsP :: Parser (Text, Text)
propsP = do
Expand All @@ -178,17 +181,17 @@ propsP = do
indLine = do
_ <- string featureIndent2
notFollowedBy (char '/')
text <- textWithSpacesP
text <- textWithSpacesP
eolSpaceP
pure text

multiLineProp :: Parser Text
multiLineProp = do
fstText <- textWithSpacesP <* eolSpaceP
fstText <- textWithSpacesP <* eolSpaceP
rest <- many (try indLine)
pure $ T.concat (fstText : rest)
pure $ T.concat (fstText : rest)




-- | First level of identation in FEATURES table file.
--
Expand Down
6 changes: 5 additions & 1 deletion src/Bio/MMTF/Decode/MessagePack.hs
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,11 @@ asStr _ (ObjectStr s) = pure s
asStr m _ = fail $ T.unpack m <> ": not a string data"

asChar :: MonadFail m => Text -> Object -> m Char
asChar m = (head . T.unpack <$>) . asStr m
asChar txt obj = do
str <- asStr txt obj
case T.unpack str of
[] -> return ' '
(c : _) -> return c

asInt :: (MonadFail m, Integral a) => Text -> Object -> m a
asInt _ (ObjectInt i) = pure (fromIntegral i)
Expand Down
10 changes: 4 additions & 6 deletions src/Bio/PDB.hs
Original file line number Diff line number Diff line change
Expand Up @@ -72,15 +72,13 @@ instance StructureModels PDB.PDB where

mkResidue :: Map Text (Vector (Bond LocalID)) -> [PDB.Atom] -> Residue
mkResidue _ [] = error "Cound not make residue from empty list"
mkResidue localBondsMap atoms' = Residue (T.strip $ PDB.atomResName firstResidueAtom)
(PDB.atomResSeq firstResidueAtom)
(PDB.atomICode firstResidueAtom)
mkResidue localBondsMap atoms'@(firstAtom : _) = Residue (T.strip $ PDB.atomResName firstAtom)
(PDB.atomResSeq firstAtom)
(PDB.atomICode firstAtom)
(V.fromList $ mkAtom <$> atoms')
(localBondsMap M.!?! residueID firstResidueAtom)
(localBondsMap M.!?! residueID firstAtom)
Undefined -- now we do not read secondary structure
"" -- chemical component type?!
where
firstResidueAtom = head atoms'

mkAtom :: PDB.Atom -> Atom
mkAtom atom@PDB.Atom{..} = Atom (GlobalID $ atomToNilBasedIndex M.!?! atom)
Expand Down
40 changes: 22 additions & 18 deletions src/Bio/PDB/BondRestoring.hs
Original file line number Diff line number Diff line change
Expand Up @@ -42,25 +42,28 @@ restoreChainLocalBonds' chainAtoms = residueIDToLocalBonds
residueIDToLocalBonds = do
(residueAtoms, residueBonds) <- zip chainAtomsGroupedByResidue intraResidueGlobalBonds
let localBonds = V.fromList $ convertGlobalsToLocals residueAtoms residueBonds
let _residueID = residueID $ head residueAtoms
let _residueID =
case residueAtoms of
[] -> ""
(atom : _) -> residueID atom
pure (_residueID, localBonds)

intraResidueGlobalBonds :: [[Bond PDB.Atom]]
intraResidueGlobalBonds = fmap restoreIntraResidueBonds chainAtomsGroupedByResidue

chainAtomsGroupedByResidue :: [[PDB.Atom]]
chainAtomsGroupedByResidue = groupChainByResidue chainAtoms

convertGlobalsToLocals :: [PDB.Atom] -> [Bond PDB.Atom] -> [Bond LocalID]
convertGlobalsToLocals residueAtoms = map convertGlobalToLocal
where
convertGlobalToLocal :: Bond PDB.Atom -> Bond LocalID
convertGlobalToLocal (Bond from to order) =
convertGlobalToLocal (Bond from to order) =
Bond (LocalID $ atomToLocalIdMap ! from) (LocalID $ atomToLocalIdMap ! to) order

atomToLocalIdMap :: Map PDB.Atom Int
atomToLocalIdMap = M.fromList $ zip sortedAtoms [0..]

sortedAtoms :: [PDB.Atom]
sortedAtoms = sort residueAtoms

Expand All @@ -70,19 +73,19 @@ restoreModelGlobalBonds atomToNilBasedIndex chains = convertToGlobalIDs atomToNi
where
convertToGlobalIDs :: Map PDB.Atom Int -> Vector (Bond PDB.Atom) -> Vector (Bond GlobalID)
convertToGlobalIDs mapping = reindexBonds (\atom -> GlobalID $ mapping ! atom)

reindexBonds :: (a -> b) -> Vector (Bond a) -> Vector (Bond b)
reindexBonds convertID = fmap (\(Bond from to order) -> Bond (convertID from) (convertID to) order)

chainAtomsGroupedByResidue :: Vector [[PDB.Atom]]
chainAtomsGroupedByResidue = fmap groupChainByResidue chains

_intraResidueBonds :: [Bond PDB.Atom]
_intraResidueBonds = concatMap restoreChainIntraResidueBonds chainAtomsGroupedByResidue

peptideBonds :: [Bond PDB.Atom]
peptideBonds = concatMap restoreChainPeptideBonds chainAtomsGroupedByResidue

disulfideBonds :: [Bond PDB.Atom]
disulfideBonds = restoreDisulfideBonds . concat $ V.toList chainAtomsGroupedByResidue

Expand Down Expand Up @@ -116,7 +119,7 @@ restoreChainPeptideBonds atomsGroupedByResidue = catMaybes $ restoreChainPeptide
restoreChainPeptideBonds' :: [[PDB.Atom]] -> [Maybe (Bond PDB.Atom)] -> [Maybe (Bond PDB.Atom)]
restoreChainPeptideBonds' [] acc = acc
restoreChainPeptideBonds' [_] acc = acc
restoreChainPeptideBonds' (residue1:residue2:residues) acc =
restoreChainPeptideBonds' (residue1:residue2:residues) acc =
restoreChainPeptideBonds' (residue2:residues) (constructBond residue1 residue2 : acc)

constructBond :: [PDB.Atom] -> [PDB.Atom] -> Maybe (Bond PDB.Atom)
Expand All @@ -129,28 +132,29 @@ restoreChainPeptideBonds atomsGroupedByResidue = catMaybes $ restoreChainPeptide
guard $ distance (coords carbonAtom1) (coords nitrogenAtom2) < peptideBondMaxLength

pure $ Bond carbonAtom1 nitrogenAtom2 1

getAtomByName :: [PDB.Atom] -> Text -> Maybe PDB.Atom
getAtomByName atoms atomNameToFind = find ((atomNameToFind ==) . T.strip . PDB.atomName) atoms

restoreChainIntraResidueBonds :: [[PDB.Atom]] -> [Bond PDB.Atom]
restoreChainIntraResidueBonds = concatMap restoreIntraResidueBonds

restoreIntraResidueBonds :: [PDB.Atom] -> [Bond PDB.Atom]
restoreIntraResidueBonds residueAtoms = catMaybes $ constructBond <$> residueBonds
restoreIntraResidueBonds [] = []
restoreIntraResidueBonds residueAtoms@(firstAtom : _) = catMaybes $ constructBond <$> residueBonds
where
-- TODO: support bond order somehow
constructBond :: (Text, Text) -> Maybe (Bond PDB.Atom)
constructBond (fromAtomName, toAtomName) = Bond <$> constructAtom fromAtomName <*> constructAtom toAtomName <*> Just 1

constructAtom :: Text -> Maybe PDB.Atom
constructAtom atomName = atomNameToAtom !? atomName

atomNameToAtom :: Map Text PDB.Atom
atomNameToAtom = M.fromList $ (\atom@PDB.Atom{..} -> (T.strip atomName, atom)) <$> residueAtoms

residueBonds :: [(Text, Text)]
residueBonds = intraResidueBonds . T.strip . PDB.atomResName $ head residueAtoms
residueBonds = intraResidueBonds . T.strip . PDB.atomResName $ firstAtom

intraResidueBonds :: Text -> [(Text, Text)]
intraResidueBonds "NMA" = [("CA", "N")]
Expand Down
15 changes: 8 additions & 7 deletions src/Bio/PDB/Functions.hs
Original file line number Diff line number Diff line change
Expand Up @@ -7,20 +7,21 @@ import qualified Bio.Utils.Map as M ((!?!))

import Data.Map.Strict (Map)
import qualified Data.Map.Strict as M (fromList)
import Data.List (groupBy,
import Data.List (groupBy,
sortOn)
import Data.Vector (Vector)
import qualified Data.Vector as V (toList)
import Data.Char (toUpper)

groupChainByResidue :: Vector PDB.Atom -> [[PDB.Atom]]
groupChainByResidue = sortOn (sortOnResidue . head) . groupBy atomsFromSameResidue . V.toList
where
groupChainByResidue = sortOn sortOnResidue . groupBy atomsFromSameResidue . V.toList
where
atomsFromSameResidue :: PDB.Atom -> PDB.Atom -> Bool
atomsFromSameResidue atom1 atom2 = PDB.atomResSeq atom1 == PDB.atomResSeq atom2 && PDB.atomICode atom1 == PDB.atomICode atom2

sortOnResidue :: PDB.Atom -> Int
sortOnResidue PDB.Atom{..} = atomSerial * 100 + (insertionCodeSortingCorrections M.!?! toUpper atomICode)


sortOnResidue :: [PDB.Atom] -> Int
sortOnResidue [] = -1000000
sortOnResidue (PDB.Atom{..} : _) = atomSerial * 100 + (insertionCodeSortingCorrections M.!?! toUpper atomICode)

insertionCodeSortingCorrections :: Map Char Int
insertionCodeSortingCorrections = M.fromList $ zip (' ':['A'..'Z']) [0..]
2 changes: 1 addition & 1 deletion src/Bio/PDB/Reader.hs
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ isMdlLine line = elem (T.take 6 line) modelStrings || elem (T.take 5 line) model

checkRow :: [Int] -> Bool
checkRow [] = True
checkRow xs = last xs - head xs + 1 == L.length xs
checkRow row@(x : _) = last row - x + 1 == L.length row

checkMdlLines :: ([PDBWarnings], Text) -> Bool
checkMdlLines warnings'n'text = checkRow mdlLineNumbers
Expand Down
2 changes: 1 addition & 1 deletion src/Bio/Sequence/Functions/Marking.hs
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ import Data.List (nub)
import Data.List.NonEmpty (NonEmpty (..))
import Data.Text (Text)
import qualified Data.Vector as V (toList)
import Prelude hiding (drop, head, length, null, reverse, tail, take, (!!))
import Prelude hiding (length)

import Bio.NucleicAcid.Nucleotide (Complementary (..))
import Bio.Sequence.Class (ContainsMarking, IsBareSequence, IsMarkedSequence,
Expand Down
4 changes: 1 addition & 3 deletions src/Bio/Sequence/Functions/Weight.hs
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,7 @@ import Data.Text (Text)
import Data.Vector (Vector)
import qualified Data.Vector as V (drop, length, take,
toList, (!))
import Prelude hiding (drop, head, length,
null, reverse, tail, take,
(!!))
import Prelude hiding (drop, length, take, (!!))

-- | Range of form [a, b].
--
Expand Down
Loading

0 comments on commit 71fc907

Please sign in to comment.