Skip to content

Commit

Permalink
IO fixes, bond restoring (#32)
Browse files Browse the repository at this point in the history
* Residue index supported in PDB

* Residue number and insertion code: the proper way

* HETATMs now parsed from PDB

* Insertion code support in mae

* PDBSpec renamed to PDBParserSpec

* Bond restoring first version. Doesn't work correctly for now

* Global bond restoring work correctly

* Fixed text

* Type

* Bond restoring works correctly. Input data must be slightly fixed

* Added missing disulfide bridges to mae structures

* Local bond for PDB, with no tests yet

* Fuck, nothing works

* Everything works!

* pedantation

* Fetching from REST API avoided in MMTF test

* Changelog, version

* Removed some strange comments in a test

* Changes after the review

* Added atomInputIndex. GlobalID is 0-based now

* Changelog and formatting
  • Loading branch information
tiermak authored Apr 1, 2020
1 parent 94796ca commit c521fd1
Show file tree
Hide file tree
Showing 71 changed files with 45,491 additions and 606 deletions.
11 changes: 11 additions & 0 deletions ChangeLog.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,17 @@

## [Unreleased]

## [0.1.3.0] - 2020-03-27
### Added
- Residue index in `Structure`.
- Atom input index in `Structure`.
- Bond restoring for PDB.
- Tests for PDB -> Model conversion.
### Changed
- GlobalID now 0-based in mae, PDB, and MMTF.
### Fixed
- A lot of things.

## [0.1.2.10] - 2020-03-27
### Added
- Lenses for `Structure`.
Expand Down
2 changes: 1 addition & 1 deletion package.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
name: cobot-io
version: 0.1.2.10
version: 0.1.3.0
github: "less-wrong/cobot-io"
license: BSD3
category: Bio
Expand Down
30 changes: 22 additions & 8 deletions src/Bio/MAE.hs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{-# LANGUAGE AllowAmbiguousTypes #-}
{-# LANGUAGE CPP #-}
{-# LANGUAGE AllowAmbiguousTypes #-}
{-# LANGUAGE CPP #-}
{-# OPTIONS_GHC -fno-warn-orphans #-}

module Bio.MAE
Expand All @@ -8,13 +8,15 @@ module Bio.MAE
, Table (..)
, fromFile
, fromText
, modelsFromMaeText
, modelsFromMaeFile
, maeP
) where

import Bio.MAE.Parser
import Bio.MAE.Type (Block (..), FromMaeValue (..),
Mae (..), MaeValue (..), Table (..))
import Bio.Structure (Atom (..), Bond (..), Chain (..),
import Bio.Structure (Atom (..), Bond (..), Chain (..), Model (..),
GlobalID (..), LocalID (..),
Model (..), Residue (..),
SecondaryStructure (..),
Expand Down Expand Up @@ -50,6 +52,12 @@ fromFile f = liftIO (TIO.readFile f) >>= either fail pure . parseOnly maeP
fromText :: Text -> Either Text Mae
fromText = first T.pack . parseOnly maeP

modelsFromMaeFile :: (MonadIO m) => FilePath -> m (Either Text (Vector Model))
modelsFromMaeFile = liftIO . fmap modelsFromMaeText . TIO.readFile

modelsFromMaeText :: Text -> Either Text (Vector Model)
modelsFromMaeText maeText = modelsOf <$> fromText maeText

instance StructureModels Mae where
modelsOf Mae{..} = V.fromList $ fmap blockToModel blocks
where
Expand Down Expand Up @@ -117,18 +125,23 @@ instance StructureModels Mae where
groupedByResidues = toGroupsOn by group
residues = V.fromList $ fmap groupToResidue groupedByResidues

by :: Int -> (Int, Text)
by i = (unsafeGetFromContents "i_m_residue_number" i, getFromContents defaultChainName "s_m_insertion_code" i)
by :: Int -> (Int, Char)
by i = (unsafeGetFromContents "i_m_residue_number" i, getFromContents defaultInsertionCode "s_m_insertion_code" i)

defaultChainName :: Text
defaultChainName = "A"

defaultInsertionCode :: Char
defaultInsertionCode = ' '

groupToResidue :: [Int] -> Residue
groupToResidue [] = error "Group that is result of List.groupBy can't be empty."
groupToResidue group@(h : _) = Residue name atoms (V.fromList localBonds) secondary chemCompType
groupToResidue group@(h : _) = Residue name residueNumber insertionCode atoms (V.fromList localBonds) secondary chemCompType
where
name = stripQuotes $ unsafeGetFromContents "s_m_pdb_residue_name" h
atoms = V.fromList $ fmap indexToAtom group
name = stripQuotes $ unsafeGetFromContents "s_m_pdb_residue_name" h
residueNumber = unsafeGetFromContents "i_m_residue_number" h
insertionCode = unsafeGetFromContents "s_m_insertion_code" h
atoms = V.fromList $ fmap indexToAtom group

localInds = [0 .. length group - 1]
globalToLocal = M.fromList $ zip group localInds
Expand All @@ -146,6 +159,7 @@ instance StructureModels Mae where

indexToAtom :: Int -> Atom
indexToAtom i = Atom (GlobalID i)
(i + 1)
(stripQuotes $ getFromContentsI "s_m_pdb_atom_name")
(elIndToElement M.! getFromContentsI "i_m_atomic_number")
coords
Expand Down
9 changes: 9 additions & 0 deletions src/Bio/MAE/Type.hs
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ module Bio.MAE.Type
import Data.Map.Strict (Map)
import Data.Maybe (fromJust)
import Data.Text (Text)
import qualified Data.Text as T (head, null, dropAround)

data Mae = Mae { version :: Text
, blocks :: [Block]
Expand Down Expand Up @@ -58,3 +59,11 @@ instance FromMaeValue Text where
fromMaeValue :: MaeValue -> Maybe Text
fromMaeValue (StringMaeValue t) = Just t
fromMaeValue _ = Nothing

instance FromMaeValue Char where
fromMaeValue :: MaeValue -> Maybe Char
fromMaeValue (StringMaeValue t) = Just $ if T.null t then ' ' else T.head $ stripQuotes t
fromMaeValue _ = Nothing

stripQuotes :: Text -> Text
stripQuotes = T.dropAround (== '"')
19 changes: 11 additions & 8 deletions src/Bio/MMTF.hs
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ fetch pdbid = do let url = fromString $ "https://mmtf.rcsb.org/v1.0/full/" <> pd
decode (getResponseBody resp)

instance StructureModels MMTF where
-- TODO: add global bonds
modelsOf m = l2v (flip Model empty . l2v <$> zipWith (zipWith Chain) chainNames chainResis)
where
chainsCnts = fromIntegral <$> toList (chainsPerModel (model m))
Expand Down Expand Up @@ -70,7 +71,8 @@ instance StructureModels MMTF where
in (end, mkAtom <$> zip4 cl nl el ics)

mkResidue :: (GroupType, SecondaryStructure, [Atom]) -> Residue
mkResidue (gt, ss, atoms') = Residue (gtGroupName gt) (l2v atoms')
-- TODO: support residue number here
mkResidue (gt, ss, atoms') = Residue (gtGroupName gt) (-1) ' ' (l2v atoms')
(mkBonds (gtBondAtomList gt) (gtBondOrderList gt))
ss (gtChemCompType gt)

Expand All @@ -87,13 +89,14 @@ instance StructureModels MMTF where
z = zCoordList (atom m)
o = occupancyList (atom m)
b = bFactorList (atom m)
in Atom (GlobalID $ fromIntegral (i ! idx))
n
e
(V3 (x ! idx) (y ! idx) (z ! idx))
fc
(b ! idx)
(o ! idx)
in Atom (GlobalID idx)
(fromIntegral $ i ! idx)
n
e
(V3 (x ! idx) (y ! idx) (z ! idx))
fc
(b ! idx)
(o ! idx)

cutter :: [Int] -> [a] -> [[a]]
cutter [] [] = []
Expand Down
121 changes: 71 additions & 50 deletions src/Bio/PDB.hs
Original file line number Diff line number Diff line change
@@ -1,61 +1,82 @@
{-# OPTIONS_GHC -fno-warn-orphans #-}
module Bio.PDB
(
( modelsFromPDBText
, modelsFromPDBFile
) where

import qualified Bio.PDB.Type as PDB
import qualified Bio.PDB.Type as PDB
import Bio.PDB.Reader (fromTextPDB, PDBWarnings)
import Bio.PDB.BondRestoring (restoreModelGlobalBonds, restoreChainLocalBonds, residueID)
import Bio.PDB.Functions (groupChainByResidue)
import Bio.Structure

import Control.Arrow ((&&&))
import Data.Coerce (coerce)
import Data.Foldable (Foldable (..))
import Data.Text as T (Text, singleton, unpack)
import qualified Data.Vector as V
import Linear.V3 (V3 (..))
import Control.Arrow ((&&&))
import Control.Monad.IO.Class (MonadIO, liftIO)

import Data.Text as T (Text, singleton, unpack, strip)
import Data.Text.IO as TIO (readFile)
import Data.Map (Map)
import qualified Data.Map as M ((!), fromList)
import qualified Data.Vector as V
import Data.List (sort)
import Data.Maybe (fromMaybe)

import Text.Read (readMaybe)

import Linear.V3 (V3 (..))

instance StructureModels PDB.PDB where
modelsOf PDB.PDB {..} = fmap mkModel models
where
mkModel :: PDB.Model -> Model
mkModel = flip Model V.empty . fmap mkChain

mkChain :: PDB.Chain -> Chain
mkChain = uncurry Chain . (mkChainName &&& mkChainResidues)

mkChainName :: PDB.Chain -> Text
mkChainName = T.singleton . PDB.atomChainID . safeFirstAtom

mkChainResidues :: PDB.Chain -> V.Vector Residue
mkChainResidues = V.fromList . fmap mkResidue . flip groupByResidue [] . pure . toList

-- can be rewritten with sortOn and groupBy
groupByResidue :: [[PDB.Atom]] -> [PDB.Atom] -> [[PDB.Atom]]
groupByResidue res [] = res
groupByResidue [] (x : xs) = groupByResidue [[x]] xs
groupByResidue res@(lastList : resultTail) (x : xs)
| (PDB.atomResSeq x, PDB.atomICode x) == (PDB.atomResSeq (head lastList), PDB.atomICode (head lastList))
= groupByResidue ((x : lastList) : resultTail) xs
| otherwise = groupByResidue ([x] : res) xs

safeFirstAtom :: V.Vector PDB.Atom -> PDB.Atom
safeFirstAtom arr | V.length arr > 0 = arr V.! 0
| otherwise = error "Could not pick first atom"


mkResidue :: [PDB.Atom] -> Residue
mkResidue [] = error "Cound not make residue from empty list"
mkResidue atoms' = Residue (PDB.atomResName . head $ atoms')
(V.fromList $ mkAtom <$> atoms')
V.empty -- now we do not read bonds
Undefined -- now we do not read secondary structure
"" -- chemical component type?!


mkAtom :: PDB.Atom -> Atom
mkAtom PDB.Atom{..} = Atom (coerce atomSerial)
atomName
atomElement
(V3 atomX atomY atomZ)
(read $ T.unpack atomCharge)
atomTempFactor
atomOccupancy
mkModel model = Model (fmap mkChain model) (restoreModelGlobalBonds atomSerialToNilBasedIndex model)
where
atomSerialToNilBasedIndex :: Map Int Int
atomSerialToNilBasedIndex = M.fromList $ allModelAtomSerials `zip` [0..]

allModelAtomSerials :: [Int]
allModelAtomSerials = sort . V.toList . fmap PDB.atomSerial . V.concat $ V.toList model

mkChain :: PDB.Chain -> Chain
mkChain = uncurry Chain . (mkChainName &&& mkChainResidues)

mkChainName :: PDB.Chain -> Text
mkChainName = T.singleton . PDB.atomChainID . safeFirstAtom

mkChainResidues :: PDB.Chain -> V.Vector Residue
mkChainResidues chain = V.fromList . fmap (mkResidue (restoreChainLocalBonds chain)) $ groupChainByResidue chain

safeFirstAtom :: V.Vector PDB.Atom -> PDB.Atom
safeFirstAtom arr | V.length arr > 0 = arr V.! 0
| otherwise = error "Could not pick first atom"

mkResidue :: Map Text (V.Vector (Bond LocalID)) -> [PDB.Atom] -> Residue
mkResidue _ [] = error "Cound not make residue from empty list"
mkResidue localBondsMap atoms' = Residue (T.strip $ PDB.atomResName firstResidueAtom)
(PDB.atomResSeq firstResidueAtom)
(PDB.atomICode firstResidueAtom)
(V.fromList $ mkAtom <$> atoms')
(localBondsMap M.! residueID firstResidueAtom)
Undefined -- now we do not read secondary structure
"" -- chemical component type?!
where
firstResidueAtom = head atoms'

mkAtom :: PDB.Atom -> Atom
mkAtom PDB.Atom{..} = Atom (GlobalID $ atomSerialToNilBasedIndex M.! atomSerial)
atomSerial
(T.strip atomName)
atomElement
(V3 atomX atomY atomZ)
(fromMaybe 0 . readMaybe $ T.unpack atomCharge)
atomTempFactor
atomOccupancy

modelsFromPDBFile :: (MonadIO m) => FilePath -> m (Either Text ([PDBWarnings], V.Vector Model))
modelsFromPDBFile = liftIO . fmap modelsFromPDBText . TIO.readFile

modelsFromPDBText :: Text -> Either Text ([PDBWarnings], V.Vector Model)
modelsFromPDBText pdbText = do
(warnings, parsedPDB) <- fromTextPDB pdbText
let models = modelsOf parsedPDB
pure (warnings, models)
Loading

0 comments on commit c521fd1

Please sign in to comment.