Skip to content

Commit

Permalink
Add request timings and count histograms to telemetry. Closes hasura#…
Browse files Browse the repository at this point in the history
…3552

We upload a set of accumulating timers and counters to track service
time for different types of operations, across several dimensions (e.g.
did we hit the plan cache, was a remote involved, etc.)

Also...

Standardize on DiffTime as a standard duration type, and try to use it
consistently.

See discussion here:
hasura#3584 (review)

It should be possible to overwrite that module so the new threadDelay
sticks per the pattern in hasura#3705 blocked on hasura#3558

Rename the Control.Concurrent.Extended.threadDelay to `sleep` since a
naive use with a literal argument would be very bad!

We catch a bug in 'computeTimeDiff'.

Add convenient 'Read' instances to the time unit utility types. Make
'Second' a newtype to support this.
  • Loading branch information
jberryman committed Jan 30, 2020
1 parent 5bd5a54 commit 2350069
Show file tree
Hide file tree
Showing 24 changed files with 615 additions and 188 deletions.
6 changes: 5 additions & 1 deletion server/graphql-engine.cabal
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,7 @@ library
, Control.Arrow.Trans
, Control.Monad.Stateless
, Control.Monad.Unique
, Data.Time.Clock.Units

-- exposed for tests
, Data.Parser.CacheControl
Expand All @@ -222,6 +223,8 @@ library
, Hasura.Server.Migrate
, Hasura.Server.Compression
, Hasura.Server.PGDump
-- Exposed for testing:
, Hasura.Server.Telemetry.Counters

, Hasura.RQL.Types
, Hasura.RQL.Types.Run
Expand Down Expand Up @@ -358,7 +361,6 @@ library
, Data.Sequence.NonEmpty
, Data.TByteString
, Data.Text.Extended
, Data.Time.Clock.Units

, Hasura.SQL.DML
, Hasura.SQL.Error
Expand Down Expand Up @@ -407,9 +409,11 @@ test-suite graphql-engine-tests
main-is: Main.hs
other-modules:
Data.Parser.CacheControlSpec
Data.TimeSpec
Hasura.IncrementalSpec
Hasura.RQL.MetadataSpec
Hasura.Server.MigrateSpec
Hasura.Server.TelemetrySpec

-- Benchmarks related to caching (e.g. the plan cache).
--
Expand Down
18 changes: 13 additions & 5 deletions server/src-lib/Control/Concurrent/Extended.hs
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
module Control.Concurrent.Extended
( module Control.Concurrent
, sleep
-- * Deprecated
, threadDelay
) where

Expand All @@ -8,9 +10,15 @@ import Prelude
import qualified Control.Concurrent as Base

import Control.Concurrent hiding (threadDelay)
import Data.Time.Clock (DiffTime)
import Data.Time.Clock.Units (Microseconds (..))
import Data.Time.Clock.Units (Microseconds (..), DiffTime)

-- | Like 'Base.threadDelay', but takes a 'DiffTime' instead of an 'Int'.
threadDelay :: DiffTime -> IO ()
threadDelay = Base.threadDelay . round . Microseconds
-- | Like 'Base.threadDelay', but takes a 'DiffTime' instead of an 'Int' microseconds.
--
-- NOTE: you cannot simply replace e.g. @threadDelay 1000@ with @sleep 1000@ since those literals
-- have different meanings!
sleep :: DiffTime -> IO ()
sleep = Base.threadDelay . round . Microseconds

{-# DEPRECATED threadDelay "Please use `sleep` instead (and read the docs!)" #-}
threadDelay :: Int -> IO ()
threadDelay = Base.threadDelay
102 changes: 82 additions & 20 deletions server/src-lib/Data/Time/Clock/Units.hs
Original file line number Diff line number Diff line change
Expand Up @@ -25,55 +25,91 @@ You can also go the other way using the constructors rather than the selectors:
0.5
@
Generally, it doesn’t make sense to pass these wrappers around or put them inside data structures,
since any function that needs a duration should just accept a 'DiffTime', but they’re useful for
literals and conversions to/from other types. -}
NOTE: the 'Real' and 'Fractional' instances just essentially add or strip the unit label (as
above), so you can't use 'realToFrac' to convert between the units types here. Instead try
'fromUnits' which is less of a foot-gun.
The 'Read' instances for these types mirror the behavior of the 'RealFrac' instance wrt numeric
literals for convenient serialization (e.g. when working with env vars):
@
>>> read "1.2" :: Milliseconds
Milliseconds {milliseconds = 0.0012s}
@
Generally, if you need to pass around a duration between functions you should use 'DiffTime'
directly. However if storing a duration in a type that will be serialized, e.g. one having
a 'ToJSON' instance, it is better to use one of these explicit wrapper types so that it's
obvious what units will be used. -}
module Data.Time.Clock.Units
( Days(..)
, Hours(..)
, Minutes(..)
, Seconds
, seconds
, Seconds(..)
, Milliseconds(..)
, Microseconds(..)
, Nanoseconds(..)
-- * Converting between units
, Duration(..)
, fromUnits
-- * Reexports
-- | We use 'DiffTime' as the standard type for unit-agnostic duration in our
-- code. You'll need to convert to a 'NominalDiffTime' (with 'fromUnits') in
-- order to do anything useful with 'UTCTime' with these durations.
--
-- NOTE: some care must be taken especially when 'NominalDiffTime' interacts
-- with 'UTCTime':
--
-- - a 'DiffTime' or 'NominalDiffTime' my be negative
-- - 'addUTCTime' and 'diffUTCTime' do not attempt to handle leap seconds
, DiffTime
) where

import Prelude

import Control.Arrow (first)
import Data.Aeson
import Data.Hashable
import Data.Proxy
import Data.Time.Clock
import GHC.TypeLits
import Numeric (readFloat)

type Seconds = DiffTime

seconds :: DiffTime -> DiffTime
seconds = id
newtype Seconds = Seconds { seconds :: DiffTime }
-- NOTE: we want Show to give a pastable data structure string, even
-- though Read is custom.
deriving (Duration, Show, Eq, Ord, ToJSON, FromJSON)
deriving (Read, Num, Fractional, Real, Hashable, RealFrac) via (TimeUnit (SecondsP 1))

-- TODO if needed: deriving (ToJSON, FromJSON) via (TimeUnit ..) making sure
-- to copy Aeson instances (with withBoundedScientific), and e.g.
-- toJSON (5 :: Minutes) == Number 5
newtype Days = Days { days :: DiffTime }
deriving (Show, Eq, Ord)
deriving (Num, Fractional, Real, RealFrac) via (TimeUnit (SecondsP 86400))
deriving (Duration, Show, Eq, Ord)
deriving (Read, Num, Fractional, Real, Hashable, RealFrac) via (TimeUnit (SecondsP 86400))

newtype Hours = Hours { hours :: DiffTime }
deriving (Show, Eq, Ord)
deriving (Num, Fractional, Real, RealFrac) via (TimeUnit (SecondsP 3600))
deriving (Duration, Show, Eq, Ord)
deriving (Read, Num, Fractional, Real, Hashable, RealFrac) via (TimeUnit (SecondsP 3600))

newtype Minutes = Minutes { minutes :: DiffTime }
deriving (Show, Eq, Ord)
deriving (Num, Fractional, Real, RealFrac) via (TimeUnit (SecondsP 60))
deriving (Duration, Show, Eq, Ord)
deriving (Read, Num, Fractional, Real, Hashable, RealFrac) via (TimeUnit (SecondsP 60))

newtype Milliseconds = Milliseconds { milliseconds :: DiffTime }
deriving (Show, Eq, Ord)
deriving (Num, Fractional, Real, RealFrac) via (TimeUnit 1000000000)
deriving (Duration, Show, Eq, Ord)
deriving (Read, Num, Fractional, Real, Hashable, RealFrac) via (TimeUnit 1000000000)

newtype Microseconds = Microseconds { microseconds :: DiffTime }
deriving (Show, Eq, Ord)
deriving (Num, Fractional, Real, RealFrac) via (TimeUnit 1000000)
deriving (Duration, Show, Eq, Ord)
deriving (Read, Num, Fractional, Real, Hashable, RealFrac) via (TimeUnit 1000000)

newtype Nanoseconds = Nanoseconds { nanoseconds :: DiffTime }
deriving (Show, Eq, Ord)
deriving (Num, Fractional, Real, RealFrac) via (TimeUnit 1000)
deriving (Duration, Show, Eq, Ord)
deriving (Read, Num, Fractional, Real, Hashable, RealFrac) via (TimeUnit 1000)

-- Internal for deriving via
newtype TimeUnit (picosPerUnit :: Nat) = TimeUnit DiffTime
deriving (Show, Eq, Ord)

Expand All @@ -92,6 +128,9 @@ instance (KnownNat picosPerUnit) => Num (TimeUnit picosPerUnit) where
signum (TimeUnit a) = TimeUnit $ signum a
fromInteger a = TimeUnit . picosecondsToDiffTime $ a * natNum @picosPerUnit

instance (KnownNat picosPerUnit) => Read (TimeUnit picosPerUnit) where
readsPrec _ = map (first fromRational) . readFloat

instance (KnownNat picosPerUnit) => Fractional (TimeUnit picosPerUnit) where
TimeUnit a / TimeUnit b = TimeUnit . picosecondsToDiffTime $
diffTimeToPicoseconds a * natNum @picosPerUnit `div` diffTimeToPicoseconds b
Expand All @@ -107,3 +146,26 @@ instance (KnownNat picosPerUnit) => RealFrac (TimeUnit picosPerUnit) where
round = round . toRational
ceiling = ceiling . toRational
floor = floor . toRational

-- we can ignore unit:
instance Hashable (TimeUnit a) where
hashWithSalt salt (TimeUnit dt) = hashWithSalt salt $
(realToFrac :: DiffTime -> Double) dt


-- | Duration types isomorphic to 'DiffTime', powering 'fromUnits'.
class Duration d where
fromDiffTime :: DiffTime -> d
toDiffTime :: d -> DiffTime

instance Duration DiffTime where
fromDiffTime = id
toDiffTime = id

instance Duration NominalDiffTime where
fromDiffTime = realToFrac
toDiffTime = realToFrac

-- | Safe conversion between duration units.
fromUnits :: (Duration x, Duration y)=> x -> y
fromUnits = fromDiffTime . toDiffTime
6 changes: 4 additions & 2 deletions server/src-lib/Hasura/App.hs
Original file line number Diff line number Diff line change
Expand Up @@ -241,19 +241,21 @@ runHGEServer ServeOptions{..} InitCtx{..} initTime = do
$ Warp.defaultSettings

maxEvThrds <- liftIO $ getFromEnv defaultMaxEventThreads "HASURA_GRAPHQL_EVENTS_HTTP_POOL_SIZE"
evFetchMilliSec <- liftIO $ getFromEnv defaultFetchIntervalMilliSec "HASURA_GRAPHQL_EVENTS_FETCH_INTERVAL"
fetchI <- fmap milliseconds $ liftIO $
getFromEnv defaultFetchIntervalMilliSec "HASURA_GRAPHQL_EVENTS_FETCH_INTERVAL"
logEnvHeaders <- liftIO $ getFromEnv False "LOG_HEADERS_FROM_ENV"

-- prepare event triggers data
prepareEvents _icPgPool logger
eventEngineCtx <- liftIO $ atomically $ initEventEngineCtx maxEvThrds evFetchMilliSec
eventEngineCtx <- liftIO $ atomically $ initEventEngineCtx maxEvThrds fetchI
unLogger logger $ mkGenericStrLog LevelInfo "event_triggers" "starting workers"
void $ liftIO $ C.forkIO $ processEventQueue logger logEnvHeaders
_icHttpManager _icPgPool (getSCFromRef cacheRef) eventEngineCtx

-- start a background thread to check for updates
void $ liftIO $ C.forkIO $ checkForUpdates loggerCtx _icHttpManager

-- TODO async/immortal:
-- start a background thread for telemetry
when soEnableTelemetry $ do
unLogger logger $ mkGenericStrLog LevelInfo "telemetry" telemetryNotice
Expand Down
20 changes: 8 additions & 12 deletions server/src-lib/Hasura/Events/Lib.hs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ module Hasura.Events.Lib
, Event(..)
) where

import Control.Concurrent (threadDelay)
import Control.Concurrent.Extended (sleep)
import Control.Concurrent.Async (async, waitAny)
import Control.Concurrent.STM.TVar
import Control.Exception (try)
Expand Down Expand Up @@ -149,19 +149,19 @@ data EventEngineCtx
{ _eeCtxEventQueue :: TQ.TQueue Event
, _eeCtxEventThreads :: TVar Int
, _eeCtxMaxEventThreads :: Int
, _eeCtxFetchIntervalMilliSec :: Int
, _eeCtxFetchInterval :: DiffTime
}

defaultMaxEventThreads :: Int
defaultMaxEventThreads = 100

defaultFetchIntervalMilliSec :: Int
defaultFetchIntervalMilliSec :: Milliseconds
defaultFetchIntervalMilliSec = 1000

retryAfterHeader :: CI.CI T.Text
retryAfterHeader = "Retry-After"

initEventEngineCtx :: Int -> Int -> STM EventEngineCtx
initEventEngineCtx :: Int -> DiffTime -> STM EventEngineCtx
initEventEngineCtx maxT fetchI = do
q <- TQ.newTQueue
c <- newTVar 0
Expand All @@ -185,7 +185,7 @@ pushEvents logger pool eectx = forever $ do
case eventsOrError of
Left err -> L.unLogger logger $ EventInternalErr err
Right events -> atomically $ mapM_ (TQ.writeTQueue q) events
threadDelay (fetchI * 1000)
sleep fetchI

consumeEvents
:: (HasVersion) => L.Logger L.Hasura -> LogEnvHeaders -> HTTP.Manager -> Q.PGPool -> IO SchemaCache
Expand Down Expand Up @@ -285,7 +285,7 @@ retryOrSetError :: Event -> RetryConf -> HTTPErr -> Q.TxE QErr ()
retryOrSetError e retryConf err = do
let mretryHeader = getRetryAfterHeaderFromError err
tries = eTries e
mretryHeaderSeconds = parseRetryHeader mretryHeader
mretryHeaderSeconds = mretryHeader >>= parseRetryHeader
triesExhausted = tries >= rcNumRetries retryConf
noRetryHeader = isNothing mretryHeaderSeconds
-- current_try = tries + 1 , allowed_total_tries = rcNumRetries retryConf + 1
Expand All @@ -308,12 +308,8 @@ retryOrSetError e retryConf err = do
in case mHeader of
Just (HeaderConf _ (HVValue value)) -> Just value
_ -> Nothing
parseRetryHeader Nothing = Nothing
parseRetryHeader (Just hValue)
= let seconds = readMaybe $ T.unpack hValue
in case seconds of
Nothing -> Nothing
Just sec -> if sec > 0 then Just sec else Nothing

parseRetryHeader = mfilter (> 0) . readMaybe . T.unpack

encodeHeader :: EventHeaderInfo -> HTTP.Header
encodeHeader (EventHeaderInfo hconf cache) =
Expand Down
15 changes: 9 additions & 6 deletions server/src-lib/Hasura/GraphQL/Execute.hs
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ import qualified Hasura.GraphQL.Resolve as GR
import qualified Hasura.GraphQL.Validate as VQ
import qualified Hasura.GraphQL.Validate.Types as VT
import qualified Hasura.Logging as L
import qualified Hasura.Server.Telemetry.Counters as Telem

-- The current execution plan of a graphql operation, it is
-- currently, either local pg execution or a remote execution
Expand Down Expand Up @@ -184,21 +185,21 @@ getResolvedExecPlan
-> SchemaCache
-> SchemaCacheVer
-> GQLReqUnparsed
-> m ExecPlanResolved
-> m (Telem.CacheHit, ExecPlanResolved)
getResolvedExecPlan pgExecCtx planCache userInfo sqlGenCtx
enableAL sc scVer reqUnparsed = do
planM <- liftIO $ EP.getPlan scVer (userRole userInfo)
opNameM queryStr planCache
let usrVars = userVars userInfo
case planM of
-- plans are only for queries and subscriptions
Just plan -> GExPHasura <$> case plan of
Just plan -> (Telem.Hit,) . GExPHasura <$> case plan of
EP.RPQuery queryPlan -> do
(tx, genSql) <- EQ.queryOpFromPlan usrVars queryVars queryPlan
return $ ExOpQuery tx (Just genSql)
EP.RPSubs subsPlan ->
ExOpSubs <$> EL.reuseLiveQueryPlan pgExecCtx usrVars queryVars subsPlan
Nothing -> noExistingPlan
Nothing -> (Telem.Miss,) <$> noExistingPlan
where
GQLReq opNameM queryStr queryVars = reqUnparsed
addPlanToCache plan =
Expand Down Expand Up @@ -357,7 +358,8 @@ execRemoteGQ
-> GQLReqUnparsed
-> RemoteSchemaInfo
-> G.TypedOperationDefinition
-> m (HttpResponse EncJSON)
-> m (DiffTime, HttpResponse EncJSON)
-- ^ Also returns time spent in http request, for telemetry.
execRemoteGQ reqId userInfo reqHdrs q rsi opDef = do
execCtx <- ask
let logger = _ecxLogger execCtx
Expand Down Expand Up @@ -387,11 +389,12 @@ execRemoteGQ reqId userInfo reqHdrs q rsi opDef = do
}

L.unLogger logger $ QueryLog q Nothing reqId
res <- liftIO $ try $ HTTP.httpLbs req manager
(time, res) <- withElapsedTime $ liftIO $ try $ HTTP.httpLbs req manager
resp <- either httpThrow return res
let cookieHdrs = getCookieHdr (resp ^.. Wreq.responseHeader "Set-Cookie")
respHdrs = Just $ mkRespHeaders cookieHdrs
return $ HttpResponse (encJFromLBS $ resp ^. Wreq.responseBody) respHdrs
!httpResp = HttpResponse (encJFromLBS $ resp ^. Wreq.responseBody) respHdrs
return (time, httpResp)

where
RemoteSchemaInfo url hdrConf fwdClientHdrs timeout = rsi
Expand Down
7 changes: 3 additions & 4 deletions server/src-lib/Hasura/GraphQL/Execute/LiveQuery/Options.hs
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,6 @@ import Hasura.Prelude

import qualified Data.Aeson as J

import Data.Time.Clock (DiffTime)
import Data.Time.Clock.Units (seconds)

data LiveQueriesOptions
= LiveQueriesOptions
{ _lqoBatchSize :: !BatchSize
Expand All @@ -21,7 +18,7 @@ data LiveQueriesOptions
mkLiveQueriesOptions :: Maybe BatchSize -> Maybe RefetchInterval -> LiveQueriesOptions
mkLiveQueriesOptions batchSize refetchInterval = LiveQueriesOptions
{ _lqoBatchSize = fromMaybe (BatchSize 100) batchSize
, _lqoRefetchInterval = fromMaybe (RefetchInterval $ seconds 1) refetchInterval
, _lqoRefetchInterval = fromMaybe (RefetchInterval 1) refetchInterval
}

instance J.ToJSON LiveQueriesOptions where
Expand All @@ -33,5 +30,7 @@ instance J.ToJSON LiveQueriesOptions where
newtype BatchSize = BatchSize { unBatchSize :: Int }
deriving (Show, Eq, J.ToJSON)

-- TODO this is treated as milliseconds in fromEnv and as seconds in ToJSON.
-- ideally this would have e.g. ... unRefetchInterval :: Milliseconds
newtype RefetchInterval = RefetchInterval { unRefetchInterval :: DiffTime }
deriving (Show, Eq, J.ToJSON)
Loading

0 comments on commit 2350069

Please sign in to comment.