Skip to content

Commit

Permalink
SQLITE Contention: Add retries and enable WAL mode (#355)
Browse files Browse the repository at this point in the history
* Using SQLITE's busy timeout didn't fix the issue with busy errors
preventing sessions from being updated and learning from occuring.
* Enabling WAL mode seems to have made a huge difference
* WAL uses a separate file to log edits and allows concurrent reads and
writes
* Add logic at the application layer to retry the updates.
* Use an atomic counter to track the number of concurrent updates to try
to confirm
  we don't have multiple concurrent writes.
* To better track issues with Analyzer lag start tracking the timestamp
of the lag processed log message.
* Add a Gauge metric to report Analyzer LAG
  • Loading branch information
jlewi authored Dec 11, 2024
1 parent 2ddd974 commit 4add79b
Show file tree
Hide file tree
Showing 18 changed files with 533 additions and 1,265 deletions.
32 changes: 26 additions & 6 deletions app/pkg/analyze/analyzer.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,9 @@ import (
"sync"
"time"

"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto"

"github.com/jlewi/foyle/app/pkg/logs/matchers"

"github.com/jlewi/foyle/app/pkg/runme/converters"
Expand All @@ -34,6 +37,13 @@ import (
"k8s.io/client-go/util/workqueue"
)

var (
lagGauge = promauto.NewGauge(prometheus.GaugeOpts{
Name: "analyzer_logs_lag_seconds", // Metric name
Help: "Lag in seconds between the current time and last processed log entry", // Metric description
})
)

const (
// traceField is the field that contains the traceId in a log entry. We use this to identify processing related
// to a particular trace. We don't use the field "traceId" because these log entries aren't actually part of the
Expand Down Expand Up @@ -264,6 +274,8 @@ func (a *Analyzer) processLogFile(ctx context.Context, path string) error {

traceIDs := make(map[string]bool)

lastLogTime := time.Time{}

// We read the lines line by line. We keep track of all the traceIDs mentioned in those lines. We
// Then do a combineAndCheckpoint for all the traceIDs mentioned. Lines are also persisted in a KV store
// keyed by traceID. So if on the next iteration we get a new line for a given traceId and need to reprocess
Expand All @@ -275,6 +287,8 @@ func (a *Analyzer) processLogFile(ctx context.Context, path string) error {
continue
}

lastLogTime = entry.Time()

// Add the entry to a session if it should be.
a.sessBuilder.processLogEntry(entry, a.learnNotifier)

Expand Down Expand Up @@ -312,7 +326,7 @@ func (a *Analyzer) processLogFile(ctx context.Context, path string) error {
}

// Now run a combineAndCheckpoint
a.combineAndCheckpoint(ctx, path, offset, traceIDs)
a.combineAndCheckpoint(ctx, path, offset, lastLogTime, traceIDs)

// If we are shutting down we don't want to keep processing the file.
// By aborting shutdown here as opposed to here we are blocking shutdown for as least as long it takes
Expand All @@ -326,7 +340,7 @@ func (a *Analyzer) processLogFile(ctx context.Context, path string) error {

// combineAndCheckpoint runs a combine operation for all the traceIDs listed in the map.
// Progress is then checkpointed.
func (a *Analyzer) combineAndCheckpoint(ctx context.Context, path string, offset int64, traceIDs map[string]bool) {
func (a *Analyzer) combineAndCheckpoint(ctx context.Context, path string, offset int64, lastLogTime time.Time, traceIDs map[string]bool) {
log := logs.FromContext(ctx)
// Combine the entries for each trace that we saw.
// N.B. We could potentially make this more efficient by checking if the log message is the final message
Expand All @@ -337,7 +351,11 @@ func (a *Analyzer) combineAndCheckpoint(ctx context.Context, path string, offset
}
}
// Update the offset
a.setLogFileOffset(path, offset)
a.setLogFileOffset(path, offset, lastLogTime)

// Update the lag
lag := time.Since(lastLogTime)
lagGauge.Set(lag.Seconds())
}

func (a *Analyzer) GetWatermark() *logspb.LogsWaterMark {
Expand All @@ -362,13 +380,15 @@ func (a *Analyzer) getLogFileOffset(path string) int64 {
return a.logFileOffsets.Offset
}

func (a *Analyzer) setLogFileOffset(path string, offset int64) {
func (a *Analyzer) setLogFileOffset(path string, offset int64, lastTimestamp time.Time) {
a.mu.Lock()
defer a.mu.Unlock()
lastTime := timestamppb.New(lastTimestamp)
oldWatermark := a.logFileOffsets
a.logFileOffsets = &logspb.LogsWaterMark{
File: path,
Offset: offset,
File: path,
Offset: offset,
LastLogTimestamp: lastTime,
}

log := logs.NewLogger()
Expand Down
12 changes: 12 additions & 0 deletions app/pkg/analyze/crud.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package analyze
import (
"context"
"sort"
"time"

"connectrpc.com/connect"
"github.com/jlewi/foyle/app/pkg/logs"
Expand All @@ -22,14 +23,21 @@ type CrudHandler struct {
blocksDB *pebble.DB
tracesDB *pebble.DB
analyzer *Analyzer
location *time.Location
}

func NewCrudHandler(cfg config.Config, blocksDB *pebble.DB, tracesDB *pebble.DB, analyzer *Analyzer) (*CrudHandler, error) {
// Load the PST time zone
location, err := time.LoadLocation("America/Los_Angeles")
if err != nil {
return nil, errors.Wrap(err, "Failed to load location America/Los_Angeles")
}
return &CrudHandler{
cfg: cfg,
blocksDB: blocksDB,
tracesDB: tracesDB,
analyzer: analyzer,
location: location,
}, nil
}

Expand Down Expand Up @@ -112,5 +120,9 @@ func (h *CrudHandler) Status(ctx context.Context, request *connect.Request[logsp
Watermark: h.analyzer.GetWatermark(),
}

lastTime := response.GetWatermark().GetLastLogTimestamp().AsTime()
lastTime = lastTime.In(h.location)
formattedTime := lastTime.Format("2006-01-02 3:04:05 PM MST")
response.LastLogTimestampHuman = formattedTime
return connect.NewResponse(response), nil
}
197 changes: 126 additions & 71 deletions app/pkg/analyze/session_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@ import (
"fmt"
"os"
"path/filepath"
"sync/atomic"
"time"

"github.com/cenkalti/backoff/v4"

"github.com/go-logr/zapr"
"go.uber.org/zap"
Expand Down Expand Up @@ -47,6 +51,11 @@ var (
Name: "sqlite_busy",
Help: "Number of operations that failed because sqlite was busy",
})

activeUpdatesGauge = promauto.NewGauge(prometheus.GaugeOpts{
Name: "session_manager_active_updates", // Metric name
Help: "Number of active updates", // Metric description
})
)

// GetDDL return the DDL for the database.
Expand All @@ -63,6 +72,9 @@ type SessionUpdater func(session *logspb.Session) error
type SessionsManager struct {
queries *fsql.Queries
db *sql.DB
// Keep track of the number of concurrent calls to Update.
// This is intended to try to track down why SQLITE_BUSY errors is so frequent.
activeUpdates atomic.Int32
}

func NewSessionsManager(db *sql.DB) (*SessionsManager, error) {
Expand All @@ -81,6 +93,20 @@ func NewSessionsManager(db *sql.DB) (*SessionsManager, error) {
log := zapr.NewLogger(zap.L())
log.Info("sqlite busy_timeout set", "timeout", 5000)

if _, err := db.Exec("PRAGMA busy_timeout = 10000;"); err != nil {
return nil, errors.Wrapf(err, "Failed to set busy timeout for the database")
}

// Activate WAL mode. This hopefully helps with SQLITE_BUSY errors and contention by using a separate file
// to log writes.
// https://www.sqlite.org/wal.html#:~:text=One%20has%20merely%20to%20run,set%20on%20any%20one%20connection.
// This mode is supposedly persistent the next time the application opens it will still be doing this
output, err := db.Exec("PRAGMA journal_mode=WAL;")
log.Info("Set journal mode to WAL", "output", output)
if err != nil {
return nil, errors.Wrapf(err, "Failed to set journal mode to WAL")
}

// Create the dbtx from the actual database
queries := fsql.New(db)

Expand Down Expand Up @@ -117,98 +143,127 @@ func (db *SessionsManager) Get(ctx context.Context, contextID string) (*logspb.S
// inserted if the updateFunc returns nil. If the session already exists then the session is passed to updateFunc
// and the updated value is then written to the database
func (db *SessionsManager) Update(ctx context.Context, contextID string, updateFunc SessionUpdater) error {
// Increment the counter when entering the function
numActive := db.activeUpdates.Add(1)
defer func() {
// Decrement the counter when leaving the function
value := db.activeUpdates.Add(-1)
activeUpdatesGauge.Set(float64(value))
}()

log := logs.FromContext(ctx)
if contextID == "" {
return errors.WithStack(errors.New("contextID must be non-empty"))
}
log = log.WithValues("contextId", contextID)

sessCounter.WithLabelValues("start").Inc()

tx, err := db.db.BeginTx(ctx, &sql.TxOptions{})
if err != nil {
sessCounter.WithLabelValues("failedstart").Inc()
return errors.Wrapf(err, "Failed to start transaction")
// Intended to track whether SQLITE_BUSY errors are correlated with the number of concurrent calls to Update.
if numActive > 1 {
log.Info("Concurrent Session Updates", "numActive", numActive)
}

err = func() error {
queries := db.queries.WithTx(tx)
// Read the record
sessRow, err := queries.GetSession(ctx, contextID)
activeUpdatesGauge.Set(float64(numActive))
sessCounter.WithLabelValues("start").Inc()

// If the session doesn't exist then we do nothing because session is initializeed to empty session
session := &logspb.Session{
ContextId: contextID,
}
if err != nil {
logDBErrors(ctx, err)
if err != sql.ErrNoRows {
sessCounter.WithLabelValues("failedget").Inc()
return errors.Wrapf(err, "Failed to get session with id %v", contextID)
// Wrap the updates in a retry loop. This is intended to deal with SQLITE_BUSY errors and other possible sources
// of contention
b := backoff.NewExponentialBackOff(backoff.WithMaxElapsedTime(5*time.Minute), backoff.WithMaxInterval(30*time.Second))
for {
err := func() error {
tx, err := db.db.BeginTx(ctx, &sql.TxOptions{})
if err != nil {
// See https://go.dev/doc/database/execute-transactions We do not to issue a rollback if BeginTx fails
sessCounter.WithLabelValues("failedstart").Inc()
return errors.Wrapf(err, "Failed to start transaction")
}
// ErrNoRows means the session doesn't exist so we just continue with the empty session
} else {
// Deserialize the proto
if err := proto.Unmarshal(sessRow.Proto, session); err != nil {
return errors.Wrapf(err, "Failed to deserialize session")

// Ensure Rollback gets called.
// This is a null op if the transaction has already been committed or rolled back.
defer func() {
if err := tx.Rollback(); err != nil {
log.Error(err, "Failed to rollback transaction")
}
}()

queries := db.queries.WithTx(tx)
// Read the record
sessRow, err := queries.GetSession(ctx, contextID)

// If the session doesn't exist then we do nothing because session is initializeed to empty session
session := &logspb.Session{
ContextId: contextID,
}
if err != nil {
logDBErrors(ctx, err)
if err != sql.ErrNoRows {
sessCounter.WithLabelValues("failedget").Inc()
return errors.Wrapf(err, "Failed to get session with id %v", contextID)
}
// ErrNoRows means the session doesn't exist so we just continue with the empty session
} else {
// Deserialize the proto
if err := proto.Unmarshal(sessRow.Proto, session); err != nil {
return errors.Wrapf(err, "Failed to deserialize session")
}
}
}

sessCounter.WithLabelValues("callupdatefunc").Inc()
sessCounter.WithLabelValues("callupdatefunc").Inc()

if err := updateFunc(session); err != nil {
return errors.Wrapf(err, "Failed to update session")
}
if err := updateFunc(session); err != nil {
return errors.Wrapf(err, "Failed to update session")
}

newRow, err := protoToRow(session)
if err != nil {
return errors.Wrapf(err, "Failed to convert session proto to table row")
}
newRow, err := protoToRow(session)
if err != nil {
return errors.Wrapf(err, "Failed to convert session proto to table row")
}

if newRow.Contextid != contextID {
return errors.WithStack(errors.Errorf("contextID in session doesn't match contextID. Update was called with contextID: %v but session has contextID: %v", contextID, newRow.Contextid))
}
if newRow.Contextid != contextID {
return errors.WithStack(errors.Errorf("contextID in session doesn't match contextID. Update was called with contextID: %v but session has contextID: %v", contextID, newRow.Contextid))
}

update := fsql.UpdateSessionParams{
Contextid: contextID,
Proto: newRow.Proto,
Starttime: newRow.Starttime,
Endtime: newRow.Endtime,
Selectedid: newRow.Selectedid,
Selectedkind: newRow.Selectedkind,
TotalInputTokens: newRow.TotalInputTokens,
TotalOutputTokens: newRow.TotalOutputTokens,
NumGenerateTraces: newRow.NumGenerateTraces,
}
update := fsql.UpdateSessionParams{
Contextid: contextID,
Proto: newRow.Proto,
Starttime: newRow.Starttime,
Endtime: newRow.Endtime,
Selectedid: newRow.Selectedid,
Selectedkind: newRow.Selectedkind,
TotalInputTokens: newRow.TotalInputTokens,
TotalOutputTokens: newRow.TotalOutputTokens,
NumGenerateTraces: newRow.NumGenerateTraces,
}

sessCounter.WithLabelValues("callupdatesession").Inc()
if err := queries.UpdateSession(ctx, update); err != nil {
logDBErrors(ctx, err)
return errors.Wrapf(err, "Failed to update session")
}
return nil
}()
sessCounter.WithLabelValues("callupdatesession").Inc()
if err := queries.UpdateSession(ctx, update); err != nil {
logDBErrors(ctx, err)
return errors.Wrapf(err, "Failed to update session")
}

if err == nil {
if err := tx.Commit(); err != nil {
logDBErrors(ctx, err)
log.Error(err, "Failed to commit transaction")
sessCounter.WithLabelValues("commitfail").Inc()
return errors.Wrapf(err, "Failed to commit transaction")
if err := tx.Commit(); err != nil {
logDBErrors(ctx, err)
log.Error(err, "Failed to commit transaction")
sessCounter.WithLabelValues("commitfail").Inc()
return errors.Wrapf(err, "Failed to commit transaction")
}
sessCounter.WithLabelValues("success").Inc()
return nil
}()

if err == nil {
sessCounter.WithLabelValues("done").Inc()
return nil
}
sessCounter.WithLabelValues("success").Inc()
} else {
logDBErrors(ctx, err)
sessCounter.WithLabelValues("fail").Inc()
log.Error(err, "Failed to update session")
if txErr := tx.Rollback(); txErr != nil {
log.Error(txErr, "Failed to rollback transaction")

wait := b.NextBackOff()
if wait == backoff.Stop {
sessCounter.WithLabelValues("done").Inc()
err := errors.Errorf("Failed to update session for contextId %s", contextID)
log.Error(err, "Failed to update session")
return err
}
return err
time.Sleep(wait)
}

sessCounter.WithLabelValues("done").Inc()
return nil
}

func (m *SessionsManager) GetSession(ctx context.Context, request *connect.Request[logspb.GetSessionRequest]) (*connect.Response[logspb.GetSessionResponse], error) {
Expand Down
Loading

0 comments on commit 4add79b

Please sign in to comment.