diff --git a/app/pkg/analyze/analyzer.go b/app/pkg/analyze/analyzer.go index 08fb4abe..cca2730b 100644 --- a/app/pkg/analyze/analyzer.go +++ b/app/pkg/analyze/analyzer.go @@ -67,7 +67,7 @@ type Analyzer struct { watcher *fsnotify.Watcher - blockNotifier PostBlockEvent + learnNotifier PostBlockEvent handleLogFileIsDone sync.WaitGroup handleBlocksIsDone sync.WaitGroup @@ -140,10 +140,10 @@ type blockItem struct { type PostBlockEvent func(id string) error // Run runs the analyzer; continually processing logs. -// blockNotifier is an optional function that will be called when a block is updated. +// learnNotifier is an optional function that will be called when a block is updated. // This should be non blocking. -func (a *Analyzer) Run(ctx context.Context, logDirs []string, blockNotifier PostBlockEvent) error { - a.blockNotifier = blockNotifier +func (a *Analyzer) Run(ctx context.Context, logDirs []string, learnNotifier PostBlockEvent) error { + a.learnNotifier = learnNotifier // Find all the current files jsonFiles, err := findLogFilesInDirs(ctx, logDirs) if err != nil { @@ -447,6 +447,12 @@ func (a *Analyzer) processLogEvent(ctx context.Context, entry *api.LogEntry) { }); err != nil { log.Error(err, "Failed to update block with execution", "blockId", bid) } + // We need to enqueue the block for processing since it was executed. + if a.learnNotifier != nil { + if err := a.learnNotifier(bid); err != nil { + log.Error(err, "Error notifying block event", "blockId", bid) + } + } case v1alpha1.LogEventType_ACCEPTED: fallthrough case v1alpha1.LogEventType_REJECTED: @@ -637,12 +643,7 @@ func (a *Analyzer) handleBlockEvents(ctx context.Context) { return buildBlockLog(ctx, block, a.tracesDB) }) if err != nil { - log.Error(err, "Error processing block", "block", blockItem.id) - } - if a.blockNotifier != nil { - if err := a.blockNotifier(blockItem.id); err != nil { - log.Error(err, "Error notifying block event", "block", blockItem.id) - } + log.Error(err, "Error processing block", "blockId", blockItem.id) } if a.signalBlockDone != nil { a.signalBlockDone <- blockItem.id diff --git a/app/pkg/learn/learner.go b/app/pkg/learn/learner.go index 8c89d604..6665f88e 100644 --- a/app/pkg/learn/learner.go +++ b/app/pkg/learn/learner.go @@ -8,6 +8,9 @@ import ( "sync" "time" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promauto" + "github.com/jlewi/monogo/files" "github.com/jlewi/monogo/helpers" @@ -30,6 +33,21 @@ const ( fileSuffix = ".example.binpb" ) +var ( + enqueuedCounter = promauto.NewCounter(prometheus.CounterOpts{ + Name: "learner_enqueued_total", + Help: "Total number of enqueued blocks", + }) + + cellsProcessed = promauto.NewCounterVec( + prometheus.CounterOpts{ + Name: "learner_blocks_processed", + Help: "Number of blocks processed by the learner", + }, + []string{"status"}, + ) +) + // Learner handles the learn loop to learn from past mistakes. // // TODO(jeremy): Should we call this a trainer? @@ -74,6 +92,7 @@ func (l *Learner) Enqueue(id string) error { return errors.New("Queue is shutting down; can't enqueue anymore items") } l.queue.Add(id) + enqueuedCounter.Inc() return nil } @@ -130,16 +149,19 @@ func (l *Learner) Reconcile(ctx context.Context, id string) error { if b.ExecutedBlock == nil { // Skip unexecuted block + cellsProcessed.WithLabelValues("unexecuted").Inc() return nil } if b.GeneratedBlock == nil { // Block wasn't the result of AI generation + cellsProcessed.WithLabelValues("notgenerated").Inc() return nil } if b.EvalMode { log.V(logs.Debug).Info("Skipping block which was created as part of an eval", "id", b.GetId()) + cellsProcessed.WithLabelValues("eval").Inc() return nil } @@ -147,9 +169,11 @@ func (l *Learner) Reconcile(ctx context.Context, id string) error { // Use the metric used for eval. if strings.TrimSpace(b.ExecutedBlock.GetContents()) == strings.TrimSpace(b.GeneratedBlock.GetContents()) { log.V(logs.Debug).Info("Skipping executed block which matches generated block", "id", b.GetId()) + cellsProcessed.WithLabelValues("nochange").Inc() return nil } + cellsProcessed.WithLabelValues("learn").Inc() expectedFiles := l.getExampleFiles(b.GetId()) log.Info("Found new training example", "blockId", b.GetId()) diff --git a/app/pkg/server/metrics.go b/app/pkg/server/metrics.go new file mode 100644 index 00000000..cfc1121b --- /dev/null +++ b/app/pkg/server/metrics.go @@ -0,0 +1,14 @@ +package server + +import ( + "github.com/gin-gonic/gin" + "github.com/prometheus/client_golang/prometheus/promhttp" +) + +// prometheusHandler wraps the promhttp.Handler() and returns a Gin handler +func prometheusHandler() gin.HandlerFunc { + h := promhttp.Handler() + return func(c *gin.Context) { + h.ServeHTTP(c.Writer, c.Request) + } +} diff --git a/app/pkg/server/server.go b/app/pkg/server/server.go index c1b231b9..95b92078 100644 --- a/app/pkg/server/server.go +++ b/app/pkg/server/server.go @@ -104,6 +104,8 @@ func (s *Server) createGinEngine() error { // TODO(jeremy): Should we turn this into a protobuf service and use connect? router.GET("/healthz", s.healthCheck) + // Add the Prometheus metrics endpoint + router.GET("/metrics", prometheusHandler()) router.NoRoute(func(c *gin.Context) { log.Info("Request for not found path", "path", c.Request.URL.Path) c.JSON(http.StatusNotFound, gin.H{"message": "Not found", "path": c.Request.URL.Path}) diff --git a/docs/content/en/docs/learning/troubleshoot_learning.md b/docs/content/en/docs/learning/troubleshoot_learning.md index f6f85d51..640d26a4 100644 --- a/docs/content/en/docs/learning/troubleshoot_learning.md +++ b/docs/content/en/docs/learning/troubleshoot_learning.md @@ -47,12 +47,11 @@ To get the cell ID for a given cell 2. Find code block containing your cell 3. Your cell will contain metadata which contains the ID e.g. -```` -```bash {"id":"01J6DG428ER427GJNTKC15G6JM"} -CELLID=01J6DFZ9V3R29TCDEY2AYA8XTV -curl http://localhost:8080/api/blocklogs/${CELLID} | jq . +```sh +` ` `bash {"id":"01J6DG428ER427GJNTKC15G6JM"} +echo hello world +` ` ` ``` -```` ## Did Block Logs Get Created @@ -61,14 +60,14 @@ curl http://localhost:8080/api/blocklogs/${CELLID} | jq . * The cell should be one that was generated by the AI and you think learning should have occurred on ```bash -CELLID=01J7KQPBYCT9VM2KFBY48JC7J0 +CELLID=01J7S3QZMS5F742JFPWZDCTVRG curl -X POST http://localhost:8877/api/foyle.logs.LogsService/GetBlockLog -H "Content-Type: application/json" -d "{\"id\": \"${CELLID}\"}" | jq . ``` * If this returns not found then no log was created for this cell and there is a problem with Log Processing * The correct output should look like the following -``` +```json { "blockLog": { "id": "01J7KQPBYCT9VM2KFBY48JC7J0", @@ -94,12 +93,14 @@ curl -X POST http://localhost:8877/api/foyle.logs.LogsService/GetBlockLog -H "Co * Notably the output should include the following fields - * **generatedBlock** - This is the block that was generated by the AI - * **executedBlock** - This is the block that the user actually executed + * **generatedBlock** - This is the block that was generated by the AI + * **executedBlock** - This is the block that the user actually executed * If the **generatedBlock** and **executedBlock** are the same then no learning occured * If the **generatedBlock** is missing then this means the block wasn't generated by Foyle and learning won't occur - * This can happen if you insert a blank cell and manually enter a command + + * This can happen if you insert a blank cell and manually enter a command + * If the **executedBlock** is missing then this means the block wasn't executed and learning won't occur ## Was a cell executed? @@ -167,3 +168,22 @@ jq -c 'select(.message == "Building block log")' ${LASTLOG} ```bash jq -c 'select(.severity == "error")' ${LASTLOG} ``` + +# Check Prometheus counters + +Check to make sure blocks are being enqueued for learner processing + +```bash +curl -s http://localhost:8877/metrics | grep learner_enqueued_total +``` + +* If the number is 0 please open an issue in GitHub because there is most likely a bug + +Check the metrics for post processing of blocks + +```bash +curl -s http://localhost:8877/metrics | grep learner_blocks_processed +``` + +* The value of `learner_blocks_processed{status="learn"}` is the number of blocks that contributed to learning +* The value of `learner_blocks_processed{status="unexecuted"}` is the number of blocks that were ignored because they were not executed