Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove assessment collection #378

Merged
merged 3 commits into from
Jan 8, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 1 addition & 6 deletions cmd/eval-dev-quality/cmd/evaluate.go
Original file line number Diff line number Diff line change
Expand Up @@ -524,7 +524,7 @@ func (command *Evaluate) evaluateLocal(evaluationContext *evaluate.Context) (err
command.logger.Panicf("ERROR: %s", err)
}

assessments := evaluate.Evaluate(evaluationContext)
evaluate.Evaluate(evaluationContext)

if err := (report.Markdown{
DateTime: command.timestamp,
Expand All @@ -534,11 +534,6 @@ func (command *Evaluate) evaluateLocal(evaluationContext *evaluate.Context) (err
command.logger.Panicf("ERROR: %s", err)
}

assessmentsPerModel := assessments.CollapseByModel()
for _, modelID := range maps.Keys(assessmentsPerModel) {
command.logger.Printf("Evaluation assessments for %q: %s", modelID, assessmentsPerModel[modelID])
}

return nil
}

Expand Down
215 changes: 40 additions & 175 deletions cmd/eval-dev-quality/cmd/evaluate_test.go

Large diffs are not rendered by default.

10 changes: 3 additions & 7 deletions evaluate/evaluate.go
Original file line number Diff line number Diff line change
Expand Up @@ -67,12 +67,11 @@ func (ctx *Context) runsAtModelLevel() uint {
const RepositoryPlainName = "plain"

// Evaluate runs an evaluation on the given context and returns its results.
func Evaluate(ctx *Context) (assessments *report.AssessmentStore) {
func Evaluate(ctx *Context) {
// Check that models and languages can be evaluated by executing the "plain" repositories.
modelSucceededBasicChecksOfLanguage := map[evalmodel.Model]map[evallanguage.Language]bool{}
ctx.Log.Printf("Checking that models and languages can be used for evaluation")
// Ensure we report metrics for every model even if they are excluded.
assessments = report.NewAssessmentStore()

problemsPerModel := map[string][]error{}
// Write the evaluation CSV header so it's only written once.
evaluationCSVFile, err := os.OpenFile(filepath.Join(ctx.ResultPath, "evaluation.csv"), os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644)
Expand Down Expand Up @@ -178,7 +177,6 @@ func Evaluate(ctx *Context) (assessments *report.AssessmentStore) {
logger.Printf("Model %q was not able to solve the %q repository for language %q: %+v", modelID, repositoryPath, languageID, ps)
}

assessments.AddAssessmentPerTask(model, language, repositoryPath, assessment)
// Write the task assessment to the evaluation CSV file.
if err := evaluationFile.WriteEvaluationRecord(model, language, temporaryRepository.Name(), runCount, assessment); err != nil {
logger.Panicf("ERROR: cannot write evaluation record: %s", err)
Expand Down Expand Up @@ -295,7 +293,7 @@ func Evaluate(ctx *Context) (assessments *report.AssessmentStore) {
if err != nil {
logger.Printf("ERROR: Model %q encountered a hard error for language %q, repository %q: %+v", modelID, languageID, repositoryPath, err)
}
assessments.AddAssessmentPerTask(model, language, repositoryPath, assessment)

// Write the task assessment to the evaluation CSV file.
if err := evaluationFile.WriteEvaluationRecord(model, language, temporaryRepository.Name(), runCount, assessment); err != nil {
logger.Panicf("ERROR: cannot write evaluation record: %s", err)
Expand All @@ -307,8 +305,6 @@ func Evaluate(ctx *Context) (assessments *report.AssessmentStore) {
}
}
}

return assessments
}

// withLoadedModel loads the model for the duration of the given task if supported by the model's provider.
Expand Down
Loading
Loading