Skip to content

Commit

Permalink
Allow the repository to specify a list of evaluation tasks
Browse files Browse the repository at this point in the history
Part of #165
  • Loading branch information
ahumenberger committed Jun 10, 2024
1 parent ed4af74 commit 0bbc24d
Show file tree
Hide file tree
Showing 10 changed files with 513 additions and 239 deletions.
2 changes: 1 addition & 1 deletion cmd/eval-dev-quality/cmd/evaluate.go
Original file line number Diff line number Diff line change
Expand Up @@ -329,7 +329,7 @@ func (command *Evaluate) Execute(args []string) (err error) {
}

// WriteCSVs writes the various CSV reports to disk.
func writeCSVs(resultPath string, assessments report.AssessmentPerModelPerLanguagePerRepository) (err error) {
func writeCSVs(resultPath string, assessments report.AssessmentPerModelPerLanguagePerRepositoryPerTask) (err error) {
// Write the "evaluation.csv" containing all data.
csv, err := report.GenerateCSV(assessments)
if err != nil {
Expand Down
86 changes: 45 additions & 41 deletions evaluate/evaluate.go
Original file line number Diff line number Diff line change
Expand Up @@ -63,12 +63,12 @@ func (ctx *Context) runsAtModelLevel() uint {
const RepositoryPlainName = "plain"

// Evaluate runs an evaluation on the given context and returns its results.
func Evaluate(ctx *Context) (assessments report.AssessmentPerModelPerLanguagePerRepository, totalScore uint64) {
func Evaluate(ctx *Context) (assessments report.AssessmentPerModelPerLanguagePerRepositoryPerTask, totalScore uint64) {
// Check that models and languages can be evaluated by executing the "plain" repositories.
modelSucceededBasicChecksOfLanguage := map[evalmodel.Model]map[evallanguage.Language]bool{}
ctx.Log.Printf("Checking that models and languages can be used for evaluation")
// Ensure we report metrics for every model even if they are excluded.
assessments = report.NewAssessmentPerModelPerLanguagePerRepository(ctx.Models, ctx.Languages, ctx.RepositoryPaths)
assessments = report.NewAssessmentPerModelPerLanguagePerRepositoryPerTask(ctx.Models, ctx.Languages, ctx.RepositoryPaths)
problemsPerModel := map[string][]error{}

{
Expand Down Expand Up @@ -106,29 +106,31 @@ func Evaluate(ctx *Context) (assessments report.AssessmentPerModelPerLanguagePer
r.SetQueryAttempts(ctx.QueryAttempts)
}

withLoadedModel(ctx.Log, model, ctx.ProviderForModel[model], func() {
for rm := uint(0); rm < ctx.runsAtModelLevel(); rm++ {
if ctx.Runs > 1 && ctx.RunsSequential {
ctx.Log.Printf("Run %d/%d for model %q", rm+1, ctx.Runs, modelID)
for _, taskIdentifier := range temporaryRepository.Tasks {
withLoadedModel(ctx.Log, model, ctx.ProviderForModel[model], func() {
for rm := uint(0); rm < ctx.runsAtModelLevel(); rm++ {
if ctx.Runs > 1 && ctx.RunsSequential {
ctx.Log.Printf("Run %d/%d for model %q", rm+1, ctx.Runs, modelID)
}

if err := temporaryRepository.Reset(ctx.Log); err != nil {
ctx.Log.Panicf("ERROR: unable to reset temporary repository path: %s", err)
}

assessment, ps, err := temporaryRepository.Evaluate(ctx.Log, ctx.ResultPath, model, language, taskIdentifier)
assessments.Add(model, language, repositoryPath, taskIdentifier, assessment)
if err != nil {
ps = append(ps, err)
}
if len(ps) > 0 {
ctx.Log.Printf("Model %q was not able to solve the %q repository for language %q: %+v", modelID, repositoryPath, languageID, ps)
problemsPerModel[modelID] = append(problemsPerModel[modelID], ps...)
} else {
modelSucceededBasicChecksOfLanguage[model][language] = true
}
}

if err := temporaryRepository.Reset(ctx.Log); err != nil {
ctx.Log.Panicf("ERROR: unable to reset temporary repository path: %s", err)
}

assessment, ps, err := temporaryRepository.Evaluate(ctx.Log, ctx.ResultPath, model, language)
assessments[model][language][repositoryPath].Add(assessment)
if err != nil {
ps = append(ps, err)
}
if len(ps) > 0 {
ctx.Log.Printf("Model %q was not able to solve the %q repository for language %q: %+v", modelID, repositoryPath, languageID, ps)
problemsPerModel[modelID] = append(problemsPerModel[modelID], ps...)
} else {
modelSucceededBasicChecksOfLanguage[model][language] = true
}
}
})
})
}
}
}
}
Expand Down Expand Up @@ -196,24 +198,26 @@ func Evaluate(ctx *Context) (assessments report.AssessmentPerModelPerLanguagePer

continue
}
withLoadedModel(ctx.Log, model, ctx.ProviderForModel[model], func() {
for rm := uint(0); rm < ctx.runsAtModelLevel(); rm++ {
if ctx.Runs > 1 && ctx.RunsSequential {
ctx.Log.Printf("Run %d/%d for model %q", rm+1, ctx.Runs, modelID)
}

if err := temporaryRepository.Reset(ctx.Log); err != nil {
ctx.Log.Panicf("ERROR: unable to reset temporary repository path: %s", err)
}

assessment, ps, err := temporaryRepository.Evaluate(ctx.Log, ctx.ResultPath, model, language)
assessments[model][language][repositoryPath].Add(assessment)
problemsPerModel[modelID] = append(problemsPerModel[modelID], ps...)
if err != nil {
ctx.Log.Printf("ERROR: Model %q encountered a hard error for language %q, repository %q: %+v", modelID, languageID, repositoryPath, err)
for _, taskIdentifier := range temporaryRepository.Tasks {
withLoadedModel(ctx.Log, model, ctx.ProviderForModel[model], func() {
for rm := uint(0); rm < ctx.runsAtModelLevel(); rm++ {
if ctx.Runs > 1 && ctx.RunsSequential {
ctx.Log.Printf("Run %d/%d for model %q", rm+1, ctx.Runs, modelID)
}

if err := temporaryRepository.Reset(ctx.Log); err != nil {
ctx.Log.Panicf("ERROR: unable to reset temporary repository path: %s", err)
}

assessment, ps, err := temporaryRepository.Evaluate(ctx.Log, ctx.ResultPath, model, language, taskIdentifier)
assessments.Add(model, language, repositoryPath, taskIdentifier, assessment)
problemsPerModel[modelID] = append(problemsPerModel[modelID], ps...)
if err != nil {
ctx.Log.Printf("ERROR: Model %q encountered a hard error for language %q, repository %q: %+v", modelID, languageID, repositoryPath, err)
}
}
}
})
})
}
}
}
}
Expand Down
Loading

0 comments on commit 0bbc24d

Please sign in to comment.