Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve maintainability of assessments by abstracting away details of how assessments are stored #178

Merged
merged 1 commit into from
Jun 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion cmd/eval-dev-quality/cmd/evaluate.go
Original file line number Diff line number Diff line change
Expand Up @@ -342,7 +342,7 @@ func (command *Evaluate) Execute(args []string) (err error) {
}

// WriteCSVs writes the various CSV reports to disk.
func writeCSVs(resultPath string, assessments report.AssessmentPerModelPerLanguagePerRepositoryPerTask) (err error) {
func writeCSVs(resultPath string, assessments *report.AssessmentStore) (err error) {
ahumenberger marked this conversation as resolved.
Show resolved Hide resolved
// Write the "evaluation.csv" containing all data.
csv, err := report.GenerateCSV(assessments)
if err != nil {
Expand Down
4 changes: 2 additions & 2 deletions evaluate/evaluate.go
Original file line number Diff line number Diff line change
Expand Up @@ -63,12 +63,12 @@ func (ctx *Context) runsAtModelLevel() uint {
const RepositoryPlainName = "plain"

// Evaluate runs an evaluation on the given context and returns its results.
func Evaluate(ctx *Context) (assessments report.AssessmentPerModelPerLanguagePerRepositoryPerTask, totalScore uint64) {
func Evaluate(ctx *Context) (assessments *report.AssessmentStore, totalScore uint64) {
// Check that models and languages can be evaluated by executing the "plain" repositories.
modelSucceededBasicChecksOfLanguage := map[evalmodel.Model]map[evallanguage.Language]bool{}
ctx.Log.Printf("Checking that models and languages can be used for evaluation")
// Ensure we report metrics for every model even if they are excluded.
assessments = report.NewAssessmentPerModelPerLanguagePerRepositoryPerTask()
assessments = report.NewAssessmentStore()
problemsPerModel := map[string][]error{}

{
Expand Down
294 changes: 153 additions & 141 deletions evaluate/evaluate_test.go

Large diffs are not rendered by default.

47 changes: 47 additions & 0 deletions evaluate/metrics/testing/assessments.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@ import (
"github.com/stretchr/testify/assert"

"github.com/symflower/eval-dev-quality/evaluate/metrics"
"github.com/symflower/eval-dev-quality/language"
"github.com/symflower/eval-dev-quality/model"
"github.com/symflower/eval-dev-quality/task"
)

// AssertAssessmentsEqual checks if the given assessments are equal ignoring default and nondeterministic values.
Expand All @@ -29,3 +32,47 @@ func AssertAssessmentsEqual(t *testing.T, expected metrics.Assessments, actual m
var AssessmentsWithProcessingTime = metrics.Assessments{
metrics.AssessmentKeyProcessingTime: 1,
}

// AssessmentTuple holds all parameters uniquely defining to which run an assessment belongs to.
type AssessmentTuple struct {
Model model.Model
Language language.Language
RepositoryPath string
Task task.Identifier
Assessment metrics.Assessments
}
bauersimon marked this conversation as resolved.
Show resolved Hide resolved

type AssessmentTuples []*AssessmentTuple

func (at AssessmentTuples) ToMap() (lookup map[model.Model]map[language.Language]map[string]map[task.Identifier]metrics.Assessments) {
lookup = map[model.Model]map[language.Language]map[string]map[task.Identifier]metrics.Assessments{}
for _, t := range at {
perLanguageLookup, ok := lookup[t.Model]
if !ok {
perLanguageLookup = map[language.Language]map[string]map[task.Identifier]metrics.Assessments{}
lookup[t.Model] = perLanguageLookup
}

perRepositoryLookup, ok := perLanguageLookup[t.Language]
if !ok {
perRepositoryLookup = map[string]map[task.Identifier]metrics.Assessments{}
perLanguageLookup[t.Language] = perRepositoryLookup
}

perTaskLookup, ok := perRepositoryLookup[t.RepositoryPath]
if !ok {
perTaskLookup = map[task.Identifier]metrics.Assessments{}
perRepositoryLookup[t.RepositoryPath] = perTaskLookup
}

assessments, ok := perTaskLookup[t.Task]
if !ok {
assessments = metrics.NewAssessments()
perTaskLookup[t.Task] = assessments
}

assessments.Add(t.Assessment)
}

return lookup
}
46 changes: 25 additions & 21 deletions evaluate/report/collection.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,20 +43,24 @@ func (a AssessmentPerModel) WalkByScore(function func(model model.Model, assessm
return nil
}

// AssessmentPerModelPerLanguagePerRepositoryPerTask holds a collection of assessments per model per language and per repository.
type AssessmentPerModelPerLanguagePerRepositoryPerTask map[model.Model]map[language.Language]map[string]map[task.Identifier]metrics.Assessments
// AssessmentStore holds a collection of assessments per model per language and per repository.
type AssessmentStore struct {
store map[model.Model]map[language.Language]map[string]map[task.Identifier]metrics.Assessments
}

// NewAssessmentPerModelPerLanguagePerRepositoryPerTask returns a new AssessmentPerModelPerLanguagePerRepository initialized with an empty assessment for each combination.
func NewAssessmentPerModelPerLanguagePerRepositoryPerTask() (assessments AssessmentPerModelPerLanguagePerRepositoryPerTask) {
return AssessmentPerModelPerLanguagePerRepositoryPerTask{}
// NewAssessmentStore returns a new store for collecting assessments.
func NewAssessmentStore() (assessments *AssessmentStore) {
return &AssessmentStore{
store: map[model.Model]map[language.Language]map[string]map[task.Identifier]metrics.Assessments{},
}
}

// Add adds a new assessment.
func (a AssessmentPerModelPerLanguagePerRepositoryPerTask) Add(model model.Model, l language.Language, repositoryPath string, taskIdentifier task.Identifier, assessment metrics.Assessments) {
perLanguageLookup, ok := a[model]
func (a *AssessmentStore) Add(model model.Model, l language.Language, repositoryPath string, taskIdentifier task.Identifier, assessment metrics.Assessments) {
perLanguageLookup, ok := a.store[model]
if !ok {
perLanguageLookup = map[language.Language]map[string]map[task.Identifier]metrics.Assessments{}
a[model] = perLanguageLookup
a.store[model] = perLanguageLookup
}

perRepositoryLookup, ok := perLanguageLookup[l]
Expand All @@ -65,39 +69,39 @@ func (a AssessmentPerModelPerLanguagePerRepositoryPerTask) Add(model model.Model
perLanguageLookup[l] = perRepositoryLookup
}

perTaskLookup, ok := a[model][l][repositoryPath]
perTaskLookup, ok := perRepositoryLookup[repositoryPath]
if !ok {
perTaskLookup = map[task.Identifier]metrics.Assessments{}
a[model][l][repositoryPath] = perTaskLookup
perRepositoryLookup[repositoryPath] = perTaskLookup
}

assessments, ok := perTaskLookup[taskIdentifier]
if !ok {
assessments = metrics.NewAssessments()
a[model][l][repositoryPath][taskIdentifier] = assessments
perTaskLookup[taskIdentifier] = assessments
}

assessments.Add(assessment)
}

// Walk walks over all entries.
func (a AssessmentPerModelPerLanguagePerRepositoryPerTask) Walk(function func(m model.Model, l language.Language, r string, t task.Identifier, a metrics.Assessments) error) (err error) {
models := maps.Keys(a)
func (a *AssessmentStore) Walk(function func(m model.Model, l language.Language, r string, t task.Identifier, a metrics.Assessments) error) (err error) {
models := maps.Keys(a.store)
slices.SortStableFunc(models, func(a, b model.Model) int {
return cmp.Compare(a.ID(), b.ID())
})
for _, m := range models {
languages := maps.Keys(a[m])
languages := maps.Keys(a.store[m])
slices.SortStableFunc(languages, func(a, b language.Language) int {
return cmp.Compare(a.ID(), b.ID())
})
for _, l := range languages {
repositories := maps.Keys(a[m][l])
repositories := maps.Keys(a.store[m][l])
sort.Strings(repositories)
for _, r := range repositories {
taskIdentifiers := maps.Keys(a[m][l][r])
taskIdentifiers := maps.Keys(a.store[m][l][r])
for _, t := range taskIdentifiers {
if err := function(m, l, r, t, a[m][l][r][t]); err != nil {
if err := function(m, l, r, t, a.store[m][l][r][t]); err != nil {
return err
}
}
Expand All @@ -109,9 +113,9 @@ func (a AssessmentPerModelPerLanguagePerRepositoryPerTask) Walk(function func(m
}

// CollapseByModel returns all assessments aggregated per model ID.
func (a AssessmentPerModelPerLanguagePerRepositoryPerTask) CollapseByModel() AssessmentPerModel {
perModel := make(AssessmentPerModel, len(a))
for _, m := range maps.Keys(a) {
func (a *AssessmentStore) CollapseByModel() AssessmentPerModel {
perModel := make(AssessmentPerModel, len(a.store))
for _, m := range maps.Keys(a.store) {
perModel[m] = metrics.NewAssessments()
}
_ = a.Walk(func(m model.Model, l language.Language, r string, t task.Identifier, a metrics.Assessments) (err error) {
Expand All @@ -124,7 +128,7 @@ func (a AssessmentPerModelPerLanguagePerRepositoryPerTask) CollapseByModel() Ass
}

// CollapseByLanguage returns all assessments aggregated per language and model.
func (a AssessmentPerModelPerLanguagePerRepositoryPerTask) CollapseByLanguage() AssessmentPerLanguagePerModel {
func (a *AssessmentStore) CollapseByLanguage() AssessmentPerLanguagePerModel {
assessments := AssessmentPerLanguagePerModel{}
_ = a.Walk(func(m model.Model, l language.Language, r string, t task.Identifier, a metrics.Assessments) (err error) {
if _, ok := assessments[l]; !ok {
Expand Down
Loading
Loading