Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Always apply "symflower fix" and refactor "write test" task and LLM prompting to prepare for templates #354

Merged
merged 3 commits into from
Oct 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 20 additions & 36 deletions evaluate/task/task-transpile.go
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
package task

import (
"context"
"errors"
"fmt"
"os"
"path/filepath"
Expand Down Expand Up @@ -75,7 +73,7 @@ func (t *TaskTranspile) Run(ctx evaltask.Context) (repositoryAssessment map[eval
}
for originFilePath, originLanguage := range originFilePathsWithLanguage {
modelAssessmentsForFile := metrics.NewAssessments()
withSymflowerAssessmentsForFile := modelAssessmentsForFile // The symflower assessment tracks how the model result can be improved in case of a failure, so just link to the model assessment until a failure actually happens.
withSymflowerAssessmentsForFile := modelAssessmentsForFile // The symflower assessment tracks how the model result can be improved in case of a failure, so just link to the model assessment until we successfully applied "symflower fix".

if err := ctx.Repository.Reset(ctx.Logger); err != nil {
ctx.Logger.Panicf("ERROR: unable to reset temporary repository path: %s", err)
Expand Down Expand Up @@ -110,46 +108,32 @@ func (t *TaskTranspile) Run(ctx evaltask.Context) (repositoryAssessment map[eval
problems = append(problems, ps...)
if err != nil {
problems = append(problems, pkgerrors.WithMessage(err, originFilePath))

// If there is an execution timeout do not run "symflower fix" because the code itself is correct.
if errors.Is(err, context.DeadlineExceeded) {
modelAssessments.Add(modelAssessmentsForFile)
withSymflowerAssessments.Add(withSymflowerAssessmentsForFile)

continue
}

// Run "symflower fix" if the model response fails to execute.
if ctx.Language.ID() == "golang" { // Currently we only support Go for "symflower fix".
withSymflowerFixTestResult, processingTime, ps, err := ExecuteWithSymflowerFix(ctx, taskLogger.Logger, filepath.Join(ctx.Repository.DataPath(), packagePath))
problems = append(problems, ps...)
if err != nil {
problems = append(problems, err)

modelAssessments.Add(modelAssessmentsForFile)
withSymflowerAssessments.Add(withSymflowerAssessmentsForFile)

continue
} else {
testsPassing := withSymflowerFixTestResult.TestsPass
taskLogger.Printf("with symflower repair: Executes tests with %d tests passing", testsPassing)

// Symflower was able to fix a failure so now update the assessment with the improved results.
withSymflowerFixAssessments := metrics.NewAssessments()
withSymflowerFixAssessments[metrics.AssessmentKeyProcessingTime] = processingTime
withSymflowerFixAssessments.Award(metrics.AssessmentKeyFilesExecuted)
withSymflowerFixAssessments.AwardPoints(metrics.AssessmentKeyTestsPassing, uint64(testsPassing))

withSymflowerAssessmentsForFile = metrics.CombineWithSymflowerFixAssessments(modelAssessmentsForFile, withSymflowerFixAssessments)
}
}
} else {
testsPassing := testResult.TestsPass
taskLogger.Printf("Executes tests with %d tests passing", testsPassing)
modelAssessmentsForFile.Award(metrics.AssessmentKeyFilesExecuted)
modelAssessmentsForFile.AwardPoints(metrics.AssessmentKeyTestsPassing, uint64(testsPassing))
}

if ctx.Language.ID() == "golang" { // Currently we only support Go for "symflower fix".
withSymflowerFixTestResult, processingTime, ps, err := ExecuteWithSymflowerFix(ctx, taskLogger.Logger, filepath.Join(ctx.Repository.DataPath(), packagePath))
problems = append(problems, ps...)
if err != nil {
problems = append(problems, err)
} else {
testsPassing := withSymflowerFixTestResult.TestsPass
taskLogger.Printf("with symflower repair: Executes tests with %d tests passing", testsPassing)

// Symflower was able to fix a failure so now update the assessment with the improved results.
withSymflowerFixAssessments := metrics.NewAssessments()
withSymflowerFixAssessments[metrics.AssessmentKeyProcessingTime] = processingTime
withSymflowerFixAssessments.Award(metrics.AssessmentKeyFilesExecuted)
withSymflowerFixAssessments.AwardPoints(metrics.AssessmentKeyTestsPassing, uint64(testsPassing))

withSymflowerAssessmentsForFile = metrics.CombineWithSymflowerFixAssessments(modelAssessmentsForFile, withSymflowerFixAssessments)
}
}

modelAssessments.Add(modelAssessmentsForFile)
withSymflowerAssessments.Add(withSymflowerAssessmentsForFile)
}
Expand Down
123 changes: 57 additions & 66 deletions evaluate/task/task-write-test.go
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
package task

import (
"context"
"errors"
"fmt"
"strings"

Expand Down Expand Up @@ -54,77 +52,17 @@ func (t *TaskWriteTests) Run(ctx evaltask.Context) (repositoryAssessment map[eva
withSymflowerFixAssessment[metrics.AssessmentKeyFilesExecutedMaximumReachable] = maximumReachableFiles

for _, filePath := range filePaths {
modelAssessmentForFile := metrics.NewAssessments()
withSymflowerFixAssessmentForFile := modelAssessmentForFile // The symflower assessment tracks how the model result can be improved in case of a failure, so just link to the model assessment until a failure actually happens.

if err := ctx.Repository.Reset(ctx.Logger); err != nil {
ctx.Logger.Panicf("ERROR: unable to reset temporary repository path: %s", err)
}

modelContext := model.Context{
Language: ctx.Language,

RepositoryPath: dataPath,
FilePath: filePath,

Logger: taskLogger.Logger,
}
assessments, err := modelCapability.WriteTests(modelContext)
if err != nil {
problems = append(problems, pkgerrors.WithMessage(err, filePath))

continue
}
if assessments[metrics.AssessmentKeyProcessingTime] == 0 {
return nil, nil, pkgerrors.Errorf("no model response time measurement present for %q at repository %q", ctx.Model.ID(), ctx.Repository.Name())
}
modelAssessmentForFile.Add(assessments)
modelAssessmentForFile.Award(metrics.AssessmentKeyResponseNoError)

testResult, ps, err := ctx.Language.ExecuteTests(taskLogger.Logger, dataPath)
modelAssessmentFile, withSymflowerFixAssessmentFile, ps, err := runModelAndSymflowerFix(ctx, taskLogger, modelCapability, dataPath, filePath)
problems = append(problems, ps...)
if err != nil {
problems = append(problems, pkgerrors.WithMessage(err, filePath))

// If there is an execution timeout do not run "symflower fix" because the code itself is correct.
if errors.Is(err, context.DeadlineExceeded) {
modelAssessment.Add(modelAssessmentForFile)
withSymflowerFixAssessment.Add(withSymflowerFixAssessmentForFile)

continue
}

// Run "symflower fix" if the model response fails to execute.
if ctx.Language.ID() == "golang" { // Currently we only support Go for "symflower fix".
withSymflowerFixTestResult, processingTime, ps, err := ExecuteWithSymflowerFix(ctx, taskLogger.Logger, ctx.Repository.DataPath())
problems = append(problems, ps...)
if err != nil {
problems = append(problems, err)

modelAssessment.Add(modelAssessmentForFile)
withSymflowerFixAssessment.Add(withSymflowerFixAssessmentForFile)

continue
} else {
ctx.Logger.Printf("with symflower repair: Executes tests with %d coverage objects", withSymflowerFixTestResult.Coverage)

// Symflower was able to fix a failure so now update the assessment with the improved results.
withSymflowerFixAssessments := metrics.NewAssessments()
withSymflowerFixAssessments[metrics.AssessmentKeyProcessingTime] = processingTime
withSymflowerFixAssessments.Award(metrics.AssessmentKeyFilesExecuted)
withSymflowerFixAssessments.AwardPoints(metrics.AssessmentKeyCoverage, withSymflowerFixTestResult.Coverage)

withSymflowerFixAssessmentForFile = metrics.CombineWithSymflowerFixAssessments(modelAssessmentForFile, withSymflowerFixAssessments)
}
}
} else {
taskLogger.Printf("Executes tests with %d coverage objects", testResult.Coverage)
modelAssessmentForFile.Award(metrics.AssessmentKeyFilesExecuted)
modelAssessmentForFile.AwardPoints(metrics.AssessmentKeyCoverage, testResult.Coverage)
return nil, problems, err
}

modelAssessment.Add(modelAssessmentForFile)
withSymflowerFixAssessment.Add(withSymflowerFixAssessmentForFile)
modelAssessment.Add(modelAssessmentFile)
withSymflowerFixAssessment.Add(withSymflowerFixAssessmentFile)
}

repositoryAssessment = map[evaltask.Identifier]metrics.Assessments{
Expand All @@ -135,6 +73,59 @@ func (t *TaskWriteTests) Run(ctx evaltask.Context) (repositoryAssessment map[eva
return repositoryAssessment, problems, nil
}

func runModelAndSymflowerFix(ctx evaltask.Context, taskLogger *taskLogger, modelCapability model.CapabilityWriteTests, dataPath string, filePath string) (modelAssessment metrics.Assessments, withSymflowerFixAssessment metrics.Assessments, problems []error, err error) {
modelAssessment = metrics.NewAssessments()
withSymflowerFixAssessment = modelAssessment // The symflower assessment tracks how the model result can be improved in case of a failure, so just link to the model assessment until we successfully applied "symflower fix".

modelContext := model.Context{
Language: ctx.Language,

RepositoryPath: dataPath,
FilePath: filePath,

Logger: taskLogger.Logger,
}
assessments, err := modelCapability.WriteTests(modelContext)
if err != nil {
return nil, nil, append(problems, pkgerrors.WithMessage(err, filePath)), nil
}
if assessments[metrics.AssessmentKeyProcessingTime] == 0 {
return nil, nil, problems, pkgerrors.Errorf("no model response time measurement present for %q at repository %q", ctx.Model.ID(), ctx.Repository.Name())
}
modelAssessment.Add(assessments)
modelAssessment.Award(metrics.AssessmentKeyResponseNoError)

testResult, ps, err := ctx.Language.ExecuteTests(taskLogger.Logger, dataPath)
problems = append(problems, ps...)
if err != nil {
problems = append(problems, pkgerrors.WithMessage(err, filePath))
} else {
taskLogger.Printf("Executes tests with %d coverage objects", testResult.Coverage)
modelAssessment.Award(metrics.AssessmentKeyFilesExecuted)
modelAssessment.AwardPoints(metrics.AssessmentKeyCoverage, testResult.Coverage)
}

if ctx.Language.ID() == "golang" { // Currently we only support Go for "symflower fix".
withSymflowerFixTestResult, processingTime, ps, err := ExecuteWithSymflowerFix(ctx, taskLogger.Logger, ctx.Repository.DataPath())
problems = append(problems, ps...)
if err != nil {
problems = append(problems, err)
} else {
ctx.Logger.Printf("with symflower repair: Executes tests with %d coverage objects", withSymflowerFixTestResult.Coverage)

// Symflower was able to fix a failure so now update the assessment with the improved results.
withSymflowerFix := metrics.NewAssessments()
withSymflowerFix[metrics.AssessmentKeyProcessingTime] = processingTime
withSymflowerFix.Award(metrics.AssessmentKeyFilesExecuted)
withSymflowerFix.AwardPoints(metrics.AssessmentKeyCoverage, withSymflowerFixTestResult.Coverage)

withSymflowerFixAssessment = metrics.CombineWithSymflowerFixAssessments(modelAssessment, withSymflowerFix)
}
}

return modelAssessment, withSymflowerFixAssessment, problems, nil
}

// validateWriteTestsRepository checks if the repository for the "write-tests" task is well-formed.
func validateWriteTestsRepository(logger *log.Logger, repositoryPath string, language language.Language) (err error) {
logger.Printf("validating repository %q", repositoryPath)
Expand Down
24 changes: 0 additions & 24 deletions evaluate/task/task-write-test_test.go
Original file line number Diff line number Diff line change
@@ -1,14 +1,12 @@
package task

import (
"context"
"fmt"
"os"
"path/filepath"
"testing"

"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/mock"
"github.com/stretchr/testify/require"
"github.com/symflower/eval-dev-quality/evaluate/metrics"
metricstesting "github.com/symflower/eval-dev-quality/evaluate/metrics/testing"
Expand All @@ -17,7 +15,6 @@ import (
"github.com/symflower/eval-dev-quality/language/golang"
"github.com/symflower/eval-dev-quality/language/java"
"github.com/symflower/eval-dev-quality/language/ruby"
languagetesting "github.com/symflower/eval-dev-quality/language/testing"
"github.com/symflower/eval-dev-quality/log"
modeltesting "github.com/symflower/eval-dev-quality/model/testing"
"github.com/symflower/eval-dev-quality/task"
Expand Down Expand Up @@ -191,27 +188,6 @@ func TestTaskWriteTestsRun(t *testing.T) {
this is not valid go code
`), expectedAssessments, expectedProblems, false)
}
{
expectedAssessments := map[task.Identifier]metrics.Assessments{
IdentifierWriteTests: metrics.Assessments{
metrics.AssessmentKeyFilesExecutedMaximumReachable: 1,
metrics.AssessmentKeyResponseNoError: 1,
},
IdentifierWriteTestsSymflowerFix: metrics.Assessments{
metrics.AssessmentKeyFilesExecutedMaximumReachable: 1,
metrics.AssessmentKeyResponseNoError: 1,
},
}
expectedProblems := []string{
"context deadline exceeded",
}

languageMock := languagetesting.NewMockLanguageNamed(t, "golang")
languageMock.On("Files", mock.Anything, mock.Anything).Return([]string{filepath.Join("golang", "plain")}, nil).Once()
languageMock.On("ExecuteTests", mock.Anything, mock.Anything).Return(nil, nil, context.DeadlineExceeded).Once()

validateGo(t, "Execution timeout", languageMock, "", expectedAssessments, expectedProblems, false)
}
})
})

Expand Down
Loading