Skip to content

Commit

Permalink
Merge pull request #354 from symflower/350-early-merger-3
Browse files Browse the repository at this point in the history
Always apply "symflower fix" and refactor "write test" task and LLM prompting to prepare for templates
  • Loading branch information
ruiAzevedo19 authored Oct 3, 2024
2 parents 2bff5d3 + fcb2104 commit da013ed
Show file tree
Hide file tree
Showing 5 changed files with 146 additions and 217 deletions.
56 changes: 20 additions & 36 deletions evaluate/task/task-transpile.go
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
package task

import (
"context"
"errors"
"fmt"
"os"
"path/filepath"
Expand Down Expand Up @@ -75,7 +73,7 @@ func (t *TaskTranspile) Run(ctx evaltask.Context) (repositoryAssessment map[eval
}
for originFilePath, originLanguage := range originFilePathsWithLanguage {
modelAssessmentsForFile := metrics.NewAssessments()
withSymflowerAssessmentsForFile := modelAssessmentsForFile // The symflower assessment tracks how the model result can be improved in case of a failure, so just link to the model assessment until a failure actually happens.
withSymflowerAssessmentsForFile := modelAssessmentsForFile // The symflower assessment tracks how the model result can be improved in case of a failure, so just link to the model assessment until we successfully applied "symflower fix".

if err := ctx.Repository.Reset(ctx.Logger); err != nil {
ctx.Logger.Panicf("ERROR: unable to reset temporary repository path: %s", err)
Expand Down Expand Up @@ -110,46 +108,32 @@ func (t *TaskTranspile) Run(ctx evaltask.Context) (repositoryAssessment map[eval
problems = append(problems, ps...)
if err != nil {
problems = append(problems, pkgerrors.WithMessage(err, originFilePath))

// If there is an execution timeout do not run "symflower fix" because the code itself is correct.
if errors.Is(err, context.DeadlineExceeded) {
modelAssessments.Add(modelAssessmentsForFile)
withSymflowerAssessments.Add(withSymflowerAssessmentsForFile)

continue
}

// Run "symflower fix" if the model response fails to execute.
if ctx.Language.ID() == "golang" { // Currently we only support Go for "symflower fix".
withSymflowerFixTestResult, processingTime, ps, err := ExecuteWithSymflowerFix(ctx, taskLogger.Logger, filepath.Join(ctx.Repository.DataPath(), packagePath))
problems = append(problems, ps...)
if err != nil {
problems = append(problems, err)

modelAssessments.Add(modelAssessmentsForFile)
withSymflowerAssessments.Add(withSymflowerAssessmentsForFile)

continue
} else {
testsPassing := withSymflowerFixTestResult.TestsPass
taskLogger.Printf("with symflower repair: Executes tests with %d tests passing", testsPassing)

// Symflower was able to fix a failure so now update the assessment with the improved results.
withSymflowerFixAssessments := metrics.NewAssessments()
withSymflowerFixAssessments[metrics.AssessmentKeyProcessingTime] = processingTime
withSymflowerFixAssessments.Award(metrics.AssessmentKeyFilesExecuted)
withSymflowerFixAssessments.AwardPoints(metrics.AssessmentKeyTestsPassing, uint64(testsPassing))

withSymflowerAssessmentsForFile = metrics.CombineWithSymflowerFixAssessments(modelAssessmentsForFile, withSymflowerFixAssessments)
}
}
} else {
testsPassing := testResult.TestsPass
taskLogger.Printf("Executes tests with %d tests passing", testsPassing)
modelAssessmentsForFile.Award(metrics.AssessmentKeyFilesExecuted)
modelAssessmentsForFile.AwardPoints(metrics.AssessmentKeyTestsPassing, uint64(testsPassing))
}

if ctx.Language.ID() == "golang" { // Currently we only support Go for "symflower fix".
withSymflowerFixTestResult, processingTime, ps, err := ExecuteWithSymflowerFix(ctx, taskLogger.Logger, filepath.Join(ctx.Repository.DataPath(), packagePath))
problems = append(problems, ps...)
if err != nil {
problems = append(problems, err)
} else {
testsPassing := withSymflowerFixTestResult.TestsPass
taskLogger.Printf("with symflower repair: Executes tests with %d tests passing", testsPassing)

// Symflower was able to fix a failure so now update the assessment with the improved results.
withSymflowerFixAssessments := metrics.NewAssessments()
withSymflowerFixAssessments[metrics.AssessmentKeyProcessingTime] = processingTime
withSymflowerFixAssessments.Award(metrics.AssessmentKeyFilesExecuted)
withSymflowerFixAssessments.AwardPoints(metrics.AssessmentKeyTestsPassing, uint64(testsPassing))

withSymflowerAssessmentsForFile = metrics.CombineWithSymflowerFixAssessments(modelAssessmentsForFile, withSymflowerFixAssessments)
}
}

modelAssessments.Add(modelAssessmentsForFile)
withSymflowerAssessments.Add(withSymflowerAssessmentsForFile)
}
Expand Down
123 changes: 57 additions & 66 deletions evaluate/task/task-write-test.go
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
package task

import (
"context"
"errors"
"fmt"
"strings"

Expand Down Expand Up @@ -54,77 +52,17 @@ func (t *TaskWriteTests) Run(ctx evaltask.Context) (repositoryAssessment map[eva
withSymflowerFixAssessment[metrics.AssessmentKeyFilesExecutedMaximumReachable] = maximumReachableFiles

for _, filePath := range filePaths {
modelAssessmentForFile := metrics.NewAssessments()
withSymflowerFixAssessmentForFile := modelAssessmentForFile // The symflower assessment tracks how the model result can be improved in case of a failure, so just link to the model assessment until a failure actually happens.

if err := ctx.Repository.Reset(ctx.Logger); err != nil {
ctx.Logger.Panicf("ERROR: unable to reset temporary repository path: %s", err)
}

modelContext := model.Context{
Language: ctx.Language,

RepositoryPath: dataPath,
FilePath: filePath,

Logger: taskLogger.Logger,
}
assessments, err := modelCapability.WriteTests(modelContext)
if err != nil {
problems = append(problems, pkgerrors.WithMessage(err, filePath))

continue
}
if assessments[metrics.AssessmentKeyProcessingTime] == 0 {
return nil, nil, pkgerrors.Errorf("no model response time measurement present for %q at repository %q", ctx.Model.ID(), ctx.Repository.Name())
}
modelAssessmentForFile.Add(assessments)
modelAssessmentForFile.Award(metrics.AssessmentKeyResponseNoError)

testResult, ps, err := ctx.Language.ExecuteTests(taskLogger.Logger, dataPath)
modelAssessmentFile, withSymflowerFixAssessmentFile, ps, err := runModelAndSymflowerFix(ctx, taskLogger, modelCapability, dataPath, filePath)
problems = append(problems, ps...)
if err != nil {
problems = append(problems, pkgerrors.WithMessage(err, filePath))

// If there is an execution timeout do not run "symflower fix" because the code itself is correct.
if errors.Is(err, context.DeadlineExceeded) {
modelAssessment.Add(modelAssessmentForFile)
withSymflowerFixAssessment.Add(withSymflowerFixAssessmentForFile)

continue
}

// Run "symflower fix" if the model response fails to execute.
if ctx.Language.ID() == "golang" { // Currently we only support Go for "symflower fix".
withSymflowerFixTestResult, processingTime, ps, err := ExecuteWithSymflowerFix(ctx, taskLogger.Logger, ctx.Repository.DataPath())
problems = append(problems, ps...)
if err != nil {
problems = append(problems, err)

modelAssessment.Add(modelAssessmentForFile)
withSymflowerFixAssessment.Add(withSymflowerFixAssessmentForFile)

continue
} else {
ctx.Logger.Printf("with symflower repair: Executes tests with %d coverage objects", withSymflowerFixTestResult.Coverage)

// Symflower was able to fix a failure so now update the assessment with the improved results.
withSymflowerFixAssessments := metrics.NewAssessments()
withSymflowerFixAssessments[metrics.AssessmentKeyProcessingTime] = processingTime
withSymflowerFixAssessments.Award(metrics.AssessmentKeyFilesExecuted)
withSymflowerFixAssessments.AwardPoints(metrics.AssessmentKeyCoverage, withSymflowerFixTestResult.Coverage)

withSymflowerFixAssessmentForFile = metrics.CombineWithSymflowerFixAssessments(modelAssessmentForFile, withSymflowerFixAssessments)
}
}
} else {
taskLogger.Printf("Executes tests with %d coverage objects", testResult.Coverage)
modelAssessmentForFile.Award(metrics.AssessmentKeyFilesExecuted)
modelAssessmentForFile.AwardPoints(metrics.AssessmentKeyCoverage, testResult.Coverage)
return nil, problems, err
}

modelAssessment.Add(modelAssessmentForFile)
withSymflowerFixAssessment.Add(withSymflowerFixAssessmentForFile)
modelAssessment.Add(modelAssessmentFile)
withSymflowerFixAssessment.Add(withSymflowerFixAssessmentFile)
}

repositoryAssessment = map[evaltask.Identifier]metrics.Assessments{
Expand All @@ -135,6 +73,59 @@ func (t *TaskWriteTests) Run(ctx evaltask.Context) (repositoryAssessment map[eva
return repositoryAssessment, problems, nil
}

func runModelAndSymflowerFix(ctx evaltask.Context, taskLogger *taskLogger, modelCapability model.CapabilityWriteTests, dataPath string, filePath string) (modelAssessment metrics.Assessments, withSymflowerFixAssessment metrics.Assessments, problems []error, err error) {
modelAssessment = metrics.NewAssessments()
withSymflowerFixAssessment = modelAssessment // The symflower assessment tracks how the model result can be improved in case of a failure, so just link to the model assessment until we successfully applied "symflower fix".

modelContext := model.Context{
Language: ctx.Language,

RepositoryPath: dataPath,
FilePath: filePath,

Logger: taskLogger.Logger,
}
assessments, err := modelCapability.WriteTests(modelContext)
if err != nil {
return nil, nil, append(problems, pkgerrors.WithMessage(err, filePath)), nil
}
if assessments[metrics.AssessmentKeyProcessingTime] == 0 {
return nil, nil, problems, pkgerrors.Errorf("no model response time measurement present for %q at repository %q", ctx.Model.ID(), ctx.Repository.Name())
}
modelAssessment.Add(assessments)
modelAssessment.Award(metrics.AssessmentKeyResponseNoError)

testResult, ps, err := ctx.Language.ExecuteTests(taskLogger.Logger, dataPath)
problems = append(problems, ps...)
if err != nil {
problems = append(problems, pkgerrors.WithMessage(err, filePath))
} else {
taskLogger.Printf("Executes tests with %d coverage objects", testResult.Coverage)
modelAssessment.Award(metrics.AssessmentKeyFilesExecuted)
modelAssessment.AwardPoints(metrics.AssessmentKeyCoverage, testResult.Coverage)
}

if ctx.Language.ID() == "golang" { // Currently we only support Go for "symflower fix".
withSymflowerFixTestResult, processingTime, ps, err := ExecuteWithSymflowerFix(ctx, taskLogger.Logger, ctx.Repository.DataPath())
problems = append(problems, ps...)
if err != nil {
problems = append(problems, err)
} else {
ctx.Logger.Printf("with symflower repair: Executes tests with %d coverage objects", withSymflowerFixTestResult.Coverage)

// Symflower was able to fix a failure so now update the assessment with the improved results.
withSymflowerFix := metrics.NewAssessments()
withSymflowerFix[metrics.AssessmentKeyProcessingTime] = processingTime
withSymflowerFix.Award(metrics.AssessmentKeyFilesExecuted)
withSymflowerFix.AwardPoints(metrics.AssessmentKeyCoverage, withSymflowerFixTestResult.Coverage)

withSymflowerFixAssessment = metrics.CombineWithSymflowerFixAssessments(modelAssessment, withSymflowerFix)
}
}

return modelAssessment, withSymflowerFixAssessment, problems, nil
}

// validateWriteTestsRepository checks if the repository for the "write-tests" task is well-formed.
func validateWriteTestsRepository(logger *log.Logger, repositoryPath string, language language.Language) (err error) {
logger.Printf("validating repository %q", repositoryPath)
Expand Down
24 changes: 0 additions & 24 deletions evaluate/task/task-write-test_test.go
Original file line number Diff line number Diff line change
@@ -1,14 +1,12 @@
package task

import (
"context"
"fmt"
"os"
"path/filepath"
"testing"

"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/mock"
"github.com/stretchr/testify/require"
"github.com/symflower/eval-dev-quality/evaluate/metrics"
metricstesting "github.com/symflower/eval-dev-quality/evaluate/metrics/testing"
Expand All @@ -17,7 +15,6 @@ import (
"github.com/symflower/eval-dev-quality/language/golang"
"github.com/symflower/eval-dev-quality/language/java"
"github.com/symflower/eval-dev-quality/language/ruby"
languagetesting "github.com/symflower/eval-dev-quality/language/testing"
"github.com/symflower/eval-dev-quality/log"
modeltesting "github.com/symflower/eval-dev-quality/model/testing"
"github.com/symflower/eval-dev-quality/task"
Expand Down Expand Up @@ -191,27 +188,6 @@ func TestTaskWriteTestsRun(t *testing.T) {
this is not valid go code
`), expectedAssessments, expectedProblems, false)
}
{
expectedAssessments := map[task.Identifier]metrics.Assessments{
IdentifierWriteTests: metrics.Assessments{
metrics.AssessmentKeyFilesExecutedMaximumReachable: 1,
metrics.AssessmentKeyResponseNoError: 1,
},
IdentifierWriteTestsSymflowerFix: metrics.Assessments{
metrics.AssessmentKeyFilesExecutedMaximumReachable: 1,
metrics.AssessmentKeyResponseNoError: 1,
},
}
expectedProblems := []string{
"context deadline exceeded",
}

languageMock := languagetesting.NewMockLanguageNamed(t, "golang")
languageMock.On("Files", mock.Anything, mock.Anything).Return([]string{filepath.Join("golang", "plain")}, nil).Once()
languageMock.On("ExecuteTests", mock.Anything, mock.Anything).Return(nil, nil, context.DeadlineExceeded).Once()

validateGo(t, "Execution timeout", languageMock, "", expectedAssessments, expectedProblems, false)
}
})
})

Expand Down
Loading

0 comments on commit da013ed

Please sign in to comment.