Skip to content

Commit

Permalink
internal/labels: clean issue body
Browse files Browse the repository at this point in the history
Remove HTML comments and some headings that come
from the issue template.

This fixes the classification of 69040, an empty issue (see
internal/devtools/cmd/labeleval). Previously, the LLM was
confused by the comments and the presence of some headers.

For #64.

Change-Id: I82a68bba3f3aac9365b4d14035ed976b405651dc
Reviewed-on: https://go-review.googlesource.com/c/oscar/+/635875
Reviewed-by: Tatiana Bradley <tatianabradley@google.com>
Reviewed-by: Hyang-Ah Hana Kim <hyangah@gmail.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
  • Loading branch information
jba committed Dec 13, 2024
1 parent 7ee572b commit 6dd16b5
Show file tree
Hide file tree
Showing 2 changed files with 85 additions and 3 deletions.
61 changes: 58 additions & 3 deletions internal/labels/labels.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,12 @@ import (
"fmt"
"html/template"
"log"
"regexp"
"strings"

"golang.org/x/oscar/internal/github"
"golang.org/x/oscar/internal/llm"
"rsc.io/markdown"
)

// A Category is a classification for an issue.
Expand All @@ -38,11 +41,12 @@ func IssueCategoryFromList(ctx context.Context, cgen llm.ContentGenerator, iss *
return Category{}, "", errors.New("issue is a pull request")
}

body := cleanIssueBody(iss.Body)
// Extract issue text into a string.
var issueText bytes.Buffer
err = template.Must(template.New("body").Parse(body)).Execute(&issueText, bodyArgs{
err = template.Must(template.New("body").Parse(bodyTemplate)).Execute(&issueText, bodyArgs{
Title: iss.Title,
Body: iss.Body,
Body: body,
})
if err != nil {
return Category{}, "", err
Expand Down Expand Up @@ -73,6 +77,57 @@ func IssueCategoryFromList(ctx context.Context, cgen llm.ContentGenerator, iss *
return Category{}, "", fmt.Errorf("no category matches LLM response %q", jsonRes)
}

// TODO(jba): this is approximate.
// See https://developer.mozilla.org/en-US/docs/Web/HTML/Comments for the exact syntax.
var htmlCommentRegexp = regexp.MustCompile(`<!--(\n|.)*?-->`)

// cleanIssueBody adjusts the issue body to improve the odds that it will be properly
// labeled.
func cleanIssueBody(text string) string {
// TODO(jba): These settings are also used in fix.go to parse bodies. Factor out.
p := &markdown.Parser{
AutoLinkText: true,
Strikethrough: true,
HeadingIDs: true,
Emoji: true,
}
doc := p.Parse(text)

var cleanBlock func(markdown.Block)
cleanBlock = func(x markdown.Block) {
switch x := x.(type) {
case *markdown.Document:
for _, sub := range x.Blocks {
cleanBlock(sub)
}
case *markdown.HTMLBlock:
// Delete comments.
// Each Text is a line.
t := strings.Join(x.Text, "\n")
t = htmlCommentRegexp.ReplaceAllString(t, "")
x.Text = strings.Split(t, "\n")
case *markdown.Quote:
for _, sub := range x.Blocks {
cleanBlock(sub)
}
case *markdown.List:
for _, sub := range x.Items {
cleanBlock(sub)
}
case *markdown.Item:
for _, sub := range x.Blocks {
cleanBlock(sub)
}
case *markdown.Heading:
cleanBlock(x.Text)
case *markdown.Paragraph:
cleanBlock(x.Text)
}
}
cleanBlock(doc)
return markdown.Format(doc)
}

// response is the response that should generated by the LLM.
// It must match [responseSchema].
type response struct {
Expand Down Expand Up @@ -102,7 +157,7 @@ Report the category of the issue and an explanation of your decision.
Each category and its description are listed below.
`
const body = `
const bodyTemplate = `
The title of the issue is: {{.Title}}
The body of the issue is: {{.Body}}
`
Expand Down
27 changes: 27 additions & 0 deletions internal/labels/labels_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,3 +39,30 @@ func kindTestGenerator() llm.ContentGenerator {
return `{"CategoryName":"other","Explanation":"whatever"}`, nil
})
}

func TestCleanIssueBody(t *testing.T) {
for _, tc := range []struct {
in string
want string
}{
{"", ""},
{"# H\nword\nword2\n", "# H\n\nword\nword2\n"},
{
"<!-- comment -->\n### H3\n<!-- another --> done",
"\n\n### H3\n\n done\n",
},
{
"<!--\ncomment\n-->\n### H3\n<!-- another -->\ndone",
"\n\n### H3\n\n\n\ndone\n",
},
{
"<!-- a --> b -->",
" b -->\n",
},
} {
got := cleanIssueBody(tc.in)
if got != tc.want {
t.Errorf("%q:\ngot %q\nwant %q", tc.in, got, tc.want)
}
}
}

0 comments on commit 6dd16b5

Please sign in to comment.