From b9194cd65fdf329ee2ca16fdfa6da3da4096681c Mon Sep 17 00:00:00 2001 From: Jeremy Lewi Date: Mon, 8 Apr 2024 08:01:51 -0700 Subject: [PATCH] Converters to/from markdown (#31) * Use the AST to parse markdown documents and turn them into blocks so they can potentially be rendered as notebooks. * Add some test utilities to define common comparers --- app/go.mod | 8 +- app/go.sum | 8 ++ app/pkg/docs/const.go | 10 ++ app/pkg/docs/converters.go | 147 ++++++++++++++++++++++++++++++ app/pkg/docs/converters_test.go | 136 +++++++++++++++++++++++++++ app/pkg/docs/docs.go | 2 + app/pkg/docs/test_data/testdoc.md | 23 +++++ app/pkg/executor/executor_test.go | 6 +- app/pkg/testutil/comparers.go | 10 ++ 9 files changed, 346 insertions(+), 4 deletions(-) create mode 100644 app/pkg/docs/const.go create mode 100644 app/pkg/docs/converters.go create mode 100644 app/pkg/docs/converters_test.go create mode 100644 app/pkg/docs/docs.go create mode 100644 app/pkg/docs/test_data/testdoc.md create mode 100644 app/pkg/testutil/comparers.go diff --git a/app/go.mod b/app/go.mod index 357ed7e6..3ccfbffd 100644 --- a/app/go.mod +++ b/app/go.mod @@ -5,10 +5,13 @@ go 1.22.1 replace github.com/jlewi/foyle/protos/go => ../protos/go require ( + github.com/Kunde21/markdownfmt/v3 v3.1.0 github.com/gin-contrib/cors v1.7.1 github.com/gin-gonic/gin v1.9.1 + github.com/go-cmd/cmd v1.4.1 github.com/go-logr/logr v1.3.0 github.com/go-logr/zapr v1.3.0 + github.com/google/go-cmp v0.6.0 github.com/grpc-ecosystem/grpc-gateway/v2 v2.19.1 github.com/jlewi/foyle/protos/go v0.0.0-00010101000000-000000000000 github.com/jlewi/hydros v0.0.6 @@ -17,6 +20,7 @@ require ( github.com/spf13/cobra v1.8.0 github.com/spf13/viper v1.18.2 github.com/timtadh/lexmachine v0.2.3 + github.com/yuin/goldmark v1.4.13 go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.46.1 go.uber.org/zap v1.27.0 google.golang.org/grpc v1.62.1 @@ -55,7 +59,6 @@ require ( github.com/gabriel-vasile/mimetype v1.4.3 // indirect github.com/ghodss/yaml v1.0.0 // indirect github.com/gin-contrib/sse v0.1.0 // indirect - github.com/go-cmd/cmd v1.4.1 // indirect github.com/go-errors/errors v1.0.1 // indirect github.com/go-git/gcfg v1.5.0 // indirect github.com/go-git/go-billy/v5 v5.4.1 // indirect @@ -72,7 +75,6 @@ require ( github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect github.com/golang/protobuf v1.5.3 // indirect github.com/google/gnostic v0.6.9 // indirect - github.com/google/go-cmp v0.6.0 // indirect github.com/google/go-containerregistry v0.18.0 // indirect github.com/google/gofuzz v1.2.0 // indirect github.com/google/s2a-go v0.1.7 // indirect @@ -93,6 +95,7 @@ require ( github.com/mailru/easyjson v0.7.7 // indirect github.com/matryer/try v0.0.0-20161228173917-9ac251b645a2 // indirect github.com/mattn/go-isatty v0.0.20 // indirect + github.com/mattn/go-runewidth v0.0.13 // indirect github.com/mitchellh/go-homedir v1.1.0 // indirect github.com/mitchellh/mapstructure v1.5.0 // indirect github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect @@ -102,6 +105,7 @@ require ( github.com/opencontainers/image-spec v1.1.0-rc3 // indirect github.com/pelletier/go-toml/v2 v2.2.0 // indirect github.com/pjbgf/sha1cd v0.3.0 // indirect + github.com/rivo/uniseg v0.4.2 // indirect github.com/sagikazarmark/locafero v0.4.0 // indirect github.com/sagikazarmark/slog-shim v0.1.0 // indirect github.com/sergi/go-diff v1.2.0 // indirect diff --git a/app/go.sum b/app/go.sum index cb4d411d..007e3bfa 100644 --- a/app/go.sum +++ b/app/go.sum @@ -20,6 +20,8 @@ cloud.google.com/go/storage v1.36.0 h1:P0mOkAcaJxhCTvAkMhxMfrTKiNcub4YmmPBtlhAyT cloud.google.com/go/storage v1.36.0/go.mod h1:M6M/3V/D3KpzMTJyPOR/HU6n2Si5QdaXYEsng2xgOs8= github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU= github.com/BurntSushi/toml v1.2.1/go.mod h1:CxXYINrC8qIiEnFrOxCa7Jy5BFHlXnUU2pbicEuybxQ= +github.com/Kunde21/markdownfmt/v3 v3.1.0 h1:KiZu9LKs+wFFBQKhrZJrFZwtLnCCWJahL+S+E/3VnM0= +github.com/Kunde21/markdownfmt/v3 v3.1.0/go.mod h1:tPXN1RTyOzJwhfHoon9wUr4HGYmWgVxSQN6VBJDkrVc= github.com/Microsoft/go-winio v0.5.2/go.mod h1:WpS1mjBmmwHBEWmogvA2mj8546UReBk4v8QkMxJ6pZY= github.com/Microsoft/go-winio v0.6.1 h1:9/kr64B9VUZrLm5YYwbGtUJnMgqWVOdUAXu6Migciow= github.com/Microsoft/go-winio v0.6.1/go.mod h1:LRdKpFKfdobln8UmuiYcKPot9D2v6svN5+sAH+4kjUM= @@ -253,6 +255,8 @@ github.com/matryer/try v0.0.0-20161228173917-9ac251b645a2 h1:JAEbJn3j/FrhdWA9jW8 github.com/matryer/try v0.0.0-20161228173917-9ac251b645a2/go.mod h1:0KeJpeMD6o+O4hW7qJOT7vyQPKrWmj26uf5wMc/IiIs= github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY= github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y= +github.com/mattn/go-runewidth v0.0.13 h1:lTGmDsbAYt5DmK6OnoV7EuIF1wEIFAcxld6ypU4OSgU= +github.com/mattn/go-runewidth v0.0.13/go.mod h1:Jdepj2loyihRzMpdS35Xk/zdY8IAYHsh153qUoGf23w= github.com/mitchellh/go-homedir v1.1.0 h1:lukF9ziXFxDFPkA1vsr5zpc1XuPDn/wFntq5mG+4E0Y= github.com/mitchellh/go-homedir v1.1.0/go.mod h1:SfyaCUpYCn1Vlf4IUYiD9fPX4A5wJrkLzIz1N1q0pr0= github.com/mitchellh/mapstructure v1.5.0 h1:jeMsZIYE/09sWLaz43PL7Gy6RuMjD2eJVyuac5Z2hdY= @@ -280,6 +284,9 @@ github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZN github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U= github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA= +github.com/rivo/uniseg v0.2.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc= +github.com/rivo/uniseg v0.4.2 h1:YwD0ulJSJytLpiaWua0sBDusfsCZohxjxzVTYjwxfV8= +github.com/rivo/uniseg v0.4.2/go.mod h1:FN3SvrM+Zdj16jyLfmOkMNblXMcoc8DfTHruCPUcx88= github.com/rogpeppe/fastuuid v1.2.0/go.mod h1:jVj6XXZzXRy/MSR5jhDC/2q6DgLz+nrA6LYCDYWNEvQ= github.com/rogpeppe/go-internal v1.9.0 h1:73kH8U+JUqXU8lRuOHeVHaa/SZPifC7BkcraZVejAe8= github.com/rogpeppe/go-internal v1.9.0/go.mod h1:WtVeX8xhTBvf0smdhujwtBcq4Qrzq/fJaraNFVN+nFs= @@ -350,6 +357,7 @@ github.com/xlab/treeprint v1.1.0 h1:G/1DjNkPpfZCFt9CSh6b5/nY4VimlbHF3Rh4obvtzDk= github.com/xlab/treeprint v1.1.0/go.mod h1:gj5Gd3gPdKtR1ikdDK6fnFLdmIS0X30kTTuNd/WEJu0= github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= +github.com/yuin/goldmark v1.4.13 h1:fVcFKWvrslecOb/tg+Cc05dkeYx540o0FuFt3nUVDoE= github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= go.opencensus.io v0.24.0 h1:y73uSU6J157QMP2kn2r30vwW1A2W2WFwSCGnAVxeaD0= go.opencensus.io v0.24.0/go.mod h1:vNK8G9p7aAivkbmorf4v+7Hgx+Zs0yY+0fOtgBfjQKo= diff --git a/app/pkg/docs/const.go b/app/pkg/docs/const.go new file mode 100644 index 00000000..08cc5d6d --- /dev/null +++ b/app/pkg/docs/const.go @@ -0,0 +1,10 @@ +package docs + +const ( + BASHLANG = "bash" + // OUTPUTLANG is the language to give to output code blocks. + // We want to potentially distinguish output from code blocks because output blocks are nested inside blocks + // in notebooks. Therefore if we want to be able to convert a markdown document into a document with blocks + // then having a unique language for output blocks helps us identify them and properly reencode them. + OUTPUTLANG = "output" +) diff --git a/app/pkg/docs/converters.go b/app/pkg/docs/converters.go new file mode 100644 index 00000000..30438c98 --- /dev/null +++ b/app/pkg/docs/converters.go @@ -0,0 +1,147 @@ +package docs + +import ( + "strings" + + markdownfmt "github.com/Kunde21/markdownfmt/v3/markdown" + "github.com/jlewi/foyle/protos/go/foyle/v1alpha1" + "github.com/yuin/goldmark" + "github.com/yuin/goldmark/ast" + "github.com/yuin/goldmark/text" +) + +// BlockToMarkdown converts a block to markdown +func BlockToMarkdown(block *v1alpha1.Block) string { + sb := strings.Builder{} + + switch block.GetKind() { + case v1alpha1.BlockKind_CODE: + // Code just gets written as a code block + sb.WriteString("```" + BASHLANG + "\n") + sb.WriteString(block.GetContents()) + sb.WriteString("\n```\n") + default: + // Otherwise assume its a markdown block + sb.WriteString(block.GetContents() + "\n") + } + + // Handle the outputs + for _, output := range block.GetOutputs() { + for _, oi := range output.Items { + sb.WriteString("```" + OUTPUTLANG + "\n") + sb.WriteString(oi.GetTextData()) + sb.WriteString("\n```\n") + } + } + + return sb.String() +} + +// MarkdownToBlocks converts a markdown string into a sequence of blocks. +// This function relies on the goldmark library to parse the markdown into an AST. +func MarkdownToBlocks(mdText string) ([]*v1alpha1.Block, error) { + gm := goldmark.New() + source := []byte(mdText) + reader := text.NewReader(source) + root := gm.Parser().Parse(reader) + + renderer := markdownfmt.NewRenderer() + + blocks := make([]*v1alpha1.Block, 0, 20) + + err := ast.Walk(root, func(node ast.Node, entering bool) (ast.WalkStatus, error) { + if !entering { + // Do nothing on leaving the block; just continue the walk + return ast.WalkContinue, nil + } + + if node.Kind() == ast.KindDocument { + // Ignore the document node + return ast.WalkContinue, nil + } + + if node.Kind() != ast.KindFencedCodeBlock { + // Since we aren't in a code block render the node and its children to markdown + // so we can add them as a block + var sb strings.Builder + if err := renderer.Render(&sb, source, node); err != nil { + return ast.WalkStop, err + } + newBlock := &v1alpha1.Block{ + Kind: v1alpha1.BlockKind_MARKUP, + Contents: sb.String(), + } + blocks = append(blocks, newBlock) + // Skip the children because we've already rendered the children to markdown so there's no need + // to visit the children nodes + return ast.WalkSkipChildren, nil + + } + + // Since we encountered a fenced code block we need to extract the code block + fenced := node.(*ast.FencedCodeBlock) + lang := string(fenced.Language(source)) + textData := getBlockText(fenced, source) + + lastBlock := len(blocks) - 1 + lastWasCode := false + if lastBlock >= 0 && blocks[lastBlock].Kind == v1alpha1.BlockKind_CODE { + lastWasCode = true + } + + if lang == OUTPUTLANG && lastWasCode { + // Since its an output block and the last block was a code block we should append the output to the last block + if blocks[lastBlock].Outputs == nil { + blocks[lastBlock].Outputs = make([]*v1alpha1.BlockOutput, 0, 1) + } + blocks[lastBlock].Outputs = append(blocks[lastBlock].Outputs, &v1alpha1.BlockOutput{ + Items: []*v1alpha1.BlockOutputItem{ + { + TextData: textData, + }, + }, + }) + } else { + block := &v1alpha1.Block{ + Kind: v1alpha1.BlockKind_CODE, + Contents: textData, + Language: lang, + } + blocks = append(blocks, block) + } + + // We can skip walking the children of the code block since we've already ingested the code block + return ast.WalkSkipChildren, nil + }) + + // The way we walk the AST above we potentially end up segmenting continuous markdown without code blocks + // into more than one block. So we merge these blocks. + final := make([]*v1alpha1.Block, 0, len(blocks)) + i := 0 + for _, block := range blocks { + lastBlock := i - 1 + addToLastBlock := false + if lastBlock >= 0 && block.Kind == v1alpha1.BlockKind_MARKUP && final[lastBlock].Kind == v1alpha1.BlockKind_MARKUP { + addToLastBlock = true + } + + if addToLastBlock { + final[lastBlock].Contents += block.Contents + } else { + final = append(final, block) + i++ + } + } + + return final, err +} + +func getBlockText(fenced *ast.FencedCodeBlock, source []byte) string { + var sb strings.Builder + for i := 0; i < fenced.Lines().Len(); i++ { + // Get the i'th line + line := fenced.Lines().At(i) + sb.WriteString(string(line.Value(source))) + } + return sb.String() +} diff --git a/app/pkg/docs/converters_test.go b/app/pkg/docs/converters_test.go new file mode 100644 index 00000000..92ebcce1 --- /dev/null +++ b/app/pkg/docs/converters_test.go @@ -0,0 +1,136 @@ +package docs + +import ( + "os" + "path/filepath" + "testing" + + "github.com/google/go-cmp/cmp" + "github.com/jlewi/foyle/app/pkg/testutil" + "github.com/jlewi/foyle/protos/go/foyle/v1alpha1" +) + +func Test_BlockToMarkdown(t *testing.T) { + type testCase struct { + name string + block *v1alpha1.Block + expected string + } + + testCases := []testCase{ + { + name: "markup", + block: &v1alpha1.Block{ + Kind: v1alpha1.BlockKind_MARKUP, + Contents: "This is a test", + }, + expected: "This is a test\n", + }, + { + name: "code", + block: &v1alpha1.Block{ + Kind: v1alpha1.BlockKind_CODE, + Contents: "echo \"something something\"", + Outputs: []*v1alpha1.BlockOutput{ + { + Items: []*v1alpha1.BlockOutputItem{ + { + TextData: "something something", + }, + }, + }, + }, + }, + expected: "```bash\necho \"something something\"\n```\n```output\nsomething something\n```\n", + }, + } + for _, c := range testCases { + t.Run(c.name, func(t *testing.T) { + actual := BlockToMarkdown(c.block) + if d := cmp.Diff(c.expected, actual); d != "" { + t.Errorf("Unexpected diff:\n%s", d) + } + }) + } +} + +func Test_MarkdownToBlocks(t *testing.T) { + type testCase struct { + name string + inFile string + expected []*v1alpha1.Block + } + + cases := []testCase{ + { + name: "simple", + inFile: "testdoc.md", + expected: []*v1alpha1.Block{ + { + Kind: v1alpha1.BlockKind_MARKUP, + Contents: "# Section 1\n\nThis is section 1", + }, + { + Kind: v1alpha1.BlockKind_CODE, + Language: "go", + Contents: "package main\n\nfunc main() {\n...\n}\n", + }, + { + Kind: v1alpha1.BlockKind_MARKUP, + Contents: "\n\nBreaking text", + }, + { + Kind: v1alpha1.BlockKind_CODE, + Language: "bash", + Contents: "echo \"Hello, World!\"\n", + Outputs: []*v1alpha1.BlockOutput{ + { + Items: []*v1alpha1.BlockOutputItem{ + { + TextData: "hello, world!\n", + }}, + }, + }, + }, + { + Kind: v1alpha1.BlockKind_MARKUP, + Contents: "\n\n## Subsection", + }, + }, + }, + } + + cwd, err := os.Getwd() + if err != nil { + t.Fatalf("Failed to get working directory: %v", err) + } + + for _, c := range cases { + t.Run(c.name, func(t *testing.T) { + fPath := filepath.Join(cwd, "test_data", c.inFile) + raw, err := os.ReadFile(fPath) + if err != nil { + t.Fatalf("Failed to read raw file: %v", err) + } + actual, err := MarkdownToBlocks(string(raw)) + if err != nil { + t.Fatalf("MarkdownToBlocks(%v) returned error %v", c.inFile, err) + } + if len(actual) != len(c.expected) { + t.Errorf("Expected %v blocks got %v", len(c.expected), len(actual)) + } + + for i, eBlock := range c.expected { + if i >= len(actual) { + break + } + + aBlock := actual[i] + + if d := cmp.Diff(eBlock, aBlock, testutil.BlockComparer); d != "" { + t.Errorf("Unexpected diff block %d:\n%s", i, d) + } + } + }) + } +} diff --git a/app/pkg/docs/docs.go b/app/pkg/docs/docs.go new file mode 100644 index 00000000..2f5d70f7 --- /dev/null +++ b/app/pkg/docs/docs.go @@ -0,0 +1,2 @@ +// Package docs contains routines for working with documents. +package docs diff --git a/app/pkg/docs/test_data/testdoc.md b/app/pkg/docs/test_data/testdoc.md new file mode 100644 index 00000000..14025002 --- /dev/null +++ b/app/pkg/docs/test_data/testdoc.md @@ -0,0 +1,23 @@ +# Section 1 + +This is section 1 + +```go +package main + +func main() { +... +} +``` + +Breaking text + +```bash +echo "Hello, World!" +``` + +```output +hello, world! +``` + +## Subsection diff --git a/app/pkg/executor/executor_test.go b/app/pkg/executor/executor_test.go index 28dcb801..45093c10 100644 --- a/app/pkg/executor/executor_test.go +++ b/app/pkg/executor/executor_test.go @@ -5,8 +5,10 @@ import ( "fmt" "testing" - "github.com/google/go-cmp/cmp" "github.com/google/go-cmp/cmp/cmpopts" + + "github.com/google/go-cmp/cmp" + "github.com/jlewi/foyle/app/pkg/testutil" "github.com/jlewi/foyle/protos/go/foyle/v1alpha1" ) @@ -54,7 +56,7 @@ func Test_Executor(t *testing.T) { if err != nil { t.Fatalf("Failed to execute: %v", err) } - if d := cmp.Diff(c.expected, resp, cmpopts.IgnoreUnexported(v1alpha1.ExecuteResponse{}), cmpopts.IgnoreUnexported(v1alpha1.BlockOutput{}), cmpopts.IgnoreUnexported(v1alpha1.BlockOutputItem{})); d != "" { + if d := cmp.Diff(c.expected, resp, testutil.BlockComparer, cmpopts.IgnoreUnexported(v1alpha1.ExecuteResponse{})); d != "" { t.Errorf("Unexpected response (-want +got):\n%v", d) } }) diff --git a/app/pkg/testutil/comparers.go b/app/pkg/testutil/comparers.go new file mode 100644 index 00000000..08db0892 --- /dev/null +++ b/app/pkg/testutil/comparers.go @@ -0,0 +1,10 @@ +package testutil + +import ( + "github.com/google/go-cmp/cmp/cmpopts" + "github.com/jlewi/foyle/protos/go/foyle/v1alpha1" +) + +var ( + BlockComparer = cmpopts.IgnoreUnexported(v1alpha1.Block{}, v1alpha1.BlockOutput{}, v1alpha1.BlockOutputItem{}) +)