From 36c283f9a8e4bf9e704864caae86e106892068c6 Mon Sep 17 00:00:00 2001 From: Matthew Sladescu Date: Tue, 11 Mar 2025 15:58:57 +1100 Subject: [PATCH] cue/encoding/koala: add the koala XML encoding This commit adds an XML encoding for CUE, called koala, as described in proposal #3776 (https://github.com/cue-lang/cue/discussions/3776). Signed-off-by: Matthew Sladescu --- encoding/koala/decode.go | 456 +++++++++++++++++++++++++++++ encoding/koala/decode_test.go | 523 ++++++++++++++++++++++++++++++++++ encoding/koala/encode.go | 234 +++++++++++++++ encoding/koala/encode_test.go | 377 ++++++++++++++++++++++++ go.mod | 2 + go.sum | 4 + 6 files changed, 1596 insertions(+) create mode 100644 encoding/koala/decode.go create mode 100644 encoding/koala/decode_test.go create mode 100644 encoding/koala/encode.go create mode 100644 encoding/koala/encode_test.go diff --git a/encoding/koala/decode.go b/encoding/koala/decode.go new file mode 100644 index 00000000000..c4f268efa8a --- /dev/null +++ b/encoding/koala/decode.go @@ -0,0 +1,456 @@ +// Copyright 2025 The CUE Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package koala converts XML to and from CUE as described here: https://github.com/cue-lang/cue/discussions/3776 +// +// WARNING: THIS PACKAGE IS EXPERIMENTAL. +// ITS API MAY CHANGE AT ANY TIME. +package koala + +import ( + "bytes" + "encoding/xml" + "fmt" + "io" + "regexp" + "strings" + + "cuelang.org/go/cue/ast" + "cuelang.org/go/cue/token" +) + +// koala is an XML encoding for CUE described here: https://github.com/cue-lang/cue/discussions/3776 +// Decoder implements the decoding state. +type Decoder struct { + reader io.Reader + //required to find attribute and content offsets + xmlText string + fileName string + tokenFile *token.File + + // current XML element being processed + currXmlElement *XMLElement + + // Properties below relate to ast representation of XML document + // a description of this model can be found at https://github.com/cue-lang/cue/discussions/3776 + astRoot *ast.StructLit + //CUE model of ancestors of current XML element being processed + ancestors []*ast.Field + //CUE model of current XML element + currField *ast.Field + //CUE model of current XML element's inner content ($$ attribute) + currInnerText *ast.Field +} + +// models an XML Element hierarchy +// used for tracking namespace prefixes +type XMLElement struct { + xmlName xml.Name + attr []xml.Attr + parent *XMLElement + children []*XMLElement + textContentIsWhiteSpace bool +} + +// the prefix used to model the inner text content within an XML element +const ContentAttribute string = "$$" + +// the prefix used to model each attribute of an XML element +const AttributeSymbol string = "$" + +// NewDecoder creates a decoder from a stream of XML input. +func NewDecoder(fileName string, reader io.Reader) *Decoder { + return &Decoder{reader: reader, fileName: fileName} +} + +// Decode parses the input stream as XML and converts it to a CUE [ast.Expr]. +// The input stream is taken from the [Decoder] and consumed. +// If an error is encountered in the decoding process, this function returns it. +func (dec *Decoder) Decode() (ast.Expr, error) { + + data, reader, err := bytesFromReader(dec.reader) + if err != nil { + return nil, err + } + //required to find attribute and content offsets + dec.xmlText = string(data) + dec.reader = reader + + //create a token file to track the position of the XML content in the CUE file + dec.tokenFile = token.NewFile(dec.fileName, 0, len(data)) + dec.tokenFile.SetLinesForContent(data) + + xmlDec := xml.NewDecoder(dec.reader) + + for { + t, err := xmlDec.Token() + if err != nil && err != io.EOF { + return nil, err + } + if t == nil { + break + } + switch xmlToken := t.(type) { + case xml.StartElement: + err = dec.decodeStartElement(xmlToken, xmlDec) + case xml.CharData: + err = dec.decoderInnerText(xmlToken, xmlDec) + case xml.EndElement: + err = dec.decodeEndElement(xmlToken) + } + if err != nil { + return nil, err + } + } + return dec.astRoot, nil +} + +func (dec *Decoder) decoderInnerText(xmlToken xml.CharData, xmlDec *xml.Decoder) error { + //if this is text content within an XML element + textContent := string(xml.CharData(xmlToken)) + if dec.currField != nil { + contentOffset := dec.contentOffset(int(xmlDec.InputOffset())) + txtContentPosition := dec.tokenFile.Pos(contentOffset, token.NoRelPos) + txtLabel := ast.NewString(ContentAttribute) + txtLabel.ValuePos = txtContentPosition + val := convertToBasicLit(textContent) + val.ValuePos = txtContentPosition + textContentNode := &ast.Field{ + Label: txtLabel, + Value: val, + TokenPos: dec.tokenFile.Pos(contentOffset, token.NoRelPos), + } + dec.currInnerText = textContentNode + dec.currXmlElement.textContentIsWhiteSpace = isWhiteSpace(textContent) + return nil + } else { + if isWhiteSpace(textContent) { + return nil + } + return fmt.Errorf("text content outside of an XML element is not supported") + } +} + +func (dec *Decoder) decodeEndElement(xmlToken xml.EndElement) error { + //should match the start element name + if dec.currXmlElement.xmlName.Local != xmlToken.Name.Local { + return fmt.Errorf("mismatched start and end element names: %s and %s", dec.currXmlElement.xmlName.Local, xmlToken.Name.Local) + } + //if there is text content within the element, add it to the element's value + if dec.currXmlElement != nil && dec.currInnerText != nil { + //only support text content within an element that has no sub-elements + if len(dec.currXmlElement.children) == 0 { + err := dec.addFieldToCurrElement(dec.currInnerText) + if err != nil { + return err + } + } else { + //if there is text content within an element that has sub-elements, return an error + err := dec.checkCurrXmlNodeForMixedContentError() + if err != nil { + return err + } + } + } + //XMLElement: step back up the XML hierarchy + if dec.currXmlElement != nil { + dec.currXmlElement = dec.currXmlElement.parent + } + //CUE ast: end current element, and step back up the XML hierarchy + if len(dec.ancestors) > 0 { + dec.currField = dec.ancestors[len(dec.ancestors)-1] + dec.ancestors = dec.ancestors[:len(dec.ancestors)-1] + } + return nil +} + +func (dec *Decoder) decodeStartElement(xmlToken xml.StartElement, xmlDec *xml.Decoder) error { + //if this is the root node + if dec.currField == nil { + dec.currXmlElement = &XMLElement{xmlName: xmlToken.Name, attr: xmlToken.Attr, children: []*XMLElement{}} + cueElement, err := dec.cueFieldFromXmlElement(xmlToken, int(xmlDec.InputOffset()), dec.currXmlElement) + if err != nil { + return err + } + dec.currField = cueElement + dec.astRoot = ast.NewStruct(dec.currField) + ast.SetPos(dec.astRoot, dec.tokenFile.Pos(0, token.NoRelPos)) + } else { + err := dec.checkCurrXmlNodeForMixedContentError() + if err != nil { + return err + } + + //XMLElement: step down the XML hierarchy + parentXmlNode := dec.currXmlElement + dec.currXmlElement = &XMLElement{xmlName: xmlToken.Name, attr: xmlToken.Attr, parent: parentXmlNode, children: []*XMLElement{}} + parentXmlNode.children = append(parentXmlNode.children, dec.currXmlElement) + + //CUE ast: step down the CUE hierarchy + dec.ancestors = append(dec.ancestors, dec.currField) + newElement, err := dec.cueFieldFromXmlElement(xmlToken, int(xmlDec.InputOffset()), dec.currXmlElement) + if err != nil { + return err + } + + //check if this new XML element has a name that has seen before at the current level + xmlElementProperties, err := elementProperties(dec.currField) + if err != nil { + return err + } + for _, elt := range xmlElementProperties { + prefixedXmlElementName, err := prefixedElementName(xmlToken, dec.currXmlElement) + if err != nil { + return err + } + fieldElementName, err := elementNameFromField(elt) + if err != nil { + return err + } + //if the new element has the same name as an existing element at this level add it to a list for that element name + if fieldElementName == ast.NewString(prefixedXmlElementName).Value { + //if the field's value is not a ListLit, create a new ListLit and append the existing field + if _, ok := elt.(*ast.Field).Value.(*ast.ListLit); !ok { + elt.(*ast.Field).Value = &ast.ListLit{Elts: []ast.Expr{elt.(*ast.Field).Value}} + } + //append the new element to the ListLit, which we now know exists + elt.(*ast.Field).Value.(*ast.ListLit).Elts = append(elt.(*ast.Field).Value.(*ast.ListLit).Elts, newElement.Value) + dec.currField = newElement + return nil + } + } + dec.currField.Value.(*ast.StructLit).Elts = append(xmlElementProperties, newElement) + dec.currField = newElement + } + return nil +} + +func elementProperties(field *ast.Field) ([]ast.Decl, error) { + err := fmt.Errorf("could not find element properties") + if field == nil || field.Value == nil { + return nil, err + } + structLit, ok := field.Value.(*ast.StructLit) + if !ok { + return nil, err + } + return structLit.Elts, nil +} + +func elementNameFromField(elt ast.Decl) (string, error) { + err := fmt.Errorf("could not find element name") + field, ok := elt.(*ast.Field) + if !ok || field.Label == nil { + return "", err + } + basicLit, ok := field.Label.(*ast.BasicLit) + if !ok || basicLit.Value == "" { + return "", err + } + return basicLit.Value, nil +} + +// return an error if the current XML element has non-whitespace text content and sub-elements +func (dec *Decoder) checkCurrXmlNodeForMixedContentError() error { + xmlNode := dec.currXmlElement + if len(xmlNode.children) > 0 { + if !xmlNode.textContentIsWhiteSpace { + return fmt.Errorf("text content within an XML element that has sub-elements is not supported") + } + } + return nil +} + +func isWhiteSpace(s string) bool { + return regexp.MustCompile(`^[\s\r\n]*$`).MatchString(s) +} + +// attributeOffsets returns the offset of the attribute key and value in the XML text, in that order. +// The containing element offset is the offset of the end of the element that contains the attribute. +func (dec *Decoder) attributeOffsets(attribute xml.Attr, startElement xml.StartElement, containingElementOffset int) (int, int, error) { + //find the starting index of the element + elementStartIdx := containingElementOffset - 1 + for elementStartIdx >= 0 && dec.xmlText[elementStartIdx] != '<' { + elementStartIdx-- + } + if elementStartIdx == -1 { + return -1, -1, fmt.Errorf("could not find start of element") + } + elementText := dec.xmlText[elementStartIdx:containingElementOffset] + + //get the full attribute name including the namespace prefix + attrName, err := prefixedAttrName(attribute, startElement, dec.currXmlElement) + if err != nil { + return -1, -1, err + } + + // find the start index of the attribute key + // including the attribute start quote in the search ensures that we are not matching a substring of another attribute + re := regexp.MustCompile(`\s+(` + attrName + `\s*=\s*["'])`) + matches := re.FindStringIndex(elementText) + offsetFindErr := fmt.Errorf("could not find attribute %s in element %s", attrName, startElement.Name.Local) + if matches == nil { + return -1, -1, offsetFindErr + } + attrKeyIndex := matches[0] + + //increment the attrKeyIndex for each space found before the attribute name + attrKeyIndex += bytes.IndexFunc([]byte(elementText[attrKeyIndex:]), func(r rune) bool { return r != ' ' }) + + //find the start index of the value + attrValueIndex := bytes.IndexByte([]byte(elementText[attrKeyIndex:]), '"') + attrKeyIndex + + if attrKeyIndex == -1 || attrValueIndex == -1 { + return -1, -1, offsetFindErr + } + + return elementStartIdx + attrKeyIndex, elementStartIdx + attrValueIndex, nil +} + +// find the start of the $$content that ends at the endElementOffset +func (dec *Decoder) contentOffset(endElementOffset int) int { + //find the start of the content of the element + contentStartIdx := endElementOffset + for i := endElementOffset; i > 0; i-- { + if dec.xmlText[i] == '>' { + return i + 1 + } + } + return contentStartIdx +} + +// create a new ast.Field to model the XML element +func (dec *Decoder) cueFieldFromXmlElement(elem xml.StartElement, offset int, xmlNode *XMLElement) (*ast.Field, error) { + elementName, err := prefixedElementName(elem, xmlNode) + if err != nil { + return nil, err + } + + resLabel := ast.NewString(elementName) + resLabel.ValuePos = dec.tokenFile.Pos(offset, token.NoRelPos) + + result := &ast.Field{ + Label: resLabel, + Value: &ast.StructLit{}, + TokenPos: dec.tokenFile.Pos(offset, token.NoRelPos), + } + + // Extract attributes as children + for _, a := range elem.Attr { + attrName, err := prefixedAttrName(a, elem, xmlNode) + if err != nil { + return nil, err + } + label := ast.NewString(AttributeSymbol + attrName) + value := convertToBasicLit(a.Value) + + attrKeyOffset, attrValOffset, err := dec.attributeOffsets(a, elem, offset) + if err != nil { + return nil, err + } + + label.ValuePos = dec.tokenFile.Pos(attrKeyOffset, token.NoRelPos) + value.ValuePos = dec.tokenFile.Pos(attrValOffset, token.NoRelPos) + + attrExpr := &ast.Field{ + Label: label, + Value: value, + TokenPos: dec.tokenFile.Pos(attrKeyOffset, token.NoRelPos), + } + + result.Value.(*ast.StructLit).Elts = append(result.Value.(*ast.StructLit).Elts, attrExpr) + } + + return result, nil +} + +// return the name of an element, including its namespace prefix if it has one; but without namespace prefix if it is "xmlns" +func prefixedElementName(elem xml.StartElement, xmlNode *XMLElement) (string, error) { + elementName := elem.Name.Local + if elem.Name.Space != "" { + prefixNS, err := nsPrefix(elem.Name.Space, elem.Attr, xmlNode) + if err != nil { + return elementName, err + } + if prefixNS != "xmlns" { + elementName = prefixNS + ":" + elem.Name.Local + } + } + return elementName, nil +} + +// return the name of an attribute, including its namespace prefix if it has one +func prefixedAttrName(a xml.Attr, elem xml.StartElement, xmlNode *XMLElement) (string, error) { + attrName := a.Name.Local + if a.Name.Space != "" { + prefix, err := nsPrefix(a.Name.Space, elem.Attr, xmlNode) + if err != nil { + return attrName, err + } + attrName = prefix + ":" + a.Name.Local + } + return attrName, nil +} + +func convertToBasicLit(s string) *ast.BasicLit { + //discard carriage returns from s + s = strings.ReplaceAll(s, "\r", "") + return ast.NewString(s) +} + +// find the prefix label for a given namespace by looking at the current node's attributes and then +// walking up the hierarchy of XML nodes +func nsPrefix(nameSpace string, attributes []xml.Attr, xmlNode *XMLElement) (string, error) { + //when the prefix is xmlns, then the namespace is xmlns according to the golang XML parser + if nameSpace == "xmlns" { + return "xmlns", nil + } + for _, attr := range attributes { + if attr.Value == nameSpace { + return attr.Name.Local, nil + } + } + if xmlNode != nil { + if xmlNode.parent != nil { + return nsPrefix(nameSpace, xmlNode.parent.attr, xmlNode.parent) + } + } + return "", fmt.Errorf("could not find prefix for namespace %s", nameSpace) +} + +func bytesFromReader(r io.Reader) ([]byte, io.Reader, error) { + //read all bytes from r + data, err := io.ReadAll(r) + if err != nil { + return nil, nil, err + } + //create reader from bytes + reader := bytes.NewReader(data) + return data, reader, nil +} + +func (dec *Decoder) addFieldToCurrElement(field *ast.Field) error { + if dec.currField == nil { + return fmt.Errorf("current field is nil") + } + if dec.currField.Value == nil { + return fmt.Errorf("current field value is nil") + } + structLit, ok := dec.currField.Value.(*ast.StructLit) + if !ok { + return fmt.Errorf("current field value is not a StructLit") + } + dec.currField.Value.(*ast.StructLit).Elts = append(structLit.Elts, field) + return nil +} diff --git a/encoding/koala/decode_test.go b/encoding/koala/decode_test.go new file mode 100644 index 00000000000..dc5ed5d3505 --- /dev/null +++ b/encoding/koala/decode_test.go @@ -0,0 +1,523 @@ +// Copyright 2025 The CUE Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package koala converts XML to and from CUE as described here: https://github.com/cue-lang/cue/discussions/3776 +// +// WARNING: THIS PACKAGE IS EXPERIMENTAL. +// ITS API MAY CHANGE AT ANY TIME. +package koala + +import ( + "fmt" + "strings" + "testing" + + "cuelang.org/go/cue" + "cuelang.org/go/cue/ast/astutil" + "cuelang.org/go/cue/cuecontext" + "cuelang.org/go/cue/errors" + "github.com/go-quicktest/qt" +) + +func TestErrorReporting(t *testing.T) { + t.Parallel() + tests := []struct { + name string + inputXML string + cueConstraints string + expectedError string + }{{ + name: "Element Text Content Constraint Error", + inputXML: ` + + + + + + + + + content + `, + cueConstraints: `test: { + $v: string + edge: { + $n: string + $o: string + } + container: [...{ + $id: string + l: [...{ + $attr: string + }] + }] + text: { + $$: int + } + }`, + expectedError: `myXmlFile.xml:10:10 +schema.cue:14:8 +test.text.$$: conflicting values int and "content" (mismatched types int and string) +`, + }, { + name: "Attribute Constraint Error", + inputXML: ` + + + + + + + + + content + `, + cueConstraints: `test: { + $v: int + edge: { + $n: string + $o: string + } + container: [...{ + $id: string + l: [...{ + $attr: string + }] + }] + text: { + $$: string + } + }`, + expectedError: `myXmlFile.xml:2:11 +schema.cue:2:7 +test.$v: conflicting values int and "v2.1" (mismatched types int and string) +`, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + t.Parallel() + + var err error + fileName := "myXmlFile.xml" + dec := NewDecoder(fileName, strings.NewReader(test.inputXML)) + cueExpr, err := dec.Decode() + + qt.Assert(t, qt.IsNil(err)) + + rootCueFile, _ := astutil.ToFile(cueExpr) + c := cuecontext.New() + rootCueVal := c.BuildFile(rootCueFile, cue.Filename(fileName)) + + // compile some CUE into a Value + compiledSchema := c.CompileString(test.cueConstraints, cue.Filename("schema.cue")) + + //unify the compiledSchema against the formattedConfig + unified := compiledSchema.Unify(rootCueVal) + + actualError := "" + if err := unified.Validate(cue.Concrete(true), cue.Schema()); err != nil { + + for _, e := range errors.Errors(err) { + + positions := errors.Positions(e) + for _, p := range positions { + actualError += fmt.Sprintf("%s\n", p) + } + actualError += fmt.Sprintf("%s\n", e.Error()) + } + } + + qt.Assert(t, qt.Equals(actualError, test.expectedError)) + qt.Assert(t, qt.IsNil(err)) + }) + } +} + +func TestElementDecoding(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + inputXML string + wantCUE string + }{{ + name: "1. Simple Elements", + inputXML: ` + + Jani + Reminder + Don't forget me this weekend! +`, + wantCUE: `{ + note: { + to: { + $$: " " + } + from: { + $$: "Jani" + } + heading: { + $$: "Reminder" + } + body: { + $$: "Don't forget me this weekend!" + } + } +}`, + }, + { + name: "2. Attribute", + inputXML: ` + Tove + Jani + Reminder + Don't forget me this weekend! +`, + wantCUE: `{ + note: { + $alpha: "abcd" + to: { + $$: "Tove" + } + from: { + $$: "Jani" + } + heading: { + $$: "Reminder" + } + body: { + $$: "Don't forget me this weekend!" + } + } +}`, + }, + { + name: "3. Attribute and Element with the same name", + inputXML: ` + Tove + Jani + Reminder + Don't forget me this weekend! + efgh +`, + wantCUE: `{ + note: { + $alpha: "abcd" + to: { + $$: "Tove" + } + from: { + $$: "Jani" + } + heading: { + $$: "Reminder" + } + body: { + $$: "Don't forget me this weekend!" + } + alpha: { + $$: "efgh" + } + } +}`, + }, + { + name: "4. Mapping for content when an attribute exists", + inputXML: ` + hello +`, + wantCUE: `{ + note: { + $alpha: "abcd" + $$: """ + + \thello + + """ + } +}`, + }, + { + name: "5. Nested Element", + inputXML: ` + hello +`, + wantCUE: `{ + notes: { + note: { + $alpha: "abcd" + $$: "hello" + } + } +}`, + }, + { + name: "6. Collections", + inputXML: ` + hello + goodbye +`, + wantCUE: `{ + notes: { + note: [{ + $alpha: "abcd" + $$: "hello" + }, { + $alpha: "abcdef" + $$: "goodbye" + }] + } +}`, + }, + { + name: "7. Interleaving Element Types", + inputXML: ` + hello + goodbye + mybook + goodbye + direct +`, + wantCUE: `{ + notes: { + note: [{ + $alpha: "abcd" + $$: "hello" + }, { + $alpha: "abcdef" + $$: "goodbye" + }, { + $alpha: "ab" + $$: "goodbye" + }, { + $$: "direct" + }] + book: { + $$: "mybook" + } + } +}`, + }, + { + name: "8. Namespaces", + inputXML: ` + + Apples + Bananas + +`, + wantCUE: `{ + "h:table": { + "$xmlns:h": "http://www.w3.org/TR/html4/" + "h:tr": { + "h:td": [{ + $$: "Apples" + }, { + $$: "Bananas" + }] + } + } +}`, + }, + { + name: "8.1. Attribute namespace prefix", + inputXML: ` + + Apples + Bananas + +`, + wantCUE: `{ + "h:table": { + "$xmlns:h": "http://www.w3.org/TR/html4/" + "$xmlns:f": "http://www.w3.org/TR/html5/" + "h:tr": { + "h:td": [{ + "$f:type": "fruit" + $$: "Apples" + }, { + $$: "Bananas" + }] + } + } +}`, + }, + { + name: "9. Mixed Namespaces", + inputXML: ` + + Apples + Bananas + e3r + +`, + wantCUE: `{ + "h:table": { + "$xmlns:h": "http://www.w3.org/TR/html4/" + "$xmlns:r": "d" + "h:tr": { + "h:td": [{ + $$: "Apples" + }, { + $$: "Bananas" + }] + "r:blah": { + $$: "e3r" + } + } + } +}`, + }, + { + name: "10. Elements with same name but different namespaces", + inputXML: ` + + Apples + Bananas + e3r + +`, + wantCUE: `{ + "h:table": { + "$xmlns:h": "http://www.w3.org/TR/html4/" + "$xmlns:r": "d" + "h:tr": { + "h:td": [{ + $$: "Apples" + }, { + $$: "Bananas" + }] + "r:td": { + $$: "e3r" + } + } + } +}`, + }, + { + name: "11. Collection of elements, where elements have optional properties", + inputXML: ` + + title + John Doe + + + title2 + Jane Doe + + + Lord of the rings + JRR Tolkien + + Fellowship + JRR Tolkien + + + Two Towers + JRR Tolkien + + + Return of the King + JRR Tolkien + + +`, + wantCUE: `{ + books: { + book: [{ + title: { + $$: "title" + } + author: { + $$: "John Doe" + } + }, { + title: { + $$: "title2" + } + author: { + $$: "Jane Doe" + } + }, { + title: { + $$: "Lord of the rings" + } + author: { + $$: "JRR Tolkien" + } + volume: [{ + title: { + $$: "Fellowship" + } + author: { + $$: "JRR Tolkien" + } + }, { + title: { + $$: "Two Towers" + } + author: { + $$: "JRR Tolkien" + } + }, { + title: { + $$: "Return of the King" + } + author: { + $$: "JRR Tolkien" + } + }] + }] + } +}`, + }, + { + name: "12. Carriage Return Filter Test", + inputXML: "\r\nhello\r\n", + wantCUE: `{ + node: { + $$: """ + + hello + + """ + } +}`, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + t.Parallel() + + fileName := "myXmlFile.xml" + + dec := NewDecoder(fileName, strings.NewReader(test.inputXML)) + cueExpr, err := dec.Decode() + + qt.Assert(t, qt.IsNil(err)) + + rootCueFile, _ := astutil.ToFile(cueExpr) + c := cuecontext.New() + rootCueVal := c.BuildFile(rootCueFile, cue.Filename(fileName)) + + actualCue := fmt.Sprintf("%v", rootCueVal) + + qt.Assert(t, qt.Equals(actualCue, test.wantCUE)) + qt.Assert(t, qt.IsNil(err)) + }) + } +} diff --git a/encoding/koala/encode.go b/encoding/koala/encode.go new file mode 100644 index 00000000000..600adbde588 --- /dev/null +++ b/encoding/koala/encode.go @@ -0,0 +1,234 @@ +// Copyright 2025 The CUE Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package koala converts XML to and from CUE as described here: https://github.com/cue-lang/cue/discussions/3776 +// +// WARNING: THIS PACKAGE IS EXPERIMENTAL. +// ITS API MAY CHANGE AT ANY TIME. +package koala + +import ( + "bytes" + "fmt" + "io" + "strings" + + "cuelang.org/go/cue" + "github.com/go-xmlfmt/xmlfmt" + "github.com/matthew-sladescu/gosaxml" +) + +// koala is an XML encoding for CUE described here: https://github.com/cue-lang/cue/discussions/3776 +// Encoder implements the encoding state for the koala encoder. +type Encoder struct { + writer io.Writer + internalWriter *bytes.Buffer + PrettyPrint bool +} + +// NewEncoder creates an encoder to write out encoded XML bytes to the given writer +// By default, the encoder will pretty print the XML output. +func NewEncoder(w io.Writer) *Encoder { + // Create a new internal writer to handle the XML encoding + internalWriter := bytes.NewBufferString("") + return &Encoder{writer: w, internalWriter: internalWriter, PrettyPrint: true} +} + +// Encode encodes the given CUE value into XML format and writes it to the encoder's writer. +// Note that the CUE value is expected to follow the XML mapping described by the koala +// format here: https://github.com/cue-lang/cue/discussions/3776 +func (e *Encoder) Encode(val cue.Value) error { + // using gosaxml encoder since it preserves special characters like apostraphe without escaping them, unlike the standard xml encoder + // this is consistent with the decoder, which also preserves these special characters + encoder := gosaxml.NewEncoder(e.internalWriter) + err := encodeValue(encoder, val, nil) + if err != nil { + return err + } + err = encoder.Flush() + if err != nil { + return err + } + //write to external buffer + rawXml := e.internalWriter.Bytes() + var writeError error + if e.PrettyPrint { + // pretty print the XML output + fmtXml := xmlfmt.FormatXML(string(rawXml), "", "\t") + strippedCR := strings.ReplaceAll(fmtXml, "\r", "") + trimmedXml := strings.TrimSpace(strippedCR) + _, writeError = e.writer.Write([]byte(trimmedXml)) + } else { + // write the raw XML output without pretty printing + _, writeError = e.writer.Write(rawXml) + } + return writeError +} + +// Convert from the koala CUE representation of a given cue.Value to the XML representation +func encodeValue(encoder *gosaxml.Encoder, val cue.Value, currStartElement *gosaxml.Token) error { + switch val.Kind() { + //encode element attributes, text, and sub-elements + case cue.StructKind: + //fetch the attributes + attributes, err := fetchAttributes(val) + if err != nil { + return err + } + //encode the attributes + for _, attr := range attributes { + if currStartElement == nil { + return fmt.Errorf("cannot encode attribute %s=%s without an element", attr.Name.Local, attr.Value) + } + currStartElement.Attr = append(currStartElement.Attr, attr) + } + + if currStartElement != nil { + //encode the start element + encoder.EncodeToken(currStartElement) + } + + //fetch the content + content, err := fetchInnerText(val) + if err != nil { + return err + } + //encode the content + if content != nil { + contentToken := gosaxml.Token{ByteData: []byte(*content), Kind: gosaxml.TokenTypeTextElement} + encoder.EncodeToken(&contentToken) + } + + //fetch the sub-elements + subElements, err := fetchSubElements(val) + if err != nil { + return err + } + //encode the sub-elements + for _, subElement := range subElements { + subElementLabel, _ := subElement.Label() + elem := gosaxml.Token{Name: gosaxml.Name{Local: []byte(subElementLabel)}, Kind: gosaxml.TokenTypeStartElement} + if err := encodeValue(encoder, subElement, &elem); err != nil { + return err + } + encoder.EncodeToken(&gosaxml.Token{Name: elem.Name, Kind: gosaxml.TokenTypeEndElement}) + } + + //fetch the element collections + elementCollections, err := fetchElementCollections(val) + if err != nil { + return err + } + //encode the element collections + for _, elementCollection := range elementCollections { + elementCollectionLabel, _ := elementCollection.Label() + + //go through the element collection + iter, err := elementCollection.List() + if err != nil { + return err + } + for iter.Next() { + currElement := iter.Value() + elem := gosaxml.Token{Name: gosaxml.Name{Local: []byte(elementCollectionLabel)}, Kind: gosaxml.TokenTypeStartElement} + //encode the element collection + if err := encodeValue(encoder, currElement, &elem); err != nil { + return err + } + encoder.EncodeToken(&gosaxml.Token{Name: elem.Name, Kind: gosaxml.TokenTypeEndElement}) + } + } + default: + return fmt.Errorf("unsupported kind: %v", val.Kind()) + } + return nil +} + +// fetch attributes from a given cue.Value +func fetchAttributes(val cue.Value) ([]gosaxml.Attr, error) { + var attributes []gosaxml.Attr + iter, err := val.Fields() + if err != nil { + return nil, err + } + for iter.Next() { + name := iter.Selector().Unquoted() + if len(name) > 0 && string(name[0]) == AttributeSymbol && name != ContentAttribute { + // Handle names that start with "$" + attributeName := name[1:] + attributeValue, err := iter.Value().String() + if err != nil { + return nil, err + } + attr := gosaxml.Attr{Name: gosaxml.Name{Local: []byte(attributeName)}, Value: []byte(attributeValue)} + attributes = append(attributes, attr) + } + } + return attributes, nil +} + +// fetch inner text content from a given cue.Value +func fetchInnerText(val cue.Value) (*string, error) { + iter, err := val.Fields() + if err != nil { + return nil, err + } + for iter.Next() { + name := iter.Selector().Unquoted() + if name == ContentAttribute { + val, err := iter.Value().String() + if err != nil { + return nil, err + } + return &val, nil + } + } + return nil, nil +} + +// fetch sub-elements from a given cue.Value +func fetchSubElements(val cue.Value) ([]cue.Value, error) { + var subElements []cue.Value + iter, err := val.Fields() + if err != nil { + return nil, err + } + for iter.Next() { + name := iter.Selector().Unquoted() + //check that iter.Value() is not a ListKind + if iter.Value().Kind() == cue.ListKind { + continue + } + if len(name) > 0 && name[0] != AttributeSymbol[0] { + subElements = append(subElements, iter.Value()) + } + } + return subElements, nil +} + +// fetch element collections from a given cue.Value +func fetchElementCollections(val cue.Value) ([]cue.Value, error) { + var elementCollections []cue.Value + iter, err := val.Fields() + if err != nil { + return nil, err + } + for iter.Next() { + //check that iter.Value() is a ListKind + if iter.Value().Kind() == cue.ListKind { + elementCollections = append(elementCollections, iter.Value()) + } + } + return elementCollections, nil +} diff --git a/encoding/koala/encode_test.go b/encoding/koala/encode_test.go new file mode 100644 index 00000000000..d2356b1769a --- /dev/null +++ b/encoding/koala/encode_test.go @@ -0,0 +1,377 @@ +// Copyright 2025 The CUE Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package koala converts XML to and from CUE as described here: https://github.com/cue-lang/cue/discussions/3776 +// +// WARNING: THIS PACKAGE IS EXPERIMENTAL. +// ITS API MAY CHANGE AT ANY TIME. +package koala + +import ( + "bytes" + "testing" + + "cuelang.org/go/cue/cuecontext" + "github.com/go-quicktest/qt" +) + +func TestElementEncoder(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + wantXML string + inputCUE string + }{ + { + name: "1. Simple Elements", + wantXML: ` + Tove + Jani + Reminder + Don't forget me this weekend! +`, + inputCUE: `{ + note: { + to: { + $$: "Tove" + } + from: { + $$: "Jani" + } + heading: { + $$: "Reminder" + } + body: { + $$: "Don't forget me this weekend!" + } + } + }`, + }, { + name: "2. Attribute", + wantXML: ` + Tove + Jani + Reminder + Don't forget me this weekend! +`, + inputCUE: `{ + note: { + $alpha: "abcd" + to: { + $$: "Tove" + } + from: { + $$: "Jani" + } + heading: { + $$: "Reminder" + } + body: { + $$: "Don't forget me this weekend!" + } + } + }`, + }, + { + name: "3. Attribute and Element with the same name", + wantXML: ` + Tove + Jani + Reminder + Don't forget me this weekend! + efgh +`, + inputCUE: `{ + note: { + $alpha: "abcd" + to: { + $$: "Tove" + } + from: { + $$: "Jani" + } + heading: { + $$: "Reminder" + } + body: { + $$: "Don't forget me this weekend!" + } + alpha: { + $$: "efgh" + } + } + }`, + }, + { + name: "4. Mapping for content when an attribute exists", + wantXML: `hello`, + inputCUE: `{ + note: { + $alpha: "abcd" + $$: "hello" + } + }`, + }, + { + name: "5. Nested Element", + wantXML: ` + hello +`, + inputCUE: `{ + notes: { + note: { + $alpha: "abcd" + $$: "hello" + } + } + }`, + }, + { + name: "6. Collections", + wantXML: ` + hello + goodbye +`, + inputCUE: `{ + notes: { + note: [{ + $alpha: "abcd" + $$: "hello" + }, { + $alpha: "abcdef" + $$: "goodbye" + }] + } + }`, + }, + { + name: "7. Interleaving Element Types", + wantXML: ` + mybook + hello + goodbye + goodbye + direct +`, + inputCUE: `{ + notes: { + note: [{ + $alpha: "abcd" + $$: "hello" + }, { + $alpha: "abcdef" + $$: "goodbye" + }, { + $alpha: "ab" + $$: "goodbye" + }, { + $$: "direct" + }] + book: { + $$: "mybook" + } + } + }`, + }, + { + name: "8. Namespaces", + wantXML: ` + + Apples + Bananas + +`, + inputCUE: `{ + "h:table": { + "$xmlns:h": "http://www.w3.org/TR/html4/" + "h:tr": { + "h:td": [{ + $$: "Apples" + }, { + $$: "Bananas" + }] + } + } + }`, + }, + { + name: "9. Mixed Namespaces", + wantXML: ` + + e3r + Apples + Bananas + +`, + inputCUE: `{ + "h:table": { + "$xmlns:h": "http://www.w3.org/TR/html4/" + "$xmlns:r": "d" + "h:tr": { + "h:td": [{ + $$: "Apples" + }, { + $$: "Bananas" + }] + "r:blah": { + $$: "e3r" + } + } + } + }`, + }, + { + name: "10. Elements with same name but different namespaces", + wantXML: ` + + e3r + Apples + Bananas + +`, + inputCUE: `{ + "h:table": { + "$xmlns:h": "http://www.w3.org/TR/html4/" + "$xmlns:r": "d" + "h:tr": { + "h:td": [{ + $$: "Apples" + }, { + $$: "Bananas" + }] + "r:td": { + $$: "e3r" + } + } + } + }`, + }, + { + name: "11. Collection of elements, where elements have optional properties", + wantXML: ` + + title + John Doe + + + title2 + Jane Doe + + + Lord of the rings + JRR Tolkien + + Fellowship + JRR Tolkien + + + Two Towers + JRR Tolkien + + + Return of the King + JRR Tolkien + + +`, + inputCUE: `{ + books: { + book: [{ + title: { + $$: "title" + } + author: { + $$: "John Doe" + } + }, { + title: { + $$: "title2" + } + author: { + $$: "Jane Doe" + } + }, { + title: { + $$: "Lord of the rings" + } + author: { + $$: "JRR Tolkien" + } + volume: [{ + title: { + $$: "Fellowship" + } + author: { + $$: "JRR Tolkien" + } + }, { + title: { + $$: "Two Towers" + } + author: { + $$: "JRR Tolkien" + } + }, { + title: { + $$: "Return of the King" + } + author: { + $$: "JRR Tolkien" + } + }] + }] + } + }`, + }, + { + name: "12. New line restore Test", + wantXML: ` +hello +`, + inputCUE: `{ + node: { + $$: """ + + hello + + """ + } +}`, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + t.Parallel() + + c := cuecontext.New() + val := c.CompileString(test.inputCUE) + if val.Err() != nil { + t.Fatalf("failed to compile CUE: %v", val.Err()) + } + + var buf bytes.Buffer + + dec := NewEncoder(&buf) + err := dec.Encode(val) + qt.Assert(t, qt.IsNil(err)) + + xmlContent := buf.String() + + qt.Assert(t, qt.Equals(xmlContent, test.wantXML)) + + }) + } +} diff --git a/go.mod b/go.mod index e397fdc9675..983055f6c0c 100644 --- a/go.mod +++ b/go.mod @@ -7,10 +7,12 @@ require ( github.com/cockroachdb/apd/v3 v3.2.1 github.com/emicklei/proto v1.14.0 github.com/go-quicktest/qt v1.101.0 + github.com/go-xmlfmt/xmlfmt v1.1.3 github.com/google/go-cmp v0.7.0 github.com/google/shlex v0.0.0-20191202100458-e7afc7fbc510 github.com/google/uuid v1.6.0 github.com/kr/pretty v0.3.1 + github.com/matthew-sladescu/gosaxml v0.0.2 github.com/opencontainers/go-digest v1.0.0 github.com/opencontainers/image-spec v1.1.1 github.com/pelletier/go-toml/v2 v2.2.3 diff --git a/go.sum b/go.sum index 46bf1ce24bb..0e93de4f840 100644 --- a/go.sum +++ b/go.sum @@ -10,6 +10,8 @@ github.com/emicklei/proto v1.14.0 h1:WYxC0OrBuuC+FUCTZvb8+fzEHdZMwLEF+OnVfZA3LXU github.com/emicklei/proto v1.14.0/go.mod h1:rn1FgRS/FANiZdD2djyH7TMA9jdRDcYQ9IEN9yvjX0A= github.com/go-quicktest/qt v1.101.0 h1:O1K29Txy5P2OK0dGo59b7b0LR6wKfIhttaAhHUyn7eI= github.com/go-quicktest/qt v1.101.0/go.mod h1:14Bz/f7NwaXPtdYEgzsx46kqSxVwTbzVZsDC26tQJow= +github.com/go-xmlfmt/xmlfmt v1.1.3 h1:t8Ey3Uy7jDSEisW2K3somuMKIpzktkWptA0iFCnRUWY= +github.com/go-xmlfmt/xmlfmt v1.1.3/go.mod h1:aUCEOzzezBEjDBbFBoSiya/gduyIiWYRP6CnSFIV8AM= github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= github.com/google/shlex v0.0.0-20191202100458-e7afc7fbc510 h1:El6M4kTTCOh6aBiKaUGG7oYTSPP8MxqL4YI3kZKwcP4= @@ -26,6 +28,8 @@ github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0 github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw= github.com/lib/pq v1.10.7 h1:p7ZhMD+KsSRozJr34udlUrhboJwWAgCg34+/ZZNvZZw= github.com/lib/pq v1.10.7/go.mod h1:AlVN5x4E4T544tWzH6hKfbfQvm3HdbOxrmggDNAPY9o= +github.com/matthew-sladescu/gosaxml v0.0.2 h1:687OfQjqh/dz5Mv9HhA/ZkL4BY0GuHrZgTcz0wfrUgk= +github.com/matthew-sladescu/gosaxml v0.0.2/go.mod h1:bSDsqJ0rNs+uTnIglOHF9PSeBcI2HMwclBtMTddx2es= github.com/mitchellh/go-wordwrap v1.0.1 h1:TLuKupo69TCn6TQSyGxwI1EblZZEsQ0vMlAFQflz0v0= github.com/mitchellh/go-wordwrap v1.0.1/go.mod h1:R62XHJLzvMFRBbcrT7m7WgmE1eOyTSsCt+hzestvNj0= github.com/opencontainers/go-digest v1.0.0 h1:apOUWs51W5PlhuyGyz9FCeeBIOUDA/6nW8Oi/yOhh5U=