Skip to content

Commit

Permalink
Add ReadSettings option to PreserveCData
Browse files Browse the repository at this point in the history
This allows XML CDATA blocks to be preserved when reading. By default
this does not happen, because the underlying encoding/xml package does
not support it.
  • Loading branch information
beevik committed May 8, 2023
1 parent 2a0c46b commit 672dfa1
Show file tree
Hide file tree
Showing 3 changed files with 219 additions and 18 deletions.
55 changes: 45 additions & 10 deletions etree.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,10 @@ const (
// ErrXML is returned when XML parsing fails due to incorrect formatting.
var ErrXML = errors.New("etree: invalid XML format")

// cdataPrefix is used to detect CDATA text when ReadSettings.PreserveCData is
// true.
var cdataPrefix = []byte("<![CDATA[")

// ReadSettings determine the default behavior of the Document's ReadFrom*
// methods.
type ReadSettings struct {
Expand All @@ -36,6 +40,12 @@ type ReadSettings struct {
// or attribute values. Default: false.
Permissive bool

// Preserve CDATA character data blocks when decoding XML (instead of
// converting it to normal character text). This entails additional
// processing and memory usage during ReadFrom* operations. Default:
// false.
PreserveCData bool

// Entity to be passed to standard xml.Decoder. Default: nil.
Entity map[string]string
}
Expand All @@ -46,7 +56,9 @@ func newReadSettings() ReadSettings {
CharsetReader: func(label string, input io.Reader) (io.Reader, error) {
return input, nil
},
Permissive: false,
Permissive: false,
PreserveCData: false,
Entity: nil,
}
}

Expand Down Expand Up @@ -767,25 +779,39 @@ func (e *Element) RemoveChildAt(index int) Token {
// ReadFrom reads XML from the reader 'ri' and stores the result as a new
// child of this element.
func (e *Element) readFrom(ri io.Reader, settings ReadSettings) (n int64, err error) {
xr := newXmlReader(ri)
dec := xml.NewDecoder(xr)
var r xmlReader
var pr *xmlPeekReader
if settings.PreserveCData {
pr = newXmlPeekReader(ri)
r = pr
} else {
r = newXmlSimpleReader(ri)
}

dec := xml.NewDecoder(r)
dec.CharsetReader = settings.CharsetReader
dec.Strict = !settings.Permissive
dec.Entity = settings.Entity

var stack stack
stack.push(e)
for {
if pr != nil {
pr.PeekPrepare(dec.InputOffset(), len(cdataPrefix))
}

t, err := dec.RawToken()

switch {
case err == io.EOF:
if len(stack.data) != 1 {
return xr.bytes, ErrXML
return r.Bytes(), ErrXML
}
return xr.bytes, nil
return r.Bytes(), nil
case err != nil:
return xr.bytes, err
return r.Bytes(), err
case stack.empty():
return xr.bytes, ErrXML
return r.Bytes(), ErrXML
}

top := stack.peek().(*Element)
Expand All @@ -799,14 +825,23 @@ func (e *Element) readFrom(ri io.Reader, settings ReadSettings) (n int64, err er
stack.push(e)
case xml.EndElement:
if top.Tag != t.Name.Local || top.Space != t.Name.Space {
return xr.bytes, ErrXML
return r.Bytes(), ErrXML
}
stack.pop()
case xml.CharData:
data := string(t)
var flags charDataFlags
if isWhitespace(data) {
flags = whitespaceFlag
if pr != nil {
peekBuf := pr.PeekFinalize()
if bytes.Equal(peekBuf, cdataPrefix) {
flags = cdataFlag
} else if isWhitespace(data) {
flags = whitespaceFlag
}
} else {
if isWhitespace(data) {
flags = whitespaceFlag
}
}
newCharData(data, flags, top)
case xml.Comment:
Expand Down
50 changes: 49 additions & 1 deletion etree_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -239,7 +239,7 @@ func TestImbalancedXML(t *testing.T) {

func TestDocumentReadNonUTF8Encodings(t *testing.T) {
s := `<?xml version="1.0" encoding="ISO-8859-1"?>
store>
<store>
<book lang="en">
<title>Great Expectations</title>
<author>Charles Dickens</author>
Expand Down Expand Up @@ -918,6 +918,54 @@ func TestIndentPreserveWhitespace(t *testing.T) {
}
}

func TestPreserveCData(t *testing.T) {
tests := []struct {
input string
expectedWithPreserve string
expectedWithoutPreserve string
}{
{
"<test><![CDATA[x]]></test>",
"<test><![CDATA[x]]></test>",
"<test>x</test>",
},
{
"<tag><![CDATA[x <b>foo</b>]]></tag>",
"<tag><![CDATA[x <b>foo</b>]]></tag>",
"<tag>x &lt;b&gt;foo&lt;/b&gt;</tag>",
},
{
"<name><![CDATA[My]]> <b>name</b> <![CDATA[is]]></name>",
"<name><![CDATA[My]]> <b>name</b> <![CDATA[is]]></name>",
"<name>My <b>name</b> is</name>",
},
}

for _, test := range tests {
doc := NewDocument()
doc.ReadSettings.PreserveCData = true
err := doc.ReadFromString(test.input)
if err != nil {
t.Error("etree: failed to read string")
}

output, err := doc.WriteToString()
checkStrEq(t, output, test.expectedWithPreserve)
}

for _, test := range tests {
doc := NewDocument()
doc.ReadSettings.PreserveCData = false
err := doc.ReadFromString(test.input)
if err != nil {
t.Error("etree: failed to read string")
}

output, err := doc.WriteToString()
checkStrEq(t, output, test.expectedWithoutPreserve)
}
}

func TestTokenIndexing(t *testing.T) {
s := `<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="style.xsl"?>
Expand Down
132 changes: 125 additions & 7 deletions helpers.go
Original file line number Diff line number Diff line change
Expand Up @@ -83,24 +83,142 @@ func (f *fifo) grow() {
f.data, f.head, f.tail = buf, 0, count
}

// xmlReader implements a proxy reader that counts the number of
// bytes read from its encapsulated reader and detects when a CDATA
// prefix has been parsed.
type xmlReader struct {
// xmlReader provides the interface by which an XML byte stream is
// processed and decoded.
type xmlReader interface {
Bytes() int64
Read(p []byte) (n int, err error)
}

// xmlSimpleReader implements a proxy reader that counts the number of
// bytes read from its encapsulated reader.
type xmlSimpleReader struct {
r io.Reader
bytes int64
}

func newXmlReader(r io.Reader) *xmlReader {
return &xmlReader{r, 0}
func newXmlSimpleReader(r io.Reader) xmlReader {
return &xmlSimpleReader{r, 0}
}

func (xr *xmlReader) Read(p []byte) (n int, err error) {
func (xr *xmlSimpleReader) Bytes() int64 {
return xr.bytes
}

func (xr *xmlSimpleReader) Read(p []byte) (n int, err error) {
n, err = xr.r.Read(p)
xr.bytes += int64(n)
return n, err
}

// xmlPeekReader implements a proxy reader that counts the number of
// bytes read from its encapsulated reader. It also allows the caller to
// "peek" at the previous portions of the buffer after they have been
// parsed.
type xmlPeekReader struct {
r io.Reader
bytes int64 // total bytes read by the Read function
buf []byte // internal read buffer
bufSize int // total bytes used in the read buffer
bufOffset int64 // total bytes read when buf was last filled
window []byte // current read buffer window
peekBuf []byte // buffer used to store data to be peeked at later
peekOffset int64 // total read offset of the start of the peek buffer
}

func newXmlPeekReader(r io.Reader) *xmlPeekReader {
buf := make([]byte, 4096)
return &xmlPeekReader{
r: r,
bytes: 0,
buf: buf,
bufSize: 0,
bufOffset: 0,
window: buf[0:0],
peekBuf: make([]byte, 0),
peekOffset: -1,
}
}

func (xr *xmlPeekReader) Bytes() int64 {
return xr.bytes
}

func (xr *xmlPeekReader) Read(p []byte) (n int, err error) {
if len(xr.window) == 0 {
err = xr.fill()
if err != nil {
return 0, err
}
if len(xr.window) == 0 {
return 0, nil
}
}

if len(xr.window) < len(p) {
n = len(xr.window)
} else {
n = len(p)
}

copy(p, xr.window)
xr.window = xr.window[n:]
xr.bytes += int64(n)

return n, err
}

func (xr *xmlPeekReader) PeekPrepare(offset int64, maxLen int) {
if maxLen > cap(xr.peekBuf) {
xr.peekBuf = make([]byte, 0, maxLen)
}
xr.peekBuf = xr.peekBuf[0:0]
xr.peekOffset = offset
xr.updatePeekBuf()
}

func (xr *xmlPeekReader) PeekFinalize() []byte {
xr.updatePeekBuf()
return xr.peekBuf
}

func (xr *xmlPeekReader) fill() error {
xr.bufOffset = xr.bytes
xr.bufSize = 0
n, err := xr.r.Read(xr.buf)
if err != nil {
xr.window, xr.bufSize = xr.buf[0:0], 0
return err
}
xr.window, xr.bufSize = xr.buf[:n], n
xr.updatePeekBuf()
return nil
}

func (xr *xmlPeekReader) updatePeekBuf() {
peekRemain := cap(xr.peekBuf) - len(xr.peekBuf)
if xr.peekOffset >= 0 && peekRemain > 0 {
rangeMin := xr.peekOffset
rangeMax := xr.peekOffset + int64(cap(xr.peekBuf))
bufMin := xr.bufOffset
bufMax := xr.bufOffset + int64(xr.bufSize)
if rangeMin < bufMin {
rangeMin = bufMin
}
if rangeMax > bufMax {
rangeMax = bufMax
}
if rangeMax > rangeMin {
rangeMin -= xr.bufOffset
rangeMax -= xr.bufOffset
if int(rangeMax-rangeMin) > peekRemain {
rangeMax = rangeMin + int64(peekRemain)
}
xr.peekBuf = append(xr.peekBuf, xr.buf[rangeMin:rangeMax]...)
}
}
}

// xmlWriter implements a proxy writer that counts the number of
// bytes written by its encapsulated writer.
type xmlWriter struct {
Expand Down

0 comments on commit 672dfa1

Please sign in to comment.