Fix a bug in xml stream parsing where a previously unmatched node cau…

…sing all subsequent valid matches fail. Recall that for streaming mode, we have two xpaths: one for matching the element, the other (optionally) for adding additional filtering. Imagine the following example, where the xml doc is: ``` <ROOT> <AAA> <CCC>c1</CCC> <BBB>b1</BBB> <DDD>d1</DDD> <BBB>b2<ZZZ z="1">z1</ZZZ></BBB> <BBB>b3</BBB> </AAA> <ZZZ> <BBB>b4</BBB> <BBB>b5</BBB> <CCC>c3</CCC> </ZZZ> </ROOT> ``` The stream parser is created as: ``` CreateStreamParser(strings.NewReader(s), "/ROOT/*/BBB", "/ROOT/*/BBB[. != 'b3']") ``` Basically we want the stream parser to return all the `BBB` nodes whose text aren't `b3`. By looking at the sample XML, we know it should return: the `<BBB>` nodes whose texts are `b1`, `b2`, `b4`, and `b5`. However, the current code only returns `b1` and `b2`. The problem lies in the stream element matching inside `case xml.StartElement`. Currently the code does this: ``` case xml.StartElement: ... ... if p.streamElementXPath != nil { if p.streamNode == nil { if QuerySelector(p.doc, p.streamElementXPath) != nil { // Now we assume this node is the stream node candidate. } ``` We originally under the assumption that if the `streamElementXPath` query returns anything, it must be this node itself; thus if it returns, this node is the stream node candidate. But it's clearly wrong in this `b3` example above. For the node `<BBB>b3</BBB>` it is first considered as the stream candidate, but later filtering (`[. != 'b3']`) removes its stream node status, and treats it just like any other non-stream nodes, and keeps it in the node tree. But the problem is, by keeping it in the tree, any future XML element start will **always** "matches" `streamElementXPath`. So in the example above, the node `<ZZZ>` is now erroneously considered stream node, and any child nodes are not even tested for streaming anymore. There are two fixes: 1) In `xml.StartElement` stream match, instead of just doing `QuerySelector(...) != nil` check, we need to issue a `QuerySelectorAll(...)` call and examine all the returned nodes, if the current node is one of them, then this current node is considered stream candidate. 2) Simpler: if a stream candidate is later filtered out inside `case xml.EndElement` handling, then simply remove it from the node tree, thus preventing future erroneous matches. 1) seems an overkill: if a stream candidate gets filtered out, it's hard to imagine caller would like to interact with it in any capacity. Thus chose fix 2).
antchfx · Sep 13, 2020 · e26cec5 · e26cec5
1 parent 5648b2f
commit e26cec5
Show file tree

Hide file tree

Showing 2 changed files with 28 additions and 18 deletions.
diff --git a/parse.go b/parse.go
@@ -169,7 +169,11 @@ func (p *parser) parse() (*Node, error) {
 					if p.streamElementFilter == nil || QuerySelector(p.doc, p.streamElementFilter) != nil {
 						return p.streamNode, nil
 					}
-					// otherwise, this isn't our target node. clean things up.
+					// otherwise, this isn't our target node, clean things up.
+					// note we also remove the underlying *Node from the node tree, to prevent
+					// future stream node candidate selection error.
+					RemoveFromTree(p.streamNode)
+					p.prev = p.streamNodePrev
 					p.streamNode = nil
 					p.streamNodePrev = nil
 				}

diff --git a/parse_test.go b/parse_test.go
@@ -278,18 +278,22 @@ func testOutputXML(t *testing.T, msg string, expectedXML string, n *Node) {
 
 func TestStreamParser_Success1(t *testing.T) {
 	s := `
-	<AAA>
-		<CCC>c1</CCC>
-		<BBB>b1</BBB>
-		<DDD>d1</DDD>
-		<BBB>b2<ZZZ z="1">z1</ZZZ></BBB>
-		<BBB>b3</BBB>
-		<BBB>b4</BBB>
-		<BBB>b5</BBB>
-		<CCC>c3</CCC>
-	</AAA>`
-
-	sp, err := CreateStreamParser(strings.NewReader(s), "/AAA/BBB", "/AAA/BBB[. != 'b3']")
+	<ROOT>
+		<AAA>
+			<CCC>c1</CCC>
+			<BBB>b1</BBB>
+			<DDD>d1</DDD>
+			<BBB>b2<ZZZ z="1">z1</ZZZ></BBB>
+			<BBB>b3</BBB>
+		</AAA>
+		<ZZZ>
+			<BBB>b4</BBB>
+			<BBB>b5</BBB>
+			<CCC>c3</CCC>
+		</ZZZ>
+	</ROOT>`
+
+	sp, err := CreateStreamParser(strings.NewReader(s), "/ROOT/*/BBB", "/ROOT/*/BBB[. != 'b3']")
 	if err != nil {
 		t.Fatal(err.Error())
 	}
@@ -300,7 +304,8 @@ func TestStreamParser_Success1(t *testing.T) {
 		t.Fatal(err.Error())
 	}
 	testOutputXML(t, "first call result", `<BBB>b1</BBB>`, n)
-	testOutputXML(t, "doc after first call", `<><?xml?><AAA><CCC>c1</CCC><BBB>b1</BBB></AAA></>`, findRoot(n))
+	testOutputXML(t, "doc after first call",
+		`<><?xml?><ROOT><AAA><CCC>c1</CCC><BBB>b1</BBB></AAA></ROOT></>`, findRoot(n))
 
 	// Second `<BBB>` read
 	n, err = sp.Read()
@@ -309,7 +314,7 @@ func TestStreamParser_Success1(t *testing.T) {
 	}
 	testOutputXML(t, "second call result", `<BBB>b2<ZZZ z="1">z1</ZZZ></BBB>`, n)
 	testOutputXML(t, "doc after second call",
-		`<><?xml?><AAA><CCC>c1</CCC><DDD>d1</DDD><BBB>b2<ZZZ z="1">z1</ZZZ></BBB></AAA></>`, findRoot(n))
+		`<><?xml?><ROOT><AAA><CCC>c1</CCC><DDD>d1</DDD><BBB>b2<ZZZ z="1">z1</ZZZ></BBB></AAA></ROOT></>`, findRoot(n))
 
 	// Third `<BBB>` read (Note we will skip 'b3' since the streamElementFilter excludes it)
 	n, err = sp.Read()
@@ -321,17 +326,18 @@ func TestStreamParser_Success1(t *testing.T) {
 	// been filtered out and is not our target node, thus it is considered just like any other
 	// non target nodes such as `<CCC>`` or `<DDD>`
 	testOutputXML(t, "doc after third call",
-		`<><?xml?><AAA><CCC>c1</CCC><DDD>d1</DDD><BBB>b3</BBB><BBB>b4</BBB></AAA></>`, findRoot(n))
+		`<><?xml?><ROOT><AAA><CCC>c1</CCC><DDD>d1</DDD></AAA><ZZZ><BBB>b4</BBB></ZZZ></ROOT></>`,
+		findRoot(n))
 
 	// Fourth `<BBB>` read
 	n, err = sp.Read()
 	if err != nil {
 		t.Fatal(err.Error())
 	}
 	testOutputXML(t, "fourth call result", `<BBB>b5</BBB>`, n)
-	// Note the inclusion of `<BBB>b3</BBB>` in the document.
 	testOutputXML(t, "doc after fourth call",
-		`<><?xml?><AAA><CCC>c1</CCC><DDD>d1</DDD><BBB>b3</BBB><BBB>b5</BBB></AAA></>`, findRoot(n))
+		`<><?xml?><ROOT><AAA><CCC>c1</CCC><DDD>d1</DDD></AAA><ZZZ><BBB>b5</BBB></ZZZ></ROOT></>`,
+		findRoot(n))
 
 	_, err = sp.Read()
 	if err != io.EOF {