Skip to content

Commit 6a9f5cb

Browse files
authored
[Java.Interop.Tools.JavaSource] Improve <a> parsing (#1126)
Parsing of `<a/>` elements would occasionally fail when they didn't match our expectations/requirements: * Unquoted URLs, a'la `android/database/sqlite/SQLiteDatabase.java`: * <p> See <a href=https://www.sqlite.org/pragma.html#pragma_journal_mode>here</a> for more Resulting in: System.Xml.XmlException: 'https' is an unexpected token. The expected token is '"' or '''. Line 1, position 11. or a'la `java/io/PipedOutputStream.java` * @exception IOException if the pipe is <a href=#BROKEN> broken</a>, resulting in: System.Xml.XmlException: '#' is an unexpected token. The expected token is '"' or '''. Line 1, position 11. * Improperly quoted attributes, a'la `android/telephony/PhoneNumberUtils.java`: * Matching is based on <a href="https://github.com/google/libphonenumber>libphonenumber</a>. Resulting in: System.Xml.XmlException: '<', hexadecimal value 0x3C, is an invalid attribute character. Line 1, position 67. * Use of "raw" `&`, a'la `android/widget/ProgressBar.java`: * <a href="https://material.io/guidelines/components/progress-activity.html#progress-activity-types-of-indicators"> * Progress & activity</a>. Resulting in: System.Xml.XmlException: An error occurred while parsing EntityName. Line 2, position 11. Fix this by updating updating the `InlineHyperLinkOpenTerm` terminal to *not* require `href`, and updating the `InlineHyperLinkDeclaration` rule to better deal with whatever chaos is there. When we encounter an `<a/>` element that points to code or a local path we will now only include the element value in the javadoc, and not the full `href` attribute value. Replace the `IgnorableDeclaration` rule with an `IgnorableCharTerminal` terminal. This better supports `@` in the content stream when it's not part of a Javadoc inline tag, e.g. `<a href="mailto:nobody@google.com">nobody</a>`.
1 parent d0231c5 commit 6a9f5cb

File tree

3 files changed

+94
-46
lines changed

3 files changed

+94
-46
lines changed

src/Java.Interop.Tools.JavaSource/Java.Interop.Tools.JavaSource/SourceJavadocToXmldocGrammar.HtmlBnfTerms.cs

Lines changed: 42 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,6 @@ internal void CreateRules (SourceJavadocToXmldocGrammar grammar)
2525
AllHtmlTerms.Rule = TopLevelInlineDeclaration
2626
| PBlockDeclaration
2727
| PreBlockDeclaration
28-
| IgnorableElementDeclaration
2928
;
3029

3130
var inlineDeclaration = new NonTerminal ("<html inline decl>", ConcatChildNodes) {
@@ -102,41 +101,36 @@ internal void CreateRules (SourceJavadocToXmldocGrammar grammar)
102101

103102
InlineHyperLinkDeclaration.Rule = InlineHyperLinkOpenTerm + InlineDeclarations + CreateEndElement ("a", grammar, optional: true);
104103
InlineHyperLinkDeclaration.AstConfig.NodeCreator = (context, parseNode) => {
105-
var unparsedAElementValue = string.Empty;
106-
foreach (var cn in parseNode.ChildNodes) {
107-
if (cn.ChildNodes?.Count > 1) {
108-
foreach (var gcn in cn.ChildNodes) {
109-
unparsedAElementValue += gcn.AstNode?.ToString ();
110-
}
111-
} else {
112-
unparsedAElementValue += cn.AstNode?.ToString ();
113-
}
114-
}
104+
var nodesAsString = GetChildNodesAsString (parseNode);
105+
var tokenValue = parseNode.ChildNodes [0].Token.Text;
106+
int stopIndex = nodesAsString.IndexOf ('>');
115107

116-
var seeElement = TryParseHRef (unparsedAElementValue);
117-
if (seeElement == null)
118-
seeElement = TryParseHRef (WebUtility.HtmlDecode (unparsedAElementValue), logError: true);
108+
if (stopIndex == -1 || !tokenValue.Contains ("href", StringComparison.OrdinalIgnoreCase)) {
109+
parseNode.AstNode = new XText (nodesAsString);
110+
return;
111+
}
119112

120-
var hrefValue = seeElement?.Attribute ("href")?.Value ?? string.Empty;
121-
if (!string.IsNullOrEmpty (hrefValue) &&
122-
(hrefValue.StartsWith ("http", StringComparison.OrdinalIgnoreCase) || hrefValue.StartsWith ("www", StringComparison.OrdinalIgnoreCase))) {
123-
parseNode.AstNode = seeElement;
113+
var attributeName = parseNode.ChildNodes [0].Term.Name;
114+
var attributeValue = nodesAsString.Substring (0, stopIndex).Trim ().Trim ('\'', '"');
115+
var elementValue = nodesAsString.Substring (stopIndex + 1);
116+
if (!string.IsNullOrEmpty (attributeValue) && attributeValue.StartsWith ("http", StringComparison.OrdinalIgnoreCase)) {
117+
var unparsed = $"<see href=\"{attributeValue}\">{elementValue}</see>";
118+
XNode? seeElement = TryParseElement (unparsed);
119+
if (seeElement == null) {
120+
// Try to parse with HTML entities decoded
121+
seeElement = TryParseElement (WebUtility.HtmlDecode (unparsed));
122+
if (seeElement == null) {
123+
// Finally, try to parse with only the element value encoded
124+
seeElement = TryParseElement ($"<see href=\"{attributeValue}\">{WebUtility.HtmlEncode (elementValue)}</see>", logError: true);
125+
}
126+
}
127+
parseNode.AstNode = seeElement ?? new XText (nodesAsString);
124128
} else {
125129
// TODO: Need to convert relative paths or code references to appropriate CREF value.
126-
parseNode.AstNode = new XText (unparsedAElementValue);
130+
parseNode.AstNode = new XText (elementValue);
127131
}
128132
};
129133

130-
// Start to trim out unusable HTML elements/tags, but not any inner values
131-
IgnorableElementDeclaration.Rule =
132-
CreateStartElementIgnoreAttribute ("a", "name") + InlineDeclarations + CreateEndElement ("a", grammar, optional: true)
133-
| CreateStartElementIgnoreAttribute ("a", "id") + InlineDeclarations + CreateEndElement ("a", grammar, optional: true)
134-
;
135-
IgnorableElementDeclaration.AstConfig.NodeCreator = (context, parseNode) => {
136-
var aElementValue = new XText (parseNode.ChildNodes [1].AstNode.ToString () ?? string.Empty);
137-
parseNode.AstNode = aElementValue;
138-
};
139-
140134
CodeElementDeclaration.Rule = CreateStartElement ("code", grammar) + InlineDeclarations + CreateEndElement ("code", grammar);
141135
CodeElementDeclaration.AstConfig.NodeCreator = (context, parseNode) => {
142136
var target = parseNode.ChildNodes [1].AstNode;
@@ -184,13 +178,28 @@ static IEnumerable<XElement> GetParagraphs (ParseTreeNodeList children)
184178
}
185179
}
186180

187-
static XElement? TryParseHRef (string unparsedAElementValue, bool logError = false)
181+
static string GetChildNodesAsString (ParseTreeNode parseNode)
182+
{
183+
var unparsed = string.Empty;
184+
foreach (var cn in parseNode.ChildNodes) {
185+
if (cn.ChildNodes?.Count > 1) {
186+
foreach (var gcn in cn.ChildNodes) {
187+
unparsed += gcn.AstNode?.ToString ();
188+
}
189+
} else {
190+
unparsed += cn.AstNode?.ToString ();
191+
}
192+
}
193+
return unparsed;
194+
}
195+
196+
static XElement? TryParseElement (string unparsed, bool logError = false)
188197
{
189198
try {
190-
return XElement.Parse ($"<see href={unparsedAElementValue}</see>");
199+
return XElement.Parse (unparsed);
191200
} catch (Exception x) {
192201
if (logError)
193-
Console.Error.WriteLine ($"## Unable to parse HTML element: <see href={unparsedAElementValue}</see>\n{x.GetType ()}: {x.Message}");
202+
Console.Error.WriteLine ($"## Unable to parse HTML element: `{unparsed}`\n{x.GetType ()}: {x.Message}");
194203
return null;
195204
}
196205
}
@@ -221,10 +230,9 @@ static IEnumerable<XElement> GetParagraphs (ParseTreeNodeList children)
221230
public readonly NonTerminal PBlockDeclaration = new NonTerminal (nameof (PBlockDeclaration), ConcatChildNodes);
222231
public readonly NonTerminal PreBlockDeclaration = new NonTerminal (nameof (PreBlockDeclaration), ConcatChildNodes);
223232
public readonly NonTerminal InlineHyperLinkDeclaration = new NonTerminal (nameof (InlineHyperLinkDeclaration), ConcatChildNodes);
224-
public readonly NonTerminal IgnorableElementDeclaration = new NonTerminal (nameof (IgnorableElementDeclaration), ConcatChildNodes);
225233
public readonly NonTerminal CodeElementDeclaration = new NonTerminal (nameof (CodeElementDeclaration), ConcatChildNodes);
226234

227-
public readonly Terminal InlineHyperLinkOpenTerm = new RegexBasedTerminal ("<a href=", @"(?i)<a\s*href\s*=") {
235+
public readonly Terminal InlineHyperLinkOpenTerm = new RegexBasedTerminal ("<a attr=", @"(?i)<a\s*.*=") {
228236
AstConfig = new AstNodeConfig {
229237
NodeCreator = (context, parseNode) => parseNode.AstNode = "",
230238
},

src/Java.Interop.Tools.JavaSource/Java.Interop.Tools.JavaSource/SourceJavadocToXmldocGrammar.InlineTagsBnfTerms.cs

Lines changed: 31 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -109,15 +109,6 @@ internal void CreateRules (SourceJavadocToXmldocGrammar grammar)
109109
}
110110
};
111111

112-
// Inline content may contain reserved characters with no tags or special parsing rules, do not throw when encountering them
113-
IgnorableDeclaration.Rule = grammar.ToTerm ("@ ")
114-
| grammar.ToTerm ("{")
115-
| grammar.ToTerm ("}")
116-
;
117-
IgnorableDeclaration.AstConfig.NodeCreator = (context, parseNode) => {
118-
parseNode.AstNode = new XText (parseNode.ChildNodes [0].Term.Name.Trim ());
119-
};
120-
121112
InlineParamDeclaration.Rule = grammar.ToTerm ("{@param") + InlineValue + "}";
122113
InlineParamDeclaration.AstConfig.NodeCreator = (context, parseNode) => {
123114
var target = parseNode.ChildNodes [1].AstNode;
@@ -156,9 +147,38 @@ internal void CreateRules (SourceJavadocToXmldocGrammar grammar)
156147
// https://docs.oracle.com/javase/7/docs/technotes/tools/windows/javadoc.html#value
157148
public readonly NonTerminal ValueDeclaration = new NonTerminal (nameof (ValueDeclaration));
158149

159-
public readonly NonTerminal IgnorableDeclaration = new NonTerminal (nameof (IgnorableDeclaration));
160-
161150
public readonly NonTerminal InlineParamDeclaration = new NonTerminal (nameof (InlineParamDeclaration));
151+
152+
public readonly Terminal IgnorableDeclaration = new IgnorableCharTerminal (nameof (IgnorableDeclaration)) {
153+
AstConfig = new AstNodeConfig {
154+
NodeCreator = (context, parseNode) => parseNode.AstNode = parseNode.Token.Value.ToString (),
155+
},
156+
};
157+
162158
}
163159
}
160+
161+
class IgnorableCharTerminal : Terminal
162+
{
163+
public IgnorableCharTerminal (string name)
164+
: base (name)
165+
{
166+
Priority = TerminalPriority.Low - 1;
167+
}
168+
169+
public override Token? TryMatch (ParsingContext context, ISourceStream source)
170+
{
171+
var startChar = source.Text [source.Location.Position];
172+
if (startChar != '@'
173+
&& startChar != '{'
174+
&& startChar != '}'
175+
) {
176+
return null;
177+
}
178+
source.PreviewPosition += 1;
179+
return source.CreateToken (OutputTerminal, startChar);
180+
}
181+
182+
}
183+
164184
}

tests/Java.Interop.Tools.JavaSource-Tests/SourceJavadocToXmldocGrammar.HtmlBnfTermsTests.cs

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,27 @@ public void HyperLinkDeclaration ()
6969

7070
r = p.Parse ("<a href=\"AutofillService.html#FieldClassification\">field classification</a>");
7171
Assert.IsFalse (r.HasErrors (), DumpMessages (r, p));
72-
Assert.AreEqual ("\"AutofillService.html#FieldClassification\"&gt;field classification",
72+
Assert.AreEqual ("field classification", r.Root.AstNode.ToString ());
73+
74+
r = p.Parse ("<a href=https://www.sqlite.org/pragma.html#pragma_journal_mode>here</a>");
75+
Assert.IsFalse (r.HasErrors (), DumpMessages (r, p));
76+
Assert.AreEqual ("<see href=\"https://www.sqlite.org/pragma.html#pragma_journal_mode\">here</see>", r.Root.AstNode.ToString ());
77+
78+
r = p.Parse ("<a href=\"https://github.com/google/libphonenumber>libphonenumber</a>");
79+
Assert.IsFalse (r.HasErrors (), DumpMessages (r, p));
80+
Assert.AreEqual ("<see href=\"https://github.com/google/libphonenumber\">libphonenumber</see>", r.Root.AstNode.ToString ());
81+
82+
r = p.Parse ("<a href=#BROKEN> broken</a>");
83+
Assert.IsFalse (r.HasErrors (), DumpMessages (r, p));
84+
Assert.AreEqual (" broken", r.Root.AstNode.ToString ());
85+
86+
r = p.Parse ("<a href=\"mailto:nobody@google.com\">nobody</a>");
87+
Assert.IsFalse (r.HasErrors (), DumpMessages (r, p));
88+
Assert.AreEqual ("nobody", r.Root.AstNode.ToString ());
89+
90+
r = p.Parse ("<a href='https://material.io/guidelines/components/progress-activity.html#progress-activity-types-of-indicators'>\nProgress & activity</a>");
91+
Assert.IsFalse (r.HasErrors (), DumpMessages (r, p));
92+
Assert.AreEqual ($"<see href=\"https://material.io/guidelines/components/progress-activity.html#progress-activity-types-of-indicators\">{Environment.NewLine}Progress &amp; activity</see>",
7393
r.Root.AstNode.ToString ());
7494
}
7595

0 commit comments

Comments
 (0)