Skip to content

Commit

Permalink
[Java.Interop.Tools.JavaSource] Improve <a> parsing (#1126)
Browse files Browse the repository at this point in the history
Parsing of `<a/>` elements would occasionally fail when they didn't
match our expectations/requirements:

  * Unquoted URLs, a'la `android/database/sqlite/SQLiteDatabase.java`:

        * <p> See <a href=https://www.sqlite.org/pragma.html#pragma_journal_mode>here</a> for more

    Resulting in:

        System.Xml.XmlException: 'https' is an unexpected token. The expected token is '"' or '''. Line 1, position 11.

    or a'la `java/io/PipedOutputStream.java`

        * @exception IOException if the pipe is <a href=#BROKEN> broken</a>,

    resulting in:

        System.Xml.XmlException: '#' is an unexpected token. The expected token is '"' or '''. Line 1, position 11.


  * Improperly quoted attributes, a'la `android/telephony/PhoneNumberUtils.java`:

        * Matching is based on <a href="https://github.com/google/libphonenumber>libphonenumber</a>.

    Resulting in:

        System.Xml.XmlException: '<', hexadecimal value 0x3C, is an invalid attribute character. Line 1, position 67.

  * Use of "raw" `&`, a'la `android/widget/ProgressBar.java`:

        * <a href="https://material.io/guidelines/components/progress-activity.html#progress-activity-types-of-indicators">
        * Progress & activity</a>.

    Resulting in:

        System.Xml.XmlException: An error occurred while parsing EntityName. Line 2, position 11.


Fix this by updating updating the `InlineHyperLinkOpenTerm` terminal
to *not* require `href`, and updating the `InlineHyperLinkDeclaration`
rule to better deal with whatever chaos is there.

When we encounter an `<a/>` element that points to code or a local
path we will now only include the element value in the javadoc, and
not the full `href` attribute value.

Replace the `IgnorableDeclaration` rule with an
`IgnorableCharTerminal` terminal.  This better supports `@` in the
content stream when it's not part of a Javadoc inline tag, e.g.
`<a href="mailto:nobody@google.com">nobody</a>`.
  • Loading branch information
pjcollins authored Jul 12, 2023
1 parent d0231c5 commit 6a9f5cb
Show file tree
Hide file tree
Showing 3 changed files with 94 additions and 46 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@ internal void CreateRules (SourceJavadocToXmldocGrammar grammar)
AllHtmlTerms.Rule = TopLevelInlineDeclaration
| PBlockDeclaration
| PreBlockDeclaration
| IgnorableElementDeclaration
;

var inlineDeclaration = new NonTerminal ("<html inline decl>", ConcatChildNodes) {
Expand Down Expand Up @@ -102,41 +101,36 @@ internal void CreateRules (SourceJavadocToXmldocGrammar grammar)

InlineHyperLinkDeclaration.Rule = InlineHyperLinkOpenTerm + InlineDeclarations + CreateEndElement ("a", grammar, optional: true);
InlineHyperLinkDeclaration.AstConfig.NodeCreator = (context, parseNode) => {
var unparsedAElementValue = string.Empty;
foreach (var cn in parseNode.ChildNodes) {
if (cn.ChildNodes?.Count > 1) {
foreach (var gcn in cn.ChildNodes) {
unparsedAElementValue += gcn.AstNode?.ToString ();
}
} else {
unparsedAElementValue += cn.AstNode?.ToString ();
}
}
var nodesAsString = GetChildNodesAsString (parseNode);
var tokenValue = parseNode.ChildNodes [0].Token.Text;
int stopIndex = nodesAsString.IndexOf ('>');
var seeElement = TryParseHRef (unparsedAElementValue);
if (seeElement == null)
seeElement = TryParseHRef (WebUtility.HtmlDecode (unparsedAElementValue), logError: true);
if (stopIndex == -1 || !tokenValue.Contains ("href", StringComparison.OrdinalIgnoreCase)) {
parseNode.AstNode = new XText (nodesAsString);
return;
}
var hrefValue = seeElement?.Attribute ("href")?.Value ?? string.Empty;
if (!string.IsNullOrEmpty (hrefValue) &&
(hrefValue.StartsWith ("http", StringComparison.OrdinalIgnoreCase) || hrefValue.StartsWith ("www", StringComparison.OrdinalIgnoreCase))) {
parseNode.AstNode = seeElement;
var attributeName = parseNode.ChildNodes [0].Term.Name;
var attributeValue = nodesAsString.Substring (0, stopIndex).Trim ().Trim ('\'', '"');
var elementValue = nodesAsString.Substring (stopIndex + 1);
if (!string.IsNullOrEmpty (attributeValue) && attributeValue.StartsWith ("http", StringComparison.OrdinalIgnoreCase)) {
var unparsed = $"<see href=\"{attributeValue}\">{elementValue}</see>";
XNode? seeElement = TryParseElement (unparsed);
if (seeElement == null) {
// Try to parse with HTML entities decoded
seeElement = TryParseElement (WebUtility.HtmlDecode (unparsed));
if (seeElement == null) {
// Finally, try to parse with only the element value encoded
seeElement = TryParseElement ($"<see href=\"{attributeValue}\">{WebUtility.HtmlEncode (elementValue)}</see>", logError: true);
}
}
parseNode.AstNode = seeElement ?? new XText (nodesAsString);
} else {
// TODO: Need to convert relative paths or code references to appropriate CREF value.
parseNode.AstNode = new XText (unparsedAElementValue);
parseNode.AstNode = new XText (elementValue);
}
};

// Start to trim out unusable HTML elements/tags, but not any inner values
IgnorableElementDeclaration.Rule =
CreateStartElementIgnoreAttribute ("a", "name") + InlineDeclarations + CreateEndElement ("a", grammar, optional: true)
| CreateStartElementIgnoreAttribute ("a", "id") + InlineDeclarations + CreateEndElement ("a", grammar, optional: true)
;
IgnorableElementDeclaration.AstConfig.NodeCreator = (context, parseNode) => {
var aElementValue = new XText (parseNode.ChildNodes [1].AstNode.ToString () ?? string.Empty);
parseNode.AstNode = aElementValue;
};

CodeElementDeclaration.Rule = CreateStartElement ("code", grammar) + InlineDeclarations + CreateEndElement ("code", grammar);
CodeElementDeclaration.AstConfig.NodeCreator = (context, parseNode) => {
var target = parseNode.ChildNodes [1].AstNode;
Expand Down Expand Up @@ -184,13 +178,28 @@ static IEnumerable<XElement> GetParagraphs (ParseTreeNodeList children)
}
}

static XElement? TryParseHRef (string unparsedAElementValue, bool logError = false)
static string GetChildNodesAsString (ParseTreeNode parseNode)
{
var unparsed = string.Empty;
foreach (var cn in parseNode.ChildNodes) {
if (cn.ChildNodes?.Count > 1) {
foreach (var gcn in cn.ChildNodes) {
unparsed += gcn.AstNode?.ToString ();
}
} else {
unparsed += cn.AstNode?.ToString ();
}
}
return unparsed;
}

static XElement? TryParseElement (string unparsed, bool logError = false)
{
try {
return XElement.Parse ($"<see href={unparsedAElementValue}</see>");
return XElement.Parse (unparsed);
} catch (Exception x) {
if (logError)
Console.Error.WriteLine ($"## Unable to parse HTML element: <see href={unparsedAElementValue}</see>\n{x.GetType ()}: {x.Message}");
Console.Error.WriteLine ($"## Unable to parse HTML element: `{unparsed}`\n{x.GetType ()}: {x.Message}");
return null;
}
}
Expand Down Expand Up @@ -221,10 +230,9 @@ static IEnumerable<XElement> GetParagraphs (ParseTreeNodeList children)
public readonly NonTerminal PBlockDeclaration = new NonTerminal (nameof (PBlockDeclaration), ConcatChildNodes);
public readonly NonTerminal PreBlockDeclaration = new NonTerminal (nameof (PreBlockDeclaration), ConcatChildNodes);
public readonly NonTerminal InlineHyperLinkDeclaration = new NonTerminal (nameof (InlineHyperLinkDeclaration), ConcatChildNodes);
public readonly NonTerminal IgnorableElementDeclaration = new NonTerminal (nameof (IgnorableElementDeclaration), ConcatChildNodes);
public readonly NonTerminal CodeElementDeclaration = new NonTerminal (nameof (CodeElementDeclaration), ConcatChildNodes);

public readonly Terminal InlineHyperLinkOpenTerm = new RegexBasedTerminal ("<a href=", @"(?i)<a\s*href\s*=") {
public readonly Terminal InlineHyperLinkOpenTerm = new RegexBasedTerminal ("<a attr=", @"(?i)<a\s*.*=") {
AstConfig = new AstNodeConfig {
NodeCreator = (context, parseNode) => parseNode.AstNode = "",
},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -109,15 +109,6 @@ internal void CreateRules (SourceJavadocToXmldocGrammar grammar)
}
};

// Inline content may contain reserved characters with no tags or special parsing rules, do not throw when encountering them
IgnorableDeclaration.Rule = grammar.ToTerm ("@ ")
| grammar.ToTerm ("{")
| grammar.ToTerm ("}")
;
IgnorableDeclaration.AstConfig.NodeCreator = (context, parseNode) => {
parseNode.AstNode = new XText (parseNode.ChildNodes [0].Term.Name.Trim ());
};

InlineParamDeclaration.Rule = grammar.ToTerm ("{@param") + InlineValue + "}";
InlineParamDeclaration.AstConfig.NodeCreator = (context, parseNode) => {
var target = parseNode.ChildNodes [1].AstNode;
Expand Down Expand Up @@ -156,9 +147,38 @@ internal void CreateRules (SourceJavadocToXmldocGrammar grammar)
// https://docs.oracle.com/javase/7/docs/technotes/tools/windows/javadoc.html#value
public readonly NonTerminal ValueDeclaration = new NonTerminal (nameof (ValueDeclaration));

public readonly NonTerminal IgnorableDeclaration = new NonTerminal (nameof (IgnorableDeclaration));

public readonly NonTerminal InlineParamDeclaration = new NonTerminal (nameof (InlineParamDeclaration));

public readonly Terminal IgnorableDeclaration = new IgnorableCharTerminal (nameof (IgnorableDeclaration)) {
AstConfig = new AstNodeConfig {
NodeCreator = (context, parseNode) => parseNode.AstNode = parseNode.Token.Value.ToString (),
},
};

}
}

class IgnorableCharTerminal : Terminal
{
public IgnorableCharTerminal (string name)
: base (name)
{
Priority = TerminalPriority.Low - 1;
}

public override Token? TryMatch (ParsingContext context, ISourceStream source)
{
var startChar = source.Text [source.Location.Position];
if (startChar != '@'
&& startChar != '{'
&& startChar != '}'
) {
return null;
}
source.PreviewPosition += 1;
return source.CreateToken (OutputTerminal, startChar);
}

}

}
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,27 @@ public void HyperLinkDeclaration ()

r = p.Parse ("<a href=\"AutofillService.html#FieldClassification\">field classification</a>");
Assert.IsFalse (r.HasErrors (), DumpMessages (r, p));
Assert.AreEqual ("\"AutofillService.html#FieldClassification\"&gt;field classification",
Assert.AreEqual ("field classification", r.Root.AstNode.ToString ());

r = p.Parse ("<a href=https://www.sqlite.org/pragma.html#pragma_journal_mode>here</a>");
Assert.IsFalse (r.HasErrors (), DumpMessages (r, p));
Assert.AreEqual ("<see href=\"https://www.sqlite.org/pragma.html#pragma_journal_mode\">here</see>", r.Root.AstNode.ToString ());

r = p.Parse ("<a href=\"https://github.com/google/libphonenumber>libphonenumber</a>");
Assert.IsFalse (r.HasErrors (), DumpMessages (r, p));
Assert.AreEqual ("<see href=\"https://github.com/google/libphonenumber\">libphonenumber</see>", r.Root.AstNode.ToString ());

r = p.Parse ("<a href=#BROKEN> broken</a>");
Assert.IsFalse (r.HasErrors (), DumpMessages (r, p));
Assert.AreEqual (" broken", r.Root.AstNode.ToString ());

r = p.Parse ("<a href=\"mailto:nobody@google.com\">nobody</a>");
Assert.IsFalse (r.HasErrors (), DumpMessages (r, p));
Assert.AreEqual ("nobody", r.Root.AstNode.ToString ());

r = p.Parse ("<a href='https://material.io/guidelines/components/progress-activity.html#progress-activity-types-of-indicators'>\nProgress & activity</a>");
Assert.IsFalse (r.HasErrors (), DumpMessages (r, p));
Assert.AreEqual ($"<see href=\"https://material.io/guidelines/components/progress-activity.html#progress-activity-types-of-indicators\">{Environment.NewLine}Progress &amp; activity</see>",
r.Root.AstNode.ToString ());
}

Expand Down

0 comments on commit 6a9f5cb

Please sign in to comment.