Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Emoji and abbreviations parser #305

Merged
merged 8 commits into from
Feb 8, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions src/Markdig.Benchmarks/TestMatchPerf.cs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright (c) Alexandre Mutel. All rights reserved.
// Copyright (c) Alexandre Mutel. All rights reserved.
// This file is licensed under the BSD-Clause 2 license.
// See the license.txt file in the project root for more information.

Expand Down Expand Up @@ -52,7 +52,7 @@ public bool TryMatch(string text, int offset, out string matchText, out string r
}
}


/*
public class TestMatchPerf
{
private readonly TextMatchHelper matcher;
Expand Down Expand Up @@ -82,4 +82,5 @@ public void TestMatch()
}
}
}
*/
}
41 changes: 40 additions & 1 deletion src/Markdig.Tests/Specs/AbbreviationSpecs.cs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Generated: 21. 01. 2019 14:26:34
// Generated: 6. 02. 2019 16:15:54

// --------------------------------
// Abbreviations
Expand Down Expand Up @@ -193,5 +193,44 @@ public void ExtensionsAbbreviation_Example009()
Console.WriteLine("Example 9\nSection Extensions / Abbreviation\n");
TestParser.TestSpec("*[PR]: Pull Request\n\nPRAA", "<p>PRAA</p>", "abbreviations|advanced");
}

// Single character abbreviations should be matched
[Test]
public void ExtensionsAbbreviation_Example010()
{
// Example 10
// Section: Extensions / Abbreviation
//
// The following Markdown:
// *[A]: Foo
//
// A
//
// Should be rendered as:
// <p><abbr title="Foo">A</abbr></p>

Console.WriteLine("Example 10\nSection Extensions / Abbreviation\n");
TestParser.TestSpec("*[A]: Foo\n\nA", "<p><abbr title=\"Foo\">A</abbr></p>", "abbreviations|advanced");
}

// The longest matching abbreviation should be used
[Test]
public void ExtensionsAbbreviation_Example011()
{
// Example 11
// Section: Extensions / Abbreviation
//
// The following Markdown:
// *[Foo]: foo
// *[Foo Bar]: foobar
//
// Foo B
//
// Should be rendered as:
// <p><abbr title="foo">Foo</abbr> B</p>

Console.WriteLine("Example 11\nSection Extensions / Abbreviation\n");
TestParser.TestSpec("*[Foo]: foo\n*[Foo Bar]: foobar\n\nFoo B", "<p><abbr title=\"foo\">Foo</abbr> B</p>", "abbreviations|advanced");
}
}
}
21 changes: 21 additions & 0 deletions src/Markdig.Tests/Specs/AbbreviationSpecs.md
Original file line number Diff line number Diff line change
Expand Up @@ -97,4 +97,25 @@ Abbreviations should only match when surrounded by whitespace:
PRAA
.
<p>PRAA</p>
````````````````````````````````

Single character abbreviations should be matched

```````````````````````````````` example
*[A]: Foo

A
.
<p><abbr title="Foo">A</abbr></p>
````````````````````````````````

The longest matching abbreviation should be used

```````````````````````````````` example
*[Foo]: foo
*[Foo Bar]: foobar

Foo B
.
<p><abbr title="foo">Foo</abbr> B</p>
````````````````````````````````
92 changes: 38 additions & 54 deletions src/Markdig/Extensions/Abbreviations/AbbreviationParser.cs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright (c) Alexandre Mutel. All rights reserved.
// Copyright (c) Alexandre Mutel. All rights reserved.
// This file is licensed under the BSD-Clause 2 license.
// See the license.txt file in the project root for more information.
using System.Collections.Generic;
Expand All @@ -12,7 +12,7 @@ namespace Markdig.Extensions.Abbreviations
/// <summary>
/// A block parser for abbreviations.
/// </summary>
/// <seealso cref="Markdig.Parsers.BlockParser" />
/// <seealso cref="BlockParser" />
public class AbbreviationParser : BlockParser
{
/// <summary>
Expand Down Expand Up @@ -40,8 +40,7 @@ public override BlockState TryOpen(BlockProcessor processor)
}

SourceSpan labelSpan;
string label;
if (!LinkHelper.TryParseLabel(ref slice, out label, out labelSpan))
if (!LinkHelper.TryParseLabel(ref slice, out string label, out labelSpan))
{
return BlockState.None;
}
Expand Down Expand Up @@ -85,8 +84,7 @@ private void DocumentOnProcessInlinesBegin(InlineProcessor inlineProcessor, Inli
}

// Build a text matcher from the abbreviations labels
var labels = new HashSet<string>(abbreviations.Keys);
var matcher = new TextMatchHelper(labels);
var prefixTree = new CompactPrefixTree<Abbreviation>(abbreviations);

inlineProcessor.LiteralInlineParser.PostMatch += (InlineProcessor processor, ref StringSlice slice) =>
{
Expand All @@ -98,20 +96,35 @@ private void DocumentOnProcessInlinesBegin(InlineProcessor inlineProcessor, Inli
// This is slow, but we don't have much the choice
var content = literal.Content;
var text = content.Text;
for (int i = content.Start; i < content.End; i++)
{
string match;
if (matcher.TryMatch(text, i, content.End - i + 1, out match) && IsValidAbbreviation(match, content, i))
{
var indexAfterMatch = i + match.Length;

// We should have a match, but in case...
Abbreviation abbr;
if (!abbreviations.TryGetValue(match, out abbr))
for (int i = content.Start; i <= content.End; i++)
{
// Abbreviation must be a whole word == start at the start of a line or after a whitespace
if (i != 0)
{
for (i = i - 1; i <= content.End; i++)
{
if (text[i].IsWhitespace())
{
i++;
goto ValidAbbreviationStart;
}
}
break;
}

ValidAbbreviationStart:;

if (prefixTree.TryMatchLongest(text, i, content.End - i + 1, out KeyValuePair<string, Abbreviation> abbreviationMatch))
{
var match = abbreviationMatch.Key;
if (!IsValidAbbreviationEnding(match, content, i))
{
continue;
continue;
}

var indexAfterMatch = i + match.Length;

// If we don't have a container, create a new one
if (container == null)
{
Expand All @@ -124,39 +137,32 @@ private void DocumentOnProcessInlinesBegin(InlineProcessor inlineProcessor, Inli
};
}

int line;
int column;
var abbrInline = new AbbreviationInline(abbr)
var abbrInline = new AbbreviationInline(abbreviationMatch.Value)
{
Span =
{
Start = processor.GetSourcePosition(i, out line, out column),
Start = processor.GetSourcePosition(i, out int line, out int column),
},
Line = line,
Column = column
};
abbrInline.Span.End = abbrInline.Span.Start + match.Length - 1;

// Append the previous literal
if (i > content.Start)
{
if (literal.Parent == null)
{
container.AppendChild(literal);
}

if (i > content.Start && literal.Parent == null)
{
container.AppendChild(literal);
}

literal.Span.End = abbrInline.Span.Start - 1;
// Truncate it before the abbreviation
literal.Content.End = i - 1;


// Appned the abbreviation
// Append the abbreviation
container.AppendChild(abbrInline);

// If this is the end of the string, clear the literal
// and exit
// If this is the end of the string, clear the literal and exit
if (content.End == indexAfterMatch - 1)
{
literal = null;
Expand Down Expand Up @@ -188,34 +194,12 @@ private void DocumentOnProcessInlinesBegin(InlineProcessor inlineProcessor, Inli
};
}

private static bool IsValidAbbreviation(string match, StringSlice content, int matchIndex)
private static bool IsValidAbbreviationEnding(string match, StringSlice content, int matchIndex)
{
// The word matched must be embraced by punctuation or whitespace or \0.
var index = matchIndex - 1;
while (index >= content.Start)
{
var c = content.PeekCharAbsolute(index);
if (!(c == '\0' || c.IsWhitespace() || c.IsAsciiPunctuation()))
{
return false;
}

if (c.IsAlphaNumeric())
{
return false;
}

if (!c.IsAsciiPunctuation() || c.IsWhitespace())
{
break;
}
index--;
}

// This will check if the next char at the end of the StringSlice is whitespace, punctuation or \0.
var contentNew = content;
contentNew.End = content.End + 1;
index = matchIndex + match.Length;
int index = matchIndex + match.Length;
while (index <= contentNew.End)
{
var c = contentNew.PeekCharAbsolute(index);
Expand Down
Loading