Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,14 @@
# Changelog

## 3.2.0

- Add new public API to allow parsing into Header and Footer #162. Some API methods as been flagged as obsolete with a clear message of what to use instead.
This is not a breaking changes as it keep existing behaviour.
- Add support for `SVG` format (either from img src or the SVG node tag)
- Automatically create the `_top` bookmark if needed
- Fix a crash when a hyperlink contains both `img` and `figcation`
- Fix a crash when `li` is empty #161

## 3.1.1

- Fix respecting layout with `div`/`p` ending with line break #158
Expand Down
8 changes: 4 additions & 4 deletions HtmlToOpenXml.sln
Original file line number Diff line number Diff line change
@@ -1,17 +1,17 @@

Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio 15
VisualStudioVersion = 15.0.26730.16
# Visual Studio 17
VisualStudioVersion = 17.8.34511.84
MinimumVisualStudioVersion = 10.0.40219.1
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "HtmlToOpenXml", "src\Html2OpenXml\HtmlToOpenXml.csproj", "{EF700F30-C9BB-49A6-912C-E3B77857B514}"
EndProject
Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "src", "src", "{58520A98-BA53-4BA4-AAE3-786AA21331D6}"
EndProject
Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "test", "test", "{84EA02ED-2E97-47D2-992E-32CC104A3A7A}"
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Demo", "examples\Demo\Demo.csproj", "{A1ECC760-B9F7-4A00-AF5F-568B5FD6F09F}"
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Demo", "examples\Demo\Demo.csproj", "{A1ECC760-B9F7-4A00-AF5F-568B5FD6F09F}"
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "HtmlToOpenXml.Tests", "test\HtmlToOpenXml.Tests\HtmlToOpenXml.Tests.csproj", "{CA0A68E0-45A0-4A01-A061-F951D93D6906}"
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "HtmlToOpenXml.Tests", "test\HtmlToOpenXml.Tests\HtmlToOpenXml.Tests.csproj", "{CA0A68E0-45A0-4A01-A061-F951D93D6906}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Expand Down
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@
![Download Counts](https://img.shields.io/nuget/dt/HtmlToOpenXml.dll.svg)
[![MIT License](https://img.shields.io/badge/license-MIT-blue.svg)](https://github.com/onizet/html2openxml/blob/dev/LICENSE)

# What is Html2OpenXml?
# What is HtmlToOpenXml?

Html2OpenXml is a small .Net library that convert simple or advanced HTML to plain OpenXml components. This program has started in 2009, initially to convert user's comments into Word.
HtmlToOpenXml is a small .Net library that convert simple or advanced HTML to plain OpenXml components. This program has started in 2009, initially to convert user's comments into Word.

This library supports both **.Net Framework 4.6.2**, **.NET Standard 2.0** and **.NET 8** which are all LTS.

Expand All @@ -28,7 +28,7 @@ Refer to [w3schools’ tag](http://www.w3schools.com/tags/default.asp) list to s
* `abbr` and `acronym`
* `b`, `i`, `u`, `s`, `del`, `ins`, `em`, `strike`, `strong`
* `br` and `hr`
* `img`, `figcaption`
* `img`, `figcaption` and `svg`
* `table`, `td`, `tr`, `th`, `tbody`, `thead`, `tfoot`, `caption` and `col`
* `cite`
* `div`, `span`, `time`, `font` and `p`
Expand Down
7 changes: 6 additions & 1 deletion examples/Demo/Demo.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,19 @@
</PropertyGroup>

<ItemGroup>
<PackageReference Include="DocumentFormat.OpenXml" Version="3.0.2" />
<PackageReference Include="DocumentFormat.OpenXml" Version="3.1.0" />
<PackageReference Include="System.Diagnostics.Process" Version="4.3.0" />
</ItemGroup>

<ItemGroup>
<ProjectReference Include="..\..\src\Html2OpenXml\HtmlToOpenXml.csproj" />
</ItemGroup>

<PropertyGroup Condition="'$(Configuration)' == 'Debug'">
<DefineConstants>$(DefineConstants);DEBUG</DefineConstants>
<Optimize>false</Optimize>
</PropertyGroup>

<ItemGroup>
<EmbeddedResource Include="Resources\*" />
</ItemGroup>
Expand Down
4 changes: 2 additions & 2 deletions examples/Demo/Program.cs
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ static class Program
static async Task Main(string[] args)
{
const string filename = "test.docx";
string html = ResourceHelper.GetString("Resources.CompleteRunTest.html");
string html = ResourceHelper.GetString("Resources.AdvancedTable.html");
if (File.Exists(filename)) File.Delete(filename);

using (MemoryStream generatedDocument = new MemoryStream())
Expand All @@ -42,7 +42,7 @@ static async Task Main(string[] args)
converter.RenderPreAsTable = true;
Body body = mainPart.Document.Body;

await converter.ParseHtml(html);
await converter.ParseBody(html);
mainPart.Document.Save();

AssertThatOpenXmlDocumentIsValid(package);
Expand Down
3 changes: 2 additions & 1 deletion src/Html2OpenXml/Expressions/AbbreviationExpression.cs
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,9 @@ public override IEnumerable<OpenXmlElement> Interpret(ParsingContext context)
var childElements = base.Interpret(context);

// Transform the inline acronym/abbreviation to a reference to a foot note.
// Footnote or endnote are invalid inside header and footer
string? description = node.Title;
if (string.IsNullOrEmpty(description))
if (string.IsNullOrEmpty(description) || context.HostingPart is not MainDocumentPart)
return childElements;

string runStyle;
Expand Down
2 changes: 1 addition & 1 deletion src/Html2OpenXml/Expressions/BlockElementExpression.cs
Original file line number Diff line number Diff line change
Expand Up @@ -325,7 +325,7 @@ private static Paragraph CreateParagraph(ParsingContext context, IList<OpenXmlEl
/// <summary>
/// Resolve the next available <see cref="BookmarkStart.Id"/> (they must be unique).
/// </summary>
private static int IncrementBookmarkId(ParsingContext context)
protected static int IncrementBookmarkId(ParsingContext context)
{
var bookmarkRef = context.Properties<int?>("bookmarkRef");

Expand Down
9 changes: 7 additions & 2 deletions src/Html2OpenXml/Expressions/BlockQuoteExpression.cs
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
using System.Linq;
using AngleSharp.Html.Dom;
using DocumentFormat.OpenXml;
using DocumentFormat.OpenXml.Packaging;
using DocumentFormat.OpenXml.Wordprocessing;

namespace HtmlToOpenXml.Expressions;
Expand All @@ -26,15 +27,19 @@ sealed class BlockQuoteExpression(IHtmlElement node) : BlockElementExpression(no
/// <inheritdoc/>
public override IEnumerable<OpenXmlElement> Interpret(ParsingContext context)
{
string? description = node.GetAttribute("cite");

var childElements = base.Interpret(context);
if (!childElements.Any())
return [];

// Footnote or endnote are invalid inside header and footer
if (context.HostingPart is not MainDocumentPart)
return childElements;

// Transform the inline acronym/abbreviation to a reference to a foot note.
if (childElements.First() is Paragraph paragraph)
{
string? description = node.GetAttribute("cite");

paragraph.ParagraphProperties ??= new();
if (paragraph.ParagraphProperties.ParagraphStyleId is null)
paragraph.ParagraphProperties.ParagraphStyleId =
Expand Down
43 changes: 37 additions & 6 deletions src/Html2OpenXml/Expressions/BodyExpression.cs
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
* PARTICULAR PURPOSE.
*/
using System.Collections.Generic;
using System.Globalization;
using System.Linq;
using AngleSharp.Dom;
using AngleSharp.Html.Dom;
Expand All @@ -24,28 +25,51 @@ namespace HtmlToOpenXml.Expressions;
/// </summary>
sealed class BodyExpression(IHtmlElement node) : BlockElementExpression(node)
{
private bool shouldRegisterTopBookmark;

public override IEnumerable<OpenXmlElement> Interpret(ParsingContext context)
{
MarkAllBookmarks();

return base.Interpret(context);
var elements = base.Interpret(context);

if (shouldRegisterTopBookmark && elements.Any())
{
// Check whether it already exists
var body = context.MainPart.Document.Body!;
if (body.Descendants<BookmarkStart>().Where(b => b.Name?.Value == "_top").Any())
{
return elements;
}

var bookmarkId = IncrementBookmarkId(context).ToString(CultureInfo.InvariantCulture);
// this is expected to stand in the 1st paragraph
Paragraph? p = body.FirstChild as Paragraph;
p ??= body.PrependChild(new Paragraph());
p.InsertAfter(new BookmarkEnd() { Id = bookmarkId }, p.ParagraphProperties);
p.InsertAfter(new BookmarkStart() { Id = bookmarkId, Name = "_top" }, p.ParagraphProperties);
}

return elements;
}

protected override void ComposeStyles(ParsingContext context)
{
base.ComposeStyles(context);

var mainPart = context.MainPart;

// Unsupported W3C attribute but claimed by users. Specified at <body> level, the page
// orientation is applied on the whole document
string? attr = styleAttributes!["page-orientation"];
if (attr != null)
{
PageOrientationValues orientation = Converter.ToPageOrientation(attr);

var sectionProperties = context.MainPart.Document.Body!.GetFirstChild<SectionProperties>();
var sectionProperties = mainPart.Document.Body!.GetFirstChild<SectionProperties>();
if (sectionProperties == null || sectionProperties.GetFirstChild<PageSize>() == null)
{
context.MainPart.Document.Body.Append(ChangePageOrientation(orientation));
mainPart.Document.Body.Append(ChangePageOrientation(orientation));
}
else
{
Expand All @@ -61,10 +85,10 @@ protected override void ComposeStyles(ParsingContext context)

if (paraProperties.BiDi is not null)
{
var sectionProperties = context.MainPart.Document.Body!.GetFirstChild<SectionProperties>();
var sectionProperties = mainPart.Document.Body!.GetFirstChild<SectionProperties>();
if (sectionProperties == null || sectionProperties.GetFirstChild<PageSize>() == null)
{
context.MainPart.Document.Body.Append(sectionProperties = new());
mainPart.Document.Body.Append(sectionProperties = new());
}

sectionProperties.AddChild(paraProperties.BiDi.CloneNode(true));
Expand Down Expand Up @@ -105,10 +129,17 @@ private void MarkAllBookmarks()
var links = node.QuerySelectorAll("a[href^='#']");
if (links.Length == 0) return;

foreach (var link in links.Cast<IHtmlAnchorElement>())
foreach (var link in links.Cast<IHtmlAnchorElement>().Where(l => l.Hash.Length > 0))
{
if (link.IsTopAnchor())
{
shouldRegisterTopBookmark = true;
return;
}

var id = link.Hash.Substring(1);
var target = node.Owner!.GetElementById(id);

// `id` attribute is preferred but `name` is also valid
target ??= node.Owner!.GetElementsByName(id).FirstOrDefault();

Expand Down
1 change: 1 addition & 0 deletions src/Html2OpenXml/Expressions/HtmlDomExpression.cs
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ private static Dictionary<string, Func<IElement, HtmlDomExpression>> InitKnownTa
{ TagNames.Strong, el => new PhrasingElementExpression((IHtmlElement) el, new Bold()) },
{ TagNames.Sub, el => new PhrasingElementExpression((IHtmlElement) el, new VerticalTextAlignment() { Val = VerticalPositionValues.Subscript }) },
{ TagNames.Sup, el => new PhrasingElementExpression((IHtmlElement) el, new VerticalTextAlignment() { Val = VerticalPositionValues.Superscript }) },
{ TagNames.Svg, el => new SvgExpression((AngleSharp.Svg.Dom.ISvgSvgElement) el) },
{ TagNames.Table, el => new TableExpression((IHtmlTableElement) el) },
{ TagNames.Time, el => new PhrasingElementExpression((IHtmlElement) el) },
{ TagNames.U, el => new PhrasingElementExpression((IHtmlElement) el, new Underline() { Val = UnderlineValues.Single }) },
Expand Down
61 changes: 41 additions & 20 deletions src/Html2OpenXml/Expressions/HyperlinkExpression.cs
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
using System.Linq;
using AngleSharp.Html.Dom;
using DocumentFormat.OpenXml;
using DocumentFormat.OpenXml.Packaging;
using DocumentFormat.OpenXml.Wordprocessing;

namespace HtmlToOpenXml.Expressions;
Expand Down Expand Up @@ -42,8 +43,10 @@ public override IEnumerable<OpenXmlElement> Interpret (ParsingContext context)
// Let's see whether the link tag include an image inside its body.
// If so, the Hyperlink OpenXmlElement is lost and we'll keep only the images
// and applied a HyperlinkOnClick attribute.
var imagesInLink = childElements.Where(e => e.HasChild<Drawing>());
if (imagesInLink.Any())
IEnumerable<OpenXmlElement> imagesInLink;
// Clickable image is only supported in body but not in header/footer
if (context.HostingPart is MainDocumentPart &&
(imagesInLink = childElements.Where(e => e.HasChild<Drawing>())).Any())
{
foreach (var img in imagesInLink)
{
Expand All @@ -56,25 +59,42 @@ public override IEnumerable<OpenXmlElement> Interpret (ParsingContext context)
if (enDp.MoveNext()) alt = enDp.Current.Description;
else alt = null;

d.InsertInDocProperties(
new a.HyperlinkOnClick() { Id = h.Id ?? h.Anchor, Tooltip = alt });
d.Inline ??= new a.Wordprocessing.Inline();
d.Inline.DocProperties ??= new a.Wordprocessing.DocProperties();

if (h.Anchor == "_top")
{
// exception case: clickable image requires the _top bookmark get registred with a relationship
var extLink = context.HostingPart.AddHyperlinkRelationship(new Uri("#_top", UriKind.Relative), false);
d.Inline.DocProperties.Append(
new a.HyperlinkOnClick() { Id = extLink.Id, Tooltip = alt });
}
else
{
d.Inline.DocProperties.Append(
new a.HyperlinkOnClick() { Id = h.Id ?? h.Anchor, Tooltip = alt });
}
}
}

// can't use GetFirstChild<Run> or we may find the one containing the image
List<Run> runs = [];
foreach (var el in childElements)
{
if (el is Run run && !run.HasChild<Drawing>())
{
run.RunProperties ??= new();
run.RunProperties.RunStyle = context.DocumentStyle.GetRunStyle(
context.DocumentStyle.DefaultStyles.HyperlinkStyle);
break;
}
if (el is Run r) runs.Add(r);
// unroll paragraphs. CloneNode is need to unparent the run
else runs.AddRange(el.Elements<Run>().Select(r => (Run) r.CloneNode(true)));
}

foreach (var run in runs.Where(run => !run.HasChild<Drawing>()))
{
run.RunProperties ??= new();
run.RunProperties.RunStyle = context.DocumentStyle.GetRunStyle(
context.DocumentStyle.DefaultStyles.HyperlinkStyle);
}

// Append the processed elements and put them to the Run of the Hyperlink
h.Append(childElements);
h.Append(runs);

return [h];
}
Expand All @@ -87,20 +107,21 @@ public override IEnumerable<OpenXmlElement> Interpret (ParsingContext context)
if (string.IsNullOrEmpty(att))
return null;

// Always accept _top anchor
if (linkNode.IsTopAnchor())
{
h = new Hyperlink() { History = true, Anchor = "_top" };
}
// is it an anchor?
if (att![0] == '#' && att.Length > 1)
else if (!context.Converter.ExcludeLinkAnchor && linkNode.Hash.Length > 1 && linkNode.Hash[0] == '#')
{
// Always accept _top anchor
if (!context.Converter.ExcludeLinkAnchor || att == "#_top")
{
h = new Hyperlink(
) { History = true, Anchor = att.Substring(1) };
}
h = new Hyperlink(
) { History = true, Anchor = linkNode.Hash.Substring(1) };
}
// ensure the links does not start with javascript:
else if (AngleSharpExtensions.TryParseUrl(att, UriKind.Absolute, out var uri))
{
var extLink = context.MainPart.AddHyperlinkRelationship(uri!, true);
var extLink = context.HostingPart.AddHyperlinkRelationship(uri!, true);

h = new Hyperlink(
) { History = true, Id = extLink.Id };
Expand Down
Loading