diff --git a/CHANGELOG.md b/CHANGELOG.md index 14e39bd1..a56d874d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,14 @@ # Changelog +## 3.2.0 + +- Add new public API to allow parsing into Header and Footer #162. Some API methods as been flagged as obsolete with a clear message of what to use instead. + This is not a breaking changes as it keep existing behaviour. +- Add support for `SVG` format (either from img src or the SVG node tag) +- Automatically create the `_top` bookmark if needed +- Fix a crash when a hyperlink contains both `img` and `figcation` +- Fix a crash when `li` is empty #161 + ## 3.1.1 - Fix respecting layout with `div`/`p` ending with line break #158 diff --git a/HtmlToOpenXml.sln b/HtmlToOpenXml.sln index 6ed4cc54..18814542 100644 --- a/HtmlToOpenXml.sln +++ b/HtmlToOpenXml.sln @@ -1,7 +1,7 @@  Microsoft Visual Studio Solution File, Format Version 12.00 -# Visual Studio 15 -VisualStudioVersion = 15.0.26730.16 +# Visual Studio 17 +VisualStudioVersion = 17.8.34511.84 MinimumVisualStudioVersion = 10.0.40219.1 Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "HtmlToOpenXml", "src\Html2OpenXml\HtmlToOpenXml.csproj", "{EF700F30-C9BB-49A6-912C-E3B77857B514}" EndProject @@ -9,9 +9,9 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "src", "src", "{58520A98-BA5 EndProject Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "test", "test", "{84EA02ED-2E97-47D2-992E-32CC104A3A7A}" EndProject -Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Demo", "examples\Demo\Demo.csproj", "{A1ECC760-B9F7-4A00-AF5F-568B5FD6F09F}" +Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Demo", "examples\Demo\Demo.csproj", "{A1ECC760-B9F7-4A00-AF5F-568B5FD6F09F}" EndProject -Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "HtmlToOpenXml.Tests", "test\HtmlToOpenXml.Tests\HtmlToOpenXml.Tests.csproj", "{CA0A68E0-45A0-4A01-A061-F951D93D6906}" +Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "HtmlToOpenXml.Tests", "test\HtmlToOpenXml.Tests\HtmlToOpenXml.Tests.csproj", "{CA0A68E0-45A0-4A01-A061-F951D93D6906}" EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution diff --git a/README.md b/README.md index 92f3166f..529b26f1 100644 --- a/README.md +++ b/README.md @@ -2,9 +2,9 @@ ![Download Counts](https://img.shields.io/nuget/dt/HtmlToOpenXml.dll.svg) [![MIT License](https://img.shields.io/badge/license-MIT-blue.svg)](https://github.com/onizet/html2openxml/blob/dev/LICENSE) -# What is Html2OpenXml? +# What is HtmlToOpenXml? -Html2OpenXml is a small .Net library that convert simple or advanced HTML to plain OpenXml components. This program has started in 2009, initially to convert user's comments into Word. +HtmlToOpenXml is a small .Net library that convert simple or advanced HTML to plain OpenXml components. This program has started in 2009, initially to convert user's comments into Word. This library supports both **.Net Framework 4.6.2**, **.NET Standard 2.0** and **.NET 8** which are all LTS. @@ -28,7 +28,7 @@ Refer to [w3schools’ tag](http://www.w3schools.com/tags/default.asp) list to s * `abbr` and `acronym` * `b`, `i`, `u`, `s`, `del`, `ins`, `em`, `strike`, `strong` * `br` and `hr` -* `img`, `figcaption` +* `img`, `figcaption` and `svg` * `table`, `td`, `tr`, `th`, `tbody`, `thead`, `tfoot`, `caption` and `col` * `cite` * `div`, `span`, `time`, `font` and `p` diff --git a/examples/Demo/Demo.csproj b/examples/Demo/Demo.csproj index b869b3f9..3e76b9dc 100644 --- a/examples/Demo/Demo.csproj +++ b/examples/Demo/Demo.csproj @@ -6,7 +6,7 @@ - + @@ -14,6 +14,11 @@ + + $(DefineConstants);DEBUG + false + + diff --git a/examples/Demo/Program.cs b/examples/Demo/Program.cs index c3c54245..47c0124b 100644 --- a/examples/Demo/Program.cs +++ b/examples/Demo/Program.cs @@ -15,7 +15,7 @@ static class Program static async Task Main(string[] args) { const string filename = "test.docx"; - string html = ResourceHelper.GetString("Resources.CompleteRunTest.html"); + string html = ResourceHelper.GetString("Resources.AdvancedTable.html"); if (File.Exists(filename)) File.Delete(filename); using (MemoryStream generatedDocument = new MemoryStream()) @@ -42,7 +42,7 @@ static async Task Main(string[] args) converter.RenderPreAsTable = true; Body body = mainPart.Document.Body; - await converter.ParseHtml(html); + await converter.ParseBody(html); mainPart.Document.Save(); AssertThatOpenXmlDocumentIsValid(package); diff --git a/src/Html2OpenXml/Expressions/AbbreviationExpression.cs b/src/Html2OpenXml/Expressions/AbbreviationExpression.cs index 654f1fb3..d0dcca86 100644 --- a/src/Html2OpenXml/Expressions/AbbreviationExpression.cs +++ b/src/Html2OpenXml/Expressions/AbbreviationExpression.cs @@ -32,8 +32,9 @@ public override IEnumerable Interpret(ParsingContext context) var childElements = base.Interpret(context); // Transform the inline acronym/abbreviation to a reference to a foot note. + // Footnote or endnote are invalid inside header and footer string? description = node.Title; - if (string.IsNullOrEmpty(description)) + if (string.IsNullOrEmpty(description) || context.HostingPart is not MainDocumentPart) return childElements; string runStyle; diff --git a/src/Html2OpenXml/Expressions/BlockElementExpression.cs b/src/Html2OpenXml/Expressions/BlockElementExpression.cs index de22330c..e15db508 100644 --- a/src/Html2OpenXml/Expressions/BlockElementExpression.cs +++ b/src/Html2OpenXml/Expressions/BlockElementExpression.cs @@ -325,7 +325,7 @@ private static Paragraph CreateParagraph(ParsingContext context, IList /// Resolve the next available (they must be unique). /// - private static int IncrementBookmarkId(ParsingContext context) + protected static int IncrementBookmarkId(ParsingContext context) { var bookmarkRef = context.Properties("bookmarkRef"); diff --git a/src/Html2OpenXml/Expressions/BlockQuoteExpression.cs b/src/Html2OpenXml/Expressions/BlockQuoteExpression.cs index 18900c2c..3fa3d431 100644 --- a/src/Html2OpenXml/Expressions/BlockQuoteExpression.cs +++ b/src/Html2OpenXml/Expressions/BlockQuoteExpression.cs @@ -13,6 +13,7 @@ using System.Linq; using AngleSharp.Html.Dom; using DocumentFormat.OpenXml; +using DocumentFormat.OpenXml.Packaging; using DocumentFormat.OpenXml.Wordprocessing; namespace HtmlToOpenXml.Expressions; @@ -26,15 +27,19 @@ sealed class BlockQuoteExpression(IHtmlElement node) : BlockElementExpression(no /// public override IEnumerable Interpret(ParsingContext context) { - string? description = node.GetAttribute("cite"); - var childElements = base.Interpret(context); if (!childElements.Any()) return []; + + // Footnote or endnote are invalid inside header and footer + if (context.HostingPart is not MainDocumentPart) + return childElements; // Transform the inline acronym/abbreviation to a reference to a foot note. if (childElements.First() is Paragraph paragraph) { + string? description = node.GetAttribute("cite"); + paragraph.ParagraphProperties ??= new(); if (paragraph.ParagraphProperties.ParagraphStyleId is null) paragraph.ParagraphProperties.ParagraphStyleId = diff --git a/src/Html2OpenXml/Expressions/BodyExpression.cs b/src/Html2OpenXml/Expressions/BodyExpression.cs index ed663484..7ff5ee16 100644 --- a/src/Html2OpenXml/Expressions/BodyExpression.cs +++ b/src/Html2OpenXml/Expressions/BodyExpression.cs @@ -10,6 +10,7 @@ * PARTICULAR PURPOSE. */ using System.Collections.Generic; +using System.Globalization; using System.Linq; using AngleSharp.Dom; using AngleSharp.Html.Dom; @@ -24,17 +25,40 @@ namespace HtmlToOpenXml.Expressions; /// sealed class BodyExpression(IHtmlElement node) : BlockElementExpression(node) { + private bool shouldRegisterTopBookmark; + public override IEnumerable Interpret(ParsingContext context) { MarkAllBookmarks(); - return base.Interpret(context); + var elements = base.Interpret(context); + + if (shouldRegisterTopBookmark && elements.Any()) + { + // Check whether it already exists + var body = context.MainPart.Document.Body!; + if (body.Descendants().Where(b => b.Name?.Value == "_top").Any()) + { + return elements; + } + + var bookmarkId = IncrementBookmarkId(context).ToString(CultureInfo.InvariantCulture); + // this is expected to stand in the 1st paragraph + Paragraph? p = body.FirstChild as Paragraph; + p ??= body.PrependChild(new Paragraph()); + p.InsertAfter(new BookmarkEnd() { Id = bookmarkId }, p.ParagraphProperties); + p.InsertAfter(new BookmarkStart() { Id = bookmarkId, Name = "_top" }, p.ParagraphProperties); + } + + return elements; } protected override void ComposeStyles(ParsingContext context) { base.ComposeStyles(context); + var mainPart = context.MainPart; + // Unsupported W3C attribute but claimed by users. Specified at level, the page // orientation is applied on the whole document string? attr = styleAttributes!["page-orientation"]; @@ -42,10 +66,10 @@ protected override void ComposeStyles(ParsingContext context) { PageOrientationValues orientation = Converter.ToPageOrientation(attr); - var sectionProperties = context.MainPart.Document.Body!.GetFirstChild(); + var sectionProperties = mainPart.Document.Body!.GetFirstChild(); if (sectionProperties == null || sectionProperties.GetFirstChild() == null) { - context.MainPart.Document.Body.Append(ChangePageOrientation(orientation)); + mainPart.Document.Body.Append(ChangePageOrientation(orientation)); } else { @@ -61,10 +85,10 @@ protected override void ComposeStyles(ParsingContext context) if (paraProperties.BiDi is not null) { - var sectionProperties = context.MainPart.Document.Body!.GetFirstChild(); + var sectionProperties = mainPart.Document.Body!.GetFirstChild(); if (sectionProperties == null || sectionProperties.GetFirstChild() == null) { - context.MainPart.Document.Body.Append(sectionProperties = new()); + mainPart.Document.Body.Append(sectionProperties = new()); } sectionProperties.AddChild(paraProperties.BiDi.CloneNode(true)); @@ -105,10 +129,17 @@ private void MarkAllBookmarks() var links = node.QuerySelectorAll("a[href^='#']"); if (links.Length == 0) return; - foreach (var link in links.Cast()) + foreach (var link in links.Cast().Where(l => l.Hash.Length > 0)) { + if (link.IsTopAnchor()) + { + shouldRegisterTopBookmark = true; + return; + } + var id = link.Hash.Substring(1); var target = node.Owner!.GetElementById(id); + // `id` attribute is preferred but `name` is also valid target ??= node.Owner!.GetElementsByName(id).FirstOrDefault(); diff --git a/src/Html2OpenXml/Expressions/HtmlDomExpression.cs b/src/Html2OpenXml/Expressions/HtmlDomExpression.cs index a60c6033..44ec6bc0 100644 --- a/src/Html2OpenXml/Expressions/HtmlDomExpression.cs +++ b/src/Html2OpenXml/Expressions/HtmlDomExpression.cs @@ -67,6 +67,7 @@ private static Dictionary> InitKnownTa { TagNames.Strong, el => new PhrasingElementExpression((IHtmlElement) el, new Bold()) }, { TagNames.Sub, el => new PhrasingElementExpression((IHtmlElement) el, new VerticalTextAlignment() { Val = VerticalPositionValues.Subscript }) }, { TagNames.Sup, el => new PhrasingElementExpression((IHtmlElement) el, new VerticalTextAlignment() { Val = VerticalPositionValues.Superscript }) }, + { TagNames.Svg, el => new SvgExpression((AngleSharp.Svg.Dom.ISvgSvgElement) el) }, { TagNames.Table, el => new TableExpression((IHtmlTableElement) el) }, { TagNames.Time, el => new PhrasingElementExpression((IHtmlElement) el) }, { TagNames.U, el => new PhrasingElementExpression((IHtmlElement) el, new Underline() { Val = UnderlineValues.Single }) }, diff --git a/src/Html2OpenXml/Expressions/HyperlinkExpression.cs b/src/Html2OpenXml/Expressions/HyperlinkExpression.cs index 2e11edd6..46fdc9a4 100644 --- a/src/Html2OpenXml/Expressions/HyperlinkExpression.cs +++ b/src/Html2OpenXml/Expressions/HyperlinkExpression.cs @@ -14,6 +14,7 @@ using System.Linq; using AngleSharp.Html.Dom; using DocumentFormat.OpenXml; +using DocumentFormat.OpenXml.Packaging; using DocumentFormat.OpenXml.Wordprocessing; namespace HtmlToOpenXml.Expressions; @@ -42,8 +43,10 @@ public override IEnumerable Interpret (ParsingContext context) // Let's see whether the link tag include an image inside its body. // If so, the Hyperlink OpenXmlElement is lost and we'll keep only the images // and applied a HyperlinkOnClick attribute. - var imagesInLink = childElements.Where(e => e.HasChild()); - if (imagesInLink.Any()) + IEnumerable imagesInLink; + // Clickable image is only supported in body but not in header/footer + if (context.HostingPart is MainDocumentPart && + (imagesInLink = childElements.Where(e => e.HasChild())).Any()) { foreach (var img in imagesInLink) { @@ -56,25 +59,42 @@ public override IEnumerable Interpret (ParsingContext context) if (enDp.MoveNext()) alt = enDp.Current.Description; else alt = null; - d.InsertInDocProperties( - new a.HyperlinkOnClick() { Id = h.Id ?? h.Anchor, Tooltip = alt }); + d.Inline ??= new a.Wordprocessing.Inline(); + d.Inline.DocProperties ??= new a.Wordprocessing.DocProperties(); + + if (h.Anchor == "_top") + { + // exception case: clickable image requires the _top bookmark get registred with a relationship + var extLink = context.HostingPart.AddHyperlinkRelationship(new Uri("#_top", UriKind.Relative), false); + d.Inline.DocProperties.Append( + new a.HyperlinkOnClick() { Id = extLink.Id, Tooltip = alt }); + } + else + { + d.Inline.DocProperties.Append( + new a.HyperlinkOnClick() { Id = h.Id ?? h.Anchor, Tooltip = alt }); + } } } // can't use GetFirstChild or we may find the one containing the image + List runs = []; foreach (var el in childElements) { - if (el is Run run && !run.HasChild()) - { - run.RunProperties ??= new(); - run.RunProperties.RunStyle = context.DocumentStyle.GetRunStyle( - context.DocumentStyle.DefaultStyles.HyperlinkStyle); - break; - } + if (el is Run r) runs.Add(r); + // unroll paragraphs. CloneNode is need to unparent the run + else runs.AddRange(el.Elements().Select(r => (Run) r.CloneNode(true))); + } + + foreach (var run in runs.Where(run => !run.HasChild())) + { + run.RunProperties ??= new(); + run.RunProperties.RunStyle = context.DocumentStyle.GetRunStyle( + context.DocumentStyle.DefaultStyles.HyperlinkStyle); } // Append the processed elements and put them to the Run of the Hyperlink - h.Append(childElements); + h.Append(runs); return [h]; } @@ -87,20 +107,21 @@ public override IEnumerable Interpret (ParsingContext context) if (string.IsNullOrEmpty(att)) return null; + // Always accept _top anchor + if (linkNode.IsTopAnchor()) + { + h = new Hyperlink() { History = true, Anchor = "_top" }; + } // is it an anchor? - if (att![0] == '#' && att.Length > 1) + else if (!context.Converter.ExcludeLinkAnchor && linkNode.Hash.Length > 1 && linkNode.Hash[0] == '#') { - // Always accept _top anchor - if (!context.Converter.ExcludeLinkAnchor || att == "#_top") - { - h = new Hyperlink( - ) { History = true, Anchor = att.Substring(1) }; - } + h = new Hyperlink( + ) { History = true, Anchor = linkNode.Hash.Substring(1) }; } // ensure the links does not start with javascript: else if (AngleSharpExtensions.TryParseUrl(att, UriKind.Absolute, out var uri)) { - var extLink = context.MainPart.AddHyperlinkRelationship(uri!, true); + var extLink = context.HostingPart.AddHyperlinkRelationship(uri!, true); h = new Hyperlink( ) { History = true, Id = extLink.Id }; diff --git a/src/Html2OpenXml/Expressions/Image/ImageExpression.cs b/src/Html2OpenXml/Expressions/Image/ImageExpression.cs index 52b3e2f1..a4b61c30 100644 --- a/src/Html2OpenXml/Expressions/Image/ImageExpression.cs +++ b/src/Html2OpenXml/Expressions/Image/ImageExpression.cs @@ -11,8 +11,11 @@ */ using System; using System.Threading; +using AngleSharp.Dom; using AngleSharp.Html.Dom; +using AngleSharp.Svg.Dom; using DocumentFormat.OpenXml; +using DocumentFormat.OpenXml.Packaging; using DocumentFormat.OpenXml.Wordprocessing; using HtmlToOpenXml.IO; @@ -57,14 +60,24 @@ class ImageExpression(IHtmlImageElement node) : ImageExpressionBase(node) preferredSize.Height = imgNode.DisplayHeight; } - var (imageObjId, drawingObjId) = IncrementDrawingObjId(context); - - HtmlImageInfo? iinfo = context.Converter.ImagePrefetcher.Download(src, CancellationToken.None) + HtmlImageInfo? iinfo = context.ImageLoader.Download(src, CancellationToken.None) .ConfigureAwait(false).GetAwaiter().GetResult(); if (iinfo == null) return null; + if (iinfo.TypeInfo == ImagePartType.Svg) + { + var imagePart = context.HostingPart.GetPartById(iinfo.ImagePartId); + using var stream = imagePart.GetStream(System.IO.FileMode.Open); + using var sreader = new System.IO.StreamReader(stream); + imgNode.Insert(AdjacentPosition.AfterBegin, sreader.ReadToEnd()); + + var svgNode = imgNode.FindChild(); + if (svgNode is null) return null; + return SvgExpression.CreateSvgDrawing(context, svgNode, iinfo.ImagePartId, preferredSize); + } + if (preferredSize.IsEmpty) { preferredSize = iinfo.Size; @@ -78,6 +91,7 @@ class ImageExpression(IHtmlImageElement node) : ImageExpressionBase(node) long widthInEmus = new Unit(UnitMetric.Pixel, preferredSize.Width).ValueInEmus; long heightInEmus = new Unit(UnitMetric.Pixel, preferredSize.Height).ValueInEmus; + var (imageObjId, drawingObjId) = IncrementDrawingObjId(context); var img = new Drawing( new wp.Inline( new wp.Extent() { Cx = widthInEmus, Cy = heightInEmus }, diff --git a/src/Html2OpenXml/Expressions/Image/ImageExpressionBase.cs b/src/Html2OpenXml/Expressions/Image/ImageExpressionBase.cs index 170b0464..3cf0422c 100644 --- a/src/Html2OpenXml/Expressions/Image/ImageExpressionBase.cs +++ b/src/Html2OpenXml/Expressions/Image/ImageExpressionBase.cs @@ -12,6 +12,7 @@ using System.Collections.Generic; using System.Linq; using DocumentFormat.OpenXml; +using DocumentFormat.OpenXml.Packaging; using DocumentFormat.OpenXml.Wordprocessing; using a = DocumentFormat.OpenXml.Drawing; @@ -85,10 +86,11 @@ internal static (uint imageObjId, uint drawingObjId) IncrementDrawingObjId(Parsi drawingObjId ??= 1; // 1 is the minimum ID set by MS Office. imageObjId ??= 1; + var mainPart = context.MainPart; foreach (var part in new[] { - context.MainPart.Document.Body!.Descendants(), - context.MainPart.HeaderParts.Where(f => f.Header != null).SelectMany(f => f.Header.Descendants()), - context.MainPart.FooterParts.Where(f => f.Footer != null).SelectMany(f => f.Footer.Descendants()) + mainPart.Document.Body!.Descendants(), + mainPart.HeaderParts.Where(f => f.Header != null).SelectMany(f => f.Header.Descendants()), + mainPart.FooterParts.Where(f => f.Footer != null).SelectMany(f => f.Footer.Descendants()) }) foreach (Drawing d in part) { diff --git a/src/Html2OpenXml/Expressions/Image/SvgExpression.cs b/src/Html2OpenXml/Expressions/Image/SvgExpression.cs new file mode 100644 index 00000000..7bb9a7f6 --- /dev/null +++ b/src/Html2OpenXml/Expressions/Image/SvgExpression.cs @@ -0,0 +1,105 @@ +/* Copyright (C) Olivier Nizet https://github.com/onizet/html2openxml - All Rights Reserved + * + * This source is subject to the Microsoft Permissive License. + * Please see the License.txt file for more information. + * All other rights reserved. + * + * THIS CODE AND INFORMATION ARE PROVIDED "AS IS" WITHOUT WARRANTY OF ANY + * KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A + * PARTICULAR PURPOSE. + */ +using AngleSharp.Svg.Dom; +using DocumentFormat.OpenXml; +using DocumentFormat.OpenXml.Packaging; +using DocumentFormat.OpenXml.Wordprocessing; +using DocumentFormat.OpenXml.Office2019.Drawing.SVG; +using System.Text; + +using a = DocumentFormat.OpenXml.Drawing; +using pic = DocumentFormat.OpenXml.Drawing.Pictures; +using wp = DocumentFormat.OpenXml.Drawing.Wordprocessing; +using AngleSharp.Text; + +namespace HtmlToOpenXml.Expressions; + +/// +/// Process the parsing of a svg element. +/// +sealed class SvgExpression(ISvgSvgElement node) : ImageExpressionBase(node) +{ + private readonly ISvgSvgElement svgNode = node; + + + protected override Drawing? CreateDrawing(ParsingContext context) + { + var imgPart = context.MainPart.AddImagePart(ImagePartType.Svg); + using var stream = new System.IO.MemoryStream(Encoding.UTF8.GetBytes(svgNode.OuterHtml), writable: false); + imgPart.FeedData(stream); + var imagePartId = context.MainPart.GetIdOfPart(imgPart); + return CreateSvgDrawing(context, svgNode, imagePartId, Size.Empty); + } + + internal static Drawing CreateSvgDrawing(ParsingContext context, ISvgSvgElement svgNode, string imagePartId, Size preferredSize) + { + var width = Unit.Parse(svgNode.GetAttribute("width")); + var height = Unit.Parse(svgNode.GetAttribute("height")); + long widthInEmus, heightInEmus; + if (width.IsValid && height.IsValid) + { + widthInEmus = width.ValueInEmus; + heightInEmus = height.ValueInEmus; + } + else + { + widthInEmus = new Unit(UnitMetric.Pixel, preferredSize.Width).ValueInEmus; + heightInEmus = new Unit(UnitMetric.Pixel, preferredSize.Height).ValueInEmus; + } + + var (imageObjId, drawingObjId) = IncrementDrawingObjId(context); + + string? title = svgNode.QuerySelector("title")?.TextContent?.CollapseAndStrip() ?? "Picture " + imageObjId; + string? description = svgNode.QuerySelector("desc")?.TextContent?.CollapseAndStrip() ?? string.Empty; + + var img = new Drawing( + new wp.Inline( + new wp.Extent() { Cx = widthInEmus, Cy = heightInEmus }, + new wp.EffectExtent() { LeftEdge = 0L, TopEdge = 0L, RightEdge = 0L, BottomEdge = 0L }, + new wp.DocProperties() { Id = drawingObjId, Name = title, Description = description }, + new wp.NonVisualGraphicFrameDrawingProperties { + GraphicFrameLocks = new a.GraphicFrameLocks() { NoChangeAspect = true } + }, + new a.Graphic( + new a.GraphicData( + new pic.Picture( + new pic.NonVisualPictureProperties { + NonVisualDrawingProperties = new pic.NonVisualDrawingProperties() { + Id = imageObjId, Name = title + }, + NonVisualPictureDrawingProperties = new() + }, + new pic.BlipFill( + new a.Blip( + new a.BlipExtensionList( + new a.BlipExtension(new SVGBlip { Embed = imagePartId }) { + Uri = "{96DAC541-7B7A-43D3-8B79-37D633B846F1}" + }) + ) { Embed = imagePartId /* ideally, that should be a png representation of the svg */ }, + new a.Stretch( + new a.FillRectangle()) + ), + new pic.ShapeProperties( + new a.Transform2D( + new a.Offset() { X = 0L, Y = 0L }, + new a.Extents() { Cx = widthInEmus, Cy = heightInEmus }), + new a.PresetGeometry( + new a.AdjustValueList() + ) { Preset = a.ShapeTypeValues.Rectangle }) + ) + ) { Uri = "http://schemas.openxmlformats.org/drawingml/2006/picture" }) + ) { DistanceFromTop = (UInt32Value)0U, DistanceFromBottom = (UInt32Value)0U, DistanceFromLeft = (UInt32Value)0U, DistanceFromRight = (UInt32Value)0U } + ); + + return img; + } +} \ No newline at end of file diff --git a/src/Html2OpenXml/Expressions/Numbering/ListExpression.cs b/src/Html2OpenXml/Expressions/Numbering/ListExpression.cs index 87d3a3e6..8bb2a369 100644 --- a/src/Html2OpenXml/Expressions/Numbering/ListExpression.cs +++ b/src/Html2OpenXml/Expressions/Numbering/ListExpression.cs @@ -78,6 +78,7 @@ public override IEnumerable Interpret(ParsingContext context) { var expression = new BlockElementExpression(liNode); var childElements = expression.Interpret(context); + if (!childElements.Any()) continue; Paragraph p = (Paragraph) childElements.First(); p.ParagraphProperties ??= new(); diff --git a/src/Html2OpenXml/HtmlConverter.cs b/src/Html2OpenXml/HtmlConverter.cs index 5b64b31b..8531f7ed 100755 --- a/src/Html2OpenXml/HtmlConverter.cs +++ b/src/Html2OpenXml/HtmlConverter.cs @@ -28,8 +28,8 @@ namespace HtmlToOpenXml; public partial class HtmlConverter { private readonly MainDocumentPart mainPart; - /// Cache all the ImagePart processed to avoid downloading the same image. - private ImagePrefetcher? imagePrefetcher; + // Cache all the ImagePart processed to avoid downloading the same image + private IImageLoader? headerImageLoader, bodyImageLoader, footerImageLoader; private readonly WordDocumentStyle htmlStyles; private readonly IWebRequest webRequester; @@ -57,79 +57,135 @@ public HtmlConverter(MainDocumentPart mainPart, IWebRequest? webRequester = null } /// - /// Start the parse processing. + /// Parse some HTML content where the output is intented to be inserted in . /// /// The HTML content to parse /// Returns a list of parsed paragraph. public IList Parse(string html) { - return Parse(html, CancellationToken.None).ConfigureAwait(false).GetAwaiter().GetResult().ToList(); + bodyImageLoader ??= new ImagePrefetcher(mainPart, webRequester); + return ParseCoreAsync(html, mainPart, bodyImageLoader, + new ParallelOptions() { CancellationToken = CancellationToken.None }) + .ConfigureAwait(false).GetAwaiter().GetResult().ToList(); } /// - /// Start the parse processing. + /// Start the asynchroneous parse processing where the output is intented to be inserted in . /// /// The HTML content to parse /// The cancellation token. /// Returns a list of parsed paragraph. + [Obsolete("Use ParseAsync instead to respect naming convention")] + [System.Diagnostics.CodeAnalysis.ExcludeFromCodeCoverage] public Task> Parse(string html, CancellationToken cancellationToken = default) { - return Parse(html, new ParallelOptions() { CancellationToken = cancellationToken }); + return ParseAsync(html, cancellationToken); } /// - /// Start the parse processing. Use this overload if you want to control the downloading of images. + /// Start the asynchroneous parse processing where the output is intented to be inserted in . + /// + /// The HTML content to parse + /// The cancellation token. + /// Returns a list of parsed paragraph. + public Task> ParseAsync(string html, CancellationToken cancellationToken = default) + { + return ParseAsync(html, new ParallelOptions { CancellationToken = cancellationToken }); + } + + /// + /// Start the asynchroneous parse processing where the output is intented to be inserted in . /// /// The HTML content to parse /// The configuration of parallelism while downloading the remote resources. /// Returns a list of parsed paragraph. - public async Task> Parse(string html, ParallelOptions parallelOptions) + public Task> ParseAsync(string html, ParallelOptions parallelOptions) { - if (string.IsNullOrWhiteSpace(html)) - return []; + bodyImageLoader ??= new ImagePrefetcher(mainPart, webRequester); - // ensure a body exists to avoid any errors when trying to access it - if (mainPart.Document == null) - new Document(new Body()).Save(mainPart); - else if (mainPart.Document.Body == null) - mainPart.Document.Body = new Body(); + return ParseCoreAsync(html, mainPart, bodyImageLoader, parallelOptions); + } - var browsingContext = BrowsingContext.New(); - var htmlDocument = await browsingContext.OpenAsync(req => req.Content(html), parallelOptions.CancellationToken); - if (htmlDocument == null) - return []; + /// + /// Parse asynchroneously the Html and append the output into the Header of the document. + /// + /// The HTML content to parse + /// Determines the page(s) on which the current header shall be displayed. + /// If omitted, the value is used. + /// The cancellation token. + /// + public async Task ParseHeader(string html, HeaderFooterValues? headerType = null, + CancellationToken cancellationToken = default) + { + headerType ??= HeaderFooterValues.Default; + var headerPart = ResolveHeaderFooterPart(headerType); - await PreloadImages(htmlDocument, parallelOptions).ConfigureAwait(false); + headerPart.Header ??= new(); + headerImageLoader ??= new ImagePrefetcher(headerPart, webRequester); - var parsingContext = new ParsingContext(this, mainPart); - var body = new Expressions.BodyExpression (htmlDocument.Body!); - var paragraphs = body.Interpret (parsingContext); - return paragraphs.Cast(); + var paragraphs = await ParseCoreAsync(html, headerPart, headerImageLoader, + new ParallelOptions() { CancellationToken = cancellationToken }, + htmlStyles.GetParagraphStyle(htmlStyles.DefaultStyles.HeaderStyle)); + + foreach (var p in paragraphs) + headerPart.Header.AddChild(p); } /// - /// Start the parse processing and append the converted paragraphs into the Body of the document. + /// Parse asynchroneously the Html and append the output into the Footer of the document. /// /// The HTML content to parse + /// Determines the page(s) on which the current footer shall be displayed. + /// If omitted, the value is used. /// The cancellation token. - public async Task ParseHtml(string html, CancellationToken cancellationToken = default) + /// + public async Task ParseFooter(string html, HeaderFooterValues? footerType = null, + CancellationToken cancellationToken = default) { - // This method exists because we may ensure the SectionProperties remains the last element of the body. - // It's mandatory when dealing with page orientation + footerType ??= HeaderFooterValues.Default; + var footerPart = ResolveHeaderFooterPart(footerType); + + footerPart.Footer ??= new(); + footerImageLoader ??= new ImagePrefetcher(footerPart, webRequester); + + var paragraphs = await ParseCoreAsync(html, footerPart, footerImageLoader, + new ParallelOptions() { CancellationToken = cancellationToken }, + htmlStyles.GetParagraphStyle(htmlStyles.DefaultStyles.FooterStyle)); + + foreach (var p in paragraphs) + footerPart.Footer.AddChild(p); + } - var paragraphs = await Parse(html, cancellationToken); + /// + /// Parse asynchroneously the Html and append the output into the Body of the document. + /// + /// The HTML content to parse + /// The cancellation token. + /// + public async Task ParseBody(string html, CancellationToken cancellationToken = default) + { + bodyImageLoader ??= new ImagePrefetcher(mainPart, webRequester); + var paragraphs = await ParseCoreAsync(html, mainPart, bodyImageLoader, + new ParallelOptions() { CancellationToken = cancellationToken }); - Body body = mainPart.Document.Body!; + if (!paragraphs.Any()) + return; + + Body body = mainPart.Document!.Body!; SectionProperties? sectionProperties = body.GetLastChild(); foreach (var para in paragraphs) body.Append(para); - // move the paragraph with BookmarkStart `_GoBack` as the last child + // we automatically create the _top bookmark if missing. To avoid having an empty paragrah, + // let's try to merge with its next paragraph. var p = body.GetFirstChild(); - if (p != null && p.GetFirstChild()?.Id == "_GoBack") + if (p != null && p.GetFirstChild()?.Name == "_top" + && !p.HasChild() + && p.NextSibling() is Paragraph nextPara) { + nextPara.PrependChild(p.GetFirstChild()?.CloneNode(false)); + nextPara.PrependChild(p.GetFirstChild()!.CloneNode(false)); p.Remove(); - body.Append(p); } // Push the sectionProperties as the last element of the Body @@ -141,6 +197,33 @@ public async Task ParseHtml(string html, CancellationToken cancellationToken = d } } + /// + /// Start the asynchroneous parse processing. Use this overload if you want to control the downloading of images. + /// + /// The HTML content to parse + /// The configuration of parallelism while downloading the remote resources. + /// Returns a list of parsed paragraph. + [Obsolete("Use ParseAsync instead to respect naming convention")] + [System.Diagnostics.CodeAnalysis.ExcludeFromCodeCoverage] + public Task> Parse(string html, ParallelOptions parallelOptions) + { + bodyImageLoader ??= new ImagePrefetcher(mainPart, webRequester); + + return ParseCoreAsync(html, mainPart, bodyImageLoader, parallelOptions); + } + + /// + /// Start the asynchroneous parse processing and append the output into the Body of the document. + /// + /// The HTML content to parse + /// The cancellation token. + [Obsolete("Use ParseBody instead for output clarification")] + [System.Diagnostics.CodeAnalysis.ExcludeFromCodeCoverage] + public Task ParseHtml(string html, CancellationToken cancellationToken = default) + { + return ParseBody(html, cancellationToken); + } + /// /// Refresh the cache of styles presents in the document. /// @@ -149,10 +232,53 @@ public void RefreshStyles() htmlStyles.PrepareStyles(mainPart); } + /// + /// Start the asynchroneous parse processing. Use this overload if you want to control the downloading of images. + /// + /// The HTML content to parse + /// The OpenXml container where the content will be inserted into. + /// The image resolver service linked to the . + /// The configuration of parallelism while downloading the remote resources. + /// The default OpenXml style to apply on paragraphs. + /// Returns a list of parsed paragraph. + private async Task> ParseCoreAsync(string html, + OpenXmlPartContainer hostingPart, IImageLoader imageLoader, + ParallelOptions parallelOptions, + ParagraphStyleId? defaultParagraphStyleId = null) + { + if (string.IsNullOrWhiteSpace(html)) + return []; + + var browsingContext = BrowsingContext.New(); + var htmlDocument = await browsingContext.OpenAsync(req => req.Content(html), parallelOptions.CancellationToken).ConfigureAwait(false); + if (htmlDocument == null) + return []; + + if (mainPart.Document == null) + new Document(new Body()).Save(mainPart); + else if (mainPart.Document.Body == null) + mainPart.Document.Body = new Body(); + + await PreloadImages(htmlDocument, imageLoader, parallelOptions).ConfigureAwait(false); + + Expressions.HtmlDomExpression expression; + if (hostingPart is MainDocumentPart) + expression = new Expressions.BodyExpression(htmlDocument.Body!); + else if (defaultParagraphStyleId?.Val?.HasValue == true) + expression = new Expressions.BlockElementExpression(htmlDocument.Body!, defaultParagraphStyleId); + else + expression = new Expressions.BlockElementExpression(htmlDocument.Body!); + + var parsingContext = new ParsingContext(this, hostingPart, imageLoader); + var paragraphs = expression.Interpret(parsingContext); + return paragraphs.Cast(); + } + /// /// Walk through all the img tags and preload all the remote images. /// - private async Task PreloadImages(AngleSharp.Dom.IDocument htmlDocument, ParallelOptions parallelOptions) + private async Task PreloadImages(AngleSharp.Dom.IDocument htmlDocument, + IImageLoader imageLoader, ParallelOptions parallelOptions) { var imageUris = htmlDocument.QuerySelectorAll("img[src]") .Cast() @@ -162,10 +288,50 @@ private async Task PreloadImages(AngleSharp.Dom.IDocument htmlDocument, Parallel return; await imageUris.ForEachAsync( - async (img, cts) => await ImagePrefetcher.Download(img, cts), + async (img, cts) => await imageLoader.Download(img, cts), parallelOptions).ConfigureAwait(false); } + /// + /// Create or resolve the header/footer related to the type. + /// + private TPart ResolveHeaderFooterPart(HeaderFooterValues? type) + where TPart: OpenXmlPart, IFixedContentTypePart + where TRefType: HeaderFooterReferenceType, new() + { + bool wasRefSet = false; + TPart? part = null; + + var sectionProps = mainPart.Document.Body!.Elements(); + if (!sectionProps.Any()) + { + sectionProps = [new SectionProperties()]; + mainPart.Document.Body!.AddChild(sectionProps.First()); + } + else + { + var reference = sectionProps.SelectMany(sectPr => sectPr.Elements()) + .Where(r => r.Id?.HasValue == true) + .FirstOrDefault(r => r.Type?.Value == type); + + if (reference != null) + part = (TPart) mainPart.GetPartById(reference.Id!); + wasRefSet = part is not null; + } + + part ??= mainPart.AddNewPart(); + + if (!wasRefSet) + { + sectionProps.First().PrependChild(new TRefType() { + Id = mainPart.GetIdOfPart(part), + Type = type + }); + } + + return part; + } + //____________________________________________________________________ // // Configuration @@ -216,10 +382,10 @@ public WordDocumentStyle HtmlStyles public bool ContinueNumbering { get; set; } = true; /// - /// Resolve a remote or inline image resource. + /// Gets the mainDocumentPart of the destination OpenXml document. /// - internal ImagePrefetcher ImagePrefetcher + internal MainDocumentPart MainPart { - get => imagePrefetcher ??= new ImagePrefetcher(mainPart, webRequester); + get => mainPart; } } diff --git a/src/Html2OpenXml/HtmlToOpenXml.csproj b/src/Html2OpenXml/HtmlToOpenXml.csproj index 34e3b321..b9475508 100644 --- a/src/Html2OpenXml/HtmlToOpenXml.csproj +++ b/src/Html2OpenXml/HtmlToOpenXml.csproj @@ -9,13 +9,13 @@ HtmlToOpenXml HtmlToOpenXml HtmlToOpenXml.dll - 3.1.1 + 3.2.0 icon.png Copyright 2009-$([System.DateTime]::Now.Year) Olivier Nizet See changelog https://github.com/onizet/html2openxml/blob/master/CHANGELOG.md README.md office openxml netcore html - 3.1.1 + 3.2.0 MIT https://github.com/onizet/html2openxml https://github.com/onizet/html2openxml @@ -45,11 +45,15 @@ - + + + $(DefineConstants);DEBUG + false + true snupkg diff --git a/src/Html2OpenXml/IO/DefaultWebRequest.cs b/src/Html2OpenXml/IO/DefaultWebRequest.cs index 2c85335c..9414a02b 100644 --- a/src/Html2OpenXml/IO/DefaultWebRequest.cs +++ b/src/Html2OpenXml/IO/DefaultWebRequest.cs @@ -63,7 +63,17 @@ public DefaultWebRequest(HttpClient httpClient, ILogger? logger = null) requestUri = UrlCombine(BaseImageUrl, requestUri.OriginalString); } - if (requestUri.IsFile) + bool isLocalFile; + try + { + isLocalFile = requestUri.IsFile; + } + catch (InvalidOperationException) + { + isLocalFile = false; + } + + if (isLocalFile) { return DownloadLocalFile(requestUri, cancellationToken); } @@ -108,6 +118,9 @@ public DefaultWebRequest(HttpClient httpClient, ILogger? logger = null) { logger?.LogDebug("Downloading remote file: {0}", requestUri); + if (httpClient.BaseAddress is null && !requestUri.IsAbsoluteUri) + return null; + var response = await httpClient.GetAsync(requestUri, cancellationToken).ConfigureAwait(false); if (response == null) return null; resource.StatusCode = response.StatusCode; diff --git a/src/Html2OpenXml/IO/ImageHeader.cs b/src/Html2OpenXml/IO/ImageHeader.cs index 762c9c90..858a923d 100755 --- a/src/Html2OpenXml/IO/ImageHeader.cs +++ b/src/Html2OpenXml/IO/ImageHeader.cs @@ -18,6 +18,7 @@ using System.IO; using System.Linq; using System.Text; +using System.Xml.XPath; namespace HtmlToOpenXml.IO; @@ -29,7 +30,7 @@ public static class ImageHeader // https://en.wikipedia.org/wiki/List_of_file_signatures #pragma warning disable CS1591 // Missing XML comment for publicly visible type or member - public enum FileType { Unrecognized, Bitmap, Gif, Png, Jpeg, Emf } + public enum FileType { Unrecognized, Bitmap, Gif, Png, Jpeg, Emf, Xml } #pragma warning restore CS1591 // Missing XML comment for publicly visible type or member private static readonly byte[] pngSignatureBytes = [0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A]; @@ -41,7 +42,8 @@ public enum FileType { Unrecognized, Bitmap, Gif, Png, Jpeg, Emf } { Encoding.UTF8.GetBytes("GIF89a"), FileType.Gif }, // animated gif { pngSignatureBytes, FileType.Png }, { new byte[] { 0xff, 0xd8 }, FileType.Jpeg }, - { new byte[] { 0x1, 0, 0, 0 }, FileType.Emf } + { new byte[] { 0x1, 0, 0, 0 }, FileType.Emf }, + { Encoding.UTF8.GetBytes(" - /// Download and provison the metadata of a requested image. + /// Download the remote or local image located at the specified url. /// - sealed class ImagePrefetcher + Task Download(string imageUri, CancellationToken cancellationToken); +} + +/// +/// Download and provison the metadata of a requested image. +/// +sealed class ImagePrefetcher : IImageLoader + where T: OpenXmlPartContainer, ISupportedRelationship +{ + // Map extension to PartTypeInfo + private static readonly Dictionary knownExtensions = new(StringComparer.OrdinalIgnoreCase) { + { ".gif", ImagePartType.Gif }, + { ".bmp", ImagePartType.Bmp }, + { ".emf", ImagePartType.Emf }, + { ".ico", ImagePartType.Icon }, + { ".jp2", ImagePartType.Jp2 }, + { ".jpeg", ImagePartType.Jpeg }, + { ".jpg", ImagePartType.Jpeg }, + { ".jpe", ImagePartType.Jpeg }, + { ".pcx", ImagePartType.Pcx }, + { ".png", ImagePartType.Png }, + { ".svg", ImagePartType.Svg }, + { ".tif", ImagePartType.Tif }, + { ".tiff", ImagePartType.Tiff }, + { ".wmf", ImagePartType.Wmf } + }; + private readonly T hostingPart; + private readonly IWebRequest resourceLoader; + private readonly HtmlImageInfoCollection prefetchedImages; + + + /// + /// Constructor. + /// + /// The image will be linked to that hosting part. + /// Images are not shared between header, footer and body. + /// Service to resolve an image. + public ImagePrefetcher(T hostingPart, IWebRequest resourceLoader) { - // Map extension to PartTypeInfo - private static readonly Dictionary knownExtensions = new(StringComparer.OrdinalIgnoreCase) { - { ".gif", ImagePartType.Gif }, - { ".bmp", ImagePartType.Bmp }, - { ".emf", ImagePartType.Emf }, - { ".ico", ImagePartType.Icon }, - { ".jp2", ImagePartType.Jp2 }, - { ".jpeg", ImagePartType.Jpeg }, - { ".jpg", ImagePartType.Jpeg }, - { ".jpe", ImagePartType.Jpeg }, - { ".pcx", ImagePartType.Pcx }, - { ".png", ImagePartType.Png }, - { ".svg", ImagePartType.Svg }, - { ".tif", ImagePartType.Tif }, - { ".tiff", ImagePartType.Tiff }, - { ".wmf", ImagePartType.Wmf } - }; - private readonly MainDocumentPart mainPart; - private readonly IWebRequest resourceLoader; - private readonly HtmlImageInfoCollection prefetchedImages; - - - public ImagePrefetcher(MainDocumentPart mainPart, IWebRequest resourceLoader) - { - this.mainPart = mainPart; - this.resourceLoader = resourceLoader; - this.prefetchedImages = new HtmlImageInfoCollection(); - } + this.hostingPart = hostingPart; + this.resourceLoader = resourceLoader; + this.prefetchedImages = new HtmlImageInfoCollection(); + } + + //____________________________________________________________________ + // + // Public Functionality - //____________________________________________________________________ - // - // Public Functionality + /// + /// Download the remote or local image located at the specified url. + /// + public async Task Download(string imageUri, CancellationToken cancellationToken) + { + if (prefetchedImages.Contains(imageUri)) + return prefetchedImages[imageUri]; - /// - /// Download the remote or local image located at the specified url. - /// - public async Task Download(string imageUri, CancellationToken cancellationToken) + HtmlImageInfo? iinfo; + if (DataUri.IsWellFormed(imageUri)) // data inline, encoded in base64 { - if (prefetchedImages.Contains(imageUri)) - return prefetchedImages[imageUri]; + iinfo = ReadDataUri(imageUri); + } + else + { + iinfo = await DownloadRemoteImage(imageUri, cancellationToken); + } - HtmlImageInfo? iinfo; - if (DataUri.IsWellFormed(imageUri)) // data inline, encoded in base64 - { - iinfo = ReadDataUri(imageUri); - } - else - { - iinfo = await DownloadRemoteImage(imageUri, cancellationToken); - } + if (iinfo != null) + prefetchedImages.Add(iinfo); - if (iinfo != null) - prefetchedImages.Add(iinfo); + return iinfo; + } - return iinfo; - } + /// + /// Download the image and try to find its format type. + /// + private async Task DownloadRemoteImage(string src, CancellationToken cancellationToken) + { + Uri imageUri = new Uri(src, UriKind.RelativeOrAbsolute); + if (imageUri.IsAbsoluteUri && !resourceLoader.SupportsProtocol(imageUri.Scheme)) + return null; - /// - /// Download the image and try to find its format type. - /// - private async Task DownloadRemoteImage(string src, CancellationToken cancellationToken) - { - Uri imageUri = new Uri(src, UriKind.RelativeOrAbsolute); - if (imageUri.IsAbsoluteUri && !resourceLoader.SupportsProtocol(imageUri.Scheme)) - return null; + Resource? response; - Resource? response; + response = await resourceLoader.FetchAsync(imageUri, cancellationToken).ConfigureAwait(false); + if (response?.Content == null) + return null; - response = await resourceLoader.FetchAsync(imageUri, cancellationToken).ConfigureAwait(false); - if (response?.Content == null) + using (response) + { + // For requested url with no filename, we need to read the media mime type if provided + response.Headers.TryGetValue("Content-Type", out var mime); + if (!TryInspectMimeType(mime, out PartTypeInfo type) + && !TryGuessTypeFromUri(imageUri, out type) + && !TryGuessTypeFromStream(response.Content, out type)) + { return null; + } - HtmlImageInfo info = new HtmlImageInfo(src); - using (response) + var ipart = hostingPart.AddImagePart(type); + Size originalSize; + using (var outputStream = ipart.GetStream(FileMode.Create)) { - // For requested url with no filename, we need to read the media mime type if provided - response.Headers.TryGetValue("Content-Type", out var mime); - if (!TryInspectMimeType(mime, out PartTypeInfo type) - && !TryGuessTypeFromUri(imageUri, out type) - && !TryGuessTypeFromStream(response.Content, out type)) - { - return null; - } - - var ipart = mainPart.AddImagePart(type); - using (var outputStream = ipart.GetStream(FileMode.Create)) - { - response.Content.CopyTo(outputStream); - - outputStream.Seek(0L, SeekOrigin.Begin); - info.Size = GetImageSize(outputStream); - } - - info.ImagePartId = mainPart.GetIdOfPart(ipart); - return info; + response.Content.CopyTo(outputStream); + + outputStream.Seek(0L, SeekOrigin.Begin); + originalSize = GetImageSize(outputStream); } + + return new HtmlImageInfo(src, hostingPart.GetIdOfPart(ipart)) { + TypeInfo = type, + Size = originalSize + }; } + } - /// - /// Parse the Data inline image. - /// - private HtmlImageInfo? ReadDataUri(string src) + /// + /// Parse the Data inline image. + /// + private HtmlImageInfo? ReadDataUri(string src) + { + if (DataUri.TryCreate(src, out var dataUri)) { - if (DataUri.TryCreate(src, out var dataUri)) + Size originalSize; + knownContentType.TryGetValue(dataUri!.Mime, out PartTypeInfo type); + var ipart = hostingPart.AddImagePart(type); + using (var outputStream = ipart.GetStream(FileMode.Create)) { - Size size; - knownContentType.TryGetValue(dataUri!.Mime, out PartTypeInfo type); - var ipart = mainPart.AddImagePart(type); - using (var outputStream = ipart.GetStream(FileMode.Create)) - { - outputStream.Write(dataUri.Data, 0, dataUri.Data.Length); - - outputStream.Seek(0L, SeekOrigin.Begin); - size = GetImageSize(outputStream); - } - - return new HtmlImageInfo(src) { - ImagePartId = mainPart.GetIdOfPart(ipart), - Size = size - }; + outputStream.Write(dataUri.Data, 0, dataUri.Data.Length); + + outputStream.Seek(0L, SeekOrigin.Begin); + originalSize = GetImageSize(outputStream); } - return null; + return new HtmlImageInfo(src, hostingPart.GetIdOfPart(ipart)) { + TypeInfo = type, + Size = originalSize + }; } - //____________________________________________________________________ - // - // Private Implementation - - // http://stackoverflow.com/questions/58510/using-net-how-can-you-find-the-mime-type-of-a-file-based-on-the-file-signature - private static readonly Dictionary knownContentType = new(StringComparer.OrdinalIgnoreCase) { - { "image/gif", ImagePartType.Gif }, - { "image/pjpeg", ImagePartType.Jpeg }, - { "image/jp2", ImagePartType.Jp2 }, - { "image/jpg", ImagePartType.Jpeg }, - { "image/jpeg", ImagePartType.Jpeg }, - { "image/x-png", ImagePartType.Png }, - { "image/png", ImagePartType.Png }, - { "image/tiff", ImagePartType.Tiff }, - { "image/vnd.microsoft.icon", ImagePartType.Icon }, - // these icons mime type are wrong but we should nevertheless take care (http://en.wikipedia.org/wiki/ICO_%28file_format%29#MIME_type) - { "image/x-icon", ImagePartType.Icon }, - { "image/icon", ImagePartType.Icon }, - { "image/ico", ImagePartType.Icon }, - { "text/ico", ImagePartType.Icon }, - { "text/application-ico", ImagePartType.Icon }, - { "image/bmp", ImagePartType.Bmp }, - { "image/svg+xml", ImagePartType.Svg }, - }; - - /// - /// Inspect the response headers of a web request and decode the mime type if provided - /// - /// Returns the extension of the image if provideds. - private static bool TryInspectMimeType(string? contentType, out PartTypeInfo type) - { - // can be null when the protocol used doesn't allow response headers - if (contentType != null && - knownContentType.TryGetValue(contentType, out type)) - return true; + return null; + } - type = default; - return false; - } + //____________________________________________________________________ + // + // Private Implementation - /// - /// Gets the OpenXml PartTypeInfo associated to an image. - /// - private static bool TryGuessTypeFromUri(Uri uri, out PartTypeInfo type) - { - string extension = Path.GetExtension(uri.IsAbsoluteUri ? uri.Segments[uri.Segments.Length - 1] : uri.OriginalString); - if (knownExtensions.TryGetValue(extension, out type)) return true; + // http://stackoverflow.com/questions/58510/using-net-how-can-you-find-the-mime-type-of-a-file-based-on-the-file-signature + private static readonly Dictionary knownContentType = new(StringComparer.OrdinalIgnoreCase) { + { "image/gif", ImagePartType.Gif }, + { "image/pjpeg", ImagePartType.Jpeg }, + { "image/jp2", ImagePartType.Jp2 }, + { "image/jpg", ImagePartType.Jpeg }, + { "image/jpeg", ImagePartType.Jpeg }, + { "image/x-png", ImagePartType.Png }, + { "image/png", ImagePartType.Png }, + { "image/tiff", ImagePartType.Tiff }, + { "image/vnd.microsoft.icon", ImagePartType.Icon }, + // these icons mime type are wrong but we should nevertheless take care (http://en.wikipedia.org/wiki/ICO_%28file_format%29#MIME_type) + { "image/x-icon", ImagePartType.Icon }, + { "image/icon", ImagePartType.Icon }, + { "image/ico", ImagePartType.Icon }, + { "text/ico", ImagePartType.Icon }, + { "text/application-ico", ImagePartType.Icon }, + { "image/bmp", ImagePartType.Bmp }, + { "image/svg+xml", ImagePartType.Svg }, + }; - // extension not recognized, try with checking the query string. Expecting to resolve something like: - // ./image.axd?picture=img1.jpg - extension = Path.GetExtension(uri.IsAbsoluteUri ? uri.AbsoluteUri : uri.ToString()); - if (knownExtensions.TryGetValue(extension, out type)) return true; + /// + /// Inspect the response headers of a web request and decode the mime type if provided + /// + /// Returns the extension of the image if provideds. + private static bool TryInspectMimeType(string? contentType, out PartTypeInfo type) + { + // can be null when the protocol used doesn't allow response headers + if (contentType != null && + knownContentType.TryGetValue(contentType, out type)) + return true; - return false; - } + type = default; + return false; + } - /// - /// Gets the OpenXml PartTypeInfo associated to an image. - /// - private static bool TryGuessTypeFromStream(Stream stream, out PartTypeInfo type) + /// + /// Gets the OpenXml PartTypeInfo associated to an image. + /// + private static bool TryGuessTypeFromUri(Uri uri, out PartTypeInfo type) + { + string extension = Path.GetExtension(uri.IsAbsoluteUri ? uri.Segments[uri.Segments.Length - 1] : uri.OriginalString); + if (knownExtensions.TryGetValue(extension, out type)) return true; + + // extension not recognized, try with checking the query string. Expecting to resolve something like: + // ./image.axd?picture=img1.jpg + extension = Path.GetExtension(uri.IsAbsoluteUri ? uri.AbsoluteUri : uri.ToString()); + if (knownExtensions.TryGetValue(extension, out type)) return true; + + return false; + } + + /// + /// Gets the OpenXml PartTypeInfo associated to an image. + /// + private static bool TryGuessTypeFromStream(Stream stream, out PartTypeInfo type) + { + if (ImageHeader.TryDetectFileType(stream, out ImageHeader.FileType guessType)) { - if (ImageHeader.TryDetectFileType(stream, out ImageHeader.FileType guessType)) + switch (guessType) { - switch (guessType) - { - case ImageHeader.FileType.Bitmap: type = ImagePartType.Bmp; return true; - case ImageHeader.FileType.Emf: type = ImagePartType.Emf; return true; - case ImageHeader.FileType.Gif: type = ImagePartType.Gif; return true; - case ImageHeader.FileType.Jpeg: type = ImagePartType.Jpeg; return true; - case ImageHeader.FileType.Png: type = ImagePartType.Png; return true; - } + case ImageHeader.FileType.Bitmap: type = ImagePartType.Bmp; return true; + case ImageHeader.FileType.Emf: type = ImagePartType.Emf; return true; + case ImageHeader.FileType.Gif: type = ImagePartType.Gif; return true; + case ImageHeader.FileType.Jpeg: type = ImagePartType.Jpeg; return true; + case ImageHeader.FileType.Png: type = ImagePartType.Png; return true; } - type = ImagePartType.Bmp; - return false; } + type = ImagePartType.Bmp; + return false; + } - /// - /// Loads an image from a stream and grab its size. - /// - private static Size GetImageSize(Stream imageStream) + /// + /// Loads an image from a stream and grab its size. + /// + private static Size GetImageSize(Stream imageStream) + { + // Read only the size of the image + try { - // Read only the size of the image - try - { - return ImageHeader.GetDimensions(imageStream); - } - catch (ArgumentException) - { - return Size.Empty; - } + return ImageHeader.GetDimensions(imageStream); + } + catch (ArgumentException) + { + return Size.Empty; } } -} \ No newline at end of file +} diff --git a/src/Html2OpenXml/ParsingContext.cs b/src/Html2OpenXml/ParsingContext.cs index a26bc9f4..18dd7657 100644 --- a/src/Html2OpenXml/ParsingContext.cs +++ b/src/Html2OpenXml/ParsingContext.cs @@ -20,14 +20,19 @@ namespace HtmlToOpenXml; /// Contains information that is global to the parsing. /// /// The list of paragraphs that will be returned. -sealed class ParsingContext(HtmlConverter converter, MainDocumentPart mainPart) +sealed class ParsingContext(HtmlConverter converter, OpenXmlPartContainer hostingPart, IO.IImageLoader imageLoader) { /// Shorthand for .HtmlStyles public WordDocumentStyle DocumentStyle { get => Converter.HtmlStyles; } public HtmlConverter Converter { get; } = converter; - public MainDocumentPart MainPart { get; } = mainPart; + public MainDocumentPart MainPart { get; } = converter.MainPart; + + public OpenXmlPartContainer HostingPart { get; } = hostingPart; + + public IO.IImageLoader ImageLoader { get; } = imageLoader; + private HtmlElementExpression? parentExpression; private ParsingContext? parentContext; @@ -49,7 +54,7 @@ public void CascadeStyles (OpenXmlElement element) public ParsingContext CreateChild(HtmlElementExpression expression) { - var childContext = new ParsingContext(Converter, MainPart) { + var childContext = new ParsingContext(Converter, HostingPart, ImageLoader) { propertyBag = propertyBag, parentExpression = expression, parentContext = this diff --git a/src/Html2OpenXml/PredefinedStyles.cs b/src/Html2OpenXml/PredefinedStyles.cs index 4e0d5356..a7cf2a5a 100755 --- a/src/Html2OpenXml/PredefinedStyles.cs +++ b/src/Html2OpenXml/PredefinedStyles.cs @@ -22,6 +22,9 @@ internal class PredefinedStyles public const string Quote = "Quote"; public const string QuoteChar = "QuoteChar"; public const string TableGrid = "TableGrid"; + public const string Header = "Header"; + public const string Footer = "Footer"; + /// diff --git a/src/Html2OpenXml/Primitives/DefaultStyles.cs b/src/Html2OpenXml/Primitives/DefaultStyles.cs index ca57ac29..44d2a1ce 100644 --- a/src/Html2OpenXml/Primitives/DefaultStyles.cs +++ b/src/Html2OpenXml/Primitives/DefaultStyles.cs @@ -67,7 +67,7 @@ public class DefaultStyles public string ListParagraphStyle { get; set; } = PredefinedStyles.ListParagraph; /// - /// Default style for the <pre> table + /// Default style for the pre table /// /// TableGrid public string PreTableStyle { get; set; } = PredefinedStyles.TableGrid; @@ -89,4 +89,16 @@ public class DefaultStyles /// /// TableGrid public string TableStyle { get; set; } = PredefinedStyles.TableGrid; + + /// + /// Default style for header paragraphs. + /// + /// Header + public string HeaderStyle { get; set; } = PredefinedStyles.Header; + + /// + /// Default style for footer paragraphs. + /// + /// Footer + public string FooterStyle { get; set; } = PredefinedStyles.Footer; } \ No newline at end of file diff --git a/src/Html2OpenXml/Primitives/HtmlImageInfo.cs b/src/Html2OpenXml/Primitives/HtmlImageInfo.cs index 7ecda1f6..4ffca088 100755 --- a/src/Html2OpenXml/Primitives/HtmlImageInfo.cs +++ b/src/Html2OpenXml/Primitives/HtmlImageInfo.cs @@ -16,7 +16,7 @@ namespace HtmlToOpenXml; /// /// Represents an image and its metadata. /// -sealed class HtmlImageInfo(string source) +sealed class HtmlImageInfo(string source, string partId) { /// /// The URI identifying this cached image information. @@ -26,12 +26,17 @@ sealed class HtmlImageInfo(string source) /// /// The Unique identifier of the ImagePart in the . /// - public string? ImagePartId { get; set; } + public string ImagePartId { get; set; } = partId; /// - /// Gets or sets the size of the image + /// Gets or sets the original size of the image. /// public Size Size { get; set; } + + /// + /// Gets the content type of the image. + /// + public PartTypeInfo TypeInfo { get; set; } } /// diff --git a/src/Html2OpenXml/Primitives/Unit.cs b/src/Html2OpenXml/Primitives/Unit.cs index e7370c1b..938df509 100755 --- a/src/Html2OpenXml/Primitives/Unit.cs +++ b/src/Html2OpenXml/Primitives/Unit.cs @@ -111,7 +111,7 @@ private static long ComputeInEmus(UnitMetric type, double value) case UnitMetric.Emus: return (long) value; case UnitMetric.Inch: return (long) (value * 914400L); case UnitMetric.Centimeter: return (long) (value * 360000L); - case UnitMetric.Millimeter: return (long) (value * 3600000L); + case UnitMetric.Millimeter: return (long) (value * 36000L); case UnitMetric.EM: // well this is a rough conversion but considering 1em = 12pt (http://sureshjain.wordpress.com/2007/07/06/53/) return (long) (value / 72 * 914400L * 12); diff --git a/src/Html2OpenXml/Utilities/AngleSharpExtensions.cs b/src/Html2OpenXml/Utilities/AngleSharpExtensions.cs index b7c38b90..3d896241 100644 --- a/src/Html2OpenXml/Utilities/AngleSharpExtensions.cs +++ b/src/Html2OpenXml/Utilities/AngleSharpExtensions.cs @@ -39,6 +39,17 @@ public static HtmlAttributeCollection GetStyles(this IElement element) return null; } + /// + /// Gets whether the anchor is redirect to the `top` of the document. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static bool IsTopAnchor(this IHtmlAnchorElement element) + { + if (element.Hash.Length <= 1) return false; + return "#top".Equals(element.Hash, StringComparison.OrdinalIgnoreCase) + || "#_top".Equals(element.Hash, StringComparison.OrdinalIgnoreCase); + } + /// /// Gets whether the given child is preceded by any list element (ol or ul). /// diff --git a/src/Html2OpenXml/Utilities/OpenXmlExtensions.cs b/src/Html2OpenXml/Utilities/OpenXmlExtensions.cs index e79da7c5..5242f49e 100755 --- a/src/Html2OpenXml/Utilities/OpenXmlExtensions.cs +++ b/src/Html2OpenXml/Utilities/OpenXmlExtensions.cs @@ -42,15 +42,6 @@ public static bool HasChild(this OpenXmlElement element) where T : OpenXmlEle return null; } - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static void InsertInDocProperties(this Drawing d, params OpenXmlElement[] newChildren) - { - d.Inline ??= new Inline(); - if (d.Inline.DocProperties == null) d.Inline.DocProperties = new DocProperties(); - - d.Inline.DocProperties.Append(newChildren); - } - public static bool Compare(this PageSize pageSize, PageOrientationValues orientation) { PageOrientationValues pageOrientation; diff --git a/test/HtmlToOpenXml.Tests/AbbrTests.cs b/test/HtmlToOpenXml.Tests/AbbrTests.cs index f3063ba0..2f65ad1f 100644 --- a/test/HtmlToOpenXml.Tests/AbbrTests.cs +++ b/test/HtmlToOpenXml.Tests/AbbrTests.cs @@ -101,7 +101,7 @@ public void Empty_ShouldBeIgnored() Assert.That(elements, Is.Empty); } - [TestCase("Placeholder")] + [TestCase("Placeholder")] [TestCase("Placeholder")] [TestCase("
Placeholder
")] public void WithNoDescription_ReturnsSimpleParagraph(string html) @@ -110,6 +110,8 @@ public void WithNoDescription_ReturnsSimpleParagraph(string html) var elements = converter.Parse(html); Assert.That(elements, Has.Count.EqualTo(1)); Assert.That(elements, Is.All.TypeOf()); + Assert.That(mainPart.FootnotesPart, Is.Null); + Assert.That(mainPart.EndnotesPart, Is.Null); } [TestCase("HTML", AcronymPosition.DocumentEnd, Description = "Read existing endnotes references")] @@ -177,5 +179,41 @@ public void InsideParagraph_ReturnsMultipleRuns() Assert.That(elements[0].Elements().Any(r => r.HasChild()), Is.True); }); } + + [TestCase("NASA")] + [TestCase("
NASA
")] + public async Task ParseIntoHeader_ReturnsSimpleParagraph(string html) + { + await converter.ParseHeader(html); + var header = mainPart.HeaderParts?.FirstOrDefault()?.Header; + Assert.That(header, Is.Not.Null); + Assert.That(header.ChildElements, Has.Count.EqualTo(1)); + Assert.Multiple(() => + { + Assert.That(header.ChildElements, Is.All.TypeOf()); + Assert.That(header.FirstChild!.InnerText, Is.EqualTo("NASA")); + Assert.That(mainPart.FootnotesPart, Is.Null); + Assert.That(mainPart.EndnotesPart, Is.Null); + AssertThatOpenXmlDocumentIsValid(); + }); + } + + [TestCase("NASA")] + [TestCase("
NASA
")] + public async Task ParseIntoFooter_ShouldBeIgnored(string html) + { + await converter.ParseFooter(html); + var footer = mainPart.FooterParts?.FirstOrDefault()?.Footer; + Assert.That(footer, Is.Not.Null); + Assert.That(footer.ChildElements, Has.Count.EqualTo(1)); + Assert.Multiple(() => + { + Assert.That(footer.ChildElements, Is.All.TypeOf()); + Assert.That(footer.FirstChild!.InnerText, Is.EqualTo("NASA")); + Assert.That(mainPart.FootnotesPart, Is.Null); + Assert.That(mainPart.EndnotesPart, Is.Null); + AssertThatOpenXmlDocumentIsValid(); + }); + } } } \ No newline at end of file diff --git a/test/HtmlToOpenXml.Tests/HeaderFooterTests.cs b/test/HtmlToOpenXml.Tests/HeaderFooterTests.cs new file mode 100644 index 00000000..1f399e6b --- /dev/null +++ b/test/HtmlToOpenXml.Tests/HeaderFooterTests.cs @@ -0,0 +1,113 @@ +using NUnit.Framework; +using DocumentFormat.OpenXml.Wordprocessing; +using DocumentFormat.OpenXml.Packaging; + +namespace HtmlToOpenXml.Tests +{ + /// + /// Tests on ParseHeader and ParseFooter methods. + /// + [TestFixture] + public class HeaderFooterTests : HtmlConverterTestBase + { + [Test] + public async Task Header_ReturnsHeaderPartLinkedToBody() + { + await converter.ParseHeader("

Header content

", HeaderFooterValues.First); + + var headerPart = mainPart.HeaderParts?.FirstOrDefault(); + Assert.That(headerPart, Is.Not.Null); + Assert.That(headerPart.Header, Is.Not.Null); + var p = headerPart.Header.Elements(); + Assert.That(p, Is.Not.Empty); + Assert.That(p.Select(p => p.ParagraphProperties?.ParagraphStyleId?.Val?.Value), + Has.All.EqualTo(converter.HtmlStyles.DefaultStyles.HeaderStyle)); + + var sectionProperties = mainPart.Document.Body!.Elements(); + Assert.That(sectionProperties, Is.Not.Empty); + Assert.That(sectionProperties.SelectMany(s => s.Elements()) + .Any(r => r.Type?.Value == HeaderFooterValues.First), Is.True); + AssertThatOpenXmlDocumentIsValid(); + } + + [Test] + public async Task Footer_ReturnsFooterPartLinkedToBody() + { + await converter.ParseFooter("

Footer content

"); + + var footerPart = mainPart.FooterParts?.FirstOrDefault(); + Assert.That(footerPart, Is.Not.Null); + Assert.That(footerPart.Footer, Is.Not.Null); + + var sectionProperties = mainPart.Document.Body!.Elements(); + Assert.That(sectionProperties, Is.Not.Empty); + Assert.That(sectionProperties.Any(s => s.HasChild()), Is.True); + AssertThatOpenXmlDocumentIsValid(); + } + + [Test(Description = "Overwrite existing Default header")] + public async Task WithExistingHeader_Default_ReturnsOverridenHeaderPart() + { + using var generatedDocument = new MemoryStream(); + using (var buffer = ResourceHelper.GetStream("Resources.DocWithImgHeaderFooter.docx")) + buffer.CopyTo(generatedDocument); + + generatedDocument.Position = 0L; + using WordprocessingDocument package = WordprocessingDocument.Open(generatedDocument, true); + MainDocumentPart mainPart = package.MainDocumentPart!; + + var sectionProperties = mainPart.Document.Body!.Elements(); + Assert.That(sectionProperties, Is.Not.Empty); + var headerRefs = sectionProperties.SelectMany(s => s.Elements()); + Assert.Multiple(() => + { + Assert.That(headerRefs.Count(), Is.EqualTo(1)); + Assert.That(headerRefs.Count(r => r.Type?.Value == HeaderFooterValues.Default), Is.EqualTo(1), "Default header exist"); + }); + + HtmlConverter converter = new(mainPart); + await converter.ParseHeader("Header content"); + + sectionProperties = mainPart.Document.Body!.Elements(); + Assert.That(sectionProperties, Is.Not.Empty); + Assert.That(sectionProperties.SelectMany(s => s.Elements()) + .Count(r => r.Type?.Value == HeaderFooterValues.Default), Is.EqualTo(1)); + AssertThatOpenXmlDocumentIsValid(); + } + + [Test(Description = "Create additional header for even pages")] + public async Task WithExistingHeader_Even_ReturnsAnotherHeaderPart() + { + using var generatedDocument = new MemoryStream(); + using (var buffer = ResourceHelper.GetStream("Resources.DocWithImgHeaderFooter.docx")) + buffer.CopyTo(generatedDocument); + + generatedDocument.Position = 0L; + using WordprocessingDocument package = WordprocessingDocument.Open(generatedDocument, true); + MainDocumentPart mainPart = package.MainDocumentPart!; + + var sectionProperties = mainPart.Document.Body!.Elements(); + Assert.That(sectionProperties, Is.Not.Empty); + var headerRefs = sectionProperties.SelectMany(s => s.Elements()); + Assert.Multiple(() => + { + Assert.That(headerRefs.Count(r => r.Type?.Value == HeaderFooterValues.Default), Is.EqualTo(1), "Default header exist"); + Assert.That(headerRefs.Count(r => r.Type?.Value == HeaderFooterValues.Even), Is.Zero, "No event header has been yet defined"); + }); + + HtmlConverter converter = new(mainPart); + await converter.ParseHeader("Header even content", HeaderFooterValues.Even); + + sectionProperties = mainPart.Document.Body!.Elements(); + Assert.That(sectionProperties, Is.Not.Empty); + Assert.That(sectionProperties.Count(s => s.HasChild()), Is.EqualTo(1)); + headerRefs = sectionProperties.SelectMany(s => s.Elements()); + Assert.Multiple(() => + { + Assert.That(headerRefs.Count(r => r.Type?.Value == HeaderFooterValues.Default), Is.EqualTo(1)); + Assert.That(headerRefs.Count(r => r.Type?.Value == HeaderFooterValues.Even), Is.EqualTo(1)); + }); + AssertThatOpenXmlDocumentIsValid(); + } + } +} \ No newline at end of file diff --git a/test/HtmlToOpenXml.Tests/HrTests.cs b/test/HtmlToOpenXml.Tests/HrTests.cs index 8df6af9c..b7fdc1d5 100644 --- a/test/HtmlToOpenXml.Tests/HrTests.cs +++ b/test/HtmlToOpenXml.Tests/HrTests.cs @@ -17,13 +17,25 @@ public void Standalone_ReturnsWithNoSpacing () AssertIsHr(elements[0], false); } - [Test(Description = "should not generate a particular spacing because border-bottom is empty")] + [Test(Description = "Should not generate a particular spacing because border-bottom is empty")] public void AfterBorderlessContent_ReturnsWithNoSpacing () { var elements = converter.Parse("

Before


"); AssertIsHr(elements[1], false); } + [Test(Description = "User can provide his own stylised horizontal separator")] + public void Bordered_ReturnsWithStylisedBorder () + { + var elements = converter.Parse("
"); + AssertIsHr(elements[0], false); + var borders = elements[0].GetFirstChild()?.ParagraphBorders; + Assert.That(borders, Is.Not.Null); + Assert.That(borders.TopBorder?.Val?.Value, Is.EqualTo(BorderValues.Dotted)); + Assert.That(borders.TopBorder?.Color?.Value, Is.EqualTo("FF0000")); + Assert.That(borders.TopBorder?.Size?.Value, Is.EqualTo(2)); + } + [TestCase("

Before


")] [TestCase("

Before


")] [TestCase("
Cell

")] diff --git a/test/HtmlToOpenXml.Tests/HtmlConverterTestBase.cs b/test/HtmlToOpenXml.Tests/HtmlConverterTestBase.cs index ca50f928..3e056798 100644 --- a/test/HtmlToOpenXml.Tests/HtmlConverterTestBase.cs +++ b/test/HtmlToOpenXml.Tests/HtmlConverterTestBase.cs @@ -1,5 +1,6 @@ using DocumentFormat.OpenXml; using DocumentFormat.OpenXml.Packaging; +using DocumentFormat.OpenXml.Validation; using DocumentFormat.OpenXml.Wordprocessing; using NUnit.Framework; @@ -36,5 +37,21 @@ public void Close () package?.Dispose(); generatedDocument?.Dispose(); } + + protected void AssertThatOpenXmlDocumentIsValid() + { + var validator = new OpenXmlValidator(FileFormatVersions.Office2021); + var errors = validator.Validate(package); + + if (!errors.GetEnumerator().MoveNext()) + return; + + foreach (ValidationErrorInfo error in errors) + { + TestContext.Error.Write("{0}\n\t{1}\n", error.Path?.XPath, error.Description); + } + + Assert.Fail("The document isn't conformant with Office 2021"); + } } } \ No newline at end of file diff --git a/test/HtmlToOpenXml.Tests/HtmlToOpenXml.Tests.csproj b/test/HtmlToOpenXml.Tests/HtmlToOpenXml.Tests.csproj index d612ff3c..e781a424 100755 --- a/test/HtmlToOpenXml.Tests/HtmlToOpenXml.Tests.csproj +++ b/test/HtmlToOpenXml.Tests/HtmlToOpenXml.Tests.csproj @@ -16,11 +16,11 @@ runtime; build; native; contentfiles; analyzers; buildtransitive all - - - + + + - + all runtime; build; native; contentfiles; analyzers diff --git a/test/HtmlToOpenXml.Tests/ImageFormats/ImageHeaderTests.cs b/test/HtmlToOpenXml.Tests/ImageFormats/ImageHeaderTests.cs index 8f8a14f6..8dabcd50 100644 --- a/test/HtmlToOpenXml.Tests/ImageFormats/ImageHeaderTests.cs +++ b/test/HtmlToOpenXml.Tests/ImageFormats/ImageHeaderTests.cs @@ -28,6 +28,7 @@ public void GuessFormat_ReturnsImageSize((string resourceName, Size expectedSize yield return ("Resources.html2openxml.emf", new Size(100, 100)); // animated gif: yield return ("Resources.stan.gif", new Size(252, 318)); + yield return ("Resources.kiwi.svg", new Size(612, 502)); } /// @@ -53,6 +54,7 @@ public void PngSof2_ReturnsImageSize() [TestCase("Resources.html2openxml.gif", ExpectedResult = ImageHeader.FileType.Gif)] [TestCase("Resources.html2openxml.jpg", ExpectedResult = ImageHeader.FileType.Jpeg)] [TestCase("Resources.html2openxml.png", ExpectedResult = ImageHeader.FileType.Png)] + [TestCase("Resources.kiwi.svg", ExpectedResult = ImageHeader.FileType.Xml)] public ImageHeader.FileType GuessFormat_ReturnsFileType(string resourceName) { using var imageStream = ResourceHelper.GetStream(resourceName); diff --git a/test/HtmlToOpenXml.Tests/ImgTests.cs b/test/HtmlToOpenXml.Tests/ImgTests.cs index 72a23534..64d97ae6 100644 --- a/test/HtmlToOpenXml.Tests/ImgTests.cs +++ b/test/HtmlToOpenXml.Tests/ImgTests.cs @@ -15,12 +15,14 @@ namespace HtmlToOpenXml.Tests [TestFixture] public class ImgTests : HtmlConverterTestBase { - [Test] - public void AbsoluteUri_ReturnsDrawing_WithDownloadedData() + [TestCase("https://www.w3schools.com/tags/smiley.gif", "image/gif")] + [TestCase("https://dev.w3.org/SVG/tools/svgweb/samples/svg-files/helloworld.svg", "image/svg+xml")] + public void AbsoluteUri_ReturnsDrawing_WithDownloadedData(string imageUri, string contentType) { - var elements = converter.Parse(@"Smiley face"); + var elements = converter.Parse(@$"Smiley face"); Assert.That(elements, Has.Count.EqualTo(1)); - AssertIsImg(elements[0]); + var (_, imagePart) = AssertIsImg(mainPart, elements[0]); + Assert.That(imagePart.ContentType, Is.EqualTo(contentType)); } [Test] @@ -28,14 +30,14 @@ public void DataUri_ReturnsDrawing_WithDecryptedData() { var elements = converter.Parse(@"Smiley face"); Assert.That(elements, Has.Count.EqualTo(1)); - AssertIsImg(elements[0]); + AssertIsImg(mainPart, elements[0]); } [Test] public void WithBorder_ReturnsRunWithBorder() { var elements = converter.Parse(@""); - AssertIsImg(elements[0]); + AssertIsImg(mainPart, elements[0]); var run = elements[0].GetFirstChild(); var runProperties = run?.GetFirstChild(); Assert.That(runProperties, Is.Not.Null); @@ -55,11 +57,11 @@ public void ManualProvisioning_ReturnsDrawing_WithProvidedData() var elements = converter.Parse(@"Smiley face"); Assert.That(elements, Has.Count.EqualTo(1)); - AssertIsImg(elements[0]); + AssertIsImg(mainPart, elements[0]); } - [TestCase("Smiley face", Description = "Empty image")] [TestCase("", Description = "Unsupported protocol")] + [TestCase("", Description = "Relative url without providing BaseImagerUri")] public void IgnoreImage_ShouldBeIgnored(string html) { var elements = converter.Parse(html); @@ -88,9 +90,20 @@ public async Task FileSystem_LocalImage_WithSpaceInName_ShouldSucceed() await resourceStream.CopyToAsync(fileStream); var localUri = "file:///" + filepath.TrimStart('/').Replace(" ", "%20"); - var elements = await converter.Parse($"", CancellationToken.None); + var elements = await converter.ParseAsync($""); Assert.That(elements.Count(), Is.EqualTo(1)); - AssertIsImg(elements.First()); + AssertIsImg(mainPart, elements.First()); + } + + [Test] + public void SvgNode_ReturnsImage() + { + var elements = converter.Parse(ResourceHelper.GetString("Resources.kiwi.svg")); + Assert.That(elements, Has.Count.EqualTo(1)); + var (drawing, imagePart) = AssertIsImg(mainPart, elements[0]); + Assert.That(drawing.Inline!.DocProperties?.Name?.Value, Is.EqualTo("Illustration of a Kiwi")); + Assert.That(drawing.Inline!.DocProperties?.Description?.Value, Is.EqualTo("Kiwi (/ˈkiːwiː/ KEE-wee)[4] are flightless birds endemic to New Zealand of the order Apterygiformes.")); + Assert.That(imagePart.ContentType, Is.EqualTo("image/svg+xml")); } [Test(Description = "Reading local file containing a space in the name")] @@ -99,9 +112,9 @@ public async Task RemoteImage_WithBaseUri_ShouldSucceed() converter = new HtmlConverter(mainPart, new IO.DefaultWebRequest() { BaseImageUrl = new Uri("http://github.com/onizet/html2openxml") }); - var elements = await converter.Parse($"", CancellationToken.None); + var elements = await converter.ParseAsync($""); Assert.That(elements, Is.Not.Empty); - AssertIsImg(elements.First()); + AssertIsImg(mainPart, elements.First()); } [Test(Description = "Image ID must be unique, amongst header, body and footer parts")] @@ -123,7 +136,7 @@ public async Task ImageIds_IsUniqueAcrossPackagingParts() Assert.That(beforeMaxDocPropId, Is.Not.Null); HtmlConverter converter = new(mainPart); - await converter.ParseHtml(""); + await converter.ParseBody(""); mainPart.Document.Save(); var img = mainPart.Document.Body!.Descendants().FirstOrDefault(); @@ -148,24 +161,63 @@ public void WithIncompleteHeader_ShouldNotThrow() where T : OpenXmlPart, IFix HtmlConverter converter = new(mainPart); Assert.DoesNotThrowAsync(async () => - await converter.ParseHtml("")); + await converter.ParseBody("")); } - private Drawing AssertIsImg (OpenXmlCompositeElement element) + [TestCase(typeof(HeaderPart))] + [TestCase(typeof(FooterPart))] + [TestCase(typeof(MainDocumentPart))] + public async Task ParseIntoDocumentPart_ReturnsImageParentedToPart (Type openXmlPartType) { - var run = element.GetFirstChild(); + string html = @"Smiley face"; + OpenXmlElement host; + OpenXmlPartContainer container; + + if (openXmlPartType == typeof(HeaderPart)) + { + await converter.ParseHeader(html); + container = mainPart.HeaderParts.First(); + host = mainPart.HeaderParts.First().Header; + } + else if (openXmlPartType == typeof(FooterPart)) + { + await converter.ParseFooter(html); + container = mainPart.FooterParts.First(); + host = mainPart.FooterParts.First().Footer; + } + else if (openXmlPartType == typeof(MainDocumentPart)) + { + await converter.ParseBody(html); + container = mainPart; + host = mainPart.Document.Body!; + } + else + { + throw new NotSupportedException($"Test case not supported for {openXmlPartType.FullName}"); + } + + Assert.That(host.ChildElements, Has.Count.EqualTo(1)); + var p = host.ChildElements.FirstOrDefault(c => c is Paragraph); + Assert.That(p, Is.Not.Null); + AssertIsImg(container, p); + AssertThatOpenXmlDocumentIsValid(); + } + + private static (Drawing, ImagePart) AssertIsImg (OpenXmlPartContainer container, OpenXmlElement paragraph) + { + var run = paragraph.GetFirstChild(); Assert.That(run, Is.Not.Null); - var img = run.GetFirstChild(); - Assert.That(img, Is.Not.Null); - Assert.That(img.Inline?.Graphic?.GraphicData, Is.Not.Null); - var pic = img.Inline.Graphic.GraphicData.GetFirstChild(); + var drawing = run.GetFirstChild(); + Assert.That(drawing, Is.Not.Null); + Assert.That(drawing.Inline?.Graphic?.GraphicData, Is.Not.Null); + var pic = drawing.Inline.Graphic.GraphicData.GetFirstChild(); Assert.That(pic?.BlipFill?.Blip?.Embed, Is.Not.Null); var imagePartId = pic.BlipFill.Blip.Embed.Value; Assert.That(imagePartId, Is.Not.Null); - var part = mainPart.GetPartById(imagePartId); - Assert.That(part, Is.TypeOf(typeof(ImagePart))); - return img; + var imagePart = container.GetPartById(imagePartId); + Assert.That(imagePart, Is.TypeOf(typeof(ImagePart))); + return (drawing, (ImagePart) imagePart); } } } \ No newline at end of file diff --git a/test/HtmlToOpenXml.Tests/LinkTests.cs b/test/HtmlToOpenXml.Tests/LinkTests.cs index 147bf360..27443ce9 100644 --- a/test/HtmlToOpenXml.Tests/LinkTests.cs +++ b/test/HtmlToOpenXml.Tests/LinkTests.cs @@ -1,8 +1,12 @@ using NUnit.Framework; using DocumentFormat.OpenXml.Wordprocessing; +using DocumentFormat.OpenXml.Packaging; +using DocumentFormat.OpenXml; namespace HtmlToOpenXml.Tests { + using a = DocumentFormat.OpenXml.Drawing; + /// /// Tests hyperlink. /// @@ -15,29 +19,13 @@ public class LinkTests : HtmlConverterTestBase public void ExternalLink_ShouldSucceed (string link) { var elements = converter.Parse($@"Test Caption"); - Assert.That(elements, Has.Count.EqualTo(1)); - Assert.Multiple(() => { - Assert.That(elements[0], Is.TypeOf(typeof(Paragraph))); - Assert.That(elements[0].HasChild(), Is.True); - }); - var hyperlink = elements[0].GetFirstChild()!; - Assert.That(hyperlink.Tooltip, Is.Not.Null); - Assert.That(hyperlink.Tooltip.Value, Is.EqualTo("Test Tooltip")); - Assert.That(hyperlink.HasChild(), Is.True); - Assert.That(elements[0].InnerText, Is.EqualTo("Test Caption")); - - Assert.That(hyperlink.Id, Is.Not.Null); - Assert.That(hyperlink.History?.Value, Is.EqualTo(true)); - Assert.That(mainPart.HyperlinkRelationships.Count(), Is.GreaterThan(0)); - - var extLink = mainPart.HyperlinkRelationships.FirstOrDefault(r => r.Id == hyperlink.Id); - Assert.That(extLink, Is.Not.Null); - Assert.That(extLink.IsExternal, Is.EqualTo(true)); - Assert.That(extLink.Uri.AbsoluteUri, Is.EqualTo("http://www.site.com/")); + AssertHyperlink(mainPart, elements); } [TestCase(@"Js")] [TestCase(@"Unknown site")] + [TestCase(@"Empty link")] + [TestCase(@"Empty bookmark")] public void InvalidLink_ReturnsSimpleRun (string html) { // invalid link leads to simple Run with no link @@ -64,6 +52,20 @@ public void TextImageLink_ReturnsHyperlinkWithTextAndImage () Assert.That(hyperlink.LastChild?.InnerText, Is.EqualTo(" Test Caption")); } + [Test(Description = "Assert that `figcaption` tag doesn't generate paragraphs")] + public void ImageFigcaptionLink_ReturnsHyperlinkWithTextAndImage () + { + var elements = converter.Parse(@"Go to + +
Caption for the image
"); + Assert.That(elements[0].FirstChild, Is.TypeOf(typeof(Hyperlink))); + + var hyperlink = (Hyperlink) elements[0].FirstChild; + Assert.That(hyperlink.ChildElements, Has.Count.EqualTo(4)); + Assert.That(hyperlink.ChildElements, Has.All.TypeOf(typeof(Run)), "Hyperlinks don't accept inner paragraphs"); + Assert.That(hyperlink.Descendants(), Is.Not.Null); + } + [Test] public void Anchoring_WithUnknownTarget_ReturnsHyperlinkWithBookmark () { @@ -88,8 +90,8 @@ public void SetExcludeAnchoring_ReturnsSimpleRun () Assert.That(elements[0], Is.TypeOf(typeof(Paragraph))); Assert.That(elements[0].HasChild(), Is.True); - var hyperlink = (Hyperlink) elements[0].FirstChild!; - Assert.That(hyperlink.Anchor?.Value, Is.EqualTo("_top")); + var hyperlink = elements[0].GetFirstChild(); + Assert.That(hyperlink?.Anchor?.Value, Is.EqualTo("_top")); // this should generate a Run and not an Hyperlink elements = converter.Parse(@"Anchor3"); @@ -159,5 +161,119 @@ public void WithMultipleRun_ReturnsHyperlinkWithMultipleRuns() Assert.That(h.ChildElements, Has.All.TypeOf(typeof(Run))); Assert.That(h.InnerText, Is.EqualTo("Html to OpenXml !")); } + + [TestCase(typeof(HeaderPart))] + [TestCase(typeof(FooterPart))] + [TestCase(typeof(MainDocumentPart))] + public async Task ParseIntoDocumentPart_ReturnsHyperlinkParentedToPart (Type openXmlPartType) + { + string html = @"Test Caption"; + OpenXmlElement host; + OpenXmlPartContainer container; + + if (openXmlPartType == typeof(HeaderPart)) + { + await converter.ParseHeader(html); + container = mainPart.HeaderParts.First(); + host = mainPart.HeaderParts.First().Header; + } + else if (openXmlPartType == typeof(FooterPart)) + { + await converter.ParseFooter(html); + container = mainPart.FooterParts.First(); + host = mainPart.FooterParts.First().Footer; + } + else if (openXmlPartType == typeof(MainDocumentPart)) + { + await converter.ParseBody(html); + container = mainPart; + host = mainPart.Document.Body!; + } + else + { + throw new NotSupportedException($"Test case not supported for {openXmlPartType.FullName}"); + } + + AssertHyperlink(container, host.ChildElements); + AssertThatOpenXmlDocumentIsValid(); + } + + [TestCase("_top", Description = "Bookmark _top is reserved and stands in the top of the document")] + [TestCase("top", Description = "Alternate supported bookmark for user convenience")] + public async Task WithTopAnchoring_ReturnsAutoCreatedBookmark(string anchor) + { + await converter.ParseBody($"
Cell1
Move to top"); + + Assert.That(mainPart.Document.Body!.Elements().Count(), Is.EqualTo(3)); + Assert.That(mainPart.Document.Body!.FirstChild, Is.TypeOf()); + Assert.That(mainPart.Document.Body!.ElementAt(1), Is.TypeOf()); + Assert.That(mainPart.Document.Body!.LastChild, Is.TypeOf()); + + var p = mainPart.Document.Body!.GetFirstChild()!; + Assert.That(p.GetFirstChild()?.Name?.Value, Is.EqualTo("_top"), "Reserved keyword `_top`"); + + p = mainPart.Document.Body!.GetLastChild()!; + Assert.That(p.GetFirstChild()?.Anchor?.Value, Is.EqualTo("_top")); + } + + [Test(Description = "Bookmark must not stand as a single paragraph but inserted into the heading")] + public async Task WithHeading_ThenTopAnchoring_PrependBookmarkIntoHeading() + { + await converter.ParseBody(@"

Heading 1

+ Move to top"); + + var p = mainPart.Document.Body!.GetFirstChild(); + Assert.That(p, Is.Not.Null); + Assert.Multiple(() => + { + Assert.That(p.GetFirstChild()?.Name?.Value, Is.EqualTo("_top"), + "Expected `_top` bookmark in the first body paragraph"); + Assert.That(p.GetFirstChild(), Is.Not.Null); + Assert.That(p.ParagraphProperties?.ParagraphStyleId?.Val?.Value, Is.EqualTo("Heading1"), + "Expected first paragraph is the heading"); + }); + } + + [Test(Description = "Clickable image pointing to `_top` bookmark requires additional link relationship")] + public async Task WithImageTopAnchoring_ReturnsClickableLink() + { + await converter.ParseBody(@"Move to top + + "); + var p = mainPart.Document.Body!.GetFirstChild(); + var drawing = p?.Descendants().FirstOrDefault(); + Assert.That(drawing, Is.Not.Null); + var linkTarget = drawing?.Inline?.DocProperties?.GetFirstChild()?.Id?.Value; + Assert.That(linkTarget, Is.Not.Null); + var rel = mainPart.HyperlinkRelationships.FirstOrDefault(r => r.Id == linkTarget); + Assert.That(rel, Is.Not.Null); + Assert.That(rel.Uri.ToString(), Is.EqualTo("#_top")); + } + + private static void AssertHyperlink(OpenXmlPartContainer container, IEnumerable elements) + { + Assert.That(elements.Count(), Is.EqualTo(1)); + Assert.Multiple(() => { + Assert.That(elements.First(), Is.TypeOf(typeof(Paragraph))); + Assert.That(elements.First().HasChild(), Is.True); + }); + var hyperlink = elements.First().GetFirstChild()!; + Assert.That(hyperlink.Tooltip, Is.Not.Null); + Assert.That(hyperlink.Tooltip.Value, Is.EqualTo("Test Tooltip")); + Assert.That(hyperlink.HasChild(), Is.True); + Assert.That(elements.First().InnerText, Is.EqualTo("Test Caption")); + + Assert.Multiple(() => + { + Assert.That(hyperlink.Id, Is.Not.Null); + Assert.That(hyperlink.History?.Value, Is.EqualTo(true)); + Assert.That(container.HyperlinkRelationships.Count(), Is.GreaterThan(0)); + }); + + var extLink = container.HyperlinkRelationships.FirstOrDefault(r => r.Id == hyperlink.Id); + Assert.That(extLink, Is.Not.Null); + Assert.That(extLink.IsExternal, Is.EqualTo(true)); + Assert.That(extLink.Uri.AbsoluteUri, Is.EqualTo("http://www.site.com/")); + } } } \ No newline at end of file diff --git a/test/HtmlToOpenXml.Tests/NumberingTests.cs b/test/HtmlToOpenXml.Tests/NumberingTests.cs index eee33ef8..5288765e 100644 --- a/test/HtmlToOpenXml.Tests/NumberingTests.cs +++ b/test/HtmlToOpenXml.Tests/NumberingTests.cs @@ -131,6 +131,16 @@ public void EmptyList_ShouldBeIgnored() } } + [Test(Description = "Empty list item should not be registred")] + public void EmptyLiElement_ShouldBeIgnored() + { + var elements = converter.Parse(@"
    +
  • not empty
  • +
  • +
"); + Assert.That(elements, Has.Count.EqualTo(1)); + } + [Test(Description = "Increment instanceId based on existing lists")] public void WithExistingNumbering_ReturnsUniqueInstanceId() { @@ -257,9 +267,9 @@ public void UseVariantStyle_ListItem_ReturnsAppliedStyle() [Test(Description = "Resume indenting from existing numbering (default behaviour)")] public async Task ContinueNumbering_ReturnsSecondList_ContinueOrder() { - await converter.ParseHtml(@"
  1. Item 1
"); + await converter.ParseBody(@"
  1. Item 1
"); - await converter.ParseHtml("
  1. Item 2
"); + await converter.ParseBody("
  1. Item 2
"); var absNum = mainPart.NumberingDefinitionsPart?.Numbering .Elements() @@ -281,15 +291,16 @@ public async Task ContinueNumbering_ReturnsSecondList_ContinueOrder() e.ParagraphProperties?.NumberingProperties?.NumberingId?.Val?.Value), Has.All.EqualTo(instances.First().NumberID!.Value), "All paragraphs are linked to the same list instance"); + AssertThatOpenXmlDocumentIsValid(); } [Test(Description = "Stop indenting from existing numbering (issue #57)")] public async Task DisableContinueNumbering_ReturnsSecondList_RestartingOrder() { - await converter.ParseHtml(@"
  1. Item 1
"); + await converter.ParseBody(@"
  1. Item 1
"); converter.ContinueNumbering = false; - await converter.ParseHtml("
  1. Item 2
"); + await converter.ParseBody("
  1. Item 2
"); var absNum = mainPart.NumberingDefinitionsPart?.Numbering .Elements() @@ -311,6 +322,7 @@ public async Task DisableContinueNumbering_ReturnsSecondList_RestartingOrder() e.ParagraphProperties?.NumberingProperties?.NumberingId?.Val?.Value), Is.Unique, "All paragraphs use different list instances"); + AssertThatOpenXmlDocumentIsValid(); } /// diff --git a/test/HtmlToOpenXml.Tests/Resources/kiwi.svg b/test/HtmlToOpenXml.Tests/Resources/kiwi.svg new file mode 100644 index 00000000..9a5b8dd7 --- /dev/null +++ b/test/HtmlToOpenXml.Tests/Resources/kiwi.svg @@ -0,0 +1,30 @@ + + + + + + Illustration of a Kiwi + + + Kiwi (/ˈkiːwiː/ KEE-wee)[4] are flightless birds endemic to New Zealand of the order Apterygiformes. + + + +