From 654bf93840af06a6c630403e42c8bc890054bf39 Mon Sep 17 00:00:00 2001 From: Michael Ganss Date: Mon, 7 Mar 2016 13:14:33 +0100 Subject: [PATCH 1/3] Add sanitization of complete HTML documents. --- HtmlSanitizer.Tests/Tests.cs | 34 +++++++++++++ HtmlSanitizer/HtmlSanitizer.cs | 91 ++++++++++++++++++++++++++++------ 2 files changed, 109 insertions(+), 16 deletions(-) diff --git a/HtmlSanitizer.Tests/Tests.cs b/HtmlSanitizer.Tests/Tests.cs index 34268ff..bf49869 100644 --- a/HtmlSanitizer.Tests/Tests.cs +++ b/HtmlSanitizer.Tests/Tests.cs @@ -2487,6 +2487,40 @@ public void RemoveEventForNotAllowedTag_ScriptTagAndSpan() s.Sanitize("Hi"); Assert.That(actual, Is.EqualTo(RemoveReason.NotAllowedTag)); } + + [Test] + public void DocumentTest() + { + var s = new HtmlSanitizer(); + s.AllowedTags.Add("title"); + var html = "Test
Test
"; + + var actual = s.SanitizeDocument(html); + + Assert.That(actual, Is.EqualTo(html)); + } + + [Test] + public void DocumentFromFragmentTest() + { + var s = new HtmlSanitizer(); + var html = "
Test
"; + + var actual = s.SanitizeDocument(html); + + Assert.That(actual, Is.EqualTo("
Test
")); + } + + [Test] + public void FragmentFromDocumentTest() + { + var s = new HtmlSanitizer(); + var html = "Test
Test
"; + + var actual = s.Sanitize(html); + + Assert.That(actual, Is.EqualTo("
Test
")); + } } } diff --git a/HtmlSanitizer/HtmlSanitizer.cs b/HtmlSanitizer/HtmlSanitizer.cs index 2bfe680..70923b2 100644 --- a/HtmlSanitizer/HtmlSanitizer.cs +++ b/HtmlSanitizer/HtmlSanitizer.cs @@ -114,7 +114,9 @@ public HtmlSanitizer(IEnumerable allowedTags = null, IEnumerable // Forms "datalist", "keygen", "output", "progress", "meter", // Interactive elements - "details", "summary", "menuitem" + "details", "summary", "menuitem", + // document elements + "html", "head", "body" }; /// @@ -296,6 +298,11 @@ protected virtual void OnRemovingStyle(RemovingStyleEventArgs e) /// public static readonly Regex DefaultDisallowedCssPropertyValue = new Regex(@"[<>]", RegexOptions.Compiled); + /// + /// Return all nested subnodes of a node. + /// + /// The root node. + /// All nested subnodes. private static IEnumerable GetAllNodes(INode dom) { if (dom == null) yield break; @@ -311,31 +318,78 @@ private static IEnumerable GetAllNodes(INode dom) } /// - /// Sanitizes the specified HTML. + /// Sanitizes the specified HTML body fragment. If a document is given, only the body part will be returned. /// - /// The HTML to sanitize. + /// The HTML body fragment to sanitize. /// The base URL relative URLs are resolved against. No resolution if empty. - /// The CsQuery output formatter used to render the DOM. Using the default formatter if null. - /// The sanitized HTML. + /// The formatter used to render the DOM. Using the default formatter if null. + /// The sanitized HTML body fragment. public string Sanitize(string html, string baseUrl = "", IMarkupFormatter outputFormatter = null) { - var parser = new HtmlParser(new Configuration().WithCss(e => e.Options = new CssParserOptions + var parser = CreateParser(); + var dom = parser.Parse("" + html + ""); + + DoSanitize(dom, dom.Body, baseUrl, outputFormatter); + + var output = dom.Body.ChildNodes.ToHtml(outputFormatter ?? HtmlMarkupFormatter.Instance); + + return output; + } + + /// + /// Sanitizes the specified HTML document. Even if only a fragment is given, a whole document will be returned. + /// + /// The HTML document to sanitize. + /// The base URL relative URLs are resolved against. No resolution if empty. + /// The formatter used to render the DOM. Using the default formatter if null. + /// The sanitized HTML document. + public string SanitizeDocument(string html, string baseUrl = "", IMarkupFormatter outputFormatter = null) + { + var parser = CreateParser(); + var dom = parser.Parse(html); + + DoSanitize(dom, dom.DocumentElement, baseUrl, outputFormatter); + + var output = dom.ToHtml(outputFormatter ?? HtmlMarkupFormatter.Instance); + + return output; + } + + /// + /// Creeates an instance of . + /// + /// An instance of . + private static HtmlParser CreateParser() + { + return new HtmlParser(new Configuration().WithCss(e => e.Options = new CssParserOptions { IsIncludingUnknownDeclarations = true, IsIncludingUnknownRules = true, IsToleratingInvalidConstraints = true, IsToleratingInvalidValues = true })); - var dom = parser.Parse("" + html + ""); + } + /// + /// Removes all comment nodes from a list of nodes. + /// + /// The list of nodes. + private static void RemoveComments(List nodes) + { + foreach (var comment in nodes.OfType()) + comment.Remove(); + } + + private void DoSanitize(IHtmlDocument dom, IElement context, string baseUrl = "", IMarkupFormatter outputFormatter = null) + { // remove non-whitelisted tags - foreach (var tag in dom.Body.QuerySelectorAll("*").Where(t => !IsAllowedTag(t)).ToList()) + foreach (var tag in context.QuerySelectorAll("*").Where(t => !IsAllowedTag(t)).ToList()) { RemoveTag(tag, RemoveReason.NotAllowedTag); } // cleanup attributes - foreach (var tag in dom.Body.QuerySelectorAll("*").OfType().ToList()) + foreach (var tag in context.QuerySelectorAll("*").OfType().ToList()) { // remove non-whitelisted attributes foreach (var attribute in tag.Attributes.Where(a => !IsAllowedAttribute(a)).ToList()) @@ -372,11 +426,20 @@ public string Sanitize(string html, string baseUrl = "", IMarkupFormatter output } } - var nodes = GetAllNodes(dom.Body).ToList(); + var nodes = GetAllNodes(context).ToList(); - foreach (var comment in nodes.OfType()) - comment.Remove(); + RemoveComments(nodes); + + DoPostProcess(dom, nodes); + } + /// + /// Performs post processing on all nodes in the document. + /// + /// The HTML document. + /// The list of nodes in the document. + private void DoPostProcess(IHtmlDocument dom, List nodes) + { if (PostProcessNode != null) { foreach (var node in nodes) @@ -387,10 +450,6 @@ public string Sanitize(string html, string baseUrl = "", IMarkupFormatter output ((IChildNode)node).Replace(e.ReplacementNodes.ToArray()); } } - - var output = dom.Body.ChildNodes.ToHtml(outputFormatter ?? HtmlMarkupFormatter.Instance); - - return output; } /// From e56146f99d5b260d342e19719252ee32393a13dd Mon Sep 17 00:00:00 2001 From: Michael Ganss Date: Mon, 7 Mar 2016 13:47:53 +0100 Subject: [PATCH 2/3] Clean up documentation --- HtmlSanitizer.Tests/Tests.cs | 3 +-- HtmlSanitizer/HtmlSanitizer.cs | 7 +++--- HtmlSanitizer/IHtmlSanitizer.cs | 38 +++------------------------------ README.md | 2 +- 4 files changed, 8 insertions(+), 42 deletions(-) diff --git a/HtmlSanitizer.Tests/Tests.cs b/HtmlSanitizer.Tests/Tests.cs index bf49869..6093372 100644 --- a/HtmlSanitizer.Tests/Tests.cs +++ b/HtmlSanitizer.Tests/Tests.cs @@ -2202,8 +2202,7 @@ public void RussianTextTest() // Act var htmlFragment = "Тест"; - //var outputFormatter = new CsQuery.Output.FormatDefault(DomRenderingOptions.RemoveComments | DomRenderingOptions.QuoteAllAttributes, HtmlEncoders.Minimum); - var actual = s.Sanitize(htmlFragment, ""/*, outputFormatter*/); + var actual = s.Sanitize(htmlFragment, ""); // Assert var expected = htmlFragment; diff --git a/HtmlSanitizer/HtmlSanitizer.cs b/HtmlSanitizer/HtmlSanitizer.cs index 70923b2..0820ddc 100644 --- a/HtmlSanitizer/HtmlSanitizer.cs +++ b/HtmlSanitizer/HtmlSanitizer.cs @@ -14,10 +14,10 @@ namespace Ganss.XSS { /// - /// Cleans HTML fragments from constructs that can lead to XSS attacks. + /// Cleans HTML documents and fragments from constructs that can lead to XSS attacks. /// /// - /// XSS attacks can occur at several levels within an HTML fragment: + /// XSS attacks can occur at several levels within an HTML document or fragment: /// /// HTML Tags (e.g. the <script> tag) /// HTML attributes (e.g. the "onload" attribute) @@ -25,8 +25,7 @@ namespace Ganss.XSS /// malformed HTML or HTML that exploits parser bugs in specific browsers /// /// - /// The HtmlSanitizer class addresses all of these possible attack vectors by using an HTML parser that is based on the one used - /// in the Gecko browser engine (see CsQuery). + /// The HtmlSanitizer class addresses all of these possible attack vectors by using a sophisticated HTML parser (AngleSharp). /// /// /// In order to facilitate different use cases, HtmlSanitizer can be customized at the levels mentioned above: diff --git a/HtmlSanitizer/IHtmlSanitizer.cs b/HtmlSanitizer/IHtmlSanitizer.cs index 9b268c7..1da7619 100644 --- a/HtmlSanitizer/IHtmlSanitizer.cs +++ b/HtmlSanitizer/IHtmlSanitizer.cs @@ -6,41 +6,9 @@ namespace Ganss.XSS { /// - /// Cleans HTML fragments from constructs that can lead to XSS attacks. + /// Enables an inheriting class to implement an HtmlSanitizer class, which cleans HTML documents and fragments + /// from constructs that can lead to XSS attacks. /// - /// - /// XSS attacks can occur at several levels within an HTML fragment: - /// - /// HTML Tags (e.g. the <script> tag) - /// HTML attributes (e.g. the "onload" attribute) - /// CSS styles (url property values) - /// malformed HTML or HTML that exploits parser bugs in specific browsers - /// - /// - /// The HtmlSanitizer class addresses all of these possible attack vectors by using an HTML parser that is based on the one used - /// in the Gecko browser engine (see CsQuery). - /// - /// - /// In order to facilitate different use cases, HtmlSanitizer can be customized at the levels mentioned above: - /// - /// You can specify the allowed HTML tags through the property . All other tags will be stripped. - /// You can specify the allowed HTML attributes through the property . All other attributes will be stripped. - /// You can specify the allowed CSS property names through the property . All other styles will be stripped. - /// You can specify the allowed URI schemes through the property . All other URIs will be stripped. - /// You can specify the HTML attributes that contain URIs (such as "src", "href" etc.) through the property . - /// - /// - /// - /// - /// - /// alert('xss')
Test
"; - /// var sanitized = sanitizer.Sanitize(html, "http://www.example.com"); - /// // -> "
Test
" - /// ]]> - ///
- ///
public interface IHtmlSanitizer { /// @@ -121,7 +89,7 @@ public interface IHtmlSanitizer /// /// The HTML to sanitize. /// The base URL relative URLs are resolved against. No resolution if empty. - /// The CsQuery output formatter used to render the DOM. Using the default formatter if null. + /// The formatter used to render the DOM. Using the default formatter if null. /// The sanitized HTML. string Sanitize(string html, string baseUrl = "", IMarkupFormatter outputFormatter = null); } diff --git a/README.md b/README.md index cd7b525..6e9a5a8 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ HtmlSanitizer [![Build status](https://ci.appveyor.com/api/projects/status/418bmfx643iae00c/branch/master?svg=true)](https://ci.appveyor.com/project/mganss/htmlsanitizer/branch/master) [![codecov.io](https://codecov.io/github/mganss/HtmlSanitizer/coverage.svg?branch=master)](https://codecov.io/github/mganss/HtmlSanitizer?branch=master) -HtmlSanitizer is a .NET library for cleaning HTML fragments from constructs that can lead to [XSS attacks](https://en.wikipedia.org/wiki/Cross-site_scripting). +HtmlSanitizer is a .NET library for cleaning HTML fragments and documents from constructs that can lead to [XSS attacks](https://en.wikipedia.org/wiki/Cross-site_scripting). It uses [AngleSharp](https://github.com/AngleSharp/AngleSharp) to parse, manipulate, and render HTML and CSS. Because HtmlSanitizer is based on a robust HTML parser it can also shield you from deliberate or accidental From 2f23ab4a2fffa8f0f09e449c8ab6fb7fcf45f6a3 Mon Sep 17 00:00:00 2001 From: Michael Ganss Date: Mon, 7 Mar 2016 15:06:42 +0100 Subject: [PATCH 3/3] Fix NUnit path Add -beta to NuGet version --- HtmlSanitizer/HtmlSanitizer.nuspec | 2 +- appveyor.yml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/HtmlSanitizer/HtmlSanitizer.nuspec b/HtmlSanitizer/HtmlSanitizer.nuspec index cb31a6c..68484c1 100644 --- a/HtmlSanitizer/HtmlSanitizer.nuspec +++ b/HtmlSanitizer/HtmlSanitizer.nuspec @@ -2,7 +2,7 @@ $id$ - $version$ + $version$-beta $title$ $author$ $author$ diff --git a/appveyor.yml b/appveyor.yml index 8b1af9b..86ff04b 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -1,4 +1,4 @@ -version: 3.1.{build} +version: 3.2.{build} install: - nuget restore configuration: Release @@ -31,7 +31,7 @@ build: test_script: - nuget.exe install OpenCover -ExcludeVersion - nuget.exe install NUnit.Runners -ExcludeVersion - - OpenCover\tools\OpenCover.Console.exe -register:user -filter:"+[HtmlSanitizer]*" -target:"NUnit.Console\tools\nunit3-console.exe" "-targetargs:/domain:single HtmlSanitizer.Tests\bin\release\HtmlSanitizer.Tests.dll" -returntargetcode -hideskipped:All -output:coverage.xml + - OpenCover\tools\OpenCover.Console.exe -register:user -filter:"+[HtmlSanitizer]*" -target:"NUnit.ConsoleRunner\tools\nunit3-console.exe" "-targetargs:/domain:single HtmlSanitizer.Tests\bin\release\HtmlSanitizer.Tests.dll" -returntargetcode -hideskipped:All -output:coverage.xml - "SET PATH=C:\\Python34;C:\\Python34\\Scripts;%PATH%" - pip install codecov - codecov -f "coverage.xml"