From 53a18779679487d79d369c2ba3e66e2e4e9637bd Mon Sep 17 00:00:00 2001 From: Jan Walther Date: Tue, 20 Aug 2024 11:55:36 +0200 Subject: [PATCH 1/2] URL-encode URLs instead of HTML-encode --- src/Sitemap/Url/GoogleImage.php | 2 +- .../Url/GoogleMultilangUrlDecorator.php | 2 +- src/Sitemap/Url/GoogleVideo.php | 4 +- src/Sitemap/Url/UrlConcrete.php | 2 +- src/Sitemap/Utils.php | 40 +++++++++++++++++++ tests/Unit/Sitemap/UtilsTest.php | 6 +++ 6 files changed, 51 insertions(+), 5 deletions(-) diff --git a/src/Sitemap/Url/GoogleImage.php b/src/Sitemap/Url/GoogleImage.php index 42eab61..fe96710 100644 --- a/src/Sitemap/Url/GoogleImage.php +++ b/src/Sitemap/Url/GoogleImage.php @@ -177,7 +177,7 @@ public function toXML(): string { $xml = ''; - $xml .= '' . Utils::encode($this->getLocation()) . ''; + $xml .= '' . Utils::encodeUrl($this->getLocation()) . ''; if ($this->getCaption()) { $xml .= '' . Utils::cdata($this->getCaption()) . ''; diff --git a/src/Sitemap/Url/GoogleMultilangUrlDecorator.php b/src/Sitemap/Url/GoogleMultilangUrlDecorator.php index c30e401..54a6a27 100644 --- a/src/Sitemap/Url/GoogleMultilangUrlDecorator.php +++ b/src/Sitemap/Url/GoogleMultilangUrlDecorator.php @@ -65,7 +65,7 @@ protected function generateLinkXml(string $href, string $hreflang, string $rel = return ''; + . '" href="' . Utils::encodeUrl($href) . '" />'; } /** diff --git a/src/Sitemap/Url/GoogleVideo.php b/src/Sitemap/Url/GoogleVideo.php index b1f456d..48349b8 100644 --- a/src/Sitemap/Url/GoogleVideo.php +++ b/src/Sitemap/Url/GoogleVideo.php @@ -941,7 +941,7 @@ public function toXml(): string //---------------------- // required fields - $videoXml .= '' . Utils::encode($this->getThumbnailLocation()) . ''; + $videoXml .= '' . Utils::encodeUrl($this->getThumbnailLocation()) . ''; $videoXml .= '' . Utils::cdata($this->getTitle()) . ''; $videoXml .= '' . Utils::cdata($this->getDescription()) . ''; @@ -952,7 +952,7 @@ public function toXml(): string $videoXml .= '' . Utils::cdata($category) . ''; } if ($location = $this->getContentLocation()) { - $videoXml .= '' . Utils::encode($location) . ''; + $videoXml .= '' . Utils::encodeUrl($location) . ''; } if ($duration = $this->getDuration()) { $videoXml .= '' . $duration . ''; diff --git a/src/Sitemap/Url/UrlConcrete.php b/src/Sitemap/Url/UrlConcrete.php index 5e07c4a..ea72a1b 100644 --- a/src/Sitemap/Url/UrlConcrete.php +++ b/src/Sitemap/Url/UrlConcrete.php @@ -199,7 +199,7 @@ public function getPriority(): ?float */ public function toXml(): string { - $xml = '' . Utils::encode($this->getLoc()) . ''; + $xml = '' . Utils::encodeUrl($this->getLoc()) . ''; $lastmod = $this->getLastmod(); if ($lastmod) { diff --git a/src/Sitemap/Utils.php b/src/Sitemap/Utils.php index 1dcc0dd..5d65eb2 100644 --- a/src/Sitemap/Utils.php +++ b/src/Sitemap/Utils.php @@ -39,4 +39,44 @@ public static function encode(string $string): string { return htmlspecialchars($string, ENT_QUOTES | ENT_SUBSTITUTE, 'UTF-8'); } + + /** + * Encode URL + * + * @param string $string + * + * @return string + */ + public static function encodeUrl(string $url): string + { + $parts = parse_url($url); + + // Optional but we only sanitize URLs with scheme and host defined + if ($parts === false || empty($parts['scheme']) || empty($parts['host'])) { + return $url; + } + + $sanitizedPath = null; + if (!empty($parts['path'])) { + $pathParts = explode('/', $parts['path']); + foreach ($pathParts as $pathPart) { + if (empty($pathPart)) { + continue; + } + // The Path part might already be urlencoded + $sanitizedPath .= '/'.rawurlencode(rawurldecode($pathPart)); + } + } + + // Build the url + $targetUrl = $parts['scheme'].'://'. + ((!empty($parts['user']) && !empty($parts['pass'])) ? $parts['user'].':'.$parts['pass'].'@' : ''). + $parts['host']. + (!empty($parts['port']) ? ':'.$parts['port'] : ''). + (!empty($sanitizedPath) ? $sanitizedPath : ''). + (!empty($parts['query']) ? '?'.$parts['query'] : ''). + (!empty($parts['fragment']) ? '#'.$parts['fragment'] : ''); + + return $targetUrl; + } } diff --git a/tests/Unit/Sitemap/UtilsTest.php b/tests/Unit/Sitemap/UtilsTest.php index bb0286c..f580575 100644 --- a/tests/Unit/Sitemap/UtilsTest.php +++ b/tests/Unit/Sitemap/UtilsTest.php @@ -27,4 +27,10 @@ public function testEncode(): void $actual = Utils::encode('data & spécial chars>'); self::assertEquals('data & spécial chars>', $actual); } + + public function testEncodeUrl(): void + { + $actual = Utils::encodeUrl('http://example.org/test_ä'); + self::assertEquals('http://example.org/test_%C3%A4', $actual); + } } From 42a5cca9f33bb2736b112945c4b9f1d9826fb7fd Mon Sep 17 00:00:00 2001 From: Jan Walther Date: Tue, 20 Aug 2024 12:34:42 +0200 Subject: [PATCH 2/2] URL-encode URLs --- src/Sitemap/Utils.php | 2 +- tests/Unit/Sitemap/Url/UrlConcreteTest.php | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/src/Sitemap/Utils.php b/src/Sitemap/Utils.php index 5d65eb2..6dd3049 100644 --- a/src/Sitemap/Utils.php +++ b/src/Sitemap/Utils.php @@ -74,7 +74,7 @@ public static function encodeUrl(string $url): string $parts['host']. (!empty($parts['port']) ? ':'.$parts['port'] : ''). (!empty($sanitizedPath) ? $sanitizedPath : ''). - (!empty($parts['query']) ? '?'.$parts['query'] : ''). + (!empty($parts['query']) ? '?'.self::encode($parts['query']) : ''). (!empty($parts['fragment']) ? '#'.$parts['fragment'] : ''); return $targetUrl; diff --git a/tests/Unit/Sitemap/Url/UrlConcreteTest.php b/tests/Unit/Sitemap/Url/UrlConcreteTest.php index 0f5cee1..ea473a2 100644 --- a/tests/Unit/Sitemap/Url/UrlConcreteTest.php +++ b/tests/Unit/Sitemap/Url/UrlConcreteTest.php @@ -31,6 +31,8 @@ public function toXmlProvider(): array ['http://example.com/', 'http://example.com/'], ['http://example.com/abcd', 'http://example.com/abcd'], ['http://example.com/abcd/?a=1&b=cdf', 'http://example.com/abcd/?a=1&b=cdf'], + ['http://example.com/%C3%A4', 'http://example.com/ä'], + ['http://example.com/folder/%C3%A4', 'http://example.com/folder/ä'], [ 'http://example.com/2012-12-29T10:39:12+00:00', 'http://example.com/',