From cf725b1c71d6fa82450607ec895faf66f11d49a2 Mon Sep 17 00:00:00 2001
From: Kevin Decherf <kevin@kdecherf.com>
Date: Sat, 23 Jan 2021 00:52:07 +0100
Subject: [PATCH 1/3] HttpClient: refactor the way we clean utm_ query params

The old preg_replace call incorrectly removed the ? of the query string
section which could lead to things like this:

  http://example.com/foo?utm_source=a&var=value =>
http://example.com/foo&var=value

Signed-off-by: Kevin Decherf <kevin@kdecherf.com>
---
 src/Extractor/HttpClient.php       |  7 ++++++-
 tests/Extractor/HttpClientTest.php | 32 ++++++++++++++++++++++++++++++
 2 files changed, 38 insertions(+), 1 deletion(-)

diff --git a/src/Extractor/HttpClient.php b/src/Extractor/HttpClient.php
index ba8d4db7..e1f774a6 100644
--- a/src/Extractor/HttpClient.php
+++ b/src/Extractor/HttpClient.php
@@ -263,7 +263,12 @@ public function fetch($url, $skipTypeVerification = false, $httpHeader = [])
         }
 
         // remove utm parameters & fragment
-        $effectiveUrl = preg_replace('/((\?)?(&(amp;)?)?utm_(.*?)\=[^&]+)|(#(.*?)\=[^&]+)/', '', rawurldecode($effectiveUrl));
+        $uri = new Uri(str_replace('&amp;', '&', $effectiveUrl));
+        parse_str($uri->getQuery(), $query);
+        $queryParameters = array_filter($query, function ($k) {
+            return !(0 === stripos($k, 'utm_'));
+        }, \ARRAY_FILTER_USE_KEY);
+        $effectiveUrl = (string) Uri::withQueryValues(new Uri($uri->withFragment('')->withQuery('')), $queryParameters);
 
         $this->logger->info('Data fetched: {data}', ['data' => [
             'effective_url' => $effectiveUrl,
diff --git a/tests/Extractor/HttpClientTest.php b/tests/Extractor/HttpClientTest.php
index 3ca97dcd..5b3f7de6 100644
--- a/tests/Extractor/HttpClientTest.php
+++ b/tests/Extractor/HttpClientTest.php
@@ -620,4 +620,36 @@ public function testAccept(string $url, array $httpHeader, $expectedAccept): voi
             $this->assertArrayNotHasKey('accept', $records[3]['context']);
         }
     }
+
+    public function dataForWithUrlContainingQueryAndFragment(): array
+    {
+        return [
+            [
+                'url' => 'https://example.com/foo?utm_content=111315005&utm_medium=social&utm_source=twitter&hss_channel=tw-hello',
+                'expectedUrl' => 'https://example.com/foo?hss_channel=tw-hello',
+            ],
+            [
+                'url' => 'https://example.com/foo?hss_channel=tw-hello#fragment',
+                'expectedUrl' => 'https://example.com/foo?hss_channel=tw-hello',
+            ],
+            [
+                'url' => 'https://example.com/foo?utm_content=111315005',
+                'expectedUrl' => 'https://example.com/foo',
+            ],
+        ];
+    }
+
+    /**
+     * @dataProvider dataForWithUrlContainingQueryAndFragment
+     */
+    public function testWithUrlContainingQueryAndFragment(string $url, string $expectedUrl): void
+    {
+        $httpMockClient = new HttpMockClient();
+        $httpMockClient->addResponse(new Response(200));
+
+        $http = new HttpClient($httpMockClient);
+        $res = $http->fetch($url);
+
+        $this->assertSame($expectedUrl, $res['effective_url']);
+    }
 }

From 5fab6691446e4dd2f39169ddfbd1aa5634082bd8 Mon Sep 17 00:00:00 2001
From: Kevin Decherf <kevin@kdecherf.com>
Date: Sat, 23 Jan 2021 18:44:14 +0100
Subject: [PATCH 2/3] tests: fix tests following query string refactor

As per RFC3986, Section 3.4 indicates that we can avoid percent-encoding
the'/' and '?' characters. However the '=' character in the nested query
string should be encoded.

Signed-off-by: Kevin Decherf <kevin@kdecherf.com>
---
 tests/GrabyFunctionalTest.php | 2 +-
 tests/GrabyTest.php           | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/GrabyFunctionalTest.php b/tests/GrabyFunctionalTest.php
index d7718d95..91a09a19 100644
--- a/tests/GrabyFunctionalTest.php
+++ b/tests/GrabyFunctionalTest.php
@@ -220,7 +220,7 @@ public function testYoutubeOembed(): void
 
         $this->assertSame(200, $res['status']);
         $this->assertEmpty($res['language']);
-        $this->assertSame('https://www.youtube.com/oembed?url=https://www.youtube.com/watch?v=td0P8qrS8iI&format=xml', $res['url']);
+        $this->assertSame('https://www.youtube.com/oembed?url=https://www.youtube.com/watch?v%3Dtd0P8qrS8iI&format=xml', $res['url']);
         $this->assertSame('[Review] The Matrix Falling (Rain) Source Code C++', $res['title']);
         // $this->assertSame('<iframe id="video" width="480" height="270" src="https://www.youtube.com/embed/td0P8qrS8iI?feature=oembed" frameborder="0" allowfullscreen="allowfullscreen">[embedded content]</iframe>', $res['html']);
         $this->assertSame('[embedded content]', $res['summary']);
diff --git a/tests/GrabyTest.php b/tests/GrabyTest.php
index 3c349b5f..38f0516f 100644
--- a/tests/GrabyTest.php
+++ b/tests/GrabyTest.php
@@ -403,7 +403,7 @@ public function testAssetExtensionTXT(): void
     public function dataForSinglePage(): array
     {
         return [
-            'single_page_link will return a string (ie the text content of <a> node)' => ['singlepage1.com', 'http://singlepage1.com/printed view', 'http://moreintelligentlife.com/print/content'],
+            'single_page_link will return a string (ie the text content of <a> node)' => ['singlepage1.com', 'http://singlepage1.com/printed%20view', 'http://moreintelligentlife.com/print/content'],
             'single_page_link will return the a node' => ['singlepage2.com', 'http://singlepage2.com/print/content', 'http://singlepage2.com/print/content'],
             'single_page_link will return the href from a node' => ['singlepage3.com', 'http://singlepage3.com/print/content', 'http://singlepage3.com/print/content'],
             'single_page_link will return nothing useful' => ['singlepage4.com', 'http://singlepage4.com', 'http://singlepage4.com/print/content'],

From d8f93d915e784fb130e9b4e520e66e66d117dbdf Mon Sep 17 00:00:00 2001
From: Kevin Decherf <kevin@kdecherf.com>
Date: Sat, 23 Jan 2021 19:05:25 +0100
Subject: [PATCH 3/3] dependencies: set min version of guzzlehttp/psr7 to 1.5.0

Uri::withQueryValues() has been added in psr7 1.5.0 but our direct
dependency http-factory-guzzle is currently set to require ^1.4.2 which
could cause issues on some installations.

Signed-off-by: Kevin Decherf <kevin@kdecherf.com>
---
 composer.json | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/composer.json b/composer.json
index e6068e94..6b4585aa 100644
--- a/composer.json
+++ b/composer.json
@@ -31,7 +31,8 @@
         "simplepie/simplepie": "^1.5",
         "smalot/pdfparser": "^1.0",
         "symfony/options-resolver": "^3.4|^4.4|^5.3",
-        "true/punycode": "^2.1"
+        "true/punycode": "^2.1",
+        "guzzlehttp/psr7": "^1.5.0"
     },
     "require-dev": {
         "friendsofphp/php-cs-fixer": "^3.0",