From cf725b1c71d6fa82450607ec895faf66f11d49a2 Mon Sep 17 00:00:00 2001 From: Kevin Decherf <kevin@kdecherf.com> Date: Sat, 23 Jan 2021 00:52:07 +0100 Subject: [PATCH 1/3] HttpClient: refactor the way we clean utm_ query params The old preg_replace call incorrectly removed the ? of the query string section which could lead to things like this: http://example.com/foo?utm_source=a&var=value => http://example.com/foo&var=value Signed-off-by: Kevin Decherf <kevin@kdecherf.com> --- src/Extractor/HttpClient.php | 7 ++++++- tests/Extractor/HttpClientTest.php | 32 ++++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+), 1 deletion(-) diff --git a/src/Extractor/HttpClient.php b/src/Extractor/HttpClient.php index ba8d4db7..e1f774a6 100644 --- a/src/Extractor/HttpClient.php +++ b/src/Extractor/HttpClient.php @@ -263,7 +263,12 @@ public function fetch($url, $skipTypeVerification = false, $httpHeader = []) } // remove utm parameters & fragment - $effectiveUrl = preg_replace('/((\?)?(&(amp;)?)?utm_(.*?)\=[^&]+)|(#(.*?)\=[^&]+)/', '', rawurldecode($effectiveUrl)); + $uri = new Uri(str_replace('&', '&', $effectiveUrl)); + parse_str($uri->getQuery(), $query); + $queryParameters = array_filter($query, function ($k) { + return !(0 === stripos($k, 'utm_')); + }, \ARRAY_FILTER_USE_KEY); + $effectiveUrl = (string) Uri::withQueryValues(new Uri($uri->withFragment('')->withQuery('')), $queryParameters); $this->logger->info('Data fetched: {data}', ['data' => [ 'effective_url' => $effectiveUrl, diff --git a/tests/Extractor/HttpClientTest.php b/tests/Extractor/HttpClientTest.php index 3ca97dcd..5b3f7de6 100644 --- a/tests/Extractor/HttpClientTest.php +++ b/tests/Extractor/HttpClientTest.php @@ -620,4 +620,36 @@ public function testAccept(string $url, array $httpHeader, $expectedAccept): voi $this->assertArrayNotHasKey('accept', $records[3]['context']); } } + + public function dataForWithUrlContainingQueryAndFragment(): array + { + return [ + [ + 'url' => 'https://example.com/foo?utm_content=111315005&utm_medium=social&utm_source=twitter&hss_channel=tw-hello', + 'expectedUrl' => 'https://example.com/foo?hss_channel=tw-hello', + ], + [ + 'url' => 'https://example.com/foo?hss_channel=tw-hello#fragment', + 'expectedUrl' => 'https://example.com/foo?hss_channel=tw-hello', + ], + [ + 'url' => 'https://example.com/foo?utm_content=111315005', + 'expectedUrl' => 'https://example.com/foo', + ], + ]; + } + + /** + * @dataProvider dataForWithUrlContainingQueryAndFragment + */ + public function testWithUrlContainingQueryAndFragment(string $url, string $expectedUrl): void + { + $httpMockClient = new HttpMockClient(); + $httpMockClient->addResponse(new Response(200)); + + $http = new HttpClient($httpMockClient); + $res = $http->fetch($url); + + $this->assertSame($expectedUrl, $res['effective_url']); + } } From 5fab6691446e4dd2f39169ddfbd1aa5634082bd8 Mon Sep 17 00:00:00 2001 From: Kevin Decherf <kevin@kdecherf.com> Date: Sat, 23 Jan 2021 18:44:14 +0100 Subject: [PATCH 2/3] tests: fix tests following query string refactor As per RFC3986, Section 3.4 indicates that we can avoid percent-encoding the'/' and '?' characters. However the '=' character in the nested query string should be encoded. Signed-off-by: Kevin Decherf <kevin@kdecherf.com> --- tests/GrabyFunctionalTest.php | 2 +- tests/GrabyTest.php | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/GrabyFunctionalTest.php b/tests/GrabyFunctionalTest.php index d7718d95..91a09a19 100644 --- a/tests/GrabyFunctionalTest.php +++ b/tests/GrabyFunctionalTest.php @@ -220,7 +220,7 @@ public function testYoutubeOembed(): void $this->assertSame(200, $res['status']); $this->assertEmpty($res['language']); - $this->assertSame('https://www.youtube.com/oembed?url=https://www.youtube.com/watch?v=td0P8qrS8iI&format=xml', $res['url']); + $this->assertSame('https://www.youtube.com/oembed?url=https://www.youtube.com/watch?v%3Dtd0P8qrS8iI&format=xml', $res['url']); $this->assertSame('[Review] The Matrix Falling (Rain) Source Code C++', $res['title']); // $this->assertSame('<iframe id="video" width="480" height="270" src="https://www.youtube.com/embed/td0P8qrS8iI?feature=oembed" frameborder="0" allowfullscreen="allowfullscreen">[embedded content]</iframe>', $res['html']); $this->assertSame('[embedded content]', $res['summary']); diff --git a/tests/GrabyTest.php b/tests/GrabyTest.php index 3c349b5f..38f0516f 100644 --- a/tests/GrabyTest.php +++ b/tests/GrabyTest.php @@ -403,7 +403,7 @@ public function testAssetExtensionTXT(): void public function dataForSinglePage(): array { return [ - 'single_page_link will return a string (ie the text content of <a> node)' => ['singlepage1.com', 'http://singlepage1.com/printed view', 'http://moreintelligentlife.com/print/content'], + 'single_page_link will return a string (ie the text content of <a> node)' => ['singlepage1.com', 'http://singlepage1.com/printed%20view', 'http://moreintelligentlife.com/print/content'], 'single_page_link will return the a node' => ['singlepage2.com', 'http://singlepage2.com/print/content', 'http://singlepage2.com/print/content'], 'single_page_link will return the href from a node' => ['singlepage3.com', 'http://singlepage3.com/print/content', 'http://singlepage3.com/print/content'], 'single_page_link will return nothing useful' => ['singlepage4.com', 'http://singlepage4.com', 'http://singlepage4.com/print/content'], From d8f93d915e784fb130e9b4e520e66e66d117dbdf Mon Sep 17 00:00:00 2001 From: Kevin Decherf <kevin@kdecherf.com> Date: Sat, 23 Jan 2021 19:05:25 +0100 Subject: [PATCH 3/3] dependencies: set min version of guzzlehttp/psr7 to 1.5.0 Uri::withQueryValues() has been added in psr7 1.5.0 but our direct dependency http-factory-guzzle is currently set to require ^1.4.2 which could cause issues on some installations. Signed-off-by: Kevin Decherf <kevin@kdecherf.com> --- composer.json | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/composer.json b/composer.json index e6068e94..6b4585aa 100644 --- a/composer.json +++ b/composer.json @@ -31,7 +31,8 @@ "simplepie/simplepie": "^1.5", "smalot/pdfparser": "^1.0", "symfony/options-resolver": "^3.4|^4.4|^5.3", - "true/punycode": "^2.1" + "true/punycode": "^2.1", + "guzzlehttp/psr7": "^1.5.0" }, "require-dev": { "friendsofphp/php-cs-fixer": "^3.0",