Skip to content

Commit

Permalink
Merge pull request #276 from Kdecherf/fix/utm-query-2.x
Browse files Browse the repository at this point in the history
  • Loading branch information
j0k3r authored Dec 13, 2021
2 parents 7514e82 + d8f93d9 commit 93b7c63
Show file tree
Hide file tree
Showing 5 changed files with 42 additions and 4 deletions.
3 changes: 2 additions & 1 deletion composer.json
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,8 @@
"simplepie/simplepie": "^1.5",
"smalot/pdfparser": "^1.0",
"symfony/options-resolver": "^3.4|^4.4|^5.3",
"true/punycode": "^2.1"
"true/punycode": "^2.1",
"guzzlehttp/psr7": "^1.5.0"
},
"require-dev": {
"friendsofphp/php-cs-fixer": "^3.0",
Expand Down
7 changes: 6 additions & 1 deletion src/Extractor/HttpClient.php
Original file line number Diff line number Diff line change
Expand Up @@ -263,7 +263,12 @@ public function fetch($url, $skipTypeVerification = false, $httpHeader = [])
}

// remove utm parameters & fragment
$effectiveUrl = preg_replace('/((\?)?(&(amp;)?)?utm_(.*?)\=[^&]+)|(#(.*?)\=[^&]+)/', '', rawurldecode($effectiveUrl));
$uri = new Uri(str_replace('&', '&', $effectiveUrl));
parse_str($uri->getQuery(), $query);
$queryParameters = array_filter($query, function ($k) {
return !(0 === stripos($k, 'utm_'));
}, \ARRAY_FILTER_USE_KEY);
$effectiveUrl = (string) Uri::withQueryValues(new Uri($uri->withFragment('')->withQuery('')), $queryParameters);

$this->logger->info('Data fetched: {data}', ['data' => [
'effective_url' => $effectiveUrl,
Expand Down
32 changes: 32 additions & 0 deletions tests/Extractor/HttpClientTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -620,4 +620,36 @@ public function testAccept(string $url, array $httpHeader, $expectedAccept): voi
$this->assertArrayNotHasKey('accept', $records[3]['context']);
}
}

public function dataForWithUrlContainingQueryAndFragment(): array
{
return [
[
'url' => 'https://example.com/foo?utm_content=111315005&utm_medium=social&utm_source=twitter&hss_channel=tw-hello',
'expectedUrl' => 'https://example.com/foo?hss_channel=tw-hello',
],
[
'url' => 'https://example.com/foo?hss_channel=tw-hello#fragment',
'expectedUrl' => 'https://example.com/foo?hss_channel=tw-hello',
],
[
'url' => 'https://example.com/foo?utm_content=111315005',
'expectedUrl' => 'https://example.com/foo',
],
];
}

/**
* @dataProvider dataForWithUrlContainingQueryAndFragment
*/
public function testWithUrlContainingQueryAndFragment(string $url, string $expectedUrl): void
{
$httpMockClient = new HttpMockClient();
$httpMockClient->addResponse(new Response(200));

$http = new HttpClient($httpMockClient);
$res = $http->fetch($url);

$this->assertSame($expectedUrl, $res['effective_url']);
}
}
2 changes: 1 addition & 1 deletion tests/GrabyFunctionalTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -220,7 +220,7 @@ public function testYoutubeOembed(): void

$this->assertSame(200, $res['status']);
$this->assertEmpty($res['language']);
$this->assertSame('https://www.youtube.com/oembed?url=https://www.youtube.com/watch?v=td0P8qrS8iI&format=xml', $res['url']);
$this->assertSame('https://www.youtube.com/oembed?url=https://www.youtube.com/watch?v%3Dtd0P8qrS8iI&format=xml', $res['url']);
$this->assertSame('[Review] The Matrix Falling (Rain) Source Code C++', $res['title']);
// $this->assertSame('<iframe id="video" width="480" height="270" src="https://www.youtube.com/embed/td0P8qrS8iI?feature=oembed" frameborder="0" allowfullscreen="allowfullscreen">[embedded content]</iframe>', $res['html']);
$this->assertSame('[embedded content]', $res['summary']);
Expand Down
2 changes: 1 addition & 1 deletion tests/GrabyTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -403,7 +403,7 @@ public function testAssetExtensionTXT(): void
public function dataForSinglePage(): array
{
return [
'single_page_link will return a string (ie the text content of <a> node)' => ['singlepage1.com', 'http://singlepage1.com/printed view', 'http://moreintelligentlife.com/print/content'],
'single_page_link will return a string (ie the text content of <a> node)' => ['singlepage1.com', 'http://singlepage1.com/printed%20view', 'http://moreintelligentlife.com/print/content'],
'single_page_link will return the a node' => ['singlepage2.com', 'http://singlepage2.com/print/content', 'http://singlepage2.com/print/content'],
'single_page_link will return the href from a node' => ['singlepage3.com', 'http://singlepage3.com/print/content', 'http://singlepage3.com/print/content'],
'single_page_link will return nothing useful' => ['singlepage4.com', 'http://singlepage4.com', 'http://singlepage4.com/print/content'],
Expand Down

0 comments on commit 93b7c63

Please sign in to comment.