Skip to content

Commit

Permalink
Rebuild siteconfig if host for singlepage is not the same than the so…
Browse files Browse the repository at this point in the history
…urce

Consider a `single_page_link` directive on twitter.com that follows any
link found in a tweet. Here is an example with the following tweet:

  https://twitter.com/Cloudflare/status/1341353044504694787

This eventually leads to the following page:

  https://blog.cloudflare.com/beat-an-acoustics-inspired-ddos-attack/

Until now, Graby was incorrectly sending to cloudflare.com
http headers defined for twitter.com.

Signed-off-by: Kevin Decherf <kevin@kdecherf.com>
  • Loading branch information
Kdecherf committed Jan 17, 2021
1 parent 987aa6c commit 898a6be
Show file tree
Hide file tree
Showing 3 changed files with 45 additions and 1 deletion.
14 changes: 13 additions & 1 deletion src/Graby.php
Original file line number Diff line number Diff line change
Expand Up @@ -745,7 +745,19 @@ private function getSinglePage($html, $url)
// check it's not what we have already!
if (false !== $singlePageUrl && $singlePageUrl !== $url) {
// it's not, so let's try to fetch it...
$response = $this->httpClient->fetch($singlePageUrl, false, $siteConfig->http_header);
$headers = $siteConfig->http_header;

$sourceUrl = parse_url($url);
$targetUrl = parse_url($singlePageUrl);
if (\is_array($sourceUrl)
&& \is_array($targetUrl)
&& \array_key_exists('host', $sourceUrl)
&& \array_key_exists('host', $targetUrl)
&& $sourceUrl['host'] !== $targetUrl['host']) {
$targetSiteConfig = $this->configBuilder->buildForHost($targetUrl['host']);
$headers = $targetSiteConfig->http_header;
}
$response = $this->httpClient->fetch($singlePageUrl, false, $headers);

if ($response['status'] < 300) {
$this->logger->info('Single page content found with url', ['url' => $singlePageUrl]);
Expand Down
29 changes: 29 additions & 0 deletions tests/GrabyTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -497,6 +497,35 @@ public function testSinglePageMimeAction(): void
$this->assertFalse($res['native_ad']);
}

public function testSinglePageReloadSiteConfig(): void
{
DnsMock::withMockedHosts([
'singlepage2.com' => [['type' => 'A', 'ip' => self::AN_IPV4]],
'singlepage5.com' => [['type' => 'A', 'ip' => self::AN_IPV4]],
]);

$httpMockClient = new HttpMockClient();
$httpMockClient->addResponse(new Response(
200,
['Content-Type' => 'text/html'],
'<html><h1 class="print-title">my title</h1><div class="print-submitted">my singlepage2</div><ul><li class="service-links-print"><a href="http://singlepage5.com/hello" class="service-links-print">printed view</a></li></ul></html>'
));
$httpMockClient->addResponse(new Response(
200,
['Content-Type' => 'text/html'],
'<html><h1 class="print-title">my title</h1><div class="main-article">my singlepage5</div></html>'
));

$graby = new Graby(['debug' => true, 'xss_filter' => false, 'extractor' => ['config_builder' => [
'site_config' => [__DIR__ . '/fixtures/site_config'],
]]], $httpMockClient);

$res = $graby->fetchContent('http://singlepage2.com/hello');

$this->assertStringContainsString('my singlepage5', $res['html']);
$this->assertSame('http://singlepage5.com/hello', $res['url']);
}

/**
* @group dns-sensitive
*/
Expand Down
3 changes: 3 additions & 0 deletions tests/fixtures/site_config/singlepage5.com.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
title: //h1[@class='print-title']
body: //div[@class='main-article']
prune: no

0 comments on commit 898a6be

Please sign in to comment.