Skip to content

Commit

Permalink
Merge pull request #172 from j0k3r/fix/cookie
Browse files Browse the repository at this point in the history
Fix cookies injection into request
  • Loading branch information
j0k3r authored Nov 1, 2018
2 parents cf4a49d + 888e34f commit 1e56056
Show file tree
Hide file tree
Showing 3 changed files with 80 additions and 4 deletions.
16 changes: 12 additions & 4 deletions src/Extractor/HttpClient.php
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

namespace Graby\Extractor;

use Guzzle\Parser\Cookie\CookieParser;
use GuzzleHttp\Client;
use GuzzleHttp\Exception\RequestException;
use GuzzleHttp\Message\Response;
Expand Down Expand Up @@ -127,7 +128,7 @@ public function fetch($url, $skipTypeVerification = false, $httpHeader = [])
// don't add an empty line with cookie if none are defined
$cookie = $this->getCookie($url, $httpHeader);
if ($cookie) {
$options['headers']['Cookie'] = $cookie;
$options['cookies'] = $cookie;
}

try {
Expand Down Expand Up @@ -406,17 +407,24 @@ private function getReferer($url, $httpHeader = [])
* @param string $url Absolute url
* @param array $httpHeader Custom HTTP Headers from SiteConfig
*
* @return string
* @return array|false
*/
private function getCookie($url, $httpHeader = [])
{
if (!empty($httpHeader['cookie'])) {
$this->logger->log('debug', 'Found cookie "{cookie}" for url "{url}" from site config', ['cookie' => $httpHeader['cookie'], 'url' => $url]);

return $httpHeader['cookie'];
$parser = new CookieParser();
$data = $parser->parseCookie($httpHeader['cookie']);

if (false === $data) {
return false;
}

return $data['cookies'];
}

return '';
return false;
}

/**
Expand Down
32 changes: 32 additions & 0 deletions tests/GrabyFunctionalTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -491,4 +491,36 @@ public function testJsonLd()
$this->assertCount(1, $res['authors']);
$this->assertSame('Jeremy Goujon', $res['authors'][0]);
}

public function testCookie()
{
$graby = new Graby([
'debug' => true,
'extractor' => [
'config_builder' => [
'site_config' => [__DIR__ . '/fixtures/site_config'],
],
],
]);
$res = $graby->fetchContent('http://www.npr.org/sections/parallels/2017/05/19/529148729/michael-flynns-contradictory-line-on-russia');

$this->assertCount(12, $res);

$this->assertArrayHasKey('status', $res);
$this->assertArrayHasKey('html', $res);
$this->assertArrayHasKey('title', $res);
$this->assertArrayHasKey('language', $res);
$this->assertArrayHasKey('date', $res);
$this->assertArrayHasKey('authors', $res);
$this->assertArrayHasKey('url', $res);
$this->assertArrayHasKey('content_type', $res);
$this->assertArrayHasKey('summary', $res);
$this->assertArrayHasKey('open_graph', $res);
$this->assertArrayHasKey('native_ad', $res);
$this->assertArrayHasKey('all_headers', $res);

$this->assertSame(200, $res['status']);
// if the cookie wasn't taking into account, it'll be "NPR Choice page"
$this->assertSame('Michael Flynn\'s Contradictory Line On Russia', $res['title']);
}
}
36 changes: 36 additions & 0 deletions tests/fixtures/site_config/npr.org.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
title: //div[contains(@class, 'storytitle')]//h1
author: //p[@class="byline"]/span
body: //div[@id='primaryaudio']//*[@class='duration' or @class='download' or contains(@class, 'photo')] | //div[@id='storytext' or @id='supplementarycontent' or contains(@class, 'transcript')]
date: //meta[@name="date"]/@content

strip_id_or_class: enlarge_measure
strip_id_or_class: enlarge_html
strip: //a[contains(@class, 'enlargeicon')]
strip: //div[contains(@class, 'bookedition')]
strip: //div[@class='textsize']
strip: //ul[@class='genres']
strip: //span[@class='bull']
strip_id_or_class: secondary
strip_id_or_class: con1col
strip: //h3[@class='conheader']

replace_string(<a name="more">&nbsp;</a>): <!-- no more -->
replace_string(<div class="transcript">): <div class="transcript"><h2>Transcript</h2>
replace_string(<div class="transcript storytext">): <div class="transcript storytext"><h2>Transcript</h2>

http_header(Cookie): trackingChoice=true; choiceVersion=1

prune: no
strip://div[@class="ecommercepop"]
strip://span[@class="bull"]
strip://span[@class="purchaseLink"]
strip://div[@class="enlarge_html"]
strip://div[@class="enlarge_measure"]
strip://div[@class="container con1col small"]
strip://a[contains(@class, "enlargebtn")]
strip://div[contains(@class, "bucketwrap internallink")]

test_url: http://www.npr.org/blogs/thetwo-way/2011/07/12/137799301/sports-loses-its-escapist-gleam-in-a-summer-of-court-dates
test_url: http://www.npr.org/2012/07/04/156190948/feeling-under-siege-catholic-leadership-shifts-right
test_url: http://www.npr.org/2012/12/13/166480907/the-years-best-sci-fi-crosses-galaxies-and-genres
test_url: http://www.npr.org/templates/story/story.php?storyId=229103221

0 comments on commit 1e56056

Please sign in to comment.