Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix cookies injection into request #172

Merged
merged 2 commits into from
Nov 1, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 12 additions & 4 deletions src/Extractor/HttpClient.php
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

namespace Graby\Extractor;

use Guzzle\Parser\Cookie\CookieParser;
use GuzzleHttp\Client;
use GuzzleHttp\Exception\RequestException;
use GuzzleHttp\Message\Response;
Expand Down Expand Up @@ -127,7 +128,7 @@ public function fetch($url, $skipTypeVerification = false, $httpHeader = [])
// don't add an empty line with cookie if none are defined
$cookie = $this->getCookie($url, $httpHeader);
if ($cookie) {
$options['headers']['Cookie'] = $cookie;
$options['cookies'] = $cookie;
}

try {
Expand Down Expand Up @@ -406,17 +407,24 @@ private function getReferer($url, $httpHeader = [])
* @param string $url Absolute url
* @param array $httpHeader Custom HTTP Headers from SiteConfig
*
* @return string
* @return array|false
*/
private function getCookie($url, $httpHeader = [])
{
if (!empty($httpHeader['cookie'])) {
$this->logger->log('debug', 'Found cookie "{cookie}" for url "{url}" from site config', ['cookie' => $httpHeader['cookie'], 'url' => $url]);

return $httpHeader['cookie'];
$parser = new CookieParser();
$data = $parser->parseCookie($httpHeader['cookie']);

if (false === $data) {
return false;
}

return $data['cookies'];
}

return '';
return false;
}

/**
Expand Down
32 changes: 32 additions & 0 deletions tests/GrabyFunctionalTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -491,4 +491,36 @@ public function testJsonLd()
$this->assertCount(1, $res['authors']);
$this->assertSame('Jeremy Goujon', $res['authors'][0]);
}

public function testCookie()
{
$graby = new Graby([
'debug' => true,
'extractor' => [
'config_builder' => [
'site_config' => [__DIR__ . '/fixtures/site_config'],
],
],
]);
$res = $graby->fetchContent('http://www.npr.org/sections/parallels/2017/05/19/529148729/michael-flynns-contradictory-line-on-russia');

$this->assertCount(12, $res);

$this->assertArrayHasKey('status', $res);
$this->assertArrayHasKey('html', $res);
$this->assertArrayHasKey('title', $res);
$this->assertArrayHasKey('language', $res);
$this->assertArrayHasKey('date', $res);
$this->assertArrayHasKey('authors', $res);
$this->assertArrayHasKey('url', $res);
$this->assertArrayHasKey('content_type', $res);
$this->assertArrayHasKey('summary', $res);
$this->assertArrayHasKey('open_graph', $res);
$this->assertArrayHasKey('native_ad', $res);
$this->assertArrayHasKey('all_headers', $res);

$this->assertSame(200, $res['status']);
// if the cookie wasn't taking into account, it'll be "NPR Choice page"
$this->assertSame('Michael Flynn\'s Contradictory Line On Russia', $res['title']);
}
}
36 changes: 36 additions & 0 deletions tests/fixtures/site_config/npr.org.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
title: //div[contains(@class, 'storytitle')]//h1
author: //p[@class="byline"]/span
body: //div[@id='primaryaudio']//*[@class='duration' or @class='download' or contains(@class, 'photo')] | //div[@id='storytext' or @id='supplementarycontent' or contains(@class, 'transcript')]
date: //meta[@name="date"]/@content

strip_id_or_class: enlarge_measure
strip_id_or_class: enlarge_html
strip: //a[contains(@class, 'enlargeicon')]
strip: //div[contains(@class, 'bookedition')]
strip: //div[@class='textsize']
strip: //ul[@class='genres']
strip: //span[@class='bull']
strip_id_or_class: secondary
strip_id_or_class: con1col
strip: //h3[@class='conheader']

replace_string(<a name="more">&nbsp;</a>): <!-- no more -->
replace_string(<div class="transcript">): <div class="transcript"><h2>Transcript</h2>
replace_string(<div class="transcript storytext">): <div class="transcript storytext"><h2>Transcript</h2>

http_header(Cookie): trackingChoice=true; choiceVersion=1

prune: no
strip://div[@class="ecommercepop"]
strip://span[@class="bull"]
strip://span[@class="purchaseLink"]
strip://div[@class="enlarge_html"]
strip://div[@class="enlarge_measure"]
strip://div[@class="container con1col small"]
strip://a[contains(@class, "enlargebtn")]
strip://div[contains(@class, "bucketwrap internallink")]

test_url: http://www.npr.org/blogs/thetwo-way/2011/07/12/137799301/sports-loses-its-escapist-gleam-in-a-summer-of-court-dates
test_url: http://www.npr.org/2012/07/04/156190948/feeling-under-siege-catholic-leadership-shifts-right
test_url: http://www.npr.org/2012/12/13/166480907/the-years-best-sci-fi-crosses-galaxies-and-genres
test_url: http://www.npr.org/templates/story/story.php?storyId=229103221