Skip to content

Commit

Permalink
Merge pull request #190 from j0k3r/if-page-contains
Browse files Browse the repository at this point in the history
Handle "if_page_contains" for "single_page_link"
  • Loading branch information
j0k3r authored Jan 8, 2019
2 parents ca4aaba + b456522 commit 7ac1e2d
Show file tree
Hide file tree
Showing 6 changed files with 103 additions and 0 deletions.
12 changes: 12 additions & 0 deletions src/Graby.php
Original file line number Diff line number Diff line change
Expand Up @@ -656,6 +656,18 @@ private function getSinglePage($html, $url)
$singlePageUrl = null;

foreach ($siteConfig->single_page_link as $pattern) {
// Do we have conditions?
$condition = $siteConfig->getIfPageContainsCondition('single_page_link', $pattern);

if ($condition) {
$elems = $xpath->evaluate($condition, $readability->dom);

// move on to next single page link XPath in case condition isn't met
if (!($elems instanceof \DOMNodeList && $elems->length > 0)) {
continue;
}
}

$elems = $xpath->evaluate($pattern, $readability->dom);

if (\is_string($elems)) {
Expand Down
26 changes: 26 additions & 0 deletions src/SiteConfig/ConfigBuilder.php
Original file line number Diff line number Diff line change
Expand Up @@ -291,6 +291,15 @@ public function mergeConfig(SiteConfig $currentConfig, SiteConfig $newConfig)
$currentConfig->$var = array_unique(array_merge($currentConfig->$var, $newConfig->$var));
}

// special handling of if_page_contains directive
foreach (['single_page_link'] as $var) {
if (isset($currentConfig->if_page_contains[$var]) && isset($newConfig->if_page_contains[$var])) {
$currentConfig->if_page_contains[$var] = array_merge($newConfig->if_page_contains[$var], $currentConfig->if_page_contains[$var]);
} elseif (isset($newConfig->if_page_contains[$var])) {
$currentConfig->if_page_contains[$var] = $newConfig->if_page_contains[$var];
}
}

// check for single statement commands
// we do not overwrite existing non null values
foreach (['tidy', 'prune', 'parser', 'autodetect_on_failure', 'requires_login', 'skip_json_ld'] as $var) {
Expand Down Expand Up @@ -359,9 +368,26 @@ public function parseLines(array $lines)
array_push($config->replace_string, $val);
} elseif ((')' === substr($command, -1)) && preg_match('!^([a-z0-9_]+)\(([a-z0-9_-]+)\)$!i', $command, $match) && 'http_header' === $match[1] && \in_array(strtolower($match[2]), ['user-agent', 'referer', 'cookie', 'accept'], true)) {
$config->http_header[strtolower(trim($match[2]))] = $val;
// special treatment for if_page_contains
} elseif (\in_array($command, ['if_page_contains'], true)) {
$this->handleIfPageContainsCondition($config, $val);
}
}

return $config;
}

// Add if_page_page_contains
// TODO: Expand so it can be used with other rules too
private function handleIfPageContainsCondition(SiteConfig $config, $condition)
{
if (empty($config->single_page_link)) {
return;
}

$key = end($config->single_page_link);
reset($config->single_page_link);

$config->if_page_contains['single_page_link'][$key] = (string) $condition;
}
}
19 changes: 19 additions & 0 deletions src/SiteConfig/SiteConfig.php
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,10 @@ class SiteConfig
// Test URL - if present, can be used to test the config above
public $test_url = [];

// If page contains - XPath expression. Used to determine if the preceding rule gets evaluated or not.
// Currently only works with single_page_link.
public $if_page_contains = [];

// Single-page link - should identify a link element or URL pointing to the page holding the entire article
// This is useful for sites which split their articles across multiple pages. Links to such pages tend to
// display the first page with links to the other pages at the bottom. Often there is also a link to a page
Expand Down Expand Up @@ -207,4 +211,19 @@ public function autodetect_on_failure($use_default = true)

return $this->autodetect_on_failure;
}

/**
* Return a condition for the given name (if exists).
*
* @param string $name Rule name (only single_page_link is supported for now)
* @param string $value Value of the rule (currently only an url)
*
* @return string|null
*/
public function getIfPageContainsCondition($name, $value)
{
if (isset($this->if_page_contains[$name]) && isset($this->if_page_contains[$name][$value])) {
return $this->if_page_contains[$name][$value];
}
}
}
36 changes: 36 additions & 0 deletions tests/GrabyTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -1633,6 +1633,42 @@ public function testAuthors($url, $file, $expectedAuthors)
$this->assertSame($expectedAuthors, $res['authors']);
}

/**
* Validated using the site_config in "tests/fixtures".
*/
public function testIfPageContains()
{
$graby = $this->getGrabyWithMock(
'/fixtures/content/timothysykes-keepol.html',
200,
[
'extractor' => [
'config_builder' => [
'site_config' => [__DIR__ . '/fixtures/site_config'],
],
],
]
);
$res = $graby->fetchContent('https://www.timothysykes.com/blog/10-things-know-short-selling/');

$this->assertCount(12, $res);

$this->assertArrayHasKey('status', $res);
$this->assertArrayHasKey('html', $res);
$this->assertArrayHasKey('title', $res);
$this->assertArrayHasKey('language', $res);
$this->assertArrayHasKey('date', $res);
$this->assertArrayHasKey('authors', $res);
$this->assertArrayHasKey('url', $res);
$this->assertArrayHasKey('content_type', $res);
$this->assertArrayHasKey('summary', $res);
$this->assertArrayHasKey('open_graph', $res);
$this->assertArrayHasKey('native_ad', $res);
$this->assertArrayHasKey('all_headers', $res);

$this->assertSame(200, $res['status']);
}

/**
* Return an instance of graby with a mocked Guzzle client returning data from a predefined file.
*/
Expand Down
8 changes: 8 additions & 0 deletions tests/SiteConfig/ConfigBuilderTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,8 @@ public function testBuildFromArray()
'http_header(Cookie): GDPR_consent=1',
'strip_attr: @class',
'strip_attr: @style',
'single_page_link: //canonical',
'if_page_contains: //div/article/header',
]);

$configExpected = new SiteConfig();
Expand All @@ -59,6 +61,12 @@ public function testBuildFromArray()
];
$configExpected->date = ['foo'];
$configExpected->strip_attr = ['@class', '@style'];
$configExpected->single_page_link = ['//canonical'];
$configExpected->if_page_contains = [
'single_page_link' => [
'//canonical' => '//div/article/header',
],
];

$this->assertEquals($configExpected, $configActual);

Expand Down
2 changes: 2 additions & 0 deletions tests/fixtures/site_config/timothysykes.com.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
single_page_link: concat(substring-before(//link[@rel="canonical"]/@href, "_story.html"), "_print.html")
if_page_contains: //link[@rel="canonical" and contains(@href, '_story.html')]

0 comments on commit 7ac1e2d

Please sign in to comment.