Skip to content

Commit

Permalink
[GQMagazineBridge] Fix bridge (RSS-Bridge#1195)
Browse files Browse the repository at this point in the history
* Fix bridge by changing the way the articles are loaded AND their titles are found
Riduidel authored and logmanoriginal committed Jun 28, 2019
1 parent aab6eae commit cc32955
Showing 1 changed file with 39 additions and 29 deletions.
68 changes: 39 additions & 29 deletions bridges/GQMagazineBridge.php
Original file line number Diff line number Diff line change
@@ -40,6 +40,11 @@ class GQMagazineBridge extends BridgeAbstract
'data-original' => 'src'
);

const POSSIBLE_TITLES = array(
'h2',
'h3'
);

private function getDomain() {
$domain = $this->getInput('domain');
if (empty($domain))
@@ -54,6 +59,17 @@ public function getURI()
return $this->getDomain() . '/' . $this->getInput('page');
}

private function findTitleOf($link) {
foreach (self::POSSIBLE_TITLES as $tag) {
$title = $link->find($tag, 0);
if($title !== null) {
if($title->plaintext !== null) {
return $title->plaintext;
}
}
}
}

public function collectData()
{
$html = getSimpleHTMLDOM($this->getURI()) or returnServerError('Could not request ' . $this->getURI());
@@ -62,30 +78,33 @@ public function collectData()
$main = $html->find('main', 0);
foreach ($main->find('a') as $link) {
$uri = $link->href;
$title = $link->find('h2', 0);
$date = $link->find('time', 0);

$item = array();
$author = $link->find('span[itemprop=name]', 0);
$item['author'] = $author->plaintext;
$item['title'] = $title->plaintext;
if(substr($uri, 0, 1) === 'h') { // absolute uri
$item['uri'] = $uri;
} else if(substr($uri, 0, 1) === '/') { // domain relative url
$item['uri'] = $this->getDomain() . $uri;
} else {
$item['uri'] = $this->getDomain() . '/' . $uri;
}

$article = $this->loadFullArticle($item['uri']);
if($article) {
$item['content'] = $this->replaceUriInHtmlElement($article);
} else {
$item['content'] = "<strong>Article body couldn't be loaded</strong>. It must be a bug!";
if($author !== null) {
$item['author'] = $author->plaintext;
$item['title'] = $this->findTitleOf($link);
switch(substr($uri, 0, 1)) {
case 'h': // absolute uri
$item['uri'] = $uri;
break;
case '/': // domain relative uri
$item['uri'] = $this->getDomain() . $uri;
break;
default:
$item['uri'] = $this->getDomain() . '/' . $uri;
}
$article = $this->loadFullArticle($item['uri']);
if($article) {
$item['content'] = $this->replaceUriInHtmlElement($article);
} else {
$item['content'] = "<strong>Article body couldn't be loaded</strong>. It must be a bug!";
}
$short_date = $date->datetime;
$item['timestamp'] = strtotime($short_date);
$this->items[] = $item;
}
$short_date = $date->datetime;
$item['timestamp'] = strtotime($short_date);
$this->items[] = $item;
}
}

@@ -96,16 +115,7 @@ public function collectData()
*/
private function loadFullArticle($uri){
$html = getSimpleHTMLDOMCached($uri);
// Once again, that generated css classes madness is an obstacle ... which i can go over easily
foreach($html->find('div') as $div) {
// List the CSS classes of that div
$classes = $div->class;
// I can't directly lookup that class since GQ since to generate random names like "ArticleBodySection-fkggUW"
if(strpos($classes, 'ArticleBodySection') !== false) {
return $div;
}
}
return null;
return $html->find('section[data-test-id=ArticleBodyContent]', 0);
}

/**

0 comments on commit cc32955

Please sign in to comment.