Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[NationalGeographicBridge] Rewrite bridge #2177

Merged
merged 16 commits into from
Oct 1, 2021
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
312 changes: 240 additions & 72 deletions bridges/NationalGeographicBridge.php
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,12 @@ class NationalGeographicBridge extends BridgeAbstract {
const PARAMETER_FULL_ARTICLE = 'full';
const TOPIC_MAGAZINE = 'Magazine';
const TOPIC_LATEST_STORIES = 'Latest Stories';
const CACHE_TIMEOUT = 900; //15 min

const NAME = 'National Geographic';
const URI = 'https://www.nationalgeographic.com/';
const DESCRIPTION = 'Fetches the latest articles from the National Geographic Magazine';
const MAINTAINER = 'logmanoriginal';
const MAINTAINER = 'csisoap';
const PARAMETERS = array(
self::CONTEXT_BY_TOPIC => array(
self::PARAMETER_TOPIC => array(
Expand All @@ -28,12 +29,22 @@ class NationalGeographicBridge extends BridgeAbstract {
self::PARAMETER_FULL_ARTICLE => array(
'name' => 'Full Article',
'type' => 'checkbox',
'title' => 'Enable to load full articles (takes longer)'
'title' => 'Enable to load full articles and other infos (takes longer)'
)
)
);

private $topicName = '';
const CONTEXT = 'eyJjb250ZW50VHlwZSI6IlVuaXNvbkh1YiIsInZhcmlhYmxlcyI6eyJsb2NhdG9yIjoiL3BhZ2VzL3
RvcGljL2xhdGVzdC1zdG9yaWVzIiwicG9ydGZvbGlvIjoibmF0Z2VvIiwicXVlcn
lUeXBlIjoiTE9DQVRPUiJ9LCJtb2R1bGVJZCI6bnVsbH0';
const LATEST_STORIES_ID = array(
'1df278bb-0e3d-4a67-a0ce-8fae48392822-f2-m1'
);
const MAGAZINE_ID = array(
'94d87d74-f41a-4a32-9acd-b591ba2df288-f2-m1',
'94d87d74-f41a-4a32-9acd-b591ba2df288-f5-m2',
);

public function getURI() {
switch ($this->queriedContext) {
Expand All @@ -46,9 +57,16 @@ public function getURI() {
}
}

private function getAPIURL($id) {
$context = preg_replace('/\s*/m', '', self::CONTEXT);
$url = 'https://www.nationalgeographic.com/proxy/hub?context='
. $context . '&id=' . $id
. '&moduleType=InfiniteFeedModule&_xhr=pageContent';
return $url;
}

public function collectData() {
$this->topicName = $this->getTopicName($this->getInput(self::PARAMETER_TOPIC));

switch($this->topicName) {
case self::TOPIC_MAGAZINE: {
return $this->collectMagazine();
Expand Down Expand Up @@ -78,117 +96,267 @@ private function getTopicName($topic) {
}

private function collectMagazine() {
$uri = $this->getURI();
$stories = array();

$html = getSimpleHTMLDOM($uri)
or returnServerError('Could not request ' . $uri);
foreach(self::MAGAZINE_ID as $id) {
$uri = $this->getAPIURL($id);

$script = $html->find('#lead-component script')[0];
$json_raw = getContents($uri);

$json = json_decode($script->innertext, true);
$json = json_decode($json_raw, true)['tiles'];
$stories = array_merge($json, $stories);
}

// This is probably going to break in the future, fix it then :)
foreach($json['body']['0']['multilayout_promo_beta']['stories'] as $story) {
foreach($stories as $story) {
$this->addStory($story);
}
}

private function collectLatestStories() {
$uri = self::URI . 'latest-stories/_jcr_content/content/hubfeed.promo-hub-feed-all-stories.json';
$stories = array();

foreach(self::LATEST_STORIES_ID as $id) {
$uri = $this->getAPIURL($id);

$json_raw = getContents($uri)
or returnServerError('Could not request ' . $uri);
$json_raw = getContents($uri);

foreach(json_decode($json_raw, true) as $story) {
$json = json_decode($json_raw, true)['tiles'];
$stories = array_merge($stories, $json);
}

foreach($stories as $story) {
$this->addStory($story);
}
}

private function addStory($story) {
$title = 'Unknown title';
$content = '';
$story_type = '';
$uri = '';

foreach($story['components'] as $component) {
switch($component['content_type']) {
case 'title': {
$title = $component['title']['text'];
} break;
case 'dek': {
$content = $component['dek']['text'];
} break;
}
foreach($story['ctas'] as $component) {
$uri = $component['url'];
$story_type = $component['icon'];
}

$item = array();

$item['uri'] = $story['uri'];
$item['title'] = $title;
if(isset($story['description'])) {
$content = '<p>' . $story['description'] . '</p>';
}
$title = $story['title'];
$item['uri'] = $uri;
$item['title'] = $story['title'];

// if full article is requested!
if ($this->getInput(self::PARAMETER_FULL_ARTICLE))
$item['content'] = $this->getFullArticle($item['uri']);
else
if ($this->getInput(self::PARAMETER_FULL_ARTICLE)) {
if($story_type != 'interactive') {
/* Nat Geo doesn't provided much info about interactive page
* and it requires JS to load the interactive.
*/
$article_data = $this->getFullArticle($item['uri']);
$item['timestamp'] = $article_data['published_date'];
$item['author'] = $article_data['authors'];
$item['content'] = $content . $article_data['content'];
} else {
$item['content'] = $content;
}
} else
$item['content'] = $content;

if (isset($story['promo_image'])) {
switch($story['promo_image']['content_type']) {
case 'image': {
$item['enclosures'][] = $story['promo_image']['image']['uri'];
} break;
}
$image = $story['img'];
$item['enclosures'][] = $image['src'];

$tags = $story['tags'];
foreach($tags as $tag) {
$tag_name = $tag['name'];
$item['categories'][] = $tag_name;
}

if (isset($story['lead_media'])) {
$media = $story['lead_media'];
switch($media['content_type']) {
case 'image': {
// Don't add if promo_image was added
if (empty($item['enclosures']))
$item['enclosures'][] = $media['image']['uri'];
} break;
case 'image_gallery': {
foreach($media['image_gallery']['images'] as $image) {
$item['enclosures'][] = $image['uri'];
}
} break;
$this->items[] = $item;
}

private function filterArticleData($data) {
$article_module = array_filter(
$data, function ($item) {
if(isset($item['id']) && $item['id'] == 'natgeo-template1-frame-1') {
return true;
}
}
);

$article_data = array_reduce(
$article_module,
function (array $carry, array $item) {
$module = $item['mods'];
return array_merge(
$carry,
array_filter(
$module, function ($data) {
return $data['id'] == 'natgeo-template1-frame-1-module-1';
}
)
);
},
array()
);

return $article_data[0];
}

private function handleImages($image_module, $image_type) {
$image_alt = '';
$image_credit = '';
$image_src = '';
$image_caption = '';
$caption = '';
switch($image_type) {
case 'image':
case 'imagegroup':
$image = $image_module['image'];
$image_src = $image['src'];
if(isset($image_module['alt'])) {
$image_alt = $image_module['alt'];
} elseif(isset($image['altText'])) {
$image_alt = $image['altText'];
}
if(isset($image['crdt'])) {
$image_credit = $image['crdt'];
}
$caption = (isset($image_module['caption']) ? $image_module['caption'] : '');
break;
case 'photogallery':
$image_credit = (isset($image_module['caption']['credit']) ? $image_module['caption']['credit'] : '');
$caption = $image_module['caption']['text'];
$image_src = $image_module['img']['src'];
$image_alt = $image_module['img']['altText'];
break;
case 'video':
$image_credit = (isset($image_module['credit']) ? $image_module['credit'] : '');
$description = (isset($image_module['description']) ? $image_module['description'] : '');
$caption = $description . ' Video can be watched on the article\'s page';
$image = $image_module['image'];
$image_alt = $image['altText'];
$image_src = $image['src'];
}

$this->items[] = $item;
$image_caption = $caption . ' ' . $image_credit
. '. Notes: Some image may have copyrighted on it.';
$wrapper = <<<EOD
<figure>
<img src="{$image_src}" alt="{$image_alt}">
<figcaption>$image_caption</figcaption>
</figure>
EOD;
return $wrapper;
}

private function getFullArticle($uri) {
$html = getSimpleHTMLDOMCached($uri)
$html = getContents($uri)
or returnServerError('Could not load ' . $uri);

$html = defaultLinkTo($html, $uri);
$scriptRegex = '/window\[\'__natgeo__\'\]=(.*);<\/script>/';

preg_match($scriptRegex, $html, $matches, PREG_OFFSET_CAPTURE, 0);

$json = json_decode($matches[1][0], true);

$unfiltered_data = $json['page']['content']['article']['frms'];
$filtered_data = $this->filterArticleData($unfiltered_data);

$article = $filtered_data['edgs'][0];

$contributors = $article['cntrbGrp'];
$authors = array();
if(count($contributors) > 0) {
$authors = $contributors[0]['contributors'];
}

$authors_name = '';
$counter = 0;
foreach($authors as $author) {
$counter++;
if($counter == count($authors)) {
$authors_name .= $author['displayName'];
} else {
$authors_name .= $author['displayName'] . ', ';
}
}

$published_date = $article['pbDt'];
$article_body = $article['bdy'];
$content = '';

foreach($html->find('
.content > .smartbody.text,
.content > .section.image script[type="text/json"],
.content > .section.image span[itemprop="caption"],
.content > .section.inline script[type="text/json"]
') as $element) {
if ($element->tag === 'script') {
$json = json_decode($element->innertext, true);
if (isset($json['src'])) {
$content .= '<img src="' . $json['src'] . '" width="100%" alt="' . $json['alt'] . '">';
} elseif (isset($json['galleryType']) && isset($json['endpoint'])) {
$doc = getContents($json['endpoint'])
or returnServerError('Could not load ' . $json['endpoint']);
$json = json_decode($doc, true);
foreach($json['items'] as $item) {
$content .= '<p>' . $item['caption'] . '</p>';
$content .= '<img src="' . $item['url'] . '" width="100%" alt="' . $item['caption'] . '">';
foreach($article_body as $body) {
switch($body['type']) {
case 'p':
$content .= '<p>' . $body['cntnt']['mrkup'] . '</p>';
break;
case 'h2':
$content .= '<h2>' . $body['cntnt']['mrkup'] . '</h2>';
break;
case 'inline':
$module = $body['cntnt'];
if(empty($module))
continue 2;
switch($module['cmsType']) {
case 'image':
$content .= $this->handleImages($module, $module['cmsType']);
break;
case 'imagegroup':
$images = $module['images'];
foreach($images as $image) {
$content .= $this->handleImages($image, $module['cmsType']);
}
break;
case 'editorsNote':
$content .= $module['note'];
break;
case 'listicle':
$content .= '<h2>' . $module['title'] . '</h2>';
if(isset($module['image'])) {
$content .= $this->handleImages($module['image'], $module['image']['cmsType']);
}
$content .= '<p>' . (isset($module['text']) ? $module['text'] : '') . '</p>';
break;
case 'photogallery':
$gallery = $body['cntnt']['media'];
foreach($gallery as $image) {
$content .= $this->handleImages($image, $module['cmsType']);
}
break;
case 'video':
$content .= $this->handleImages($module, $module['cmsType']);
break;
case 'pullquote';
$quote = $module['quote'];
$author_name = '';
$authors = (isset($module['byLineProps']['authors']) ? $module['byLineProps']['authors'] : array());
foreach($authors as $author) {
$author_desc = (isset($author['authorDesc']) ? $author['authorDesc'] : '');
$author_name .= $author['displayName'] . ', ' . $author_desc;
}
$content .= <<<EOD
<figure>
<blockquote>
<p>$quote</p>
</blockquote>
<figcaption>$author_name</figcaption>
</figure>
EOD;
break;
}
}
} else {
$content .= $element->outertext;
break;
case 'ul':
$content .= $body['cntnt']['mrkup'] . '<hr>';
break;
}
}

return $content;
return array(
'content' => $content,
'published_date' => $published_date,
'authors' => $authors_name
);
}
}