From c1c998dd134c0c6be660d78ec4ffe75547bd0a41 Mon Sep 17 00:00:00 2001 From: ORelio Date: Sat, 29 Jan 2022 06:29:01 +0100 Subject: [PATCH] [GBAtempBridge] Fix content extraction (#2314) Bridge was broken since GBAtemp's Xenforo 2 upgrade on 2021-09-23 --- bridges/GBAtempBridge.php | 80 +++++++++++++++++++-------------------- 1 file changed, 40 insertions(+), 40 deletions(-) diff --git a/bridges/GBAtempBridge.php b/bridges/GBAtempBridge.php index 8f9a55bdc20..79fe313fecd 100644 --- a/bridges/GBAtempBridge.php +++ b/bridges/GBAtempBridge.php @@ -32,45 +32,48 @@ private function buildItem($uri, $title, $author, $timestamp, $thumbnail, $conte return $item; } + private function decodeHtmlEntities($text) { + $text = html_entity_decode($text); + $convmap = array(0x0, 0x2FFFF, 0, 0xFFFF); + return trim(mb_decode_numericentity($text, $convmap, 'UTF-8')); + } + private function cleanupPostContent($content, $site_url){ - $content = str_replace(':arrow:', '➤', $content); - $content = str_replace('href="attachments/', 'href="' . $site_url . 'attachments/', $content); + $content = defaultLinkTo($content, self::URI); $content = stripWithDelimiters($content, ''); - return $content; + $content = stripWithDelimiters($content, ''); + $content = stripRecursiveHTMLSection($content, 'div', '
find('span.DateTime', 0); - $time = DateTime::createFromFormat( - 'M j, Y \a\t g:i A', - extractFromDelimiters( - $dateField->outertext, - 'title="', - '"' - ) - )->getTimestamp(); + $time = strtotime($dateField->datetime); } return $time; } + private function findItemImage($item, $selector){ + $img = extractFromDelimiters($item->find($selector, 0)->style, 'url(', ')'); + $paramPos = strpos($img, '?'); + if ($paramPos !== false) { + $img = substr($img, 0, $paramPos); + } + if (!str_ends_with($img, '.png') && !str_ends_with($img, '.jpg')) { + $img = $img . '#.image'; + } + return urljoin(self::URI, $img); + } + private function fetchPostContent($uri, $site_url){ $html = getSimpleHTMLDOMCached($uri); if(!$html) { return 'Could not request GBAtemp: ' . $uri; } - $content = $html->find('div.messageContent, blockquote.baseHtml', 0)->innertext; + $content = $html->find('article.message-body', 0)->innertext; return $this->cleanupPostContent($content, $site_url); } @@ -80,12 +83,12 @@ public function collectData(){ switch($this->getInput('type')) { case 'N': - foreach($html->find('li[class=news_item news full]') as $newsItem) { - $url = self::URI . $newsItem->find('a', 0)->href; - $img = $this->getURI() . extractFromDelimiters($newsItem->find('a.news_image', 0)->style, 'url(', ')') . '#.image'; + foreach($html->find('li.news_item.full') as $newsItem) { + $url = urljoin(self::URI, $newsItem->find('a', 0)->href); + $img = $this->findItemImage($newsItem, 'a.news_image'); $time = $this->findItemDate($newsItem); $author = $newsItem->find('a.username', 0)->plaintext; - $title = $newsItem->find('a', 1)->plaintext; + $title = $this->decodeHtmlEntities($newsItem->find('h3.news_title', 0)->plaintext); $content = $this->fetchPostContent($url, self::URI); $this->items[] = $this->buildItem($url, $title, $author, $time, $img, $content); unset($newsItem); // Some items are heavy, freeing the item proactively helps saving memory @@ -93,26 +96,23 @@ public function collectData(){ break; case 'R': foreach($html->find('li.portal_review') as $reviewItem) { - $url = self::URI . $reviewItem->find('a', 0)->href; - $img = $this->getURI() . extractFromDelimiters($reviewItem->find('a', 0)->style, 'image:url(', ')'); - $title = $reviewItem->find('span.review_title', 0)->plaintext; - $content = getSimpleHTMLDOM($url); - $author = $content->find('a.username', 0)->plaintext; + $url = urljoin(self::URI, $reviewItem->find('a.review_boxart', 0)->href); + $img = $this->findItemImage($reviewItem, 'a.review_boxart'); + $title = $this->decodeHtmlEntities($reviewItem->find('h2.review_title', 0)->plaintext); + $content = getSimpleHTMLDOMCached($url); + $author = $content->find('span.author--name', 0)->plaintext; $time = $this->findItemDate($content); - $intro = '

' . ($content->find('div#review_intro', 0)->plaintext) . '

'; + $intro = '

' . ($content->find('div#review_introduction', 0)->plaintext) . '

'; $review = $content->find('div#review_main', 0)->innertext; - $subheader = '

' . $content->find('div.review_subheader', 0)->plaintext . '

'; - $procons = $content->find('table.review_procons', 0)->outertext; - $scores = $content->find('table.reviewscores', 0)->outertext; - $content = $this->cleanupPostContent($intro . $review . $subheader . $procons . $scores, self::URI); + $content = $this->cleanupPostContent($intro . $review, self::URI); $this->items[] = $this->buildItem($url, $title, $author, $time, $img, $content); unset($reviewItem); // Free up memory } break; case 'T': foreach($html->find('li.portal-tutorial') as $tutorialItem) { - $url = self::URI . $tutorialItem->find('a', 1)->href; - $title = $tutorialItem->find('a', 1)->plaintext; + $url = urljoin(self::URI, $tutorialItem->find('a', 1)->href); + $title = $this->decodeHtmlEntities($tutorialItem->find('a', 1)->plaintext); $time = $this->findItemDate($tutorialItem); $author = $tutorialItem->find('a.username', 0)->plaintext; $content = $this->fetchPostContent($url, self::URI); @@ -122,8 +122,8 @@ public function collectData(){ break; case 'F': foreach($html->find('li.rc_item') as $postItem) { - $url = self::URI . $postItem->find('a', 1)->href; - $title = $postItem->find('a', 1)->plaintext; + $url = urljoin(self::URI, $postItem->find('a', 1)->href); + $title = $this->decodeHtmlEntities($postItem->find('a', 1)->plaintext); $time = $this->findItemDate($postItem); $author = $postItem->find('a.username', 0)->plaintext; $content = $this->fetchPostContent($url, self::URI);