diff --git a/actions/ConnectivityAction.php b/actions/ConnectivityAction.php index c11e6595fa1..604b78069d9 100644 --- a/actions/ConnectivityAction.php +++ b/actions/ConnectivityAction.php @@ -37,12 +37,14 @@ public function execute(array $request) throw new \Exception('This action is only available in debug mode!'); } - if (!isset($request['bridge'])) { + $bridgeName = $request['bridge'] ?? null; + if (!$bridgeName) { return render_template('connectivity.html.php'); } - - $bridgeClassName = $this->bridgeFactory->createBridgeClassName($request['bridge']); - + $bridgeClassName = $this->bridgeFactory->createBridgeClassName($bridgeName); + if (!$bridgeClassName) { + throw new \Exception(sprintf('Bridge not found: %s', $bridgeName)); + } return $this->reportBridgeConnectivity($bridgeClassName); } diff --git a/actions/DisplayAction.php b/actions/DisplayAction.php index 129d45871be..7b2efec1dd4 100644 --- a/actions/DisplayAction.php +++ b/actions/DisplayAction.php @@ -33,9 +33,15 @@ public function execute(array $request) private function createResponse(array $request) { $bridgeFactory = new BridgeFactory(); - $bridgeClassName = $bridgeFactory->createBridgeClassName($request['bridge'] ?? ''); + $formatFactory = new FormatFactory(); + $bridgeName = $request['bridge'] ?? null; $format = $request['format'] ?? null; + + $bridgeClassName = $bridgeFactory->createBridgeClassName($bridgeName); + if (!$bridgeClassName) { + throw new \Exception(sprintf('Bridge not found: %s', $bridgeName)); + } if (!$format) { throw new \Exception('You must specify a format!'); } @@ -43,7 +49,6 @@ private function createResponse(array $request) throw new \Exception('This bridge is not whitelisted'); } - $formatFactory = new FormatFactory(); $format = $formatFactory->create($format); $bridge = $bridgeFactory->create($bridgeClassName); diff --git a/actions/FrontpageAction.php b/actions/FrontpageAction.php index 40d25ea4805..64281b1e9e2 100644 --- a/actions/FrontpageAction.php +++ b/actions/FrontpageAction.php @@ -4,12 +4,20 @@ final class FrontpageAction implements ActionInterface { public function execute(array $request) { + $messages = []; $showInactive = (bool) ($request['show_inactive'] ?? null); $activeBridges = 0; $bridgeFactory = new BridgeFactory(); $bridgeClassNames = $bridgeFactory->getBridgeClassNames(); + foreach ($bridgeFactory->getMissingEnabledBridges() as $missingEnabledBridge) { + $messages[] = [ + 'body' => sprintf('Warning : Bridge "%s" not found', $missingEnabledBridge), + 'level' => 'warning' + ]; + } + $formatFactory = new FormatFactory(); $formats = $formatFactory->getFormatNames(); @@ -24,7 +32,7 @@ public function execute(array $request) } return render(__DIR__ . '/../templates/frontpage.html.php', [ - 'messages' => [], + 'messages' => $messages, 'admin_email' => Configuration::getConfig('admin', 'email'), 'admin_telegram' => Configuration::getConfig('admin', 'telegram'), 'bridges' => $body, diff --git a/actions/SetBridgeCacheAction.php b/actions/SetBridgeCacheAction.php index a9a598bd426..416f2378804 100644 --- a/actions/SetBridgeCacheAction.php +++ b/actions/SetBridgeCacheAction.php @@ -23,7 +23,11 @@ public function execute(array $request) $bridgeFactory = new BridgeFactory(); - $bridgeClassName = $bridgeFactory->createBridgeClassName($request['bridge'] ?? ''); + $bridgeName = $request['bridge'] ?? null; + $bridgeClassName = $bridgeFactory->createBridgeClassName($bridgeName); + if (!$bridgeClassName) { + throw new \Exception(sprintf('Bridge not found: %s', $bridgeName)); + } // whitelist control if (!$bridgeFactory->isEnabled($bridgeClassName)) { diff --git a/bridges/AppleMusicBridge.php b/bridges/AppleMusicBridge.php index 4c3e0e2f294..900a7009c8f 100644 --- a/bridges/AppleMusicBridge.php +++ b/bridges/AppleMusicBridge.php @@ -40,6 +40,8 @@ public function collectData() foreach ($json->results as $obj) { if ($obj->wrapperType === 'collection') { + $copyright = $obj->copyright ?? ''; + $this->items[] = [ 'title' => $obj->artistName . ' - ' . $obj->collectionName, 'uri' => $obj->collectionViewUrl, @@ -49,7 +51,7 @@ public function collectData() . '>

' . $obj->artistName . ' - ' . $obj->collectionName . '
' - . $obj->copyright, + . $copyright, ]; } } diff --git a/bridges/AskfmBridge.php b/bridges/AskfmBridge.php index 0a32641799d..d0422890804 100644 --- a/bridges/AskfmBridge.php +++ b/bridges/AskfmBridge.php @@ -37,7 +37,8 @@ public function collectData() $item['timestamp'] = strtotime($element->find('time', 0)->datetime); - $answer = trim($element->find('div.streamItem_content', 0)->innertext); + $var = $element->find('div.streamItem_content', 0); + $answer = trim($var->innertext ?? ''); // This probably should be cleaned up, especially for YouTube embeds if ($visual = $element->find('div.streamItem_visual', 0)) { diff --git a/bridges/CssSelectorBridge.php b/bridges/CssSelectorBridge.php index 2d7489de40f..ce15875861c 100644 --- a/bridges/CssSelectorBridge.php +++ b/bridges/CssSelectorBridge.php @@ -198,6 +198,9 @@ protected function htmlFindEntries($page, $url_selector, $url_pattern = '', $lim } if ($link->tag != 'a') { $link = $link->find('a', 0); + if (is_null($link)) { + continue; + } } $item['uri'] = $link->href; $item['title'] = $link->plaintext; @@ -209,6 +212,10 @@ protected function htmlFindEntries($page, $url_selector, $url_pattern = '', $lim $link_to_item[$link->href] = $item; } + if (empty($link_to_item)) { + returnClientError('The provided URL selector matches some elements, but they do not contain links.'); + } + $links = $this->filterUrlList(array_keys($link_to_item), $url_pattern, $limit); if (empty($links)) { diff --git a/bridges/EconomistBridge.php b/bridges/EconomistBridge.php index 1b15555dffd..9a73a852744 100644 --- a/bridges/EconomistBridge.php +++ b/bridges/EconomistBridge.php @@ -159,7 +159,7 @@ private function processContent($html, $elem) $svelte->parent->removeChild($svelte); } foreach ($elem->find('img') as $strange_img) { - if (!str_contains($strange_img->src, 'https://economist.com')) { + if (!str_contains($strange_img->src, 'economist.com')) { $strange_img->src = 'https://economist.com' . $strange_img->src; } } diff --git a/bridges/FeedReducerBridge.php b/bridges/FeedReducerBridge.php index a37824c9f94..37bf9809e67 100644 --- a/bridges/FeedReducerBridge.php +++ b/bridges/FeedReducerBridge.php @@ -23,8 +23,9 @@ class FeedReducerBridge extends FeedExpander public function collectData() { - if (preg_match('#^http(s?)://#i', $this->getInput('url'))) { - $this->collectExpandableDatas($this->getInput('url')); + $url = $this->getInput('url'); + if (preg_match('#^http(s?)://#i', $url)) { + $this->collectExpandableDatas($url); } else { throw new Exception('URI must begin with http(s)://'); } @@ -35,7 +36,7 @@ public function getItems() $filteredItems = []; $intPercentage = (int)preg_replace('/[^0-9]/', '', $this->getInput('percentage')); - foreach ($this->items as $thisItem) { + foreach ($this->items as $item) { // The URL is included in the hash: // - so you can change the output by adding a local-part to the URL // - so items with the same URI in different feeds won't be correlated @@ -43,13 +44,13 @@ public function getItems() // $pseudoRandomInteger will be a 16 bit unsigned int mod 100. // This won't be uniformly distributed 1-100, but should be close enough. - $pseudoRandomInteger = unpack( - 'S', // unsigned 16-bit int - hash('sha256', $thisItem['uri'] . '::' . $this->getInput('url'), true) - )[1] % 100; + $data = $item['uri'] . '::' . $this->getInput('url'); + $hash = hash('sha256', $data, true); + // S = unsigned 16-bit int + $pseudoRandomInteger = unpack('S', $hash)[1] % 100; if ($pseudoRandomInteger < $intPercentage) { - $filteredItems[] = $thisItem; + $filteredItems[] = $item; } } diff --git a/bridges/FilterBridge.php b/bridges/FilterBridge.php index 992fe0c3779..ef739de30e1 100644 --- a/bridges/FilterBridge.php +++ b/bridges/FilterBridge.php @@ -12,7 +12,7 @@ class FilterBridge extends FeedExpander 'url' => [ 'name' => 'Feed URL', 'type' => 'text', - 'defaultValue' => 'https://lorem-rss.herokuapp.com/feed?unit=day', + 'exampleValue' => 'https://lorem-rss.herokuapp.com/feed?unit=day', 'required' => true, ], 'filter' => [ diff --git a/bridges/FourchanBridge.php b/bridges/FourchanBridge.php index 179ae91f743..f67d6026e1a 100644 --- a/bridges/FourchanBridge.php +++ b/bridges/FourchanBridge.php @@ -45,7 +45,8 @@ public function collectData() $file = $element->find('.file', 0); if (!empty($file)) { - $item['image'] = $element->find('.file a', 0)->href; + $var = $element->find('.file a', 0); + $item['image'] = $var->href ?? ''; $item['imageThumb'] = $element->find('.file img', 0)->src; if (!isset($item['imageThumb']) and strpos($item['image'], '.swf') !== false) { $item['imageThumb'] = 'http://i.imgur.com/eO0cxf9.jpg'; diff --git a/bridges/GettrBridge.php b/bridges/GettrBridge.php index 2b019523d2c..74804043049 100644 --- a/bridges/GettrBridge.php +++ b/bridges/GettrBridge.php @@ -27,9 +27,10 @@ class GettrBridge extends BridgeAbstract public function collectData() { + $user = $this->getInput('user'); $api = sprintf( 'https://api.gettr.com/u/user/%s/posts?offset=0&max=%s&dir=fwd&incl=posts&fp=f_uo', - $this->getInput('user'), + $user, min($this->getInput('limit'), 20) ); $data = json_decode(getContents($api), false); diff --git a/bridges/HeiseBridge.php b/bridges/HeiseBridge.php index ec8bb96f426..08c43dc5111 100644 --- a/bridges/HeiseBridge.php +++ b/bridges/HeiseBridge.php @@ -103,6 +103,16 @@ class HeiseBridge extends FeedExpander 'required' => false, 'title' => 'Specify number of full articles to return', 'defaultValue' => 5 + ], + 'sessioncookie' => [ + 'name' => 'Session Cookie', + 'required' => false, + 'title' => <<<'TITLE' + If you have a heise+ subscription, + you can enter your cookie (ssohls) here to + have heise+ articles displayed in full. + By default the cookie is 1 year valid. + TITLE, ] ]]; const LIMIT = 5; @@ -118,6 +128,7 @@ public function collectData() protected function parseItem($feedItem) { $item = parent::parseItem($feedItem); + $sessioncookie = $this->getInput('sessioncookie'); // strip rss parameter $item['uri'] = explode('?', $item['uri'])[0]; @@ -128,13 +139,15 @@ protected function parseItem($feedItem) } // abort on heise+ articles and link to archive.ph for full-text content - if (str_starts_with($item['title'], 'heise+ |')) { + if ($sessioncookie == '' && str_starts_with($item['title'], 'heise+ |')) { $item['uri'] = 'https://archive.ph/?run=1&url=' . urlencode($item['uri']); return $item; } $item['uri'] .= '?seite=all'; - $article = getSimpleHTMLDOMCached($item['uri']); + $article = getSimpleHTMLDOM($item['uri'], [ + 'cookie: ssohls=' . $sessioncookie + ]); if ($article) { $article = defaultLinkTo($article, $item['uri']); diff --git a/bridges/NiusBridge.php b/bridges/NiusBridge.php new file mode 100644 index 00000000000..e5773a7d1f5 --- /dev/null +++ b/bridges/NiusBridge.php @@ -0,0 +1,32 @@ + [ + 'name' => 'Site URL: Home page with latest articles', + 'title' => << 'https://example.com/blog/', + 'required' => true + ], + 'url_pattern' => [ + 'name' => 'Pattern for site URLs to take in feed', + 'title' => 'Select items by applying a regular expression on their URL', + 'exampleValue' => 'https://example.com/article/.*', + 'required' => true + ], + 'content_selector' => [ + 'name' => 'Selector for each article content', + 'title' => <<. + Everything inside that element becomes feed item content. + EOT, + 'exampleValue' => 'article.content', + 'required' => true + ], + 'content_cleanup' => [ + 'name' => '[Optional] Content cleanup: List of items to remove', + 'title' => 'Selector for unnecessary elements to remove inside article contents.', + 'exampleValue' => 'div.ads, div.comments', + ], + 'title_cleanup' => [ + 'name' => '[Optional] Text to remove from article title', + 'title' => 'Specify here some text from page title that need to be removed, e.g. " | BlogName".', + 'exampleValue' => ' | BlogName', + ], + 'site_map' => [ + 'name' => '[Optional] sitemap.xml URL', + 'title' => << and fields for the bridge to work: + Eg. https://article/url2000-12-31T23:59Z + is feed item URL, for selecting the most recent entries. + EOT, + 'exampleValue' => 'https://example.com/sitemap.xml', + ], + 'limit' => self::LIMIT + ] + ]; + + public function collectData() + { + $url = $this->getInput('home_page'); + $url_pattern = $this->getInput('url_pattern'); + $content_selector = $this->getInput('content_selector'); + $content_cleanup = $this->getInput('content_cleanup'); + $title_cleanup = $this->getInput('title_cleanup'); + $site_map = $this->getInput('site_map'); + $limit = $this->getInput('limit'); + + $this->feedName = $this->getPageTitle($url, $title_cleanup); + $sitemap_url = empty($site_map) ? $url : $site_map; + $sitemap_xml = $this->getSitemapXml($sitemap_url, !empty($site_map)); + $links = $this->sitemapXmlToList($sitemap_xml, $url_pattern, empty($limit) ? 10 : $limit); + + if (empty($links) && empty(sitemapXmlToList($sitemap_xml))) { + returnClientError('Could not retrieve URLs with Timestamps from Sitemap: ' . $sitemap_url); + } + + foreach ($links as $link) { + $this->items[] = $this->expandEntryWithSelector($link, $content_selector, $content_cleanup, $title_cleanup); + } + } + + /** + * Retrieve site map from specified URL + * @param string $url URL pointing to any page of the site, e.g. "https://example.com/blog" OR directly to the site map e.g. "https://example.com/sitemap.xml" + * @param string $is_site_map TRUE if the specified URL points directly to the sitemap XML + * @return object Sitemap DOM (from parsed XML) + */ + protected function getSitemapXml(&$url, $is_site_map = false) + { + if (!$is_site_map) { + $robots_txt = getSimpleHTMLDOM(urljoin($url, '/robots.txt'))->outertext; + preg_match('/Sitemap: ([^ ]+)/', $robots_txt, $matches); + if (empty($matches)) { + returnClientError('Failed to determine Sitemap from robots.txt. Try setting it manually.'); + } + $url = $matches[1]; + } + return getSimpleHTMLDOM($url); + } + + /** + * Retrieve N most recent URLs from Site Map + * @param object $sitemap Site map XML DOM + * @param string $url_pattern Optional pattern to look for in URLs + * @param int $limit Optional maximum amount of URLs to return + * @param bool $keep_date TRUE to keep dates (url => date array instead of url array) + * @return array Array of URLs + */ + protected function sitemapXmlToList($sitemap, $url_pattern = '', $limit = 0, $keep_date = false) + { + $links = []; + + foreach ($sitemap->find('sitemap') as $nested_sitemap) { + $url = $nested_sitemap->find('loc'); + if (!empty($url)) { + $url = $url[0]->plaintext; + if (str_ends_with(strtolower($url), '.xml')) { + $nested_sitemap_xml = $this->getSitemapXml($url, true); + $nested_sitemap_links = $this->sitemapXmlToList($nested_sitemap_xml, $url_pattern, null, true); + $links = array_merge($links, $nested_sitemap_links); + } + } + } + + if (!empty($url_pattern)) { + $url_pattern = str_replace('/', '\/', $url_pattern); + } + + foreach ($sitemap->find('url') as $item) { + $url = $item->find('loc'); + $lastmod = $item->find('lastmod'); + if (!empty($url) && !empty($lastmod)) { + $url = $url[0]->plaintext; + $lastmod = $lastmod[0]->plaintext; + $timestamp = strtotime($lastmod); + if (empty($url_pattern) || preg_match('/' . $url_pattern . '/', $url) === 1) { + $links[$url] = $timestamp; + } + } + } + + arsort($links); + + if ($limit > 0 && count($links) > $limit) { + $links = array_slice($links, 0, $limit); + } + + return $keep_date ? $links : array_keys($links); + } +} diff --git a/bridges/TelegramBridge.php b/bridges/TelegramBridge.php index 1435900916c..9d73e06e394 100644 --- a/bridges/TelegramBridge.php +++ b/bridges/TelegramBridge.php @@ -169,11 +169,19 @@ private function processSticker($messageDiv) $stickerDiv->find('picture', 0)->style = ''; return $stickerDiv; - } elseif (preg_match(self::BACKGROUND_IMAGE_REGEX, $stickerDiv->find('i', 0)->style, $sticker)) { - return <<find('i', 0); + if ($var) { + $style = $var->style; + if (preg_match(self::BACKGROUND_IMAGE_REGEX, $style, $sticker)) { + return << EOD; + } } + + return ''; } private function processPoll($messageDiv) diff --git a/bridges/TikTokBridge.php b/bridges/TikTokBridge.php index 556e5ffcac0..769bc625aff 100644 --- a/bridges/TikTokBridge.php +++ b/bridges/TikTokBridge.php @@ -34,6 +34,9 @@ public function collectData() $this->feedName = htmlspecialchars_decode($title); $var = $html->find('script[id=SIGI_STATE]', 0); + if (!$var) { + throw new \Exception('Unable to find tiktok user data for ' . $this->processUsername()); + } $SIGI_STATE_RAW = $var->innertext; $SIGI_STATE = Json::decode($SIGI_STATE_RAW, false); diff --git a/bridges/TldrTechBridge.php b/bridges/TldrTechBridge.php index 7d8febe1c40..b89686bb959 100644 --- a/bridges/TldrTechBridge.php +++ b/bridges/TldrTechBridge.php @@ -25,7 +25,8 @@ class TldrTechBridge extends BridgeAbstract 'Crypto' => 'crypto', 'AI' => 'ai', 'Web Dev' => 'engineering', - 'Founders' => 'founders' + 'Founders' => 'founders', + 'Cybersecurity' => 'cybersecurity' ], 'defaultValue' => 'tech' ] diff --git a/bridges/VkBridge.php b/bridges/VkBridge.php index 967734ef7bd..8c18f26af25 100644 --- a/bridges/VkBridge.php +++ b/bridges/VkBridge.php @@ -84,7 +84,10 @@ public function collectData() foreach ($html->find('div.replies') as $comment_block) { $comment_block->outertext = ''; } - $html->load($html->save()); + + // expensive operation + $save = $html->save(); + $html->load($save); $pinned_post_item = null; $last_post_id = 0; diff --git a/lib/BridgeAbstract.php b/lib/BridgeAbstract.php index eb9d5a3cc01..e58ddb917b6 100644 --- a/lib/BridgeAbstract.php +++ b/lib/BridgeAbstract.php @@ -58,8 +58,6 @@ abstract class BridgeAbstract implements BridgeInterface /** * Configuration for the bridge - * - * Use {@see BridgeAbstract::getConfiguration()} to read this parameter */ const CONFIGURATION = []; @@ -113,6 +111,11 @@ abstract class BridgeAbstract implements BridgeInterface */ protected $queriedContext = ''; + /** + * Holds the list of bridge-specific configurations from config.ini.php, used by the bridge. + */ + private array $configuration = []; + /** {@inheritdoc} */ public function getItems() { @@ -144,6 +147,10 @@ protected function setInputs(array $inputs, $queriedContext) } foreach ($contexts as $context) { + if (!isset(static::PARAMETERS[$context])) { + // unknown context provided by client, throw exception here? or continue? + } + foreach (static::PARAMETERS[$context] as $name => $properties) { if (isset($this->inputs[$context][$name]['value'])) { continue; @@ -361,12 +368,6 @@ public function getIcon() return static::URI . '/favicon.ico'; } - /** {@inheritdoc} */ - public function getConfiguration() - { - return static::CONFIGURATION; - } - /** {@inheritdoc} */ public function getParameters() { diff --git a/lib/BridgeFactory.php b/lib/BridgeFactory.php index db2c394a2c8..f302a27acb7 100644 --- a/lib/BridgeFactory.php +++ b/lib/BridgeFactory.php @@ -4,6 +4,7 @@ final class BridgeFactory { private $bridgeClassNames = []; private $enabledBridges = []; + private $missingEnabledBridges = []; public function __construct() { @@ -23,7 +24,13 @@ public function __construct() $this->enabledBridges = $this->bridgeClassNames; break; } - $this->enabledBridges[] = $this->createBridgeClassName($enabledBridge); + $bridgeClassName = $this->createBridgeClassName($enabledBridge); + if ($bridgeClassName) { + $this->enabledBridges[] = $bridgeClassName; + } else { + $this->missingEnabledBridges[] = $enabledBridge; + Logger::info(sprintf('Bridge not found: %s', $enabledBridge)); + } } } @@ -42,13 +49,10 @@ public function createBridgeClassName(string $bridgeName): ?string $name = self::normalizeBridgeName($bridgeName); $namesLoweredCase = array_map('strtolower', $this->bridgeClassNames); $nameLoweredCase = strtolower($name); - if (! in_array($nameLoweredCase, $namesLoweredCase)) { - throw new \Exception(sprintf('Bridge name invalid: %s', $bridgeName)); + return null; } - $index = array_search($nameLoweredCase, $namesLoweredCase); - return $this->bridgeClassNames[$index]; } @@ -67,4 +71,9 @@ public function getBridgeClassNames(): array { return $this->bridgeClassNames; } + + public function getMissingEnabledBridges(): array + { + return $this->missingEnabledBridges; + } } diff --git a/lib/BridgeInterface.php b/lib/BridgeInterface.php index b461ed12cce..977ad7f61d5 100644 --- a/lib/BridgeInterface.php +++ b/lib/BridgeInterface.php @@ -60,11 +60,6 @@ interface BridgeInterface */ public function collectData(); - /** - * Get the user's supplied configuration for the bridge - */ - public function getConfiguration(); - /** * Returns the value for the selected configuration * diff --git a/lib/FeedExpander.php b/lib/FeedExpander.php index 1bac617999a..c91586d71cd 100644 --- a/lib/FeedExpander.php +++ b/lib/FeedExpander.php @@ -308,7 +308,16 @@ protected function parseATOMItem($feedItem) $item['author'] = (string)$feedItem->author->name; } if (isset($feedItem->content)) { - $item['content'] = (string)$feedItem->content; + $contentChildren = $feedItem->content->children(); + if (count($contentChildren) > 0) { + $content = ''; + foreach ($contentChildren as $contentChild) { + $content .= $contentChild->asXML(); + } + $item['content'] = $content; + } else { + $item['content'] = (string)$feedItem->content; + } } //When "link" field is present, URL is more reliable than "id" field diff --git a/lib/contents.php b/lib/contents.php index 5587a98e8b8..c842ccbcda2 100644 --- a/lib/contents.php +++ b/lib/contents.php @@ -432,8 +432,6 @@ function getSimpleHTMLDOMCached( $content = $cache->loadData($timeout); if (!$content || Debug::isEnabled()) { $content = getContents($url, $header ?? [], $opts ?? []); - } - if ($content) { $cache->setScope('pages'); $cache->setKey([$url]); $cache->saveData($content);