From c5cbab123120e243f091a20644b4de16d068667e Mon Sep 17 00:00:00 2001 From: veloute <21003408+veloute@users.noreply.github.com> Date: Tue, 22 Aug 2023 15:59:06 +0000 Subject: [PATCH 001/152] Update TheGuardianBridge.php (#3629) --- bridges/TheGuardianBridge.php | 1 + 1 file changed, 1 insertion(+) diff --git a/bridges/TheGuardianBridge.php b/bridges/TheGuardianBridge.php index 2e14de7ae95..e05bde75d1e 100644 --- a/bridges/TheGuardianBridge.php +++ b/bridges/TheGuardianBridge.php @@ -15,6 +15,7 @@ class TheGuardianBridge extends FeedExpander 'World News' => 'world/rss', 'US News' => '/us-news/rss', 'UK News' => '/uk-news/rss', + 'Australia News' => '/australia-news/rss', 'Europe News' => '/world/europe-news/rss', 'Asia News' => '/world/asia/rss', 'Tech' => '/uk/technology/rss', From 3ac861a86681b480fd194e102713044c485c57c0 Mon Sep 17 00:00:00 2001 From: Dag Date: Tue, 22 Aug 2023 19:47:32 +0200 Subject: [PATCH 002/152] fix(twitch): Invalid argument supplied for foreach() at bridges/TwitchBridge.php line 115 (#3630) --- bridges/TwitchBridge.php | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/bridges/TwitchBridge.php b/bridges/TwitchBridge.php index 8976174afb8..146fed3d4e1 100644 --- a/bridges/TwitchBridge.php +++ b/bridges/TwitchBridge.php @@ -112,7 +112,9 @@ public function collectData() if (!is_null($video->game)) { $item['categories'][] = $video->game->displayName; } - foreach ($video->contentTags as $tag) { + + $contentTags = $video->contentTags ?? []; + foreach ($contentTags as $tag) { if (!$tag->isLanguageTag) { $item['categories'][] = $tag->localizedName; } From 54045be951cf7deab15535fccd93dc89998150d8 Mon Sep 17 00:00:00 2001 From: Dag Date: Tue, 22 Aug 2023 20:06:16 +0200 Subject: [PATCH 003/152] fix(tpb): add missing cat (#3631) --- bridges/ThePirateBayBridge.php | 1 + 1 file changed, 1 insertion(+) diff --git a/bridges/ThePirateBayBridge.php b/bridges/ThePirateBayBridge.php index 130950621ad..5b305c828db 100644 --- a/bridges/ThePirateBayBridge.php +++ b/bridges/ThePirateBayBridge.php @@ -65,6 +65,7 @@ class ThePirateBayBridge extends BridgeAbstract '207' => 'HD Movies', '208' => 'HD TV-Shows', '209' => '3D', + '212' => 'UHD/4k TV-Shows', '299' => 'Other', '301' => 'Windows', '302' => 'Mac/Apple', From 7591b10219222f818d523f457fa6d01e1891260b Mon Sep 17 00:00:00 2001 From: sysadminstory Date: Tue, 22 Aug 2023 20:44:36 +0200 Subject: [PATCH 004/152] [Core] New feature : User Interface to "Detect" Feed from an URL (#3436) * [Core] New feature : User Interface to "Detect" Feed from an URL Detect Action has been expanded to support returning a Feed in a JSON format instead of a Redirect. Existing usage of the Detect action will keep working as usual. Frontpage template has now a section to display the Feed detection result, and a button to start the Feed Detection. A new JS file contains the necessary JS (Ajax and Event management) to fill the Feed Detection section. * Coding policy fixes * [Core] New feature : User Interface to "Detect" Feed from an URL - Switch from old school XMLHttpRequest to fetch - Enhance UX of search results - Revert to it's original content - Switch to a new Action : FindfeedAction.php - Switch to template literals instead of string concatenation - FindFeed action could retrun multiple feeds - Results are sent with an absolute URL - Switch to Json::encode() helper function * [Core] New feature : User Interface to "Detect" Feed from an URL - Move specific JS code to rss-bridge.js - Change HTML tag for the button to have a consistant style with th rest of the page * [Core] New feature : User Interface to "Detect" Feed from an URL - If no context is sent, assume there is only one unnamed context - Find parameter name in global and currect context * fix * remove typo --------- Co-authored-by: Dag --- actions/FindfeedAction.php | 89 ++++++++++++++++++++++++++++++++++++ static/rss-bridge.js | 79 ++++++++++++++++++++++++++++++++ static/style.css | 35 ++++++++++++++ templates/frontpage.html.php | 9 ++++ 4 files changed, 212 insertions(+) create mode 100644 actions/FindfeedAction.php diff --git a/actions/FindfeedAction.php b/actions/FindfeedAction.php new file mode 100644 index 00000000000..25fe4714f8c --- /dev/null +++ b/actions/FindfeedAction.php @@ -0,0 +1,89 @@ +getBridgeClassNames() as $bridgeClassName) { + if (!$bridgeFactory->isEnabled($bridgeClassName)) { + continue; + } + + $bridge = $bridgeFactory->create($bridgeClassName); + + $bridgeParams = $bridge->detectParameters($targetURL); + + if ($bridgeParams === null) { + continue; + } + + // It's allowed to have no 'context' in a bridge (only a default context without any name) + // In this case, the reference to the parameters are found in the first element of the PARAMETERS array + + $context = $bridgeParams['context'] ?? 0; + + $bridgeData = []; + // Construct the array of parameters + foreach ($bridgeParams as $key => $value) { + // 'context' is a special case : it's a bridge parameters, there is no "name" for this parameter + if ($key == 'context') { + $bridgeData[$key]['name'] = 'Context'; + $bridgeData[$key]['value'] = $value; + } else { + $bridgeData[$key]['name'] = $this->getParameterName($bridge, $context, $key); + $bridgeData[$key]['value'] = $value; + } + } + + $bridgeParams['bridge'] = $bridgeClassName; + $bridgeParams['format'] = $format; + $content = [ + 'url' => get_home_page_url() . '?action=display&' . http_build_query($bridgeParams), + 'bridgeParams' => $bridgeParams, + 'bridgeData' => $bridgeData, + 'bridgeMeta' => [ + 'name' => $bridge::NAME, + 'description' => $bridge::DESCRIPTION, + 'parameters' => $bridge::PARAMETERS, + 'icon' => $bridge->getIcon(), + ], + ]; + $results[] = $content; + } + if ($results === []) { + return new Response(Json::encode(['message' => 'No bridge found for given url']), 404, ['content-type' => 'application/json']); + } + return new Response(Json::encode($results), 200, ['content-type' => 'application/json']); + } + + // Get parameter name in the actual context, or in the global parameter + private function getParameterName($bridge, $context, $key) + { + if (isset($bridge::PARAMETERS[$context][$key]['name'])) { + $name = $bridge::PARAMETERS[$context][$key]['name']; + } else if (isset($bridge::PARAMETERS['global'][$key]['name'])) { + $name = $bridge::PARAMETERS['global'][$key]['name']; + } else { + $name = 'Variable "' . $key . '" (No name provided)'; + } + return $name; + } +} diff --git a/static/rss-bridge.js b/static/rss-bridge.js index 498acd375ee..82069d8caf5 100644 --- a/static/rss-bridge.js +++ b/static/rss-bridge.js @@ -47,3 +47,82 @@ function rssbridge_toggle_bridge(){ bridge.getElementsByClassName('showmore-box')[0].checked = true; } } + +var rssbridge_feed_finder = (function() { + /* + * Code for "Find feed by URL" feature + */ + + // Start the Feed search + async function rssbridge_feed_search(event) { + const input = document.getElementById('searchfield'); + let content = input.value; + if (content) { + const findfeedresults = document.getElementById('findfeedresults'); + findfeedresults.innerHTML = 'Searching for matching feeds ...'; + let baseurl = window.location.protocol + window.location.pathname; + let url = baseurl + '?action=findfeed&format=Html&url=' + content; + const response = await fetch(url); + if (response.ok) { + const data = await response.json(); + rss_bridge_feed_display_found_feed(data); + } else { + rss_bridge_feed_display_feed_search_fail(); + } + } else { + rss_bridge_feed_display_find_feed_empty(); + } + } + + // Display the found feeds + function rss_bridge_feed_display_found_feed(obj) { + const findfeedresults = document.getElementById('findfeedresults'); + + let content = 'Found Feed(s) :'; + + // Let's go throug every Feed found + for (const element of obj) { + content += `
+
+ +
+
+

${element.bridgeMeta.name}

+

+ ${element.bridgeMeta.description} +

+
+
    `; + + // Now display every Feed parameter + for (const param in element.bridgeData) { + content += `
  • ${element.bridgeData[param].name} : ${element.bridgeData[param].value}
  • `; + } + content += `
+
+
`; + } + content += '

'; + findfeedresults.innerHTML = content; + } + + // Display an error if no feed were found + function rss_bridge_feed_display_feed_search_fail() { + const findfeedresults = document.getElementById('findfeedresults'); + findfeedresults.innerHTML = 'No Feed found !'; + } + + // Empty the Found Feed section + function rss_bridge_feed_display_find_feed_empty() { + const findfeedresults = document.getElementById('findfeedresults'); + findfeedresults.innerHTML = ''; + } + + // Add Event to 'Detect Feed" button + var rssbridge_feed_finder = function() { + const button = document.getElementById('findfeed'); + button.addEventListener("click", rssbridge_feed_search); + button.addEventListener("keyup", rssbridge_feed_search); + }; + return rssbridge_feed_finder; +}()); diff --git a/static/style.css b/static/style.css index a83e25e4214..a9d5933a7e5 100644 --- a/static/style.css +++ b/static/style.css @@ -453,3 +453,38 @@ button { color: #d8d3cb; } } + +/* find-feed */ +.search-result { + background-color: #f0f0f0; + border-radius: 5px; + padding: 15px; + display: flex; + position: relative; + text-align: left; +} +@media (prefers-color-scheme: dark) { + .search-result { + background-color: #202325; + } +} +.search-result h2 { + color: #288cfc; +} + +.search-result a { + text-decoration: none; + color: #248afa; +} +.search-result .icon { + margin: 0 15px 0 0; +} +.search-result span { + margin-right: 10px; +} +.search-result .description { + font-size: 110%; + margin-right: 0 !important; + margin-top: 5px !important; +} +/* end find-feed */ diff --git a/templates/frontpage.html.php b/templates/frontpage.html.php index 63f4a2ab2fa..99e2ffd98c0 100644 --- a/templates/frontpage.html.php +++ b/templates/frontpage.html.php @@ -2,6 +2,7 @@ From eb4ff7099f07859a2a3078c50a7d5366ca722bb1 Mon Sep 17 00:00:00 2001 From: Lars Stegman Date: Tue, 22 Aug 2023 21:28:16 +0200 Subject: [PATCH 005/152] CSS Selector Bridge 2 (#3626) * [CssSelector2Bridge] Implement CSS Selector bridge 2 * [CssSelector2Bridge] Fix author not being loaded * [CssSelector2Bridge] Remove unneeded time nullcheck * Fix linting * Fix failing test * Implement PR fixes * Update bridges/CssSelector2Bridge.php Co-authored-by: ORelio * Rename bridge and fix syntax error for php7 --------- Co-authored-by: ORelio --- bridges/CssSelectorComplexBridge.php | 458 +++++++++++++++++++++++++++ 1 file changed, 458 insertions(+) create mode 100644 bridges/CssSelectorComplexBridge.php diff --git a/bridges/CssSelectorComplexBridge.php b/bridges/CssSelectorComplexBridge.php new file mode 100644 index 00000000000..4d44f85325e --- /dev/null +++ b/bridges/CssSelectorComplexBridge.php @@ -0,0 +1,458 @@ + [ + 'name' => 'Site URL: Page with latest articles', + 'exampleValue' => 'https://example.com/blog/', + 'required' => true + ], + 'cookie' => [ + 'name' => '[Optional] Cookie', + 'title' => << 'sessionId=deadb33f' + ], + 'title_cleanup' => [ + 'name' => '[Optional] Text to remove from feed title', + 'title' => << ' | BlogName', + ], + 'entry_element_selector' => [ + 'name' => 'Selector for article entry elements', + 'title' => <<... on home page, each one being treated as a feed item. + + Use the URL selector option to select the `a` element with the + `href` to the article link. If this option is not configured, the first encountered + `a` element is used. + EOT, + 'exampleValue' => 'div.article', + 'required' => true + ], + 'url_selector' => [ + 'name' => '[Optional] Selector for link elements', + 'title' => << 'a.article', + 'defaultValue' => 'a' + ], + 'url_pattern' => [ + 'name' => '[Optional] Pattern for site URLs to keep in feed', + 'title' => 'Optionally filter items by applying a regular expression on their URL', + 'exampleValue' => '/blog/article/.*', + ], + 'limit' => self::LIMIT, + 'use_article_pages' => [ + 'name' => 'Load article from page', + 'title' => << 'checkbox' + ], + 'article_page_content_selector' => [ + 'name' => '[Optional] Selector to select article element', + 'title' => 'Extract the article from its page using the provided selector', + 'exampleValue' => 'article.content', + ], + 'content_cleanup' => [ + 'name' => '[Optional] Content cleanup: selector for items to remove', + 'title' => 'Selector for unnecessary elements to remove inside article contents.', + 'exampleValue' => 'div.ads, div.comments', + ], + 'title_selector' => [ + 'name' => '[Optional] Selector for the article title', + 'title' => 'Selector to select the article title', + 'defaultValue' => 'h1' + ], + 'category_selector' => [ + 'name' => '[Optional] Categories', + 'title' => << 'span.category, #main-category' + ], + 'author_selector' => [ + 'name' => '[Optional] Author', + 'title' => << 'span#author' + ], + 'time_selector' => [ + 'name' => '[Optional] Time selector', + 'title' => << [ + 'name' => '[Optional] Format string for parsing time', + 'title' => << [ + 'name' => '[Optional] Remove styling', + 'title' => 'Remove class and style attributes from the page elements', + 'type' => 'checkbox' + ] + ] + ]; + + private $feedName = ''; + + public function getURI() + { + $url = $this->getInput('home_page'); + if (empty($url)) { + $url = parent::getURI(); + } + return $url; + } + + public function getName() + { + if (!empty($this->feedName)) { + return $this->feedName; + } + return parent::getName(); + } + + protected function getHeaders() + { + $headers = []; + $cookie = $this->getInput('cookie'); + if (!empty($cookie)) { + $headers[] = 'Cookie: ' . $cookie; + } + + return $headers; + } + + public function collectData() + { + $url = $this->getInput('home_page'); + $headers = $this->getHeaders(); + + $entry_element_selector = $this->getInput('entry_element_selector'); + $url_selector = $this->getInput('url_selector'); + $url_pattern = $this->getInput('url_pattern'); + $limit = $this->getInput('limit') ?? 10; + + $use_article_pages = $this->getInput('use_article_pages'); + $article_page_content_selector = $this->getInput('article_page_content_selector'); + $content_cleanup = $this->getInput('content_cleanup'); + $title_selector = $this->getInput('title_selector'); + $title_cleanup = $this->getInput('title_cleanup'); + $time_selector = $this->getInput('time_selector'); + $time_format = $this->getInput('time_format'); + + $category_selector = $this->getInput('category_selector'); + $author_selector = $this->getInput('author_selector'); + $remove_styling = $this->getInput('remove_styling'); + + $html = defaultLinkTo(getSimpleHTMLDOM($url, $headers), $url); + $this->feedName = $this->getTitle($html, $title_cleanup); + $entry_elements = $this->htmlFindEntryElements($html, $entry_element_selector, $url_selector, $url_pattern, $limit); + + if (empty($entry_elements)) { + return; + } + + // Fetch the elements from the article pages. + if ($use_article_pages) { + if (empty($article_page_content_selector)) { + returnClientError('`Article selector` is required when `Load article page` is enabled'); + } + + foreach (array_keys($entry_elements) as $uri) { + $entry_elements[$uri] = $this->fetchArticleElementFromPage($uri, $article_page_content_selector); + } + } + + foreach ($entry_elements as $uri => $element) { + $entry = $this->parseEntryElement( + $element, + $title_selector, + $author_selector, + $category_selector, + $time_selector, + $time_format, + $content_cleanup, + $this->feedName, + $remove_styling + ); + + $entry['uri'] = $uri; + $this->items[] = $entry; + } + } + + /** + * Filter a list of URLs using a pattern and limit + * @param array $links List of URLs + * @param string $url_pattern Pattern to look for in URLs + * @param int $limit Optional maximum amount of URLs to return + * @return array Array of URLs + */ + protected function filterUrlList($links, $url_pattern, $limit = 0) + { + if (!empty($url_pattern)) { + $url_pattern = '/' . str_replace('/', '\/', $url_pattern) . '/'; + $links = array_filter($links, function ($url) { + return preg_match($url_pattern, $url) === 1; + }); + } + + if ($limit > 0 && count($links) > $limit) { + $links = array_slice($links, 0, $limit); + } + + return $links; + } + + /** + * Retrieve title from webpage URL or DOM + * @param string|object $page URL or DOM to retrieve title from + * @param string $title_cleanup optional string to remove from webpage title, e.g. " | BlogName" + * @return string Webpage title + */ + protected function getTitle($page, $title_cleanup) + { + if (is_string($page)) { + $page = getSimpleHTMLDOMCached($page); + } + $title = html_entity_decode($page->find('title', 0)->plaintext); + if (!empty($title)) { + $title = trim(str_replace($title_cleanup, '', $title)); + } + + return $title; + } + + /** + * Remove all elements from HTML content matching cleanup selector + * @param string|object $content HTML content as HTML object or string + * @return string|object Cleaned content (same type as input) + */ + protected function cleanArticleContent($content, $cleanup_selector, $remove_styling) + { + $string_convert = false; + if (is_string($content)) { + $string_convert = true; + $content = str_get_html($content); + } + + if (!empty($cleanup_selector)) { + foreach ($content->find($cleanup_selector) as $item_to_clean) { + $item_to_clean->outertext = ''; + } + } + + if ($remove_styling) { + foreach (['class', 'style'] as $attribute_to_remove) { + foreach ($content->find('[' . $attribute_to_remove . ']') as $item_to_clean) { + $item_to_clean->removeAttribute($attribute_to_remove); + } + } + } + + if ($string_convert) { + $content = $content->outertext; + } + return $content; + } + + + /** + * Retrieve first N link+element from webpage URL or DOM satisfying the specified criteria + * @param string|object $page URL or DOM to retrieve feed items from + * @param string $entry_selector DOM selector for matching HTML elements that contain article + * entries + * @param string $url_selector DOM selector for matching links + * @param string $url_pattern Optional filter to keep only links matching the pattern + * @param int $limit Optional maximum amount of URLs to return + * @return array of items { => } + */ + protected function htmlFindEntryElements($page, $entry_selector, $url_selector, $url_pattern = '', $limit = 0) + { + if (is_string($page)) { + $page = getSimpleHTMLDOM($page); + } + + $entryElements = $page->find($entry_selector); + if (empty($entryElements)) { + returnClientError('No entry elements for entry selector'); + } + + // Extract URIs with the associated entry element + $links_with_elements = []; + foreach ($entryElements as $entry) { + $url_element = $entry->find($url_selector, 0); + if (is_null($url_element)) { + // No `a` element found in this entry + if ($entry->tag == 'a') { + $url_element = $entry; + } else { + continue; + } + } + + $links_with_elements[$url_element->href] = $entry; + } + + if (empty($links_with_elements)) { + returnClientError('The provided URL selector matches some elements, but they do not + contain links.'); + } + + // Filter using the URL pattern + $filtered_urls = $this->filterUrlList(array_keys($links_with_elements), $url_pattern, $limit); + + if (empty($filtered_urls)) { + returnClientError('No results for URL pattern'); + } + + $items = []; + foreach ($filtered_urls as $link) { + $items[$link] = $links_with_elements[$link]; + } + + return $items; + } + + + /** + * Retrieve article element from its URL using content selector and return the DOM element + * @param string $entry_url URL to retrieve article from + * @param string $content_selector HTML selector for extracting content, e.g. "article.content" + * @return article DOM element + */ + protected function fetchArticleElementFromPage($entry_url, $content_selector) + { + $entry_html = getSimpleHTMLDOMCached($entry_url); + $article_content = $entry_html->find($content_selector, 0); + + if (is_null($article_content)) { + returnClientError('Could not article content at URL: ' . $entry_url); + } + + $article_content = defaultLinkTo($article_content, $entry_url); + return $article_content; + } + + protected function parseTimeStrAsTimestamp($timeStr, $format) + { + $date = date_parse_from_format($format, $timeStr); + if ($date['error_count'] != 0) { + returnClientError('Error while parsing time string'); + } + + $timestamp = mktime( + $date['hour'], + $date['minute'], + $date['second'], + $date['month'], + $date['day'], + $date['year'] + ); + + if ($timestamp == false) { + returnClientError('Error while creating timestamp'); + } + + return $timestamp; + } + + /** + * Retrieve article content from its URL using content selector and return a feed item + * @param object $entry_html A DOM element containing the article + * @param string $title_selector A selector to the article title from the article + * @param string $author_selector A selector to find the article author + * @param string $time_selector A selector to get the article publication time. + * @param string $time_format The format to parse the time_selector. + * @param string $content_cleanup Optional selector for removing elements, e.g. "div.ads, + * div.comments" + * @param string $title_default Optional title to use when could not extract title reliably + * @param bool $remove_styling Whether to remove class and style attributes from the HTML + * @return array Entry data: uri, title, content + */ + protected function parseEntryElement( + $entry_html, + $title_selector = null, + $author_selector = null, + $category_selector = null, + $time_selector = null, + $time_format = null, + $content_cleanup = null, + $title_default = null, + $remove_styling = false + ) { + $article_content = convertLazyLoading($entry_html); + + if (is_null($title_selector)) { + $article_title = $title_default; + } else { + $article_title = trim($entry_html->find($title_selector, 0)->innertext); + } + + $author = null; + if (!is_null($author_selector) && $author_selector != '') { + $author = trim($entry_html->find($author_selector, 0)->innertext); + } + + $categories = []; + if (!is_null($category_selector && $category_selector != '')) { + $category_elements = $entry_html->find($category_selector); + foreach ($category_elements as $category_element) { + $categories[] = trim($category_element->innertext); + } + } + + $time = null; + if (!is_null($time_selector) && $time_selector != '') { + $time_element = $entry_html->find($time_selector, 0); + $time = $time_element->getAttribute('datetime'); + if (is_null($time)) { + $time = $time_element->innertext; + } + + $this->parseTimeStrAsTimestamp($time, $time_format); + } + + $article_content = $this->cleanArticleContent($article_content, $content_cleanup, $remove_styling); + + $item = []; + $item['title'] = $article_title; + $item['content'] = $article_content; + $item['categories'] = $categories; + $item['timestamp'] = $time; + $item['author'] = $author; + return $item; + } +} From 0325c2414a6b00311abb291d1189c97f120053bd Mon Sep 17 00:00:00 2001 From: t0stiman <18124323+t0stiman@users.noreply.github.com> Date: Fri, 25 Aug 2023 12:34:35 +0200 Subject: [PATCH 006/152] fix carthrottlebridge (#3633) --- bridges/CarThrottleBridge.php | 51 ++++++++++++++++++----------------- 1 file changed, 26 insertions(+), 25 deletions(-) diff --git a/bridges/CarThrottleBridge.php b/bridges/CarThrottleBridge.php index 95641573f28..5b95dd288ab 100644 --- a/bridges/CarThrottleBridge.php +++ b/bridges/CarThrottleBridge.php @@ -1,44 +1,45 @@ collectExpandableDatas('https://www.carthrottle.com/rss', 10); - } + $news = getSimpleHTMLDOMCached(self::URI . 'news') + or returnServerError('could not retrieve page'); - protected function parseItem($feedItem) - { - $item = parent::parseItem($feedItem); + $this->items[] = []; - //fetch page - $articlePage = getSimpleHTMLDOMCached($feedItem->link) - or returnServerError('Could not retrieve ' . $feedItem->link); + //for each post + foreach ($news->find('div.cmg-card') as $post) { + $item = []; - $subtitle = $articlePage->find('p.standfirst', 0); - $article = $articlePage->find('div.content_field', 0); + $titleElement = $post->find('div.title a.cmg-link')[0]; + $item['uri'] = self::URI . $titleElement->getAttribute('href'); + $item['title'] = $titleElement->innertext; - $item['content'] = str_get_html($subtitle . $article); + $articlePage = getSimpleHTMLDOMCached($item['uri']) + or returnServerError('could not retrieve page'); - //convert