This feed may be only one of the possible feeds. You may find more feeds using one of the bridges with different parameters, for example.
';
+ findfeedresults.innerHTML = content;
+ }
+
+ // Display an error if no feed were found
+ function rss_bridge_feed_display_feed_search_fail() {
+ const findfeedresults = document.getElementById('findfeedresults');
+ findfeedresults.innerHTML = 'No Feed found !
Not every bridge supports feed detection. You can check below within the bridge parameters to create a feed.
';
+ }
+
+ // Empty the Found Feed section
+ function rss_bridge_feed_display_find_feed_empty() {
+ const findfeedresults = document.getElementById('findfeedresults');
+ findfeedresults.innerHTML = '';
+ }
+
+ // Add Event to 'Detect Feed" button
+ var rssbridge_feed_finder = function() {
+ const button = document.getElementById('findfeed');
+ button.addEventListener("click", rssbridge_feed_search);
+ button.addEventListener("keyup", rssbridge_feed_search);
+ };
+ return rssbridge_feed_finder;
+}());
diff --git a/static/style.css b/static/style.css
index a83e25e4214..a9d5933a7e5 100644
--- a/static/style.css
+++ b/static/style.css
@@ -453,3 +453,38 @@ button {
color: #d8d3cb;
}
}
+
+/* find-feed */
+.search-result {
+ background-color: #f0f0f0;
+ border-radius: 5px;
+ padding: 15px;
+ display: flex;
+ position: relative;
+ text-align: left;
+}
+@media (prefers-color-scheme: dark) {
+ .search-result {
+ background-color: #202325;
+ }
+}
+.search-result h2 {
+ color: #288cfc;
+}
+
+.search-result a {
+ text-decoration: none;
+ color: #248afa;
+}
+.search-result .icon {
+ margin: 0 15px 0 0;
+}
+.search-result span {
+ margin-right: 10px;
+}
+.search-result .description {
+ font-size: 110%;
+ margin-right: 0 !important;
+ margin-top: 5px !important;
+}
+/* end find-feed */
diff --git a/templates/frontpage.html.php b/templates/frontpage.html.php
index 63f4a2ab2fa..99e2ffd98c0 100644
--- a/templates/frontpage.html.php
+++ b/templates/frontpage.html.php
@@ -2,6 +2,7 @@
@@ -15,6 +16,14 @@
onkeyup="rssbridge_list_search()"
value=""
>
+ Find Feed from URL
+
+
+
= raw($bridges) ?>
From eb4ff7099f07859a2a3078c50a7d5366ca722bb1 Mon Sep 17 00:00:00 2001
From: Lars Stegman
Date: Tue, 22 Aug 2023 21:28:16 +0200
Subject: [PATCH 005/152] CSS Selector Bridge 2 (#3626)
* [CssSelector2Bridge] Implement CSS Selector bridge 2
* [CssSelector2Bridge] Fix author not being loaded
* [CssSelector2Bridge] Remove unneeded time nullcheck
* Fix linting
* Fix failing test
* Implement PR fixes
* Update bridges/CssSelector2Bridge.php
Co-authored-by: ORelio
* Rename bridge and fix syntax error for php7
---------
Co-authored-by: ORelio
---
bridges/CssSelectorComplexBridge.php | 458 +++++++++++++++++++++++++++
1 file changed, 458 insertions(+)
create mode 100644 bridges/CssSelectorComplexBridge.php
diff --git a/bridges/CssSelectorComplexBridge.php b/bridges/CssSelectorComplexBridge.php
new file mode 100644
index 00000000000..4d44f85325e
--- /dev/null
+++ b/bridges/CssSelectorComplexBridge.php
@@ -0,0 +1,458 @@
+ [
+ 'name' => 'Site URL: Page with latest articles',
+ 'exampleValue' => 'https://example.com/blog/',
+ 'required' => true
+ ],
+ 'cookie' => [
+ 'name' => '[Optional] Cookie',
+ 'title' => << 'sessionId=deadb33f'
+ ],
+ 'title_cleanup' => [
+ 'name' => '[Optional] Text to remove from feed title',
+ 'title' => << ' | BlogName',
+ ],
+ 'entry_element_selector' => [
+ 'name' => 'Selector for article entry elements',
+ 'title' => <<... on home page, each one being treated as a feed item.
+
+ Use the URL selector option to select the `a` element with the
+ `href` to the article link. If this option is not configured, the first encountered
+ `a` element is used.
+ EOT,
+ 'exampleValue' => 'div.article',
+ 'required' => true
+ ],
+ 'url_selector' => [
+ 'name' => '[Optional] Selector for link elements',
+ 'title' => << 'a.article',
+ 'defaultValue' => 'a'
+ ],
+ 'url_pattern' => [
+ 'name' => '[Optional] Pattern for site URLs to keep in feed',
+ 'title' => 'Optionally filter items by applying a regular expression on their URL',
+ 'exampleValue' => '/blog/article/.*',
+ ],
+ 'limit' => self::LIMIT,
+ 'use_article_pages' => [
+ 'name' => 'Load article from page',
+ 'title' => << 'checkbox'
+ ],
+ 'article_page_content_selector' => [
+ 'name' => '[Optional] Selector to select article element',
+ 'title' => 'Extract the article from its page using the provided selector',
+ 'exampleValue' => 'article.content',
+ ],
+ 'content_cleanup' => [
+ 'name' => '[Optional] Content cleanup: selector for items to remove',
+ 'title' => 'Selector for unnecessary elements to remove inside article contents.',
+ 'exampleValue' => 'div.ads, div.comments',
+ ],
+ 'title_selector' => [
+ 'name' => '[Optional] Selector for the article title',
+ 'title' => 'Selector to select the article title',
+ 'defaultValue' => 'h1'
+ ],
+ 'category_selector' => [
+ 'name' => '[Optional] Categories',
+ 'title' => << 'span.category, #main-category'
+ ],
+ 'author_selector' => [
+ 'name' => '[Optional] Author',
+ 'title' => << 'span#author'
+ ],
+ 'time_selector' => [
+ 'name' => '[Optional] Time selector',
+ 'title' => << [
+ 'name' => '[Optional] Format string for parsing time',
+ 'title' => << [
+ 'name' => '[Optional] Remove styling',
+ 'title' => 'Remove class and style attributes from the page elements',
+ 'type' => 'checkbox'
+ ]
+ ]
+ ];
+
+ private $feedName = '';
+
+ public function getURI()
+ {
+ $url = $this->getInput('home_page');
+ if (empty($url)) {
+ $url = parent::getURI();
+ }
+ return $url;
+ }
+
+ public function getName()
+ {
+ if (!empty($this->feedName)) {
+ return $this->feedName;
+ }
+ return parent::getName();
+ }
+
+ protected function getHeaders()
+ {
+ $headers = [];
+ $cookie = $this->getInput('cookie');
+ if (!empty($cookie)) {
+ $headers[] = 'Cookie: ' . $cookie;
+ }
+
+ return $headers;
+ }
+
+ public function collectData()
+ {
+ $url = $this->getInput('home_page');
+ $headers = $this->getHeaders();
+
+ $entry_element_selector = $this->getInput('entry_element_selector');
+ $url_selector = $this->getInput('url_selector');
+ $url_pattern = $this->getInput('url_pattern');
+ $limit = $this->getInput('limit') ?? 10;
+
+ $use_article_pages = $this->getInput('use_article_pages');
+ $article_page_content_selector = $this->getInput('article_page_content_selector');
+ $content_cleanup = $this->getInput('content_cleanup');
+ $title_selector = $this->getInput('title_selector');
+ $title_cleanup = $this->getInput('title_cleanup');
+ $time_selector = $this->getInput('time_selector');
+ $time_format = $this->getInput('time_format');
+
+ $category_selector = $this->getInput('category_selector');
+ $author_selector = $this->getInput('author_selector');
+ $remove_styling = $this->getInput('remove_styling');
+
+ $html = defaultLinkTo(getSimpleHTMLDOM($url, $headers), $url);
+ $this->feedName = $this->getTitle($html, $title_cleanup);
+ $entry_elements = $this->htmlFindEntryElements($html, $entry_element_selector, $url_selector, $url_pattern, $limit);
+
+ if (empty($entry_elements)) {
+ return;
+ }
+
+ // Fetch the elements from the article pages.
+ if ($use_article_pages) {
+ if (empty($article_page_content_selector)) {
+ returnClientError('`Article selector` is required when `Load article page` is enabled');
+ }
+
+ foreach (array_keys($entry_elements) as $uri) {
+ $entry_elements[$uri] = $this->fetchArticleElementFromPage($uri, $article_page_content_selector);
+ }
+ }
+
+ foreach ($entry_elements as $uri => $element) {
+ $entry = $this->parseEntryElement(
+ $element,
+ $title_selector,
+ $author_selector,
+ $category_selector,
+ $time_selector,
+ $time_format,
+ $content_cleanup,
+ $this->feedName,
+ $remove_styling
+ );
+
+ $entry['uri'] = $uri;
+ $this->items[] = $entry;
+ }
+ }
+
+ /**
+ * Filter a list of URLs using a pattern and limit
+ * @param array $links List of URLs
+ * @param string $url_pattern Pattern to look for in URLs
+ * @param int $limit Optional maximum amount of URLs to return
+ * @return array Array of URLs
+ */
+ protected function filterUrlList($links, $url_pattern, $limit = 0)
+ {
+ if (!empty($url_pattern)) {
+ $url_pattern = '/' . str_replace('/', '\/', $url_pattern) . '/';
+ $links = array_filter($links, function ($url) {
+ return preg_match($url_pattern, $url) === 1;
+ });
+ }
+
+ if ($limit > 0 && count($links) > $limit) {
+ $links = array_slice($links, 0, $limit);
+ }
+
+ return $links;
+ }
+
+ /**
+ * Retrieve title from webpage URL or DOM
+ * @param string|object $page URL or DOM to retrieve title from
+ * @param string $title_cleanup optional string to remove from webpage title, e.g. " | BlogName"
+ * @return string Webpage title
+ */
+ protected function getTitle($page, $title_cleanup)
+ {
+ if (is_string($page)) {
+ $page = getSimpleHTMLDOMCached($page);
+ }
+ $title = html_entity_decode($page->find('title', 0)->plaintext);
+ if (!empty($title)) {
+ $title = trim(str_replace($title_cleanup, '', $title));
+ }
+
+ return $title;
+ }
+
+ /**
+ * Remove all elements from HTML content matching cleanup selector
+ * @param string|object $content HTML content as HTML object or string
+ * @return string|object Cleaned content (same type as input)
+ */
+ protected function cleanArticleContent($content, $cleanup_selector, $remove_styling)
+ {
+ $string_convert = false;
+ if (is_string($content)) {
+ $string_convert = true;
+ $content = str_get_html($content);
+ }
+
+ if (!empty($cleanup_selector)) {
+ foreach ($content->find($cleanup_selector) as $item_to_clean) {
+ $item_to_clean->outertext = '';
+ }
+ }
+
+ if ($remove_styling) {
+ foreach (['class', 'style'] as $attribute_to_remove) {
+ foreach ($content->find('[' . $attribute_to_remove . ']') as $item_to_clean) {
+ $item_to_clean->removeAttribute($attribute_to_remove);
+ }
+ }
+ }
+
+ if ($string_convert) {
+ $content = $content->outertext;
+ }
+ return $content;
+ }
+
+
+ /**
+ * Retrieve first N link+element from webpage URL or DOM satisfying the specified criteria
+ * @param string|object $page URL or DOM to retrieve feed items from
+ * @param string $entry_selector DOM selector for matching HTML elements that contain article
+ * entries
+ * @param string $url_selector DOM selector for matching links
+ * @param string $url_pattern Optional filter to keep only links matching the pattern
+ * @param int $limit Optional maximum amount of URLs to return
+ * @return array of items { => }
+ */
+ protected function htmlFindEntryElements($page, $entry_selector, $url_selector, $url_pattern = '', $limit = 0)
+ {
+ if (is_string($page)) {
+ $page = getSimpleHTMLDOM($page);
+ }
+
+ $entryElements = $page->find($entry_selector);
+ if (empty($entryElements)) {
+ returnClientError('No entry elements for entry selector');
+ }
+
+ // Extract URIs with the associated entry element
+ $links_with_elements = [];
+ foreach ($entryElements as $entry) {
+ $url_element = $entry->find($url_selector, 0);
+ if (is_null($url_element)) {
+ // No `a` element found in this entry
+ if ($entry->tag == 'a') {
+ $url_element = $entry;
+ } else {
+ continue;
+ }
+ }
+
+ $links_with_elements[$url_element->href] = $entry;
+ }
+
+ if (empty($links_with_elements)) {
+ returnClientError('The provided URL selector matches some elements, but they do not
+ contain links.');
+ }
+
+ // Filter using the URL pattern
+ $filtered_urls = $this->filterUrlList(array_keys($links_with_elements), $url_pattern, $limit);
+
+ if (empty($filtered_urls)) {
+ returnClientError('No results for URL pattern');
+ }
+
+ $items = [];
+ foreach ($filtered_urls as $link) {
+ $items[$link] = $links_with_elements[$link];
+ }
+
+ return $items;
+ }
+
+
+ /**
+ * Retrieve article element from its URL using content selector and return the DOM element
+ * @param string $entry_url URL to retrieve article from
+ * @param string $content_selector HTML selector for extracting content, e.g. "article.content"
+ * @return article DOM element
+ */
+ protected function fetchArticleElementFromPage($entry_url, $content_selector)
+ {
+ $entry_html = getSimpleHTMLDOMCached($entry_url);
+ $article_content = $entry_html->find($content_selector, 0);
+
+ if (is_null($article_content)) {
+ returnClientError('Could not article content at URL: ' . $entry_url);
+ }
+
+ $article_content = defaultLinkTo($article_content, $entry_url);
+ return $article_content;
+ }
+
+ protected function parseTimeStrAsTimestamp($timeStr, $format)
+ {
+ $date = date_parse_from_format($format, $timeStr);
+ if ($date['error_count'] != 0) {
+ returnClientError('Error while parsing time string');
+ }
+
+ $timestamp = mktime(
+ $date['hour'],
+ $date['minute'],
+ $date['second'],
+ $date['month'],
+ $date['day'],
+ $date['year']
+ );
+
+ if ($timestamp == false) {
+ returnClientError('Error while creating timestamp');
+ }
+
+ return $timestamp;
+ }
+
+ /**
+ * Retrieve article content from its URL using content selector and return a feed item
+ * @param object $entry_html A DOM element containing the article
+ * @param string $title_selector A selector to the article title from the article
+ * @param string $author_selector A selector to find the article author
+ * @param string $time_selector A selector to get the article publication time.
+ * @param string $time_format The format to parse the time_selector.
+ * @param string $content_cleanup Optional selector for removing elements, e.g. "div.ads,
+ * div.comments"
+ * @param string $title_default Optional title to use when could not extract title reliably
+ * @param bool $remove_styling Whether to remove class and style attributes from the HTML
+ * @return array Entry data: uri, title, content
+ */
+ protected function parseEntryElement(
+ $entry_html,
+ $title_selector = null,
+ $author_selector = null,
+ $category_selector = null,
+ $time_selector = null,
+ $time_format = null,
+ $content_cleanup = null,
+ $title_default = null,
+ $remove_styling = false
+ ) {
+ $article_content = convertLazyLoading($entry_html);
+
+ if (is_null($title_selector)) {
+ $article_title = $title_default;
+ } else {
+ $article_title = trim($entry_html->find($title_selector, 0)->innertext);
+ }
+
+ $author = null;
+ if (!is_null($author_selector) && $author_selector != '') {
+ $author = trim($entry_html->find($author_selector, 0)->innertext);
+ }
+
+ $categories = [];
+ if (!is_null($category_selector && $category_selector != '')) {
+ $category_elements = $entry_html->find($category_selector);
+ foreach ($category_elements as $category_element) {
+ $categories[] = trim($category_element->innertext);
+ }
+ }
+
+ $time = null;
+ if (!is_null($time_selector) && $time_selector != '') {
+ $time_element = $entry_html->find($time_selector, 0);
+ $time = $time_element->getAttribute('datetime');
+ if (is_null($time)) {
+ $time = $time_element->innertext;
+ }
+
+ $this->parseTimeStrAsTimestamp($time, $time_format);
+ }
+
+ $article_content = $this->cleanArticleContent($article_content, $content_cleanup, $remove_styling);
+
+ $item = [];
+ $item['title'] = $article_title;
+ $item['content'] = $article_content;
+ $item['categories'] = $categories;
+ $item['timestamp'] = $time;
+ $item['author'] = $author;
+ return $item;
+ }
+}
From 0325c2414a6b00311abb291d1189c97f120053bd Mon Sep 17 00:00:00 2001
From: t0stiman <18124323+t0stiman@users.noreply.github.com>
Date: Fri, 25 Aug 2023 12:34:35 +0200
Subject: [PATCH 006/152] fix carthrottlebridge (#3633)
---
bridges/CarThrottleBridge.php | 51 ++++++++++++++++++-----------------
1 file changed, 26 insertions(+), 25 deletions(-)
diff --git a/bridges/CarThrottleBridge.php b/bridges/CarThrottleBridge.php
index 95641573f28..5b95dd288ab 100644
--- a/bridges/CarThrottleBridge.php
+++ b/bridges/CarThrottleBridge.php
@@ -1,44 +1,45 @@
collectExpandableDatas('https://www.carthrottle.com/rss', 10);
- }
+ $news = getSimpleHTMLDOMCached(self::URI . 'news')
+ or returnServerError('could not retrieve page');
- protected function parseItem($feedItem)
- {
- $item = parent::parseItem($feedItem);
+ $this->items[] = [];
- //fetch page
- $articlePage = getSimpleHTMLDOMCached($feedItem->link)
- or returnServerError('Could not retrieve ' . $feedItem->link);
+ //for each post
+ foreach ($news->find('div.cmg-card') as $post) {
+ $item = [];
- $subtitle = $articlePage->find('p.standfirst', 0);
- $article = $articlePage->find('div.content_field', 0);
+ $titleElement = $post->find('div.title a.cmg-link')[0];
+ $item['uri'] = self::URI . $titleElement->getAttribute('href');
+ $item['title'] = $titleElement->innertext;
- $item['content'] = str_get_html($subtitle . $article);
+ $articlePage = getSimpleHTMLDOMCached($item['uri'])
+ or returnServerError('could not retrieve page');
- //convert