From dda1b882254e1b41aac4686ea436fd15865b22ab Mon Sep 17 00:00:00 2001 From: Lars Stegman Date: Sun, 20 Aug 2023 17:28:50 +0200 Subject: [PATCH 1/8] [CssSelector2Bridge] Implement CSS Selector bridge 2 --- bridges/CssSelector2Bridge.php | 429 +++++++++++++++++++++++++++++++++ 1 file changed, 429 insertions(+) create mode 100644 bridges/CssSelector2Bridge.php diff --git a/bridges/CssSelector2Bridge.php b/bridges/CssSelector2Bridge.php new file mode 100644 index 00000000000..6f4b91503cb --- /dev/null +++ b/bridges/CssSelector2Bridge.php @@ -0,0 +1,429 @@ + [ + 'name' => 'Site URL: Page with latest articles', + 'exampleValue' => 'https://example.com/blog/', + 'required' => true + ], + 'cookie' => [ + 'name' => '[Optional] Cookie', + 'title' => << 'sessionId=deadb33f' + ], + 'entry_element_selector' => [ + 'name' => 'Selector for article entry elements', + 'title' => <<... on home page, each one being treated as a feed item. + + Use the URL selector option to select the `a` element with the + `href` to the article link. If this option is not configured, the first encountered + `a` element is used. + EOT, + 'exampleValue' => 'div.article', + 'required' => true + ], + 'url_selector' => [ + 'name' => 'Selector for link elements', + 'title' => << 'a.article', + 'defaultValue' => 'a' + ], + 'url_pattern' => [ + 'name' => '[Optional] Pattern for site URLs to keep in feed', + 'title' => 'Optionally filter items by applying a regular expression on their URL', + 'exampleValue' => '/blog/article/.*', + ], + 'limit' => self::LIMIT, + 'content_cleanup' => [ + 'name' => '[Optional] Content cleanup: List of items to remove', + 'title' => 'Selector for unnecessary elements to remove inside article contents.', + 'exampleValue' => 'div.ads, div.comments', + ], + 'title_selector' => [ + 'name' => '[Optional] Selector for the article title', + 'title' => 'Selector to select the article title', + 'defaultValue' => 'h1' + ], + 'title_cleanup' => [ + 'name' => '[Optional] Text to remove from feed title', + 'title' => << ' | BlogName', + ], + 'category_selector' => [ + 'name' => '[Optional] Categories', + 'title' => << 'span.category, #main-category' + ], + 'author_selector' => [ + 'name' => '[Optional] Author', + 'title' => << 'span#author' + ], + 'time_selector' => [ + 'name' => '[Optional] Time selector', + 'title' => << [ + 'name' => 'Load article from page', + 'title' => << 'checkbox' + ], + 'article_page_content_selector' => [ + 'name' => '[Optional] Selector to select article page content', + 'title' => 'Extract the article from its page using the provided selector', + 'exampleValue' => 'article.content', + ], + 'remove_styling' => [ + 'name' => 'Remove styling', + 'title' => 'Remove class and style tags from the page contents', + 'type' => 'checkbox' + ] + ] + ]; + + private $feedName = ''; + + public function getURI() + { + $url = $this->getInput('home_page'); + if (empty($url)) { + $url = parent::getURI(); + } + return $url; + } + + public function getName() + { + if (!empty($this->feedName)) { + return $this->feedName; + } + return parent::getName(); + } + + public function getHeaders() + { + $headers = array(); + $cookie = $this->getInput('cookie'); + if (!empty($cookie)) { + $headers[] = 'Cookie: ' . $cookie; + } + + return $headers; + } + + public function collectData() + { + $url = $this->getInput('home_page'); + $headers = $this->getHeaders(); + + $entry_element_selector = $this->getInput('entry_element_selector'); + $url_selector = $this->getInput('url_selector'); + $url_pattern = $this->getInput('url_pattern'); + $limit = $this->getInput('limit') ?? 10; + + $use_article_pages = $this->getInput('use_article_pages'); + $article_page_content_selector = + $this->getInput('article_page_content_selector'); + $content_cleanup = $this->getInput('content_cleanup'); + $title_selector = $this->getInput('title_selector'); + $title_cleanup = $this->getInput('title_cleanup'); + $time_selector = $this->getInput('time_selector'); + + $category_selector = $this->getInput('category_selector'); + $author_selector = $this->getInput('author_selector'); + $remove_styling = $this->getInput('remove_styling'); + + $html = defaultLinkTo(getSimpleHTMLDOM($url, $headers), $url); + $this->feedName = $this->getTitle($html, $title_cleanup); + $entry_elements = $this->htmlFindEntryElements($html, + $entry_element_selector, $url_selector, $url_pattern, $limit); + + if (empty($entry_elements)) { + return; + } + + // Fetch the elements from the article pages. + if ($use_article_pages) { + if (empty($article_page_content_selector)) { + returnClientError( + "`Article selector` is required when `Load article page` is enabled"); + } + + foreach (array_keys($entry_elements) as $uri) { + $entry_elements[$uri] = $this->fetchArticleElementFromPage( + $uri, $article_page_content_selector); + } + } + + foreach ($entry_elements as $uri => $element) { + $entry = $this->parseEntryElement( + $element, + $title_selector, + $author_selector, + $category_selector, + $time_selector, + $content_cleanup, + $remove_styling, + $this->feedName + ); + + $entry['uri'] = $uri; + $this->items[] = $entry; + } + } + + /** + * Filter a list of URLs using a pattern and limit + * @param array $links List of URLs + * @param string $url_pattern Pattern to look for in URLs + * @param int $limit Optional maximum amount of URLs to return + * @return array Array of URLs + */ + protected function filterUrlList($links, $url_pattern, $limit = 0) + { + if (!empty($url_pattern)) { + $url_pattern = '/' . str_replace('/', '\/', $url_pattern) . '/'; + $links = array_filter($links, function ($url) { + return preg_match($url_pattern, $url) === 1; + }); + } + + if ($limit > 0 && count($links) > $limit) { + $links = array_slice($links, 0, $limit); + } + + return $links; + } + + /** + * Retrieve title from webpage URL or DOM + * @param string|object $page URL or DOM to retrieve title from + * @param string $title_cleanup optional string to remove from webpage title, e.g. " | BlogName" + * @return string Webpage title + */ + protected function getTitle($page, $title_cleanup) + { + if (is_string($page)) { + $page = getSimpleHTMLDOMCached($page); + } + $title = html_entity_decode($page->find('title', 0)->plaintext); + if (!empty($title)) { + $title = trim(str_replace($title_cleanup, '', $title)); + } + + return $title; + } + + /** + * Remove all elements from HTML content matching cleanup selector + * @param string|object $content HTML content as HTML object or string + * @return string|object Cleaned content (same type as input) + */ + protected function cleanArticleContent($content, $cleanup_selector, $remove_styling) + { + $string_convert = false; + if (is_string($content)) { + $string_convert = true; + $content = str_get_html($content); + } + + if (!empty($cleanup_selector)) { + foreach ($content->find($cleanup_selector) as $item_to_clean) { + $item_to_clean->outertext = ''; + } + } + + if ($remove_styling) { + // Get rid of classes + $content = preg_replace('/(<[^>]+) class=".*?"/i', '$1', $content); + // Get rid of inline styling + $content = preg_replace('/(<[^>]+) style=".*?"/i', '$1', $content); + } + + if ($string_convert) { + $content = $content->outertext; + } + return $content; + } + + + /** + * Retrieve first N link+element from webpage URL or DOM satisfying the specified criteria + * @param string|object $page URL or DOM to retrieve feed items from + * @param string $entry_selector DOM selector for matching HTML elements that contain article + * entries + * @param string $url_selector DOM selector for matching links + * @param string $url_pattern Optional filter to keep only links matching the pattern + * @param int $limit Optional maximum amount of URLs to return + * @return array of items { => } + */ + protected function htmlFindEntryElements($page, $entry_selector, $url_selector, + $url_pattern = '', $limit = 0) + { + if (is_string($page)) { + $page = getSimpleHTMLDOM($page); + } + + $entryElements = $page->find($entry_selector); + if (empty($entryElements)) { + returnClientError('No entry elements for entry selector'); + } + + // Extract URIs with the associated entry element + $links_with_elements = []; + foreach ($entryElements as $entry) { + $url_element = $entry->find($url_selector, 0); + if (is_null($url_element)) { + // No `a` element found in this entry + if ($entry->tag == 'a') { + $url_element = $entry; + } else { + continue; + } + } + + $links_with_elements[$url_element->href] = $entry;; + } + + if (empty($links_with_elements)) { + returnClientError('The provided URL selector matches some elements, but they do not + contain links.'); + } + + // Filter using the URL pattern + $filtered_urls = $this->filterUrlList(array_keys($links_with_elements), $url_pattern, $limit); + + if (empty($filtered_urls)) { + returnClientError('No results for URL pattern'); + } + + $items = []; + foreach ($filtered_urls as $link) { + $items[$link] = $links_with_elements[$link]; + } + + return $items; + } + + + /** + * Retrieve article element from its URL using content selector and return the DOM element + * @param string $entry_url URL to retrieve article from + * @param string $content_selector HTML selector for extracting content, e.g. "article.content" + * @return article DOM element + */ + protected function fetchArticleElementFromPage($entry_url, $content_selector) + { + $entry_html = getSimpleHTMLDOMCached($entry_url); + $article_content = $entry_html->find($content_selector, 0); + + if (is_null($article_content)) { + returnClientError('Could not article content at URL: ' . $entry_url); + } + + $article_content = defaultLinkTo($article_content, $entry_url); + return $article_content; + } + + /** + * Retrieve article content from its URL using content selector and return a feed item + * @param object $entry_html A DOM element containing the article + * @param string $title_selector A selector to the article title from the article + * @param string $author_selector A selector to find the article author + * @param string $time_selector A selector to get the article publication time. + * @param string $content_cleanup Optional selector for removing elements, e.g. "div.ads, + * div.comments" + * @param string $title_default Optional title to use when could not extract title reliably + * @param bool $remove_styling Whether to remove class and style attributes from the HTML + * @return array Entry data: uri, title, content + */ + protected function parseEntryElement( + $entry_html, + $title_selector = null, + $author_selector = null, + $category_selector = null, + $time_selector = null, + $content_cleanup = null, + $title_default = null, + $remove_styling = false, + ) + { + $article_content = convertLazyLoading($entry_html); + + if (is_null($title_selector)) { + $article_title = $title_default; + } else { + $article_title = trim($entry_html->find($title_selector, 0)->innertext); + } + + $author = null; + if (is_null($author_selector)) { + $author = trim($entry_html->find($author_selector, 0)->innertext); + } + + $categories = []; + if (!is_null($category_selector)) { + $category_elements = $entry_html->find($category_selector); + foreach ($category_elements as $category_element) { + $categories[] = trim($category_element->innertext); + } + } + + $time = null; + if (!is_null($time_selector)) { + $time_element = $entry_html->find($time_selector, 0); + if (!is_null($time_element)) { + if ($time_element->tag == 'time') { + $time = $time_element->getAttribute('datetime'); + } else { + $time = $time_element->innertext; + } + } + } + + $article_content = $this->cleanArticleContent($article_content, $content_cleanup, + $remove_styling); + + $item = []; + $item['title'] = $article_title; + $item['content'] = $article_content; + $item['categories'] = $categories; + $item['timestamp'] = $time; + return $item; + } +} From 95ce250149bbd76e3400ee00bd9ae18242b959a2 Mon Sep 17 00:00:00 2001 From: Lars Stegman Date: Sun, 20 Aug 2023 17:56:46 +0200 Subject: [PATCH 2/8] [CssSelector2Bridge] Fix author not being loaded --- bridges/CssSelector2Bridge.php | 1 + 1 file changed, 1 insertion(+) diff --git a/bridges/CssSelector2Bridge.php b/bridges/CssSelector2Bridge.php index 6f4b91503cb..05f65f6ba0c 100644 --- a/bridges/CssSelector2Bridge.php +++ b/bridges/CssSelector2Bridge.php @@ -424,6 +424,7 @@ protected function parseEntryElement( $item['content'] = $article_content; $item['categories'] = $categories; $item['timestamp'] = $time; + $item['author'] = $author; return $item; } } From 3176e5afb8ce8befbd16e82281bea975f9a81012 Mon Sep 17 00:00:00 2001 From: Lars Stegman Date: Sun, 20 Aug 2023 17:59:28 +0200 Subject: [PATCH 3/8] [CssSelector2Bridge] Remove unneeded time nullcheck --- bridges/CssSelector2Bridge.php | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/bridges/CssSelector2Bridge.php b/bridges/CssSelector2Bridge.php index 05f65f6ba0c..90e35c660c6 100644 --- a/bridges/CssSelector2Bridge.php +++ b/bridges/CssSelector2Bridge.php @@ -407,12 +407,10 @@ protected function parseEntryElement( $time = null; if (!is_null($time_selector)) { $time_element = $entry_html->find($time_selector, 0); - if (!is_null($time_element)) { - if ($time_element->tag == 'time') { - $time = $time_element->getAttribute('datetime'); - } else { - $time = $time_element->innertext; - } + if ($time_element->tag == 'time') { + $time = $time_element->getAttribute('datetime'); + } else { + $time = $time_element->innertext; } } From 43021f3184feb9d908e240dc1e6b1a00d84087d2 Mon Sep 17 00:00:00 2001 From: Lars Stegman Date: Mon, 21 Aug 2023 13:12:05 +0200 Subject: [PATCH 4/8] Fix linting --- bridges/CssSelector2Bridge.php | 43 ++++++++++++++-------------------- 1 file changed, 18 insertions(+), 25 deletions(-) diff --git a/bridges/CssSelector2Bridge.php b/bridges/CssSelector2Bridge.php index 90e35c660c6..0bbbdd70add 100644 --- a/bridges/CssSelector2Bridge.php +++ b/bridges/CssSelector2Bridge.php @@ -63,7 +63,7 @@ class CssSelector2Bridge extends BridgeAbstract 'title_selector' => [ 'name' => '[Optional] Selector for the article title', 'title' => 'Selector to select the article title', - 'defaultValue' => 'h1' + 'defaultValue' => 'h1' ], 'title_cleanup' => [ 'name' => '[Optional] Text to remove from feed title', @@ -135,9 +135,9 @@ public function getName() return parent::getName(); } - public function getHeaders() + public function getHeaders() { - $headers = array(); + $headers = []; $cookie = $this->getInput('cookie'); if (!empty($cookie)) { $headers[] = 'Cookie: ' . $cookie; @@ -155,10 +155,9 @@ public function collectData() $url_selector = $this->getInput('url_selector'); $url_pattern = $this->getInput('url_pattern'); $limit = $this->getInput('limit') ?? 10; - + $use_article_pages = $this->getInput('use_article_pages'); - $article_page_content_selector = - $this->getInput('article_page_content_selector'); + $article_page_content_selector = $this->getInput('article_page_content_selector'); $content_cleanup = $this->getInput('content_cleanup'); $title_selector = $this->getInput('title_selector'); $title_cleanup = $this->getInput('title_cleanup'); @@ -170,23 +169,20 @@ public function collectData() $html = defaultLinkTo(getSimpleHTMLDOM($url, $headers), $url); $this->feedName = $this->getTitle($html, $title_cleanup); - $entry_elements = $this->htmlFindEntryElements($html, - $entry_element_selector, $url_selector, $url_pattern, $limit); + $entry_elements = $this->htmlFindEntryElements($html, $entry_element_selector, $url_selector, $url_pattern, $limit); if (empty($entry_elements)) { return; } - + // Fetch the elements from the article pages. if ($use_article_pages) { if (empty($article_page_content_selector)) { - returnClientError( - "`Article selector` is required when `Load article page` is enabled"); + returnClientError('`Article selector` is required when `Load article page` is enabled'); } foreach (array_keys($entry_elements) as $uri) { - $entry_elements[$uri] = $this->fetchArticleElementFromPage( - $uri, $article_page_content_selector); + $entry_elements[$uri] = $this->fetchArticleElementFromPage($uri, $article_page_content_selector); } } @@ -202,7 +198,7 @@ public function collectData() $this->feedName ); - $entry['uri'] = $uri; + $entry['uri'] = $uri; $this->items[] = $entry; } } @@ -274,7 +270,7 @@ protected function cleanArticleContent($content, $cleanup_selector, $remove_styl // Get rid of inline styling $content = preg_replace('/(<[^>]+) style=".*?"/i', '$1', $content); } - + if ($string_convert) { $content = $content->outertext; } @@ -285,15 +281,14 @@ protected function cleanArticleContent($content, $cleanup_selector, $remove_styl /** * Retrieve first N link+element from webpage URL or DOM satisfying the specified criteria * @param string|object $page URL or DOM to retrieve feed items from - * @param string $entry_selector DOM selector for matching HTML elements that contain article + * @param string $entry_selector DOM selector for matching HTML elements that contain article * entries * @param string $url_selector DOM selector for matching links * @param string $url_pattern Optional filter to keep only links matching the pattern * @param int $limit Optional maximum amount of URLs to return * @return array of items { => } */ - protected function htmlFindEntryElements($page, $entry_selector, $url_selector, - $url_pattern = '', $limit = 0) + protected function htmlFindEntryElements($page, $entry_selector, $url_selector, $url_pattern = '', $limit = 0) { if (is_string($page)) { $page = getSimpleHTMLDOM($page); @@ -317,7 +312,7 @@ protected function htmlFindEntryElements($page, $entry_selector, $url_selector, } } - $links_with_elements[$url_element->href] = $entry;; + $links_with_elements[$url_element->href] = $entry; } if (empty($links_with_elements)) { @@ -366,14 +361,14 @@ protected function fetchArticleElementFromPage($entry_url, $content_selector) * @param string $title_selector A selector to the article title from the article * @param string $author_selector A selector to find the article author * @param string $time_selector A selector to get the article publication time. - * @param string $content_cleanup Optional selector for removing elements, e.g. "div.ads, + * @param string $content_cleanup Optional selector for removing elements, e.g. "div.ads, * div.comments" * @param string $title_default Optional title to use when could not extract title reliably * @param bool $remove_styling Whether to remove class and style attributes from the HTML * @return array Entry data: uri, title, content */ protected function parseEntryElement( - $entry_html, + $entry_html, $title_selector = null, $author_selector = null, $category_selector = null, @@ -381,8 +376,7 @@ protected function parseEntryElement( $content_cleanup = null, $title_default = null, $remove_styling = false, - ) - { + ) { $article_content = convertLazyLoading($entry_html); if (is_null($title_selector)) { @@ -414,8 +408,7 @@ protected function parseEntryElement( } } - $article_content = $this->cleanArticleContent($article_content, $content_cleanup, - $remove_styling); + $article_content = $this->cleanArticleContent($article_content, $content_cleanup, $remove_styling); $item = []; $item['title'] = $article_title; From 31989544964e2fdb152faac1be85a13550aa40a7 Mon Sep 17 00:00:00 2001 From: Lars Stegman Date: Mon, 21 Aug 2023 13:41:43 +0200 Subject: [PATCH 5/8] Fix failing test --- bridges/CssSelector2Bridge.php | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/bridges/CssSelector2Bridge.php b/bridges/CssSelector2Bridge.php index 0bbbdd70add..0787e588c4a 100644 --- a/bridges/CssSelector2Bridge.php +++ b/bridges/CssSelector2Bridge.php @@ -118,7 +118,7 @@ class CssSelector2Bridge extends BridgeAbstract private $feedName = ''; - public function getURI() + protected function getURI() { $url = $this->getInput('home_page'); if (empty($url)) { @@ -127,7 +127,7 @@ public function getURI() return $url; } - public function getName() + protected function getName() { if (!empty($this->feedName)) { return $this->feedName; @@ -135,7 +135,7 @@ public function getName() return parent::getName(); } - public function getHeaders() + protected function getHeaders() { $headers = []; $cookie = $this->getInput('cookie'); From 6a98ab80b88a61eeac1da69617672335a40b0a78 Mon Sep 17 00:00:00 2001 From: Lars Stegman Date: Mon, 21 Aug 2023 21:44:51 +0200 Subject: [PATCH 6/8] Implement PR fixes --- bridges/CssSelector2Bridge.php | 99 +++++++++++++++++++++++----------- 1 file changed, 69 insertions(+), 30 deletions(-) diff --git a/bridges/CssSelector2Bridge.php b/bridges/CssSelector2Bridge.php index 0787e588c4a..71c9055f00c 100644 --- a/bridges/CssSelector2Bridge.php +++ b/bridges/CssSelector2Bridge.php @@ -8,7 +8,7 @@ class CssSelector2Bridge extends BridgeAbstract const DESCRIPTION = << [ 'name' => '[Optional] Cookie', 'title' => << 'sessionId=deadb33f' ], @@ -40,7 +41,7 @@ class CssSelector2Bridge extends BridgeAbstract 'required' => true ], 'url_selector' => [ - 'name' => 'Selector for link elements', + 'name' => '[Optional] Selector for link elements', 'title' => << 'Optionally filter items by applying a regular expression on their URL', 'exampleValue' => '/blog/article/.*', ], + 'use_article_pages' => [ + 'name' => 'Load article from page', + 'title' => << 'checkbox' + ], + 'article_page_content_selector' => [ + 'name' => '[Optional] Selector to select article page content', + 'title' => 'Extract the article from its page using the provided selector', + 'exampleValue' => 'article.content', + ], 'limit' => self::LIMIT, 'content_cleanup' => [ 'name' => '[Optional] Content cleanup: List of items to remove', @@ -94,22 +109,16 @@ class CssSelector2Bridge extends BridgeAbstract is a `time` element, the value for the `datetime` attribute is used. EOT, ], - 'use_article_pages' => [ - 'name' => 'Load article from page', + 'time_format' => [ + 'name' => '[Optional] Format string for parsing time', 'title' => << 'checkbox' - ], - 'article_page_content_selector' => [ - 'name' => '[Optional] Selector to select article page content', - 'title' => 'Extract the article from its page using the provided selector', - 'exampleValue' => 'article.content', + The format to use to parse the timestamp. See + https://www.php.net/manual/en/datetimeimmutable.createfromformat.php + for the format specification. + EOT ], 'remove_styling' => [ - 'name' => 'Remove styling', + 'name' => '[Optional] Remove styling', 'title' => 'Remove class and style tags from the page contents', 'type' => 'checkbox' ] @@ -118,7 +127,7 @@ class CssSelector2Bridge extends BridgeAbstract private $feedName = ''; - protected function getURI() + public function getURI() { $url = $this->getInput('home_page'); if (empty($url)) { @@ -127,7 +136,7 @@ protected function getURI() return $url; } - protected function getName() + public function getName() { if (!empty($this->feedName)) { return $this->feedName; @@ -162,6 +171,7 @@ public function collectData() $title_selector = $this->getInput('title_selector'); $title_cleanup = $this->getInput('title_cleanup'); $time_selector = $this->getInput('time_selector'); + $time_format = $this->getInput('time_format'); $category_selector = $this->getInput('category_selector'); $author_selector = $this->getInput('author_selector'); @@ -193,9 +203,10 @@ public function collectData() $author_selector, $category_selector, $time_selector, + $time_format, $content_cleanup, - $remove_styling, - $this->feedName + $this->feedName, + $remove_styling ); $entry['uri'] = $uri; @@ -265,10 +276,11 @@ protected function cleanArticleContent($content, $cleanup_selector, $remove_styl } if ($remove_styling) { - // Get rid of classes - $content = preg_replace('/(<[^>]+) class=".*?"/i', '$1', $content); - // Get rid of inline styling - $content = preg_replace('/(<[^>]+) style=".*?"/i', '$1', $content); + foreach (['class', 'style'] as $attribute_to_remove) { + foreach ($content->find('[' . $attribute_to_remove . ']') as $item_to_clean) { + $item_to_clean->removeAttribute($attribute_to_remove); + } + } } if ($string_convert) { @@ -355,12 +367,36 @@ protected function fetchArticleElementFromPage($entry_url, $content_selector) return $article_content; } + protected function parseTimeStrAsTimestamp($timeStr, $format) + { + $date = date_parse_from_format($format, $timeStr); + if ($date['error_count'] != 0) { + returnClientError('Error while parsing time string'); + } + + $timestamp = mktime( + $date['hour'], + $date['minute'], + $date['second'], + $date['month'], + $date['day'], + $date['year'] + ); + + if ($timestamp == false) { + returnClientError('Error while creating timestamp'); + } + + return $timestamp; + } + /** * Retrieve article content from its URL using content selector and return a feed item * @param object $entry_html A DOM element containing the article * @param string $title_selector A selector to the article title from the article * @param string $author_selector A selector to find the article author * @param string $time_selector A selector to get the article publication time. + * @param string $time_format The format to parse the time_selector. * @param string $content_cleanup Optional selector for removing elements, e.g. "div.ads, * div.comments" * @param string $title_default Optional title to use when could not extract title reliably @@ -373,6 +409,7 @@ protected function parseEntryElement( $author_selector = null, $category_selector = null, $time_selector = null, + $time_format = null, $content_cleanup = null, $title_default = null, $remove_styling = false, @@ -386,12 +423,13 @@ protected function parseEntryElement( } $author = null; - if (is_null($author_selector)) { + if (!is_null($author_selector) && $author_selector != '') { + echo 'Extracting the author'; $author = trim($entry_html->find($author_selector, 0)->innertext); } $categories = []; - if (!is_null($category_selector)) { + if (!is_null($category_selector && $category_selector != '')) { $category_elements = $entry_html->find($category_selector); foreach ($category_elements as $category_element) { $categories[] = trim($category_element->innertext); @@ -399,13 +437,14 @@ protected function parseEntryElement( } $time = null; - if (!is_null($time_selector)) { + if (!is_null($time_selector) && $time_selector != '') { $time_element = $entry_html->find($time_selector, 0); - if ($time_element->tag == 'time') { - $time = $time_element->getAttribute('datetime'); - } else { + $time = $time_element->getAttribute('datetime'); + if (is_null($time)) { $time = $time_element->innertext; } + + $this->parseTimeStrAsTimestamp($time, $time_format); } $article_content = $this->cleanArticleContent($article_content, $content_cleanup, $remove_styling); From c899bb878fb62f8d46cff6cad9aba6eefcc7b0a3 Mon Sep 17 00:00:00 2001 From: Lars Stegman Date: Mon, 21 Aug 2023 22:18:13 +0200 Subject: [PATCH 7/8] Update bridges/CssSelector2Bridge.php Co-authored-by: ORelio --- bridges/CssSelector2Bridge.php | 1 - 1 file changed, 1 deletion(-) diff --git a/bridges/CssSelector2Bridge.php b/bridges/CssSelector2Bridge.php index 71c9055f00c..729bd2489a6 100644 --- a/bridges/CssSelector2Bridge.php +++ b/bridges/CssSelector2Bridge.php @@ -424,7 +424,6 @@ protected function parseEntryElement( $author = null; if (!is_null($author_selector) && $author_selector != '') { - echo 'Extracting the author'; $author = trim($entry_html->find($author_selector, 0)->innertext); } From 80ab273c3476dc3ebda5addb53abd098bfc01cbf Mon Sep 17 00:00:00 2001 From: Lars Stegman Date: Tue, 22 Aug 2023 21:04:08 +0200 Subject: [PATCH 8/8] Rename bridge and fix syntax error for php7 --- ...ridge.php => CssSelectorComplexBridge.php} | 35 +++++++++---------- 1 file changed, 17 insertions(+), 18 deletions(-) rename bridges/{CssSelector2Bridge.php => CssSelectorComplexBridge.php} (96%) diff --git a/bridges/CssSelector2Bridge.php b/bridges/CssSelectorComplexBridge.php similarity index 96% rename from bridges/CssSelector2Bridge.php rename to bridges/CssSelectorComplexBridge.php index 729bd2489a6..4d44f85325e 100644 --- a/bridges/CssSelector2Bridge.php +++ b/bridges/CssSelectorComplexBridge.php @@ -1,9 +1,9 @@ [ 'name' => '[Optional] Cookie', 'title' => << 'sessionId=deadb33f' ], + 'title_cleanup' => [ + 'name' => '[Optional] Text to remove from feed title', + 'title' => << ' | BlogName', + ], 'entry_element_selector' => [ 'name' => 'Selector for article entry elements', 'title' => << 'Optionally filter items by applying a regular expression on their URL', 'exampleValue' => '/blog/article/.*', ], + 'limit' => self::LIMIT, 'use_article_pages' => [ 'name' => 'Load article from page', 'title' => << 'checkbox' ], 'article_page_content_selector' => [ - 'name' => '[Optional] Selector to select article page content', + 'name' => '[Optional] Selector to select article element', 'title' => 'Extract the article from its page using the provided selector', 'exampleValue' => 'article.content', ], - 'limit' => self::LIMIT, 'content_cleanup' => [ - 'name' => '[Optional] Content cleanup: List of items to remove', + 'name' => '[Optional] Content cleanup: selector for items to remove', 'title' => 'Selector for unnecessary elements to remove inside article contents.', 'exampleValue' => 'div.ads, div.comments', ], @@ -80,13 +86,6 @@ class CssSelector2Bridge extends BridgeAbstract 'title' => 'Selector to select the article title', 'defaultValue' => 'h1' ], - 'title_cleanup' => [ - 'name' => '[Optional] Text to remove from feed title', - 'title' => << ' | BlogName', - ], 'category_selector' => [ 'name' => '[Optional] Categories', 'title' => << [ 'name' => '[Optional] Time selector', 'title' => << [ @@ -119,7 +118,7 @@ class CssSelector2Bridge extends BridgeAbstract ], 'remove_styling' => [ 'name' => '[Optional] Remove styling', - 'title' => 'Remove class and style tags from the page contents', + 'title' => 'Remove class and style attributes from the page elements', 'type' => 'checkbox' ] ] @@ -412,7 +411,7 @@ protected function parseEntryElement( $time_format = null, $content_cleanup = null, $title_default = null, - $remove_styling = false, + $remove_styling = false ) { $article_content = convertLazyLoading($entry_html);