Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[BridgeXPathAbstract + BlizzardNewsBridge + XPathBridge] Add new abstract class + two example implementations #1671

Merged
merged 8 commits into from
Nov 8, 2020
60 changes: 60 additions & 0 deletions bridges/BlizzardNewsBridge.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
<?php

class BlizzardNewsBridge extends XPathAbstract {

const NAME = 'Blizzard News';
const URI = 'https://news.blizzard.com';
const DESCRIPTION = 'Blizzard (game company) newsfeed';
const MAINTAINER = 'Niehztog';
const PARAMETERS = array(
'' => array(
'locale' => array(
'name' => 'Language',
'type' => 'list',
'values' => array(
'Deutsch' => 'de-de',
'English (EU)' => 'en-gb',
'English (US)' => 'en-us',
'Español (EU)' => 'es-es',
'Español (AL)' => 'es-mx',
'Français' => 'fr-fr',
'Italiano' => 'it-it',
'日本語' => 'ja-jp',
'한국어' => 'ko-kr',
'Polski' => 'pl-pl',
'Português (AL)' => 'pt-br',
'Русский' => 'ru-ru',
'ภาษาไทย' => 'th-th',
'简体中文' => 'zh-cn',
'繁體中文' => 'zh-tw'
),
'defaultValue' => 'en-us',
'title' => 'Select your language'
)
)
);
const CACHE_TIMEOUT = 3600;

const XPATH_EXPRESSION_ITEM = '/html/body/div/div[4]/div[2]/div[2]/div/div/section/ol/li/article';
const XPATH_EXPRESSION_ITEM_TITLE = './/div/div[2]/h2';
const XPATH_EXPRESSION_ITEM_CONTENT = './/div[@class="ArticleListItem-description"]/div[@class="h6"]';
const XPATH_EXPRESSION_ITEM_URI = './/a[@class="ArticleLink ArticleLink"]/@href';
const XPATH_EXPRESSION_ITEM_AUTHOR = '';
const XPATH_EXPRESSION_ITEM_TIMESTAMP = './/time[@class="ArticleListItem-footerTimestamp"]/@timestamp';
const XPATH_EXPRESSION_ITEM_ENCLOSURES = './/div[@class="ArticleListItem-image"]/@style';
const XPATH_EXPRESSION_ITEM_CATEGORIES = './/div[@class="ArticleListItem-label"]';
const SETTING_FIX_ENCODING = true;

/**
* Source Web page URL (should provide either HTML or XML content)
* @return string
*/
protected function getSourceUrl(){

$locale = $this->getInput('locale');
if('zh-cn' === $locale) {
return 'https://cn.news.blizzard.com';
}
return 'https://news.blizzard.com/' . $locale;
}
}
251 changes: 251 additions & 0 deletions bridges/XPathBridge.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,251 @@
<?php

class XPathBridge extends XPathAbstract {
const NAME = 'XPathBridge';
const URI = 'https://github.com/rss-bridge/rss-bridge';
const DESCRIPTION
= 'Parse any webpage using <a href="https://devhints.io/xpath" target="_blank">XPath expressions</a>';
const MAINTAINER = 'Niehztog';
const PARAMETERS = array(
'' => array(

'url' => array(
'name' => 'Enter web page URL',
'title' => <<<"EOL"
You can specify any website URL which serves data suited for display in RSS feeds
(for example a news blog).
EOL
, 'type' => 'text',
'exampleValue' => 'https://news.blizzard.com/en-en',
'defaultValue' => 'https://news.blizzard.com/en-en',
'required' => true
),

'item' => array(
'name' => 'Item selector',
'title' => <<<"EOL"
Enter an XPath expression matching a list of dom nodes, each node containing one
feed article item in total (usually a surrounding &lt;div&gt; or &lt;span&gt; tag). This will
be the context nodes for all of the following expressions. This expression usually
starts with a single forward slash.
EOL
, 'type' => 'text',
'exampleValue' => '/html/body/div/div[4]/div[2]/div[2]/div/div/section/ol/li/article',
'defaultValue' => '/html/body/div/div[4]/div[2]/div[2]/div/div/section/ol/li/article',
'required' => true
),

'title' => array(
'name' => 'Item title selector',
'title' => <<<"EOL"
This expression should match a node contained within each article item node
containing the article headline. It should start with a dot followed by two
forward slashes, referring to any descendant nodes of the article item node.
EOL
, 'type' => 'text',
'exampleValue' => './/div/div[2]/h2',
'defaultValue' => './/div/div[2]/h2',
'required' => true
),

'content' => array(
'name' => 'Item description selector',
'title' => <<<"EOL"
This expression should match a node contained within each article item node
containing the article content or description. It should start with a dot
followed by two forward slashes, referring to any descendant nodes of the
article item node.
EOL
, 'type' => 'text',
'exampleValue' => './/div[@class="ArticleListItem-description"]/div[@class="h6"]',
'defaultValue' => './/div[@class="ArticleListItem-description"]/div[@class="h6"]',
'required' => false
),

'uri' => array(
'name' => 'Item URL selector',
'title' => <<<"EOL"
This expression should match a node's attribute containing the article URL
(usually the href attribute of an &lt;a&gt; tag). It should start with a dot
followed by two forward slashes, referring to any descendant nodes of
the article item node. Attributes can be selected by prepending an @ char
before the attributes name.
EOL
, 'type' => 'text',
'exampleValue' => './/a[@class="ArticleLink ArticleLink"]/@href',
'defaultValue' => './/a[@class="ArticleLink ArticleLink"]/@href',
'required' => false
),

'author' => array(
'name' => 'Item author selector',
'title' => <<<"EOL"
This expression should match a node contained within each article item
node containing the article author's name. It should start with a dot
followed by two forward slashes, referring to any descendant nodes of
the article item node.
EOL
, 'type' => 'text',
'required' => false
),

'timestamp' => array(
'name' => 'Item date selector',
'title' => <<<"EOL"
This expression should match a node or node's attribute containing the
article timestamp or date (parsable by PHP's strtotime function). It
should start with a dot followed by two forward slashes, referring to
any descendant nodes of the article item node. Attributes can be
selected by prepending an @ char before the attributes name.
EOL
, 'type' => 'text',
'exampleValue' => './/time[@class="ArticleListItem-footerTimestamp"]/@timestamp',
'defaultValue' => './/time[@class="ArticleListItem-footerTimestamp"]/@timestamp',
'required' => false
),

'enclosures' => array(
'name' => 'Item image selector',
'title' => <<<"EOL"
This expression should match a node's attribute containing an article
image URL (usually the src attribute of an &lt;img&gt; tag or a style
attribute). It should start with a dot followed by two forward slashes,
referring to any descendant nodes of the article item node. Attributes
can be selected by prepending an @ char before the attributes name.
EOL
, 'type' => 'text',
'exampleValue' => './/div[@class="ArticleListItem-image"]/@style',
'defaultValue' => './/div[@class="ArticleListItem-image"]/@style',
'required' => false
),

'categories' => array(
'name' => 'Item category selector',
'title' => <<<"EOL"
This expression should match a node or node's attribute contained
within each article item node containing the article category. This
could be inside &lt;div&gt; or &lt;span&gt; tags or sometimes be hidden
in a data attribute. It should start with a dot followed by two
forward slashes, referring to any descendant nodes of the article
item node. Attributes can be selected by prepending an @ char
before the attributes name.
EOL
, 'type' => 'text',
'exampleValue' => './/div[@class="ArticleListItem-label"]',
'defaultValue' => './/div[@class="ArticleListItem-label"]',
'required' => false
),

'fix_encoding' => array(
'name' => 'Fix encoding',
'title' => <<<"EOL"
Check this to fix feed encoding by invoking PHP's utf8_decode
function on all extracted texts. Try this in case you see "broken" or
"weird" characters in your feed where you'd normally expect umlauts
or any other non-ascii characters.
EOL
, 'type' => 'checkbox',
'required' => false
),

)
);

/**
* Source Web page URL (should provide either HTML or XML content)
* @return string
*/
protected function getSourceUrl(){
return $this->encodeUri($this->getInput('url'));
}

/**
* XPath expression for extracting the feed items from the source page
* @return string
*/
protected function getExpressionItem(){
return urldecode($this->getInput('item'));
}

/**
* XPath expression for extracting an item title from the item context
* @return string
*/
protected function getExpressionItemTitle(){
return urldecode($this->getInput('title'));
}

/**
* XPath expression for extracting an item's content from the item context
* @return string
*/
protected function getExpressionItemContent(){
return urldecode($this->getInput('content'));
}

/**
* XPath expression for extracting an item link from the item context
* @return string
*/
protected function getExpressionItemUri(){
return urldecode($this->getInput('uri'));
}

/**
* XPath expression for extracting an item author from the item context
* @return string
*/
protected function getExpressionItemAuthor(){
return urldecode($this->getInput('author'));
}

/**
* XPath expression for extracting an item timestamp from the item context
* @return string
*/
protected function getExpressionItemTimestamp(){
return urldecode($this->getInput('timestamp'));
}

/**
* XPath expression for extracting item enclosures (media content like
* images or movies) from the item context
* @return string
*/
protected function getExpressionItemEnclosures(){
return urldecode($this->getInput('enclosures'));
}

/**
* XPath expression for extracting an item category from the item context
* @return string
*/
protected function getExpressionItemCategories(){
return urldecode($this->getInput('categories'));
}

/**
* Fix encoding
* @return string
*/
protected function getSettingFixEncoding(){
return $this->getInput('fix_encoding');
}

/**
* Fixes URL encoding issues in input URL's
* @param $uri
* @return string|string[]
*/
private function encodeUri($uri)
{
if (strpos($uri, 'https%3A%2F%2F') === 0
|| strpos($uri, 'http%3A%2F%2F') === 0) {
$uri = urldecode($uri);
}

$uri = str_replace('|', '%7C', $uri);

return $uri;
}
}
3 changes: 2 additions & 1 deletion composer.json
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
},
"suggest": {
"ext-memcached": "Allows to use memcached as cache type",
"ext-sqlite3": "Allows to use an SQLite database for caching"
"ext-sqlite3": "Allows to use an SQLite database for caching",
"ext-dom": "Allows to use some bridges based on XPath expressions"
}
}
Loading