Skip to content

Commit

Permalink
[FuturaSciences] Improve content extraction (#2317)
Browse files Browse the repository at this point in the history
- Fix tracking removal in URL
- Fix images broken due to new lazy loading mechanism
- Remove headline, articles do not have it anymore
- Improve article cleanup
  • Loading branch information
ORelio authored Oct 29, 2021
1 parent 970bdd4 commit 547829f
Showing 1 changed file with 40 additions and 24 deletions.
64 changes: 40 additions & 24 deletions bridges/FuturaSciencesBridge.php
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ public function collectData(){

protected function parseItem($newsItem){
$item = parent::parseItem($newsItem);
$item['uri'] = str_replace('#xtor=RSS-8', '', $item['uri']);
$item['uri'] = str_replace('#xtor%3DRSS-8', '', $item['uri']);
$article = getSimpleHTMLDOMCached($item['uri'])
or returnServerError('Could not request Futura-Sciences: ' . $item['uri']);
$item['content'] = $this->extractArticleContent($article);
Expand All @@ -96,31 +96,47 @@ protected function parseItem($newsItem){
}

private function extractArticleContent($article){
$contents = $article->find('section.article-text', 1)->innertext;
$headline = trim($article->find('p.description', 0)->plaintext);
if(!empty($headline))
$headline = '<p><b>' . $headline . '</b></p>';
$contents = $article->find('section.article-text', 1);

foreach (array(
'<div class="clear',
'<div class="sharebar2',
'<div class="diaporamafullscreen"',
'<div class="module social-button',
'<div class="module social-share',
'<div style="margin-bottom:10px;" class="noprint"',
'<div class="ficheprevnext',
'<div class="bar noprint',
'<div class="toolbar noprint',
'<div class="addthis_toolbox',
'<div class="noprint',
'<div class="bg bglight border border-full noprint',
'<div class="httplogbar-wrapper noprint',
'<div id="forumcomments',
'<div ng-if="active"'
) as $div_start) {
$contents = stripRecursiveHTMLSection($contents, 'div', $div_start);
foreach($contents->find('img') as $img) {
if(!empty($img->getAttribute('data-src'))) {
$img->src = $img->getAttribute('data-src');
}
}

foreach($contents->find('a.tooltip-link') as $a) {
$a->outertext = $a->plaintext;
}

foreach(array(
'clear',
'sharebar2',
'diaporamafullscreen',
'module.social-button',
'module.social-share',
'ficheprevnext',
'addthis_toolbox',
'noprint',
'hubbottom',
'hubbottom2'
) as $div_class_remove) {
foreach($contents->find('div.' . $div_class_remove) as $div) {
$keep_div = false;
foreach(array(
'didyouknow'
) as $div_class_dont_remove) {
if(strpos($div->getAttribute('class'), $div_class_dont_remove) !== false) {
$keep_div = true;
}
}
if(!$keep_div) {
$div->outertext = '';
}
}
}

$contents = $contents->innertext;

$contents = stripWithDelimiters($contents, '<hr ', '/>');
$contents = stripWithDelimiters($contents, '<p class="content-date', '</p>');
$contents = stripWithDelimiters($contents, '<h1 class="content-title', '</h1>');
Expand All @@ -131,7 +147,7 @@ private function extractArticleContent($article){
$contents = stripWithDelimiters($contents, '<script ', '</script>');
$contents = stripWithDelimiters($contents, '<script>', '</script>');

return $headline . trim($contents);
return trim($contents);
}

// Extracts the author from an article or element
Expand Down

0 comments on commit 547829f

Please sign in to comment.