Skip to content
This repository has been archived by the owner on Feb 4, 2022. It is now read-only.

Commit

Permalink
#152 - Initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
Tom Janssens committed Apr 28, 2015
1 parent 84a175f commit f9ca357
Show file tree
Hide file tree
Showing 21 changed files with 1,050 additions and 0 deletions.
15 changes: 15 additions & 0 deletions application/admin/component/links/controller/link.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
<?php
/**
* Belgian Police Web Platform - Police Component
*
* @copyright Copyright (C) 2012 - 2013 Timble CVBA. (http://www.timble.net)
* @license GNU GPLv3 <http://www.gnu.org/licenses/gpl.html>
* @link https://github.com/belgianpolice/internet-platform
*/

use Nooku\Library;

class LinksControllerLink extends Library\ControllerModel
{

}
15 changes: 15 additions & 0 deletions application/admin/component/links/controller/permission/link.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
<?php
/**
* Belgian Police Web Platform - Links Component
*
* @copyright Copyright (C) 2012 - 2013 Timble CVBA. (http://www.timble.net)
* @license GNU GPLv3 <http://www.gnu.org/licenses/gpl.html>
* @link https://github.com/belgianpolice/internet-platform
*/

use Nooku\Library;

class LinksControllerPermissionLink extends ApplicationControllerPermissionAbstract
{

}
29 changes: 29 additions & 0 deletions application/admin/component/links/view/link/html.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
<?php
/**
* Belgian Police Web Platform - Links Component
*
* @copyright Copyright (C) 2012 - 2013 Timble CVBA. (http://www.timble.net)
* @license GNU GPLv3 <http://www.gnu.org/licenses/gpl.html>
* @link https://github.com/belgianpolice/internet-platform
*/

use Nooku\Library;

/**
* Articles HTML View
*
* @author Tom Janssens <http://nooku.assembla.com/profile/tomjanssens>
* @package Component\Articles
*/
class LinksViewLinkHtml extends Library\ViewHtml
{
public function render()
{
$model = $this->getModel();
$link = $model->getRow();

$this->childs = $this->getObject('com:links.model.relations')->links_link_id($link->id)->getRowset();

return parent::render();
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
<?
/**
* Belgian Police Web Platform - Links Component
*
* @copyright Copyright (C) 2012 - 2013 Timble CVBA. (http://www.timble.net)
* @license GNU GPLv3 <http://www.gnu.org/licenses/gpl.html>
* @link https://github.com/belgianpolice/internet-platform
*/
?>

<script src="assets://js/koowa.js" />

<ktml:module position="actionbar">
<ktml:toolbar type="actionbar">
</ktml:module>

<form action="" method="post" id="link-form" class="-koowa-form">
<div class="main">
<div class="title">
<input disabled class="required" type="text" name="title" maxlength="255" value="<?= escape($link->title) ?>" placeholder="<?= translate('Title') ?>" />
<div class="slug">
<span class="add-on">URL</span>
<input disabled type="text" name="slug" maxlength="255" value="<?= escape($link->url) ?>" />
</div>
</div>

<div class="scrollable">
<fieldset>
<legend><?= translate('Mentioned on') ?></legend>
<table class="table table--striped">
<thead>
<tr>
<th><?= translate('Title') ?></th>
<th><?= translate('Status') ?></th>
</tr>
</thead>
<tbody>
<? foreach($childs AS $child) : ?>
<tr>
<td><?= $child->child_title ?><br />
<small><a target="_blank" href="<?= $child->child_url ?>"><?= $child->child_url ?></a></small></td>
<td><?= $child->child_status ?></td>
</tr>
<? endforeach ?>
</tbody>
</table>
</fieldset>
</div>
</div>
</form>
6 changes: 6 additions & 0 deletions application/admin/component/links/view/links/metadata.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
<?xml version="1.0" encoding="utf-8"?>
<metadata>
<view title="Links">
<message><![CDATA[]]></message>
</view>
</metadata>
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
<?
/**
* Belgian Police Web Platform - Links Component
*
* @copyright Copyright (C) 2012 - 2013 Timble CVBA. (http://www.timble.net)
* @license GNU GPLv3 <http://www.gnu.org/licenses/gpl.html>
* @link https://github.com/belgianpolice/internet-platform
*/
?>

<script src="assets://js/koowa.js" />
<style src="assets://css/koowa.css" />

<form action="" method="get" class="-koowa-grid">
<table>
<thead>
<tr>
<th width="100%">
<?= helper('grid.sort', array('column' => 'url', 'title' => 'Link')) ?>
</th>
<th>
<?= helper('grid.sort', array('column' => 'links')) ?>
</th>
<th>
<?= helper('grid.sort', array('column' => 'status')) ?>
</th>
<th>
<?= helper('grid.sort', array('column' => 'last_crawled_on', 'title' => 'Last crawled on')) ?>
</th>
<th>
<?= helper('grid.sort', array('column' => 'last_checked_on', 'title' => 'Last checked on')) ?>
</th>
</tr>
</thead>
<tfoot>
<tr>
<td colspan="7">
<?= helper('com:application.paginator.pagination', array('total' => $total)) ?>
</td>
</tr>
</tfoot>
<tbody>
<? foreach ($links as $link) : ?>
<tr>
<td class="ellipsis" style="padding: 8px 10px">
<a href="<?= route('view=link&id='.$link->id); ?>"><?= $link->title ?><br /><small><?= $link->url ?></small></a>
</td>
<td>
<?= $link->links ?>
</td>
<td>
<?= $link->status ?>
</td>
<td>
<?= helper('date.format', array('date'=> $link->last_crawled_on ? $link->last_crawled_on : $link->created_on, 'format' => translate('D d.m.Y - G:i'))) ?>
</td>
<td>
<? if($link->last_checked_on) : ?>
<?= helper('date.format', array('date'=> $link->last_checked_on, 'format' => translate('D d.m.Y - G:i'))) ?>
<? endif ?>
</td>
</tr>
<? endforeach; ?>
</tbody>
</table>
</form>
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
<?xml version="1.0" encoding="utf-8"?>
<metadata>
<layout title="Default layout">
<message>
<![CDATA[]]>
</message>
</layout>
</metadata>
155 changes: 155 additions & 0 deletions application/manager/component/links/controller/Crawler.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
<?php
/**
* @author Oscar Casajuana a.k.a. elboletaire <elboletaire {at} underave {dot} net>
*/
/*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>
*/

class Crawler
{
private $depth = 2;
private $url;
private $results = array();
private $same_host = false;
private $host;

public function setDepth($depth) { $this->depth = $depth; }
public function setHost($host) { $this->host = $host; }
public function getResults() { return $this->results; }
public function setSameHost($same_host) { $this->same_host = $same_host; }

public function setUrl($url)
{
$this->url = $url;
$this->setHost($this->getHostFromUrl($url));
}

public function __construct($url = null, $depth = null, $same_host = false)
{
if (!empty($url)) {
$this->setUrl($url);
}
if (isset($depth) && !is_null($depth)) {
$this->setDepth($depth);
}
$this->setSameHost($same_host);
}

public function crawl()
{
if (empty($this->url)) {
throw new Exception('URL must be set');
}
$this->_crawl($this->url, $this->depth);
return $this->results;
}

private function _crawl($url, $depth)
{
static $seen = array();

if (empty($url)) return;

if (!$url = $this->buildUrl($this->url, $url)) {
return;
}

if ($depth === 0 || isset($seen[$url])) {
return;
}

$seen[$url] = true;

$dom = new DOMDocument('1.0');
libxml_use_internal_errors(true);
@$dom->loadHTMLFile($url);

$this->results[] = array(
'url' => $url,
// 'content' => $dom->saveHTML()
);

$anchors = $dom->getElementsByTagName('a');
foreach ($anchors as $element)
{
if (!$href = $this->buildUrl($url, $element->getAttribute('href'))) {
continue;
}
$this->_crawl($href, $depth - 1);
}

return $url;
}

private function buildUrl($url, $href)
{
if (0 !== strpos($href, 'http'))
{
if (0 === strpos($href, 'javascript:') || 0 === strpos($href, '#'))
{
return false;
}
$path = '/' . ltrim($href, '/');
if (extension_loaded('http'))
{
$new_href = http_build_url($url, array('path' => $path), HTTP_URL_REPLACE, $parts);
}
else
{
$parts = parse_url($url);
$new_href = $this->buildUrlFromParts($parts);
$new_href .= $path;
}
// Relative urls... (like ./viewforum.php)
if (0 === strpos($href, './') && !empty($parts['path']))
{
// If the path isn't really a path (doesn't end with slash)...
if (!preg_match('@/$@', $parts['path'])) {
$path_parts = explode('/', $parts['path']);
array_pop($path_parts);
$parts['path'] = implode('/', $path_parts) . '/';
}

$new_href = $this->buildUrlFromParts($parts) . $parts['path'] . ltrim($href, './');
}
$href = $new_href;
}
$href = rtrim($href, '/');
if ($this->same_host && $this->host != $this->getHostFromUrl($href)) {
return false;
}
return $href;
}

private function buildUrlFromParts($parts)
{
$new_href = $parts['scheme'] . '://';
if (isset($parts['user']) && isset($parts['pass'])) {
$new_href .= $parts['user'] . ':' . $parts['pass'] . '@';
}
$new_href .= $parts['host'];
if (isset($parts['port'])) {
$new_href .= ':' . $parts['port'];
}
return $new_href;
}

private function getHostFromUrl($url)
{
$parts = parse_url($url);
preg_match("@([^/.]+)\.([^.]{2,6}(?:\.[^.]{2,3})?)$@", $parts['host'], $host);
return array_shift($host);
}
}
Loading

0 comments on commit f9ca357

Please sign in to comment.