Skip to content

Commit a5452e4

Browse files
committed
linux path fix
1 parent ece721d commit a5452e4

File tree

7 files changed

+979
-972
lines changed

7 files changed

+979
-972
lines changed

LICENSE

Lines changed: 674 additions & 674 deletions
Large diffs are not rendered by default.

composer.json

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
1-
{
2-
"name" : "szdk/phpwebcrawler",
3-
"description": "Browse/Download all pages from websites",
4-
"keywords" : [ "web", "pages", "crawler", "extractor"],
5-
"autoload": {
6-
"psr-4" : {
7-
"szdk\\PHPWebCrawler\\" : "src/"
8-
}
9-
}
10-
}
1+
{
2+
"name" : "szdk/phpwebcrawler",
3+
"description": "Browse/Download all pages from websites",
4+
"keywords" : [ "web", "pages", "crawler", "extractor"],
5+
"autoload": {
6+
"psr-4" : {
7+
"szdk\\PHPWebCrawler\\" : "src/"
8+
}
9+
}
10+
}

examples/emails.php

Lines changed: 29 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -1,31 +1,30 @@
1-
<?php
2-
/**
3-
* example code to extract emails from website
4-
*/
5-
ob_implicit_flush();
6-
7-
require __DIR__ . "/../vendor/autoload.php";
8-
9-
$url = "https://example.com"; //or path to any local html file
10-
$scraper = function ($content) {
11-
static $counter = 0;
12-
$counter++;
13-
echo $counter . '. ' . $content['url'] . " ";
14-
$content = $content['content'];
15-
16-
//find emails
17-
preg_match_all('/[a-z0-9\-\_\.]+\@[a-z0-9\_\-\.]{4,}\.[a-z]{2,4}/i', $content, $matches);
18-
19-
foreach ($matches[0] as $email) {
20-
file_put_contents('emails.txt', $email . "\n", FILE_APPEND);
21-
echo " => " . $email . "\n" ;
22-
}
23-
24-
echo "\n";
25-
flush();
26-
};
27-
28-
$crawler = new szdk\PHPWebCrawler\Crawler($url, true);
29-
//$crawler->rootDir = 'php-docs';
30-
$crawler->depth = 0;
1+
<?php
2+
/**
3+
* example code to extract emails from website
4+
*/
5+
ob_implicit_flush();
6+
7+
require __DIR__ . "/../vendor/autoload.php";
8+
9+
$url = "https://example.com"; //or path to any local html file
10+
$scraper = function ($content) {
11+
static $counter = 0;
12+
$counter++;
13+
echo $counter . '. ' . $content['url'] . " ";
14+
$content = $content['content'];
15+
16+
//find emails
17+
preg_match_all('/[a-z0-9\-\_\.]+\@[a-z0-9\_\-\.]{4,}\.[a-z]{2,4}/i', $content, $matches);
18+
19+
foreach ($matches[0] as $email) {
20+
file_put_contents('emails.txt', $email . "\n", FILE_APPEND);
21+
echo " => " . $email . "\n" ;
22+
}
23+
24+
echo "\n";
25+
flush();
26+
};
27+
28+
$crawler = new szdk\PHPWebCrawler\Crawler($url, true);
29+
$crawler->depth = 0;
3130
$crawler->run($scraper);

src/Crawler.php

Lines changed: 110 additions & 110 deletions
Original file line numberDiff line numberDiff line change
@@ -1,110 +1,110 @@
1-
<?php
2-
namespace szdk\PHPWebCrawler;
3-
4-
5-
class Crawler
6-
{
7-
use Queue;
8-
use Links;
9-
10-
public $url; //url of webpage
11-
//public $rootDir = null; //top level dir (eg: https://example.com, root/)
12-
public $depth = 100; //scrap max 100 pages (put 0 for no limit)
13-
public $onlyChildren = true; //process only children urls?
14-
public $exceptions = []; //dont search through these urls, or there children directories
15-
16-
private const REMOVE_ANCHOR = 1;
17-
private const REMOVE_QUERY = 2;
18-
private const REMOVE_SCHEME = 4;
19-
private const REMOVE_FILE_NAME = 8;
20-
private const LOWERCASE = 16;
21-
22-
private $localFile = false;
23-
private $processedCount = 0;
24-
private $processed = [];
25-
private $queued = [];
26-
private $dirs = [
27-
'temp' => __DIR__ . DIRECTORY_SEPARATOR . '..' . DIRECTORY_SEPARATOR . 'temp' . DIRECTORY_SEPARATOR,
28-
];
29-
private $files = [
30-
'processed' => __DIR__ . DIRECTORY_SEPARATOR . '..' . DIRECTORY_SEPARATOR . 'temp' . DIRECTORY_SEPARATOR . 'processed.json',
31-
'queued' => __DIR__ . DIRECTORY_SEPARATOR . '..' . DIRECTORY_SEPARATOR . 'temp' . DIRECTORY_SEPARATOR . 'queued.json',
32-
];
33-
34-
public function __construct(String $url, $clearURLs = false)
35-
{
36-
$this->url = \trim($url);
37-
if (\stripos($this->url, 'http://') !== 0 && \stripos($this->url, 'https://') !== 0) {
38-
$this->localFile = true;
39-
$this->url = \str_replace('\\', '/', \realpath($this->url));
40-
}
41-
foreach ($this->dirs as $dir) {
42-
if (!\is_dir($dir)) {
43-
@\mkdir($dir);
44-
}
45-
}
46-
if ($clearURLs) {
47-
$this->clearURLs();
48-
}
49-
}
50-
51-
/*
52-
*the callable function passed must accept a webpage contents in first parameter
53-
*/
54-
public function run(Callable $fnc)
55-
{
56-
$this->queued($this->url);
57-
$this->url = $this->getTrimmedURL($this->url, self::REMOVE_ANCHOR | self::REMOVE_FILE_NAME);
58-
59-
60-
while (!empty($this->queued) && ($this->depth === 0 || $this->processedCount < $this->depth)) {
61-
$url = \array_key_first($this->queued);
62-
63-
if ($this->isURLProcessed($url)) {
64-
$this->processed($url);
65-
continue;
66-
}
67-
68-
$this->processed($url);
69-
70-
71-
if ($this->onlyChildren && !$this->isChildrenURL($url, $this->url)) {
72-
continue;
73-
}
74-
75-
foreach ($this->exceptions as $link) {
76-
if ($this->isChildrenURL($url, $link));
77-
continue;
78-
}
79-
80-
81-
try {
82-
$content = \file_get_contents(!$this->localFile ? $url : $this->getTrimmedURL($url, self::REMOVE_ANCHOR | self::REMOVE_QUERY));
83-
} catch (\Exception $e) {
84-
//Logger::log($e);
85-
}
86-
87-
$fnc(['content' => $content, 'url' => $url]); //run user generated scraping function
88-
89-
$newLinks = $this->extractLinks($content, $url);
90-
foreach ($newLinks as $link) {
91-
if (!$this->isURLProcessed($link) && (!$this->onlyChildren || $this->isChildrenURL($link, $this->url))) {
92-
$this->queued(!$this->localFile ? $link : $this->getTrimmedURL($link, self::REMOVE_ANCHOR | self::REMOVE_QUERY));
93-
}
94-
}
95-
$this->processedCount++;
96-
}
97-
return true;
98-
}
99-
100-
public function clearURLs()
101-
{
102-
foreach ($this->files as $file) {
103-
if (\is_readable($file)) {
104-
@\unlink($file);
105-
}
106-
}
107-
$this->processed = $this->queued = [];
108-
}
109-
}
110-
1+
<?php
2+
namespace szdk\PHPWebCrawler;
3+
4+
5+
class Crawler
6+
{
7+
use Queue;
8+
use Links;
9+
10+
public $url; //url of webpage
11+
//public $rootDir = null; //top level dir (eg: https://example.com, root/)
12+
public $depth = 100; //scrap max 100 pages (put 0 for no limit)
13+
public $onlyChildren = true; //process only children urls?
14+
public $exceptions = []; //dont search through these urls, or there children directories
15+
16+
private const REMOVE_ANCHOR = 1;
17+
private const REMOVE_QUERY = 2;
18+
private const REMOVE_SCHEME = 4;
19+
private const REMOVE_FILE_NAME = 8;
20+
private const LOWERCASE = 16;
21+
22+
private $localFile = false;
23+
private $processedCount = 0;
24+
private $processed = [];
25+
private $queued = [];
26+
private $dirs = [
27+
'temp' => __DIR__ . DIRECTORY_SEPARATOR . '..' . DIRECTORY_SEPARATOR . 'temp' . DIRECTORY_SEPARATOR,
28+
];
29+
private $files = [
30+
'processed' => __DIR__ . DIRECTORY_SEPARATOR . '..' . DIRECTORY_SEPARATOR . 'temp' . DIRECTORY_SEPARATOR . 'processed.json',
31+
'queued' => __DIR__ . DIRECTORY_SEPARATOR . '..' . DIRECTORY_SEPARATOR . 'temp' . DIRECTORY_SEPARATOR . 'queued.json',
32+
];
33+
34+
public function __construct(String $url, $clearURLs = false)
35+
{
36+
$this->url = \trim($url);
37+
if (\stripos($this->url, 'http://') !== 0 && \stripos($this->url, 'https://') !== 0) {
38+
$this->localFile = true;
39+
$this->url = \str_replace('\\', '/', \realpath($this->url));
40+
}
41+
foreach ($this->dirs as $dir) {
42+
if (!\is_dir($dir)) {
43+
@\mkdir($dir);
44+
}
45+
}
46+
if ($clearURLs) {
47+
$this->clearURLs();
48+
}
49+
}
50+
51+
/*
52+
*the callable function passed must accept a webpage contents in first parameter
53+
*/
54+
public function run(Callable $fnc)
55+
{
56+
$this->queued($this->url);
57+
$this->url = $this->getTrimmedURL($this->url, self::REMOVE_ANCHOR | self::REMOVE_FILE_NAME);
58+
59+
60+
while (!empty($this->queued) && ($this->depth === 0 || $this->processedCount < $this->depth)) {
61+
$url = \array_key_first($this->queued);
62+
63+
if ($this->isURLProcessed($url)) {
64+
$this->processed($url);
65+
continue;
66+
}
67+
68+
$this->processed($url);
69+
70+
71+
if ($this->onlyChildren && !$this->isChildrenURL($url, $this->url)) {
72+
continue;
73+
}
74+
75+
foreach ($this->exceptions as $link) {
76+
if ($this->isChildrenURL($url, $link));
77+
continue;
78+
}
79+
80+
81+
try {
82+
$content = \file_get_contents(!$this->localFile ? $url : $this->getTrimmedURL($url, self::REMOVE_ANCHOR | self::REMOVE_QUERY));
83+
} catch (\Exception $e) {
84+
//Logger::log($e);
85+
}
86+
87+
$fnc(['content' => $content, 'url' => $url]); //run user generated scraping function
88+
89+
$newLinks = $this->extractLinks($content, $url);
90+
foreach ($newLinks as $link) {
91+
if (!$this->isURLProcessed($link) && (!$this->onlyChildren || $this->isChildrenURL($link, $this->url))) {
92+
$this->queued(!$this->localFile ? $link : $this->getTrimmedURL($link, self::REMOVE_ANCHOR | self::REMOVE_QUERY));
93+
}
94+
}
95+
$this->processedCount++;
96+
}
97+
return true;
98+
}
99+
100+
public function clearURLs()
101+
{
102+
foreach ($this->files as $file) {
103+
if (\is_readable($file)) {
104+
@\unlink($file);
105+
}
106+
}
107+
$this->processed = $this->queued = [];
108+
}
109+
}
110+

0 commit comments

Comments
 (0)