szdk
diff --git a/‎LICENSE
Lines changed: 674 additions & 674 deletions b/‎LICENSE
Lines changed: 674 additions & 674 deletions
diff --git a/‎composer.json
Lines changed: 10 additions & 10 deletions b/‎composer.json
Lines changed: 10 additions & 10 deletions
diff --git a/‎examples/emails.php
Lines changed: 29 additions & 30 deletions b/‎examples/emails.php
Lines changed: 29 additions & 30 deletions
diff --git a/‎src/Crawler.php
Lines changed: 110 additions & 110 deletions b/‎src/Crawler.php
Lines changed: 110 additions & 110 deletions
@@ -1,10 +1,10 @@
-{
-    "name" : "szdk/phpwebcrawler",
-    "description": "Browse/Download all pages from websites",
-    "keywords" : [ "web", "pages", "crawler", "extractor"],
-    "autoload": {
-        "psr-4" : {
-            "szdk\\PHPWebCrawler\\" : "src/"
-        }
-    }
-}
+{
+    "name" : "szdk/phpwebcrawler",
+    "description": "Browse/Download all pages from websites",
+    "keywords" : [ "web", "pages", "crawler", "extractor"],
+    "autoload": {
+        "psr-4" : {
+            "szdk\\PHPWebCrawler\\" : "src/"
+        }
+    }
+}
@@ -1,31 +1,30 @@
-<?php
-/**
- * example code to extract emails from website
- */
-ob_implicit_flush();
-
-require __DIR__ . "/../vendor/autoload.php";
-
-$url = "https://example.com"; //or path to any local html file
-$scraper = function ($content) {
-    static $counter = 0;
-    $counter++;
-    echo $counter . '. ' . $content['url'] . " ";
-    $content = $content['content'];
-
-    //find emails
-    preg_match_all('/[a-z0-9\-\_\.]+\@[a-z0-9\_\-\.]{4,}\.[a-z]{2,4}/i', $content, $matches);
-
-    foreach ($matches[0] as $email) {
-        file_put_contents('emails.txt', $email . "\n", FILE_APPEND);
-        echo  " => " . $email . "\n" ;
-    }
-
-    echo "\n";
-    flush();
-};
-
-$crawler = new szdk\PHPWebCrawler\Crawler($url, true);
-//$crawler->rootDir = 'php-docs';
-$crawler->depth = 0;
+<?php
+/**
+ * example code to extract emails from website
+ */
+ob_implicit_flush();
+
+require __DIR__ . "/../vendor/autoload.php";
+
+$url = "https://example.com"; //or path to any local html file
+$scraper = function ($content) {
+    static $counter = 0;
+    $counter++;
+    echo $counter . '. ' . $content['url'] . " ";
+    $content = $content['content'];
+
+    //find emails
+    preg_match_all('/[a-z0-9\-\_\.]+\@[a-z0-9\_\-\.]{4,}\.[a-z]{2,4}/i', $content, $matches);
+
+    foreach ($matches[0] as $email) {
+        file_put_contents('emails.txt', $email . "\n", FILE_APPEND);
+        echo  " => " . $email . "\n" ;
+    }
+
+    echo "\n";
+    flush();
+};
+
+$crawler = new szdk\PHPWebCrawler\Crawler($url, true);
+$crawler->depth = 0;
 $crawler->run($scraper);
@@ -1,110 +1,110 @@
-<?php
-namespace szdk\PHPWebCrawler;
-
-
-class Crawler
-{
-    use Queue;
-    use Links;
-
-    public $url; //url of webpage
-    //public $rootDir = null; //top level dir (eg: https://example.com, root/)
-    public $depth = 100; //scrap max 100 pages (put 0 for no limit)
-    public $onlyChildren = true; //process only children urls?
-    public $exceptions = []; //dont search through these urls, or there children directories
-    
-    private const REMOVE_ANCHOR = 1;
-    private const REMOVE_QUERY = 2;
-    private const REMOVE_SCHEME = 4;
-    private const REMOVE_FILE_NAME = 8;
-    private const LOWERCASE = 16;
-
-    private $localFile = false;
-    private $processedCount = 0;
-    private $processed = [];
-    private $queued = [];
-    private $dirs = [
-        'temp' => __DIR__ . DIRECTORY_SEPARATOR . '..' . DIRECTORY_SEPARATOR . 'temp' . DIRECTORY_SEPARATOR,
-    ];
-    private $files = [
-        'processed' =>  __DIR__ . DIRECTORY_SEPARATOR . '..' . DIRECTORY_SEPARATOR . 'temp' . DIRECTORY_SEPARATOR . 'processed.json',
-        'queued' =>  __DIR__ . DIRECTORY_SEPARATOR . '..' . DIRECTORY_SEPARATOR . 'temp' . DIRECTORY_SEPARATOR . 'queued.json',
-    ];
-
-    public function __construct(String $url, $clearURLs = false)
-    {
-        $this->url = \trim($url);
-        if (\stripos($this->url, 'http://') !== 0 && \stripos($this->url, 'https://') !== 0) {
-            $this->localFile = true;
-            $this->url = \str_replace('\\', '/', \realpath($this->url));
-        }
-        foreach ($this->dirs as $dir) {
-            if (!\is_dir($dir)) {
-                @\mkdir($dir);
-            }
-        }
-        if ($clearURLs) {
-            $this->clearURLs();
-        }
-    }
-
-    /*
-     *the callable function passed must accept a webpage contents in first parameter
-     */
-    public function run(Callable $fnc)
-    {
-        $this->queued($this->url);
-        $this->url = $this->getTrimmedURL($this->url, self::REMOVE_ANCHOR | self::REMOVE_FILE_NAME);
-        
-
-        while (!empty($this->queued) && ($this->depth === 0 || $this->processedCount < $this->depth)) {
-            $url = \array_key_first($this->queued);
-
-            if ($this->isURLProcessed($url)) {
-                $this->processed($url);
-                continue;
-            }
-            
-            $this->processed($url);
-
-
-            if ($this->onlyChildren && !$this->isChildrenURL($url, $this->url)) {
-                continue;
-            }
-            
-            foreach ($this->exceptions as $link) {
-                if ($this->isChildrenURL($url, $link));
-                continue;
-            }
-
-
-            try {
-                $content = \file_get_contents(!$this->localFile ? $url : $this->getTrimmedURL($url, self::REMOVE_ANCHOR | self::REMOVE_QUERY));
-            } catch (\Exception $e) {
-                //Logger::log($e);
-            }
-
-            $fnc(['content' => $content, 'url' => $url]); //run user generated scraping function
-
-            $newLinks = $this->extractLinks($content, $url);
-            foreach ($newLinks as $link) {
-                if (!$this->isURLProcessed($link) && (!$this->onlyChildren || $this->isChildrenURL($link, $this->url))) {
-                    $this->queued(!$this->localFile ? $link : $this->getTrimmedURL($link, self::REMOVE_ANCHOR | self::REMOVE_QUERY));
-                }
-            }
-            $this->processedCount++;
-        }
-        return true;
-    }
-
-    public function clearURLs()
-    {
-        foreach ($this->files as $file) {
-            if (\is_readable($file)) {
-                @\unlink($file);
-            }
-        }
-        $this->processed = $this->queued = [];
-    }
-}
-
+<?php
+namespace szdk\PHPWebCrawler;
+
+
+class Crawler
+{
+    use Queue;
+    use Links;
+
+    public $url; //url of webpage
+    //public $rootDir = null; //top level dir (eg: https://example.com, root/)
+    public $depth = 100; //scrap max 100 pages (put 0 for no limit)
+    public $onlyChildren = true; //process only children urls?
+    public $exceptions = []; //dont search through these urls, or there children directories
+    
+    private const REMOVE_ANCHOR = 1;
+    private const REMOVE_QUERY = 2;
+    private const REMOVE_SCHEME = 4;
+    private const REMOVE_FILE_NAME = 8;
+    private const LOWERCASE = 16;
+
+    private $localFile = false;
+    private $processedCount = 0;
+    private $processed = [];
+    private $queued = [];
+    private $dirs = [
+        'temp' => __DIR__ . DIRECTORY_SEPARATOR . '..' . DIRECTORY_SEPARATOR . 'temp' . DIRECTORY_SEPARATOR,
+    ];
+    private $files = [
+        'processed' =>  __DIR__ . DIRECTORY_SEPARATOR . '..' . DIRECTORY_SEPARATOR . 'temp' . DIRECTORY_SEPARATOR . 'processed.json',
+        'queued' =>  __DIR__ . DIRECTORY_SEPARATOR . '..' . DIRECTORY_SEPARATOR . 'temp' . DIRECTORY_SEPARATOR . 'queued.json',
+    ];
+
+    public function __construct(String $url, $clearURLs = false)
+    {
+        $this->url = \trim($url);
+        if (\stripos($this->url, 'http://') !== 0 && \stripos($this->url, 'https://') !== 0) {
+            $this->localFile = true;
+            $this->url = \str_replace('\\', '/', \realpath($this->url));
+        }
+        foreach ($this->dirs as $dir) {
+            if (!\is_dir($dir)) {
+                @\mkdir($dir);
+            }
+        }
+        if ($clearURLs) {
+            $this->clearURLs();
+        }
+    }
+
+    /*
+     *the callable function passed must accept a webpage contents in first parameter
+     */
+    public function run(Callable $fnc)
+    {
+        $this->queued($this->url);
+        $this->url = $this->getTrimmedURL($this->url, self::REMOVE_ANCHOR | self::REMOVE_FILE_NAME);
+        
+
+        while (!empty($this->queued) && ($this->depth === 0 || $this->processedCount < $this->depth)) {
+            $url = \array_key_first($this->queued);
+
+            if ($this->isURLProcessed($url)) {
+                $this->processed($url);
+                continue;
+            }
+            
+            $this->processed($url);
+
+
+            if ($this->onlyChildren && !$this->isChildrenURL($url, $this->url)) {
+                continue;
+            }
+            
+            foreach ($this->exceptions as $link) {
+                if ($this->isChildrenURL($url, $link));
+                continue;
+            }
+
+
+            try {
+                $content = \file_get_contents(!$this->localFile ? $url : $this->getTrimmedURL($url, self::REMOVE_ANCHOR | self::REMOVE_QUERY));
+            } catch (\Exception $e) {
+                //Logger::log($e);
+            }
+
+            $fnc(['content' => $content, 'url' => $url]); //run user generated scraping function
+
+            $newLinks = $this->extractLinks($content, $url);
+            foreach ($newLinks as $link) {
+                if (!$this->isURLProcessed($link) && (!$this->onlyChildren || $this->isChildrenURL($link, $this->url))) {
+                    $this->queued(!$this->localFile ? $link : $this->getTrimmedURL($link, self::REMOVE_ANCHOR | self::REMOVE_QUERY));
+                }
+            }
+            $this->processedCount++;
+        }
+        return true;
+    }
+
+    public function clearURLs()
+    {
+        foreach ($this->files as $file) {
+            if (\is_readable($file)) {
+                @\unlink($file);
+            }
+        }
+        $this->processed = $this->queued = [];
+    }
+}
+