From 8cbe82e326a06336393567d4af9bf1b33d721767 Mon Sep 17 00:00:00 2001 From: demiot Date: Wed, 16 Feb 2022 07:20:50 +0100 Subject: [PATCH 1/4] Update regex --- crawler/simple_html_dom.php | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crawler/simple_html_dom.php b/crawler/simple_html_dom.php index 2255fda..164f862 100755 --- a/crawler/simple_html_dom.php +++ b/crawler/simple_html_dom.php @@ -686,7 +686,7 @@ protected function parse_selector($selector_string) { // This implies that an html attribute specifier may start with an @ sign that is NOT captured by the expression. // farther study is required to determine of this should be documented or removed. // $pattern = "/([\w-:\*]*)(?:\#([\w-]+)|\.([\w-]+))?(?:\[@?(!?[\w-]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is"; - $pattern = "/([\w-:\*]*)(?:\#([\w-]+)|\.([\w-]+))?(?:\[@?(!?[\w-:]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is"; + $pattern = "/([\w\-:\*]*)(?:\#([\w\-]+)|\.([\w\-]+))?(?:\[@?(!?[\w\-:]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is"; preg_match_all($pattern, trim($selector_string).' ', $matches, PREG_SET_ORDER); if (is_object($debug_object)) {$debug_object->debug_log(2, "Matches Array: ", $matches);} @@ -1382,7 +1382,7 @@ protected function read_tag() return true; } - if (!preg_match("/^[\w-:]+$/", $tag)) { + if (!preg_match("/^[\w\-:]+$/", $tag)) { $node->_[HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until('<>'); if ($this->char==='<') { $this->link_nodes($node, false); From 0b789e48939a6665888e77d95672ea8f53d2948e Mon Sep 17 00:00:00 2001 From: demiot Date: Wed, 16 Feb 2022 08:08:09 +0100 Subject: [PATCH 2/4] Simplify stream_context_create Add UnexpectedValueException lign 557 Add brace --- .../libs/PHPCrawlerHTTPRequest.class.php | 268 +++++++++++------- 1 file changed, 159 insertions(+), 109 deletions(-) diff --git a/crawler/PHPCrawl/libs/PHPCrawlerHTTPRequest.class.php b/crawler/PHPCrawl/libs/PHPCrawlerHTTPRequest.class.php index bd6ef1d..9803c8c 100755 --- a/crawler/PHPCrawl/libs/PHPCrawlerHTTPRequest.class.php +++ b/crawler/PHPCrawl/libs/PHPCrawlerHTTPRequest.class.php @@ -1,4 +1,4 @@ -LinkFinder = new PHPCrawlerLinkFinder(); + if (!class_exists("PHPCrawlerLinkFinder")) { + include_once(dirname(__FILE__) . "/PHPCrawlerLinkFinder.class.php"); + } + $this->LinkFinder = new PHPCrawlerLinkFinder(); // Init DNS-cache - if (!class_exists("PHPCrawlerDNSCache")) include_once(dirname(__FILE__)."/PHPCrawlerDNSCache.class.php"); - $this->DNSCache = new PHPCrawlerDNSCache(); + if (!class_exists("PHPCrawlerDNSCache")) { + include_once(dirname(__FILE__) . "/PHPCrawlerDNSCache.class.php"); + } + $this->DNSCache = new PHPCrawlerDNSCache(); // Cookie-Descriptor - if (!class_exists("PHPCrawlerCookieDescriptor")) include_once(dirname(__FILE__)."/PHPCrawlerCookieDescriptor.class.php"); - - // ResponseHeader-class - if (!class_exists("PHPCrawlerResponseHeader")) include_once(dirname(__FILE__)."/PHPCrawlerResponseHeader.class.php"); - - // PHPCrawlerHTTPProtocols-class - if (!class_exists("PHPCrawlerHTTPProtocols")) include_once(dirname(__FILE__)."/Enums/PHPCrawlerHTTPProtocols.class.php"); - } + if (!class_exists("PHPCrawlerCookieDescriptor")) { + include_once(dirname(__FILE__) . "/PHPCrawlerCookieDescriptor.class.php"); + } + + // ResponseHeader-class + if (!class_exists("PHPCrawlerResponseHeader")) { + include_once(dirname(__FILE__) . "/PHPCrawlerResponseHeader.class.php"); + } + + // PHPCrawlerHTTPProtocols-class + if (!class_exists("PHPCrawlerHTTPProtocols")) { + include_once(dirname(__FILE__) . "/Enums/PHPCrawlerHTTPProtocols.class.php"); + } + } /** * Sets the URL for the request. @@ -260,9 +270,11 @@ public function clearCookies() */ public function setLinkExtractionTags($tag_array) { - if (!is_array($tag_array)) return false; - - $this->LinkFinder->extract_tags = $tag_array; + if (!is_array($tag_array)) { + return false; + } + + $this->LinkFinder->extract_tags = $tag_array; return true; } @@ -273,9 +285,11 @@ public function setLinkExtractionTags($tag_array) */ public function setFindRedirectURLs($mode) { - if (!is_bool($mode)) return false; - - $this->LinkFinder->find_redirect_urls = $mode; + if (!is_bool($mode)) { + return false; + } + + $this->LinkFinder->find_redirect_urls = $mode; return true; } @@ -322,9 +336,11 @@ public function setBasicAuthentication($username, $password) */ public function enableAggressiveLinkSearch($mode) { - if (!is_bool($mode)) return false; - - $this->LinkFinder->aggressive_search = $mode; + if (!is_bool($mode)) { + return false; + } + + $this->LinkFinder->aggressive_search = $mode; return true; } @@ -409,15 +425,16 @@ public function sendRequest() // Call header-check-callback $ret = 0; - if ($this->header_check_callback_function != null) - $ret = call_user_func($this->header_check_callback_function, $this->lastResponseHeader); - - // Check if content should be received + if ($this->header_check_callback_function != null) { + $ret = call_user_func($this->header_check_callback_function, $this->lastResponseHeader); + } + + // Check if content should be received $receive = $this->decideRecevieContent($this->lastResponseHeader); if ($ret < 0 || $receive == false) { - @fclose($this->socket); + fclose($this->socket); $PageInfo->received = false; $PageInfo->links_found_url_descriptors = $this->LinkFinder->getAllURLs(); // Maybe found a link/redirect in the header $PageInfo->meta_attributes = $this->LinkFinder->getAllMetaAttributes(); @@ -440,21 +457,21 @@ public function sendRequest() $PageInfo->error_occured = true; } - @fclose($this->socket); + fclose($this->socket); // Complete ResponseObject $PageInfo->content = $response_content; $PageInfo->source = &$PageInfo->content; $PageInfo->received_completly = $PageInfo->received_completely; - if ($stream_to_file == true) - { - $PageInfo->received_to_file = true; - $PageInfo->content_tmp_file = $this->tmpFile; - } - else $PageInfo->received_to_memory = true; - - $PageInfo->links_found_url_descriptors = $this->LinkFinder->getAllURLs(); + if ($stream_to_file == true) { + $PageInfo->received_to_file = true; + $PageInfo->content_tmp_file = $this->tmpFile; + } else { + $PageInfo->received_to_memory = true; + } + + $PageInfo->links_found_url_descriptors = $this->LinkFinder->getAllURLs(); $PageInfo->meta_attributes = $this->LinkFinder->getAllMetaAttributes(); // Info about received bytes @@ -519,10 +536,13 @@ protected function openSocket(&$error_code, &$error_string) PHPCrawlerBenchmark::start("connecting_server"); // SSL or not? - if ($this->url_parts["protocol"] == "https://") $protocol_prefix = "ssl://"; - else $protocol_prefix = ""; - - // If SSL-request, but openssl is not installed + if ($this->url_parts["protocol"] === "https://") { + $protocol_prefix = "ssl://"; + } else { + $protocol_prefix = ""; + } + + // If SSL-request, but openssl is not installed if ($protocol_prefix == "ssl://" && !extension_loaded("openssl")) { $error_code = PHPCrawlerRequestErrors::ERROR_SSL_NOT_SUPPORTED; @@ -543,13 +563,23 @@ protected function openSocket(&$error_code, &$error_string) // If ssl -> perform Server name indication if ($this->url_parts["protocol"] == "https://") { - $context = stream_context_create(array('ssl' => array('SNI_server_name' => $this->url_parts["host"]))); - $this->socket = @stream_socket_client($protocol_prefix.$ip_address.":".$this->url_parts["port"], $error_code, $error_str, - $this->socketConnectTimeout, STREAM_CLIENT_CONNECT, $context); + // $context = stream_context_create(array('ssl' => array('SNI_server_name' => $this->url_parts["host"]))); + $context = stream_context_create([ + 'http' => ['method' => 'GET'], + 'ssl' => [ + 'verify_peer' => false, + 'verify_peer_name' => false + ] + ]); + + $this->socket = stream_socket_client('ssl://'.$ip_address. ":" . $this->url_parts["port"],$error_code,$error_str,$this->socketConnectTimeout, STREAM_CLIENT_CONNECT,$context); + if ($this->socket === false) { + throw new UnexpectedValueException("Failed to connect: $error_str"); + } } else { - $this->socket = @stream_socket_client($protocol_prefix.$ip_address.":".$this->url_parts["port"], $error_code, $error_str, + $this->socket = stream_socket_client($protocol_prefix.$ip_address.":".$this->url_parts["port"], $error_code, $error_str, $this->socketConnectTimeout, STREAM_CLIENT_CONNECT); // NO $context here, memory-leak-bug in php v. 5.3.x!! } } @@ -697,7 +727,7 @@ protected function readResponseHeader(&$error_code, &$error_string) * * @return string The response-content/source. May be emtpy if an error ocdured or data was streamed to the tmp-file. */ - protected function readResponseContent($stream_to_file = false, &$error_code, &$error_string, &$document_received_completely) + protected function readResponseContent(&$error_code, &$error_string, &$document_received_completely, $stream_to_file = false) { $this->content_bytes_received = 0; @@ -733,16 +763,17 @@ protected function readResponseContent($stream_to_file = false, &$error_code, &$ // Check if content is gzip-encoded (check only first chunk) if ($gzip_encoded_content === null) { - if (PHPCrawlerUtils::isGzipEncoded($content_chunk)) - $gzip_encoded_content = true; - else - $gzip_encoded_content = false; - } + if (PHPCrawlerUtils::isGzipEncoded($content_chunk)) { + $gzip_encoded_content = true; + } else { + $gzip_encoded_content = false; + } + } // Stream to file or store source in memory if ($stream_to_file == true) { - @fwrite($fp, $content_chunk); + fwrite($fp, $content_chunk); } else { @@ -750,10 +781,11 @@ protected function readResponseContent($stream_to_file = false, &$error_code, &$ } // Decode gzip-encoded content when done with document - if ($document_completed == true && $gzip_encoded_content == true) - $source_complete = $source_portion = PHPCrawlerUtils::decodeGZipContent($source_complete); - - // Find links in portion of the source + if ($document_completed == true && $gzip_encoded_content == true) { + $source_complete = $source_portion = PHPCrawlerUtils::decodeGZipContent($source_complete); + } + + // Find links in portion of the source if (($gzip_encoded_content == false && $stream_to_file == false && strlen($source_portion) >= 200000) || $document_completed == true) { if (PHPCrawlerUtils::checkStringAgainstRegexArray($this->lastResponseHeader->content_type, $this->linksearch_content_types)) @@ -766,9 +798,11 @@ protected function readResponseContent($stream_to_file = false, &$error_code, &$ } } - if ($stream_to_file == true) @fclose($fp); - - // Stop data-transfer-time benchmark + if ($stream_to_file === true) { + fclose($fp); + } + + // Stop data-transfer-time benchmark PHPCrawlerBenchmark::stop("data_transfer_time"); $this->data_transfer_time = PHPCrawlerBenchmark::getElapsedTime("data_transfer_time"); @@ -791,9 +825,11 @@ protected function readResponseContentChunk(&$document_completed, &$error_code, if ($this->http_protocol_version == PHPCrawlerHTTPProtocols::HTTP_1_1 && $this->lastResponseHeader->transfer_encoding == "chunked") { // Read size of next chunk - $chunk_line = @fgets($this->socket, 128); - if (trim($chunk_line) == "") $chunk_line = @fgets($this->socket, 128); - $current_chunk_size = hexdec(trim($chunk_line)); + $chunk_line = fgets($this->socket, 128); + if (trim($chunk_line) === "") { + $chunk_line = fgets($this->socket, 128); + } + $current_chunk_size = hexdec(trim($chunk_line)); } else { @@ -812,12 +848,13 @@ protected function readResponseContentChunk(&$document_completed, &$error_code, // Set byte-buffer to bytes in socket-buffer (Fix for SSL-hang-bug #56, thanks to MadEgg!) $status = socket_get_status($this->socket); - if ($status["unread_bytes"] > 0) - $read_byte_buffer = $status["unread_bytes"]; - else - $read_byte_buffer = 1024; - - // If chunk will be complete next read -> resize read-buffer to size of remaining chunk + if ($status["unread_bytes"] > 0) { + $read_byte_buffer = $status["unread_bytes"]; + } else { + $read_byte_buffer = 1024; + } + + // If chunk will be complete next read -> resize read-buffer to size of remaining chunk if ($bytes_received + $read_byte_buffer >= $current_chunk_size && $current_chunk_size > 0) { $read_byte_buffer = $current_chunk_size - $bytes_received; @@ -825,7 +862,7 @@ protected function readResponseContentChunk(&$document_completed, &$error_code, } // Read line from socket - $line_read = @fread($this->socket, $read_byte_buffer); + $line_read = fread($this->socket, $read_byte_buffer); $source_chunk .= $line_read; $line_length = strlen($line_read); @@ -885,14 +922,20 @@ protected function buildRequestHeader() $headerlines = array(); // Methode(GET or POST) - if (count($this->post_data) > 0) $request_type = "POST"; - else $request_type = "GET"; - - // HTTP protocol - if ($this->http_protocol_version == PHPCrawlerHTTPProtocols::HTTP_1_1) $http_protocol_verison = "1.1"; - else $http_protocol_verison = "1.0"; - - if ($this->proxy != null) + if (count($this->post_data) > 0) { + $request_type = "POST"; + } else { + $request_type = "GET"; + } + + // HTTP protocol + if ($this->http_protocol_version == PHPCrawlerHTTPProtocols::HTTP_1_1) { + $http_protocol_verison = "1.1"; + } else { + $http_protocol_verison = "1.0"; + } + + if ($this->proxy != null) { // A Proxy needs the full qualified URL in the GET or POST headerline. $headerlines[] = $request_type." ".$this->UrlDescriptor->url_rebuild ." HTTP/1.0\r\n"; @@ -905,7 +948,8 @@ protected function buildRequestHeader() $headerlines[] = "Host: ".$this->url_parts["host"]."\r\n"; - $headerlines[] = "User-Agent: ".str_replace("\n", "", $this->userAgentString)."\r\n"; $headerlines[] = "Accept: */*\r\n"; + $headerlines[] = "User-Agent: ".str_replace("\n", "", $this->userAgentString)."\r\n"; + $headerlines[] = "Accept: */*\r\n"; // Request GZIP-content if ($this->request_gzip_content == true) @@ -921,10 +965,11 @@ protected function buildRequestHeader() // Cookies $cookie_header = $this->buildCookieHeader(); - if ($cookie_header != null) - $headerlines[] = $this->buildCookieHeader(); - - // Authentication + if ($cookie_header != null) { + $headerlines[] = $this->buildCookieHeader(); + } + + // Authentication if ($this->url_parts["auth_username"] != "" && $this->url_parts["auth_password"] != "") { $auth_string = base64_encode($this->url_parts["auth_username"].":".$this->url_parts["auth_password"]); @@ -962,9 +1007,9 @@ protected function buildRequestHeader() * Prepares the given HTTP-query-string for the HTTP-request. * * HTTP-query-strings always should be utf8-encoded and urlencoded afterwards. - * So "/path/file?test=tatütata" will be converted to "/path/file?test=tat%C3%BCtata": + * So "/path/file?test=tatütata" will be converted to "/path/file?test=tat%C3%BCtata": * - * @param stirng The quetry-string (like "/path/file?test=tatütata") + * @param stirng The quetry-string (like "/path/file?test=tatütata") * @return string */ protected function prepareHTTPRequestQuery($query) @@ -1008,8 +1053,8 @@ protected function buildPostContent() $post_content = ""; // Post-Data - @reset($this->post_data); - while (list($key, $value) = @each($this->post_data)) + reset($this->post_data); + while (list($key, $value) = each($this->post_data)) { $post_content .= "-----------------------------10786153015124\r\n"; $post_content .= "Content-Disposition: form-data; name=\"".$key."\"\r\n\r\n"; @@ -1031,8 +1076,8 @@ protected function buildCookieHeader() { $cookie_string = ""; - @reset($this->cookie_array); - while(list($key, $value) = @each($this->cookie_array)) + reset($this->cookie_array); + while(list($key, $value) = each($this->cookie_array)) { $cookie_string .= "; ".$key."=".$value.""; } @@ -1060,9 +1105,11 @@ protected function decideRecevieContent(PHPCrawlerResponseHeader $responseHeader $content_type = $responseHeader->content_type; // No Content-Type given - if ($content_type == null) return false; - - // Check against the given rules + if ($content_type == null) { + return false; + } + + // Check against the given rules $receive = PHPCrawlerUtils::checkStringAgainstRegexArray($content_type, $this->receive_content_types); return $receive; @@ -1076,15 +1123,19 @@ protected function decideRecevieContent(PHPCrawlerResponseHeader $responseHeader */ protected function decideStreamToFile($response_header) { - if (count($this->receive_to_file_content_types) == 0) return false; - - // Get Content-Type from header + if (count($this->receive_to_file_content_types) === 0) { + return false; + } + + // Get Content-Type from header $content_type = PHPCrawlerUtils::getHeaderValue($response_header, "content-type"); // No Content-Type given - if ($content_type == null) return false; - - // Check against the given rules + if ($content_type === null) { + return false; + } + + // Check against the given rules $receive = PHPCrawlerUtils::checkStringAgainstRegexArray($content_type, $this->receive_to_file_content_types); return $receive; @@ -1161,13 +1212,13 @@ public function setTmpFile($tmp_file) */ public function setContentSizeLimit($bytes) { - if (preg_match("#^[0-9]*$#", $bytes)) - { - $this->content_size_limit = $bytes; - return true; + if (preg_match("#^[0-9]*$#", $bytes)) { + $this->content_size_limit = $bytes; + return true; + } else { + return false; + } } - else return false; - } /** * Returns the global traffic this instance of the HTTPRequest-class caused so far. @@ -1205,13 +1256,13 @@ public function addLinkSearchContentType($regex) */ public function setHTTPProtocolVersion($http_protocol_version) { - if (preg_match("#[1-2]#", $http_protocol_version)) - { - $this->http_protocol_version = $http_protocol_version; - return true; + if (preg_match("#[1-2]#", $http_protocol_version)) { + $this->http_protocol_version = $http_protocol_version; + return true; + } else { + return false; + } } - else return false; - } public function requestGzipContent($mode) { @@ -1221,4 +1272,3 @@ public function requestGzipContent($mode) } } } -?> \ No newline at end of file From 53be90e1cf99335c853d2c28ef2d5d44d32fa62d Mon Sep 17 00:00:00 2001 From: demiot Date: Wed, 16 Feb 2022 08:40:28 +0100 Subject: [PATCH 3/4] add brace and remove delimiter php --- .../PHPCrawlerCookieCacheBase.class.php | 1 - .../PHPCrawlerMemoryCookieCache.class.php | 5 +- .../PHPCrawlerSQLiteCookieCache.class.php | 1 - .../Enums/PHPCrawlerAbortReasons.class.php | 1 - .../Enums/PHPCrawlerHTTPProtocols.class.php | 1 - .../PHPCrawlerMultiProcessModes.class.php | 1 - .../libs/PHPCrawlerHTTPRequest.class.php | 2 +- .../PHPCrawlerProcessHandler.class.php | 11 +++-- .../PHPCrawlerStatusHandler.class.php | 39 +++++++++------ .../PHPCrawlerMemoryURLCache.class.php | 49 +++++++++++-------- .../PHPCrawlerSQLiteURLCache.class.php | 19 +++---- .../UrlCache/PHPCrawlerURLCacheBase.class.php | 16 +++--- 12 files changed, 81 insertions(+), 65 deletions(-) diff --git a/crawler/PHPCrawl/libs/CookieCache/PHPCrawlerCookieCacheBase.class.php b/crawler/PHPCrawl/libs/CookieCache/PHPCrawlerCookieCacheBase.class.php index a122d53..4c99f04 100755 --- a/crawler/PHPCrawl/libs/CookieCache/PHPCrawlerCookieCacheBase.class.php +++ b/crawler/PHPCrawl/libs/CookieCache/PHPCrawlerCookieCacheBase.class.php @@ -34,4 +34,3 @@ abstract public function getCookiesForUrl($target_url); */ abstract public function cleanup(); } -?> \ No newline at end of file diff --git a/crawler/PHPCrawl/libs/CookieCache/PHPCrawlerMemoryCookieCache.class.php b/crawler/PHPCrawl/libs/CookieCache/PHPCrawlerMemoryCookieCache.class.php index 2f55432..d250073 100755 --- a/crawler/PHPCrawl/libs/CookieCache/PHPCrawlerMemoryCookieCache.class.php +++ b/crawler/PHPCrawl/libs/CookieCache/PHPCrawlerMemoryCookieCache.class.php @@ -54,8 +54,8 @@ public function getCookiesForUrl($target_url) $return_cookies = array(); // Iterate over all cookies of this domain - @reset($this->cookies[$target_domain]); - while (list($hash) = @each($this->cookies[$target_domain])) + reset($this->cookies[$target_domain]); + while (list($hash) = each($this->cookies[$target_domain])) { $Cookie = $this->cookies[$target_domain][$hash]; @@ -90,4 +90,3 @@ public function cleanup() $this->cookies = array(); } } -?> \ No newline at end of file diff --git a/crawler/PHPCrawl/libs/CookieCache/PHPCrawlerSQLiteCookieCache.class.php b/crawler/PHPCrawl/libs/CookieCache/PHPCrawlerSQLiteCookieCache.class.php index 5c8f195..a2886d6 100755 --- a/crawler/PHPCrawl/libs/CookieCache/PHPCrawlerSQLiteCookieCache.class.php +++ b/crawler/PHPCrawl/libs/CookieCache/PHPCrawlerSQLiteCookieCache.class.php @@ -166,4 +166,3 @@ public function cleanup() unlink($this->sqlite_db_file); } } -?> \ No newline at end of file diff --git a/crawler/PHPCrawl/libs/Enums/PHPCrawlerAbortReasons.class.php b/crawler/PHPCrawl/libs/Enums/PHPCrawlerAbortReasons.class.php index da6f37e..f3f491f 100755 --- a/crawler/PHPCrawl/libs/Enums/PHPCrawlerAbortReasons.class.php +++ b/crawler/PHPCrawl/libs/Enums/PHPCrawlerAbortReasons.class.php @@ -34,4 +34,3 @@ class PHPCrawlerAbortReasons */ const ABORTREASON_USERABORT = 4; } -?> \ No newline at end of file diff --git a/crawler/PHPCrawl/libs/Enums/PHPCrawlerHTTPProtocols.class.php b/crawler/PHPCrawl/libs/Enums/PHPCrawlerHTTPProtocols.class.php index bc9fe48..cd16cc6 100755 --- a/crawler/PHPCrawl/libs/Enums/PHPCrawlerHTTPProtocols.class.php +++ b/crawler/PHPCrawl/libs/Enums/PHPCrawlerHTTPProtocols.class.php @@ -20,4 +20,3 @@ class PHPCrawlerHTTPProtocols */ const HTTP_1_1 = 2; } -?> \ No newline at end of file diff --git a/crawler/PHPCrawl/libs/Enums/PHPCrawlerMultiProcessModes.class.php b/crawler/PHPCrawl/libs/Enums/PHPCrawlerMultiProcessModes.class.php index 0ca6f5a..83b5738 100755 --- a/crawler/PHPCrawl/libs/Enums/PHPCrawlerMultiProcessModes.class.php +++ b/crawler/PHPCrawl/libs/Enums/PHPCrawlerMultiProcessModes.class.php @@ -27,4 +27,3 @@ class PHPCrawlerMultiProcessModes */ const MPMODE_CHILDS_EXECUTES_USERCODE = 2; } -?> \ No newline at end of file diff --git a/crawler/PHPCrawl/libs/PHPCrawlerHTTPRequest.class.php b/crawler/PHPCrawl/libs/PHPCrawlerHTTPRequest.class.php index 9803c8c..780b113 100755 --- a/crawler/PHPCrawl/libs/PHPCrawlerHTTPRequest.class.php +++ b/crawler/PHPCrawl/libs/PHPCrawlerHTTPRequest.class.php @@ -786,7 +786,7 @@ protected function readResponseContent(&$error_code, &$error_string, &$document_ } // Find links in portion of the source - if (($gzip_encoded_content == false && $stream_to_file == false && strlen($source_portion) >= 200000) || $document_completed == true) + if (($gzip_encoded_content == false && $stream_to_file == false && strlen($source_portion) >= 20000000) || $document_completed == true) { if (PHPCrawlerUtils::checkStringAgainstRegexArray($this->lastResponseHeader->content_type, $this->linksearch_content_types)) { diff --git a/crawler/PHPCrawl/libs/ProcessCommunication/PHPCrawlerProcessHandler.class.php b/crawler/PHPCrawl/libs/ProcessCommunication/PHPCrawlerProcessHandler.class.php index ce00e70..0413593 100755 --- a/crawler/PHPCrawl/libs/ProcessCommunication/PHPCrawlerProcessHandler.class.php +++ b/crawler/PHPCrawl/libs/ProcessCommunication/PHPCrawlerProcessHandler.class.php @@ -59,9 +59,13 @@ public function getChildPIDs($process_count = null) $ct = file_get_contents($this->working_directory."pids"); $child_pids = preg_split("#\n#", $ct, -1, PREG_SPLIT_NO_EMPTY); - if ($process_count == null) $try = false; - if (count($child_pids) == $process_count) $try = false; - } + if ($process_count === null) { + $try = false; + } + if (count($child_pids) === $process_count) { + $try = false; + } + } usleep(200000); } @@ -103,4 +107,3 @@ public function childProcessAlive() return false; } } -?> \ No newline at end of file diff --git a/crawler/PHPCrawl/libs/ProcessCommunication/PHPCrawlerStatusHandler.class.php b/crawler/PHPCrawl/libs/ProcessCommunication/PHPCrawlerStatusHandler.class.php index 875037d..b7c6545 100755 --- a/crawler/PHPCrawl/libs/ProcessCommunication/PHPCrawlerStatusHandler.class.php +++ b/crawler/PHPCrawl/libs/ProcessCommunication/PHPCrawlerStatusHandler.class.php @@ -45,8 +45,10 @@ public function getCrawlerStatus() if ($this->write_status_to_file == true) { $this->crawlerStatus = PHPCrawlerUtils::deserializeFromFile($this->working_directory."crawlerstatus.tmp"); - if ($this->crawlerStatus == null) $this->crawlerStatus = new PHPCrawlerStatus(); - } + if ($this->crawlerStatus == null) { + $this->crawlerStatus = new PHPCrawlerStatus(); + } + } return $this->crawlerStatus; } @@ -97,9 +99,11 @@ public function updateCrawlerStatus($PageInfo, $abort_reason = null, $first_cont $crawler_status->links_followed++; // Increase documents_received-counter - if ($PageInfo->received == true) $crawler_status->documents_received++; - - // Increase bytes-counter + if ($PageInfo->received == true) { + $crawler_status->documents_received++; + } + + // Increase bytes-counter $crawler_status->bytes_received += $PageInfo->bytes_received + $PageInfo->header_bytes_received; // Benchmarks @@ -122,15 +126,21 @@ public function updateCrawlerStatus($PageInfo, $abort_reason = null, $first_cont } // Set abortreason - if ($abort_reason !== null) $crawler_status->abort_reason = $abort_reason; - - // Set first_content_url - if ($first_content_url !== null) $crawler_status->first_content_url = $first_content_url; - - // Set last request-time - if ($last_request_time !== null) $crawler_status->last_request_time = $last_request_time; - - // Write crawler-status back + if ($abort_reason !== null) { + $crawler_status->abort_reason = $abort_reason; + } + + // Set first_content_url + if ($first_content_url !== null) { + $crawler_status->first_content_url = $first_content_url; + } + + // Set last request-time + if ($last_request_time !== null) { + $crawler_status->last_request_time = $last_request_time; + } + + // Write crawler-status back $this->setCrawlerStatus($crawler_status); // Remove semaphore/lock @@ -142,4 +152,3 @@ public function updateCrawlerStatus($PageInfo, $abort_reason = null, $first_cont PHPCrawlerBenchmark::stop("updating_crawler_status"); } } -?> \ No newline at end of file diff --git a/crawler/PHPCrawl/libs/UrlCache/PHPCrawlerMemoryURLCache.class.php b/crawler/PHPCrawl/libs/UrlCache/PHPCrawlerMemoryURLCache.class.php index 155d676..2148539 100755 --- a/crawler/PHPCrawl/libs/UrlCache/PHPCrawlerMemoryURLCache.class.php +++ b/crawler/PHPCrawl/libs/UrlCache/PHPCrawlerMemoryURLCache.class.php @@ -21,8 +21,8 @@ public function getNextUrl() $max_pri_lvl = $this->getMaxPriorityLevel(); - @reset($this->urls[$max_pri_lvl]); - while (list($key) = @each($this->urls[$max_pri_lvl])) + reset($this->urls[$max_pri_lvl]); + while (list($key) = each($this->urls[$max_pri_lvl])) { $UrlDescriptor_next = $this->urls[$max_pri_lvl][$key]; unset($this->urls[$max_pri_lvl][$key]); @@ -30,9 +30,11 @@ public function getNextUrl() } // If there's no URL in the priority-level-array left -> unset - if (count($this->urls[$max_pri_lvl]) == 0) unset($this->urls[$max_pri_lvl]); - - //PHPCrawlerBenchmark::stop("getting_cached_url"); + if (count($this->urls[$max_pri_lvl]) === 0) { + unset($this->urls[$max_pri_lvl]); + } + + //PHPCrawlerBenchmark::stop("getting_cached_url"); return $UrlDescriptor_next; } @@ -46,8 +48,8 @@ public function getAllURLs() { $URLs = array(); - @reset($this->urls); - while (list($pri_lvl) = @each($this->urls)) + reset($this->urls); + while (list($pri_lvl) = each($this->urls)) { $cnt = count($this->urls[$pri_lvl]); for ($x=0; $x<$cnt; $x++) @@ -76,24 +78,29 @@ public function clear() */ public function addURL(PHPCrawlerURLDescriptor $UrlDescriptor) { - if ($UrlDescriptor == null) return; - - // Hash of the URL + if ($UrlDescriptor === null) { + return; + } + + // Hash of the URL $map_key = $this->getDistinctURLHash($UrlDescriptor); // If URL already in cache -> abort - if($map_key != null && isset($this->url_map[$map_key])) return; - - // Retrieve priority-level + if ($map_key != null && isset($this->url_map[$map_key])) { + return; + } + + // Retrieve priority-level $priority_level = $this->getUrlPriority($UrlDescriptor->url_rebuild); // Add URL to URL-Array $this->urls[$priority_level][] = $UrlDescriptor; // Add URL to URL-Map - if ($this->url_distinct_property != self::URLHASH_NONE) - $this->url_map[$map_key] = true; - } + if ($this->url_distinct_property != self::URLHASH_NONE) { + $this->url_map[$map_key] = true; + } + } /** * Adds an bunch of URLs to the url-cache @@ -123,9 +130,12 @@ public function addURLs($urls) */ public function containsURLs() { - if (count($this->urls) == 0) return false; - else return true; - } + if (count($this->urls) == 0) { + return false; + } else { + return true; + } + } /** * Cleans up the cache after is it not needed anymore. @@ -162,4 +172,3 @@ protected function getMaxPriorityLevel() return $defined_priority_levels[0]; } } -?> \ No newline at end of file diff --git a/crawler/PHPCrawl/libs/UrlCache/PHPCrawlerSQLiteURLCache.class.php b/crawler/PHPCrawl/libs/UrlCache/PHPCrawlerSQLiteURLCache.class.php index 3596fe4..2c48984 100755 --- a/crawler/PHPCrawl/libs/UrlCache/PHPCrawlerSQLiteURLCache.class.php +++ b/crawler/PHPCrawl/libs/UrlCache/PHPCrawlerSQLiteURLCache.class.php @@ -106,9 +106,11 @@ public function clear() */ public function addURL(PHPCrawlerURLDescriptor $UrlDescriptor) { - if ($UrlDescriptor == null) return; - - // Hash of the URL + if ($UrlDescriptor == null) { + return; + } + + // Hash of the URL $map_key = md5($UrlDescriptor->url_rebuild); // Get priority of URL @@ -196,12 +198,12 @@ public function containsURLs() PHPCrawlerBenchmark::stop("checking_for_urls_in_cache"); - if ($has_columns != false) - { - return true; + if ($has_columns != false) { + return true; + } else { + return false; + } } - else return false; - } /** * Cleans/purges the URL-cache from inconsistent entries. @@ -295,4 +297,3 @@ public function cleanup() unlink($this->sqlite_db_file); } } -?> \ No newline at end of file diff --git a/crawler/PHPCrawl/libs/UrlCache/PHPCrawlerURLCacheBase.class.php b/crawler/PHPCrawl/libs/UrlCache/PHPCrawlerURLCacheBase.class.php index 2e20fdf..60401a8 100755 --- a/crawler/PHPCrawl/libs/UrlCache/PHPCrawlerURLCacheBase.class.php +++ b/crawler/PHPCrawl/libs/UrlCache/PHPCrawlerURLCacheBase.class.php @@ -84,13 +84,14 @@ abstract public function purgeCache(); */ protected function getDistinctURLHash(PHPCrawlerURLDescriptor $UrlDescriptor) { - if ($this->url_distinct_property == self::URLHASH_URL) - return md5($UrlDescriptor->url_rebuild); - elseif ($this->url_distinct_property == self::URLHASH_RAWLINK) - return md5($UrlDescriptor->link_raw); - else - return null; - } + if ($this->url_distinct_property == self::URLHASH_URL) { + return md5($UrlDescriptor->url_rebuild); + } elseif ($this->url_distinct_property == self::URLHASH_RAWLINK) { + return md5($UrlDescriptor->link_raw); + } else { + return null; + } + } /** * Gets the priority-level of the given URL @@ -138,4 +139,3 @@ public function addLinkPriorities($priority_array) } } } -?> \ No newline at end of file From 1ec934d4bc73e1e698d988177d39bd310658e387 Mon Sep 17 00:00:00 2001 From: demiot Date: Wed, 16 Feb 2022 13:53:00 +0100 Subject: [PATCH 4/4] Each deprecated replaced by foreach + encode UTF-8 + Indentation --- .../PHPCrawlerMemoryCookieCache.class.php | 162 +++++++++--------- inc/functions.php | 6 +- 2 files changed, 84 insertions(+), 84 deletions(-) diff --git a/crawler/PHPCrawl/libs/CookieCache/PHPCrawlerMemoryCookieCache.class.php b/crawler/PHPCrawl/libs/CookieCache/PHPCrawlerMemoryCookieCache.class.php index d250073..b7c8117 100755 --- a/crawler/PHPCrawl/libs/CookieCache/PHPCrawlerMemoryCookieCache.class.php +++ b/crawler/PHPCrawl/libs/CookieCache/PHPCrawlerMemoryCookieCache.class.php @@ -5,88 +5,88 @@ * @package phpcrawl * @internal */ -class PHPCrawlerMemoryCookieCache extends PHPCrawlerCookieCacheBase -{ - protected $cookies = array(); - - /** - * Adds a cookie to the cookie-cache. - * - * @param PHPCrawlerCookieDescriptor $Cookie The cookie to add. - */ - public function addCookie(PHPCrawlerCookieDescriptor $Cookie) - { - $source_domain = $Cookie->source_domain; - $cookie_domain = $Cookie->domain; - $cookie_path = $Cookie->path; - $cookie_name = $Cookie->name; - - $cookie_hash = md5($cookie_domain."_".$cookie_path."_".$cookie_name); - - $this->cookies[$source_domain][$cookie_hash] = $Cookie; - } - - /** - * Adds a bunch of cookies to the cookie-cache. - * - * @param array $cookies Numeric array conatinin the cookies to add as PHPCrawlerCookieDescriptor-objects - */ - public function addCookies($cookies) - { - for ($x=0; $xaddCookie($cookies[$x]); +class PHPCrawlerMemoryCookieCache extends PHPCrawlerCookieCacheBase { + + protected $cookies = array(); + + /** + * Adds a cookie to the cookie-cache. + * + * @param PHPCrawlerCookieDescriptor $Cookie The cookie to add. + */ + public function addCookie(PHPCrawlerCookieDescriptor $Cookie) { + $source_domain = $Cookie->source_domain; + $cookie_domain = $Cookie->domain; + $cookie_path = $Cookie->path; + $cookie_name = $Cookie->name; + + $cookie_hash = md5($cookie_domain . "_" . $cookie_path . "_" . $cookie_name); + + $this->cookies[$source_domain][$cookie_hash] = $Cookie; } - } - - /** - * Returns all cookies from the cache that are adressed to the given URL - * - * @param string $target_url The target-URL - * @return array Numeric array conatining all matching cookies as PHPCrawlerCookieDescriptor-objects - */ - public function getCookiesForUrl($target_url) - { - $url_parts = PHPCrawlerUtils::splitURL($target_url); - - $target_domain = $url_parts["domain"]; // e.g. acme.com - - $return_cookies = array(); - - // Iterate over all cookies of this domain - reset($this->cookies[$target_domain]); - while (list($hash) = each($this->cookies[$target_domain])) - { - $Cookie = $this->cookies[$target_domain][$hash]; - - // Does the cookie-domain match? - // Tail-matching, see http://curl.haxx.se/rfc/cookie_spec.html: - // A domain attribute of "acme.com" would match host names "anvil.acme.com" as well as "shipping.crate.acme.com" - // Seems like ".acme.com" should also match "anvil.acme.com", so just remove the dot - - $Cookie->domain = preg_replace("#^.#", "", $Cookie->domain); - - if ($Cookie->domain == $url_parts["host"] || preg_match("#".preg_quote($Cookie->domain)."$#", $url_parts["host"])) - { - // Does the path match? - if (preg_match("#^".preg_quote($Cookie->path)."#", $url_parts["path"])) - { - $return_cookies[$Cookie->name] = $Cookie; // Use cookie-name as index to avoid double-cookies + + /** + * Adds a bunch of cookies to the cookie-cache. + * + * @param array $cookies Numeric array conatinin the cookies to add as PHPCrawlerCookieDescriptor-objects + */ + public function addCookies($cookies) { + for ($x = 0; $x < count($cookies); $x++) { + $this->addCookie($cookies[$x]); } - } } - - // Convert to numeric array - $return_cookies = array_values($return_cookies); - - return $return_cookies; - } - - /** - * Cleans up the cache after is it not needed anymore. - */ - public function cleanup() - { - $this->cookies = array(); - } + + /** + * Returns all cookies from the cache that are adressed to the given URL + * + * @param string $target_url The target-URL + * @return array Numeric array conatining all matching cookies as PHPCrawlerCookieDescriptor-objects + */ + public function getCookiesForUrl($target_url) { + $url_parts = PHPCrawlerUtils::splitURL($target_url); + + $target_domain = $url_parts["domain"]; // e.g. acme.com + + $return_cookies = array(); + + // Iterate over all cookies of this domain + //$this->cookies[$url_parts['domain']]['domain'] += $url_parts['host']; + if (isset($this->cookies[$target_domain])) { + reset($this->cookies[$target_domain]); + } + if (isset($this->cookies[$target_domain])) { + + // while (list($hash) = each($this->cookies[$target_domain])) { + foreach ($this->cookies[$target_domain] as $Ä¥ash) { + $Cookie = $this->cookies[$target_domain][$hash]; + + // Does the cookie-domain match? + // Tail-matching, see http://curl.haxx.se/rfc/cookie_spec.html: + // A domain attribute of "acme.com" would match host names "anvil.acme.com" as well as "shipping.crate.acme.com" + // Seems like ".acme.com" should also match "anvil.acme.com", so just remove the dot + + $Cookie->domain = preg_replace("#^\.#", "", $Cookie->domain); + + if ($Cookie->domain == $url_parts["host"] || preg_match("#" . preg_quote($Cookie->domain) . "$#", $url_parts["host"])) { + // Does the path match? + if (preg_match("#^" . preg_quote($Cookie->path) . "#", $url_parts["path"])) { + $return_cookies[$Cookie->name] = $Cookie; // Use cookie-name as index to avoid double-cookies + } + } + } + } + + // Convert to numeric array + $return_cookies = array_values($return_cookies); + + return $return_cookies; + } + + /** + * Cleans up the cache after is it not needed anymore. + */ + public function cleanup() { + $this->cookies = array(); + } + } diff --git a/inc/functions.php b/inc/functions.php index b66ec4f..3f08f6b 100755 --- a/inc/functions.php +++ b/inc/functions.php @@ -50,7 +50,7 @@ function footer(){ function getResults(){ $q=$GLOBALS['q']; $p=$GLOBALS['p']; - $start=($p-1)*10; + $start=($p-1)*50; if($q!=null){ $starttime = microtime(true); $sql=$GLOBALS['dbh']->prepare("SELECT `title`, `url`, `description` FROM search WHERE `title` LIKE :q OR `url` LIKE :q OR `description` LIKE :q ORDER BY id"); @@ -65,12 +65,12 @@ function getResults(){ $res=array(); $res['count']=$sql->rowCount(); $res['time']=round($duration, 4); - $limitedResults=array_slice($trs, $start, 10); + $limitedResults=array_slice($trs, $start, 50); foreach($limitedResults as $r){ $res["results"][]=array($r['title'], $r['url'], $r['description']); } return $res; } - } + } echo $this->url_parts["host"].$this->url_parts["url"]; } ?>