diff --git a/composer.json b/composer.json index d0130874..afe4365f 100644 --- a/composer.json +++ b/composer.json @@ -19,6 +19,7 @@ "ext-curl": "*", "ext-tidy": "*", "fossar/htmlawed": "^1.2.8", + "guzzlehttp/promises": "^1.5", "guzzlehttp/psr7": "^2.0", "j0k3r/graby-site-config": "^1.0.147", "j0k3r/httplug-ssrf-plugin": "^3.0", diff --git a/src/Extractor/HttpClient.php b/src/Extractor/HttpClient.php index c9ccb08d..cc68ca24 100644 --- a/src/Extractor/HttpClient.php +++ b/src/Extractor/HttpClient.php @@ -2,8 +2,11 @@ namespace Graby\Extractor; +use Exception; use Graby\HttpClient\Plugin\History; use Graby\HttpClient\Plugin\ServerSideRequestForgeryProtection\ServerSideRequestForgeryProtectionPlugin; +use GuzzleHttp\Promise\Create; +use GuzzleHttp\Promise\PromiseInterface; use GuzzleHttp\Psr7\Uri; use GuzzleHttp\Psr7\UriResolver; use Http\Client\Common\Exception\LoopException; @@ -78,6 +81,22 @@ public function setLogger(LoggerInterface $logger): void * @return array{effective_url: string, body: string, headers: array, status: int} */ public function fetch(string $url, bool $skipTypeVerification = false, array $httpHeader = []): array + { + return $this->fetchAsync($url, $skipTypeVerification, $httpHeader)->wait(); + } + + /** + * Grab informations from an url asynchronously: + * - final url (after potential redirection) + * - raw content + * - content type header. + * + * @param bool $skipTypeVerification Avoid mime detection which means, force GET instead of potential HEAD + * @param array $httpHeader Custom HTTP Headers from SiteConfig + * + * @return PromiseInterface that resolves to array{effective_url: string, body: string, headers: array, status: int} + */ + public function fetchAsync(string $url, bool $skipTypeVerification = false, array $httpHeader = []): PromiseInterface { $url = $this->cleanupUrl($url); @@ -106,137 +125,165 @@ public function fetch(string $url, bool $skipTypeVerification = false, array $ht $headers['Accept'] = $accept; } - try { - $request = $this->requestFactory->createRequest($method, $url); - foreach ($headers as $name => $value) { - $request = $request->withHeader($name, $value); - } - /** @var ResponseInterface $response */ - $response = $this->client->sendRequest($request); - } catch (LoopException $e) { - $this->logger->info('Endless redirect: ' . ($this->config->getMaxRedirect() + 1) . ' on "{url}"', ['url' => $url]); - - return [ - 'effective_url' => $url, - 'body' => '', - 'headers' => [], - // Too many Redirects - 'status' => 310, - ]; - } catch (TransferException $e) { - if (method_exists($e, 'getRequest')) { - $url = (string) $e->getRequest()->getUri(); - } + $request = $this->requestFactory->createRequest($method, $url); + foreach ($headers as $name => $value) { + $request = $request->withHeader($name, $value); + } + + return self::makePromise($this->client->sendAsyncRequest($request))->then( + function (ResponseInterface $response) use ($url, $httpHeader, $method) { + $effectiveUrl = $url; + if (null !== $this->responseHistory->getLastRequest()) { + $effectiveUrl = (string) $this->responseHistory->getLastRequest()->getUri(); + } - // no response attached to the exception, we won't be able to retrieve content from it - $data = [ - 'effective_url' => $url, - 'body' => '', - 'headers' => [], - 'status' => 500, - ]; - $message = 'Request throw exception (with no response): {error_message}'; - - if (method_exists($e, 'getResponse')) { - // exception has a response which means we might be able to retrieve content from it, log it and continue - $response = $e->getResponse(); $headers = $this->formatHeaders($response); - $data = [ - 'effective_url' => $url, - 'body' => (string) $response->getBody(), - 'headers' => $headers, - 'status' => $response->getStatusCode(), - ]; - $message = 'Request throw exception (with a response): {error_message}'; - } + // if response give us a refresh header it means we need to follow the given url + if (!empty($headers['refresh']) && 1 === preg_match('![0-9];\s*url=["\']?([^"\'>]+)!i', $headers['refresh'], $match)) { + return $this->fetchAsync($match[1], true, $httpHeader); + } - $this->logger->warning($message, ['error_message' => $e->getMessage()]); - $this->logger->info('Data fetched: {data}', ['data' => $data]); + // the response content-type did not match our 'header only' types, + // but we'd issues a HEAD request because we assumed it would. So + // let's queue a proper GET request for this item... + if ('head' === $method && !$this->headerOnlyType($headers)) { + return $this->fetchAsync($effectiveUrl, true, $httpHeader); + } - return $data; - } + $body = (string) $response->getBody(); - $effectiveUrl = $url; - if (null !== $this->responseHistory->getLastRequest()) { - $effectiveUrl = (string) $this->responseHistory->getLastRequest()->getUri(); - } + // be sure to remove conditional comments for IE around the html tag + // we only remove conditional comments until we found the tag + // they usually contains the tag which we try to found and replace the last occurence + // with the whole conditional comments + preg_match('/^\))*/mi', $body, $matchesConditional); - // be sure to remove conditional comments for IE around the html tag - // we only remove conditional comments until we found the tag - // they usually contains the tag which we try to found and replace the last occurence - // with the whole conditional comments - preg_match('/^\))*/mi', $body, $matchesConditional); + // remove utm parameters & fragment + $effectiveUrl = (string) $this->removeTrackersFromUrl(new Uri(str_replace('&', '&', $effectiveUrl))); - if (isset($matchesConditional[0]) && (is_countable($matchesConditional[0]) ? \count($matchesConditional[0]) : 0) > 1) { - foreach ($matchesConditional as $conditionalComment) { - $body = str_replace($conditionalComment, '', $body); - } - } + $this->logger->info('Data fetched: {data}', ['data' => [ + 'effective_url' => $effectiveUrl, + 'body' => '(only length for debug): ' . \strlen($body), + 'headers' => $headers, + 'status' => $response->getStatusCode(), + ]]); - if (null !== $this->extractor) { - $body = $this->extractor->processStringReplacements($body, $effectiveUrl); - } + return [ + 'effective_url' => $effectiveUrl, + 'body' => $body, + 'headers' => $headers, + 'status' => $response->getStatusCode(), + ]; + }, + function (\Exception $e) use ($url) { + if ($e instanceof LoopException) { + $this->logger->info('Endless redirect: ' . ($this->config->getMaxRedirect() + 1) . ' on "{url}"', ['url' => $url]); + + return [ + 'effective_url' => $url, + 'body' => '', + 'headers' => [], + // Too many Redirects + 'status' => 310, + ]; + } elseif ($e instanceof TransferException) { + if (method_exists($e, 'getRequest')) { + $url = (string) $e->getRequest()->getUri(); + } + + // no response attached to the exception, we won't be able to retrieve content from it + $data = [ + 'effective_url' => $url, + 'body' => '', + 'headers' => [], + 'status' => 500, + ]; + $message = 'Request throw exception (with no response): {error_message}'; + + if (method_exists($e, 'getResponse')) { + // exception has a response which means we might be able to retrieve content from it, log it and continue + $response = $e->getResponse(); + $headers = $this->formatHeaders($response); + + $data = [ + 'effective_url' => $url, + 'body' => (string) $response->getBody(), + 'headers' => $headers, + 'status' => $response->getStatusCode(), + ]; + $message = 'Request throw exception (with a response): {error_message}'; + } + + $this->logger->warning($message, ['error_message' => $e->getMessage()]); + $this->logger->info('Data fetched: {data}', ['data' => $data]); + + return $data; + } - // check for - // for AJAX sites, e.g. Blogger with its dynamic views templates. - // Based on Google's spec: https://developers.google.com/webmasters/ajax-crawling/docs/specification - if (false === strpos($effectiveUrl, '_escaped_fragment_')) { - $redirectURL = $this->getMetaRefreshURL($effectiveUrl, $body) ?: $this->getUglyURL($effectiveUrl, $body); + throw $e; + } + ); + } - if (false !== $redirectURL) { - return $this->fetch($redirectURL, true, $httpHeader); + /** + * @param mixed $thenable + */ + private static function makePromise($thenable): PromiseInterface + { + if ($thenable instanceof \Http\Client\Promise\HttpFulfilledPromise) { + // Calling then() on HttpFulfilledPromise would create another HttpFulfilledPromise, + // which cannot resolve to anything other then ResponseInterface. + return new \GuzzleHttp\Promise\FulfilledPromise($thenable->wait()); + } elseif ($thenable instanceof \Http\Client\Promise\HttpRejectedPromise) { + // Calling then() on HttpRejectedPromise would create HttpFulfilledPromise, + // which cannot resolve to anything other then ResponseInterface. + try { + // No other way to unpack the Exception than to have wait() throw it. + $thenable->wait(); + } catch (\Exception $exception) { + return new \GuzzleHttp\Promise\RejectedPromise($exception); } } - // remove utm parameters & fragment - $effectiveUrl = (string) $this->removeTrackersFromUrl(new Uri(str_replace('&', '&', $effectiveUrl))); - - $this->logger->info('Data fetched: {data}', ['data' => [ - 'effective_url' => $effectiveUrl, - 'body' => '(only length for debug): ' . \strlen($body), - 'headers' => $headers, - 'status' => $response->getStatusCode(), - ]]); - - return [ - 'effective_url' => $effectiveUrl, - 'body' => $body, - 'headers' => $headers, - 'status' => $response->getStatusCode(), - ]; + return Create::promiseFor($thenable); } /**