Skip to content

Commit

Permalink
HttpClient: Add fetchAsync method
Browse files Browse the repository at this point in the history
The diff is ugly but I just:

- removed the try block
- moved the code after the catch blocks inside the first closure argument of `then`
- changed the catch blocks into `if`s with `instanceof` checks
- moved them to the second closure argument of `then`
- replaced `fetch` with `fetchAsync`

Unfortunately, HTTPlug’s `HttpFulfilledPromise` is broken (does not allow to be converted to another promise type and only supports resolving with `ResponseInterface`) so we need the ugly hack in `makePromise` to convert it into a promise object from a full-fledged promise library.
I considered ReactPHP’s promise library first but those promises are not natively waitable in version 2.0 and using custom busy-loop waiting or third-party wait (from choval/async) just seemed to block. In the end, I went with Guzzle’s promise library.
  • Loading branch information
jtojnar committed Apr 18, 2023
1 parent 64862a0 commit 5c6ba77
Show file tree
Hide file tree
Showing 2 changed files with 156 additions and 108 deletions.
1 change: 1 addition & 0 deletions composer.json
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
"ext-curl": "*",
"ext-tidy": "*",
"fossar/htmlawed": "^1.2.8",
"guzzlehttp/promises": "^1.5",
"guzzlehttp/psr7": "^2.0",
"j0k3r/graby-site-config": "^1.0.147",
"j0k3r/httplug-ssrf-plugin": "^3.0",
Expand Down
263 changes: 155 additions & 108 deletions src/Extractor/HttpClient.php
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,11 @@

namespace Graby\Extractor;

use Exception;
use Graby\HttpClient\Plugin\History;
use Graby\HttpClient\Plugin\ServerSideRequestForgeryProtection\ServerSideRequestForgeryProtectionPlugin;
use GuzzleHttp\Promise\Create;
use GuzzleHttp\Promise\PromiseInterface;
use GuzzleHttp\Psr7\Uri;
use GuzzleHttp\Psr7\UriResolver;
use Http\Client\Common\Exception\LoopException;
Expand Down Expand Up @@ -78,6 +81,22 @@ public function setLogger(LoggerInterface $logger): void
* @return array{effective_url: string, body: string, headers: array<string, string>, status: int}
*/
public function fetch(string $url, bool $skipTypeVerification = false, array $httpHeader = []): array
{
return $this->fetchAsync($url, $skipTypeVerification, $httpHeader)->wait();
}

/**
* Grab informations from an url asynchronously:
* - final url (after potential redirection)
* - raw content
* - content type header.
*
* @param bool $skipTypeVerification Avoid mime detection which means, force GET instead of potential HEAD
* @param array<string, string> $httpHeader Custom HTTP Headers from SiteConfig
*
* @return PromiseInterface that resolves to array{effective_url: string, body: string, headers: array<string, string>, status: int}
*/
public function fetchAsync(string $url, bool $skipTypeVerification = false, array $httpHeader = []): PromiseInterface
{
$url = $this->cleanupUrl($url);

Expand Down Expand Up @@ -106,137 +125,165 @@ public function fetch(string $url, bool $skipTypeVerification = false, array $ht
$headers['Accept'] = $accept;
}

try {
$request = $this->requestFactory->createRequest($method, $url);
foreach ($headers as $name => $value) {
$request = $request->withHeader($name, $value);
}
/** @var ResponseInterface $response */
$response = $this->client->sendRequest($request);
} catch (LoopException $e) {
$this->logger->info('Endless redirect: ' . ($this->config->getMaxRedirect() + 1) . ' on "{url}"', ['url' => $url]);

return [
'effective_url' => $url,
'body' => '',
'headers' => [],
// Too many Redirects
'status' => 310,
];
} catch (TransferException $e) {
if (method_exists($e, 'getRequest')) {
$url = (string) $e->getRequest()->getUri();
}
$request = $this->requestFactory->createRequest($method, $url);
foreach ($headers as $name => $value) {
$request = $request->withHeader($name, $value);
}

return self::makePromise($this->client->sendAsyncRequest($request))->then(
function (ResponseInterface $response) use ($url, $httpHeader, $method) {
$effectiveUrl = $url;
if (null !== $this->responseHistory->getLastRequest()) {
$effectiveUrl = (string) $this->responseHistory->getLastRequest()->getUri();
}

// no response attached to the exception, we won't be able to retrieve content from it
$data = [
'effective_url' => $url,
'body' => '',
'headers' => [],
'status' => 500,
];
$message = 'Request throw exception (with no response): {error_message}';

if (method_exists($e, 'getResponse')) {
// exception has a response which means we might be able to retrieve content from it, log it and continue
$response = $e->getResponse();
$headers = $this->formatHeaders($response);

$data = [
'effective_url' => $url,
'body' => (string) $response->getBody(),
'headers' => $headers,
'status' => $response->getStatusCode(),
];
$message = 'Request throw exception (with a response): {error_message}';
}
// if response give us a refresh header it means we need to follow the given url
if (!empty($headers['refresh']) && 1 === preg_match('![0-9];\s*url=["\']?([^"\'>]+)!i', $headers['refresh'], $match)) {
return $this->fetchAsync($match[1], true, $httpHeader);
}

$this->logger->warning($message, ['error_message' => $e->getMessage()]);
$this->logger->info('Data fetched: {data}', ['data' => $data]);
// the response content-type did not match our 'header only' types,
// but we'd issues a HEAD request because we assumed it would. So
// let's queue a proper GET request for this item...
if ('head' === $method && !$this->headerOnlyType($headers)) {
return $this->fetchAsync($effectiveUrl, true, $httpHeader);
}

return $data;
}
$body = (string) $response->getBody();

$effectiveUrl = $url;
if (null !== $this->responseHistory->getLastRequest()) {
$effectiveUrl = (string) $this->responseHistory->getLastRequest()->getUri();
}
// be sure to remove conditional comments for IE around the html tag
// we only remove conditional comments until we found the <head> tag
// they usually contains the <html> tag which we try to found and replace the last occurence
// with the whole conditional comments
preg_match('/^\<!--\[if(\X+)\<!\[endif\]--\>(\X+)\<head\>$/mi', $body, $matchesConditional);

$headers = $this->formatHeaders($response);
if (\count($matchesConditional) > 1) {
preg_match_all('/\<html([\sa-z0-9\=\"\"\-:\/\.\#]+)\>$/mi', $matchesConditional[0], $matchesHtml);

// if response give us a refresh header it means we need to follow the given url
if (!empty($headers['refresh']) && 1 === preg_match('![0-9];\s*url=["\']?([^"\'>]+)!i', $headers['refresh'], $match)) {
return $this->fetch($match[1], true, $httpHeader);
}
if (\count($matchesHtml) > 1) {
$htmlTag = end($matchesHtml[0]);

// the response content-type did not match our 'header only' types,
// but we'd issues a HEAD request because we assumed it would. So
// let's queue a proper GET request for this item...
if ('head' === $method && !$this->headerOnlyType($headers)) {
return $this->fetch($effectiveUrl, true, $httpHeader);
}
if (!empty($htmlTag)) {
$body = str_replace($matchesConditional[0], $htmlTag . '<head>', $body);
}
}
}

$body = (string) $response->getBody();
// be sure to remove ALL other conditional comments for IE
// (regex found here: https://stackoverflow.com/a/137831/569101)
preg_match_all('/<!--\[if\s(?:[^<]+|<(?!!\[endif\]-->))*<!\[endif\]-->/mi', $body, $matchesConditional);

// be sure to remove conditional comments for IE around the html tag
// we only remove conditional comments until we found the <head> tag
// they usually contains the <html> tag which we try to found and replace the last occurence
// with the whole conditional comments
preg_match('/^\<!--\[if(\X+)\<!\[endif\]--\>(\X+)\<head\>$/mi', $body, $matchesConditional);
if (isset($matchesConditional[0]) && (is_countable($matchesConditional[0]) ? \count($matchesConditional[0]) : 0) > 1) {
foreach ($matchesConditional as $conditionalComment) {
$body = str_replace($conditionalComment, '', $body);
}
}

if (\count($matchesConditional) > 1) {
preg_match_all('/\<html([\sa-z0-9\=\"\"\-:\/\.\#]+)\>$/mi', $matchesConditional[0], $matchesHtml);
if (null !== $this->extractor) {
$body = $this->extractor->processStringReplacements($body, $effectiveUrl);
}

if (\count($matchesHtml) > 1) {
$htmlTag = end($matchesHtml[0]);
// check for <meta name='fragment' content='!'/>
// for AJAX sites, e.g. Blogger with its dynamic views templates.
// Based on Google's spec: https://developers.google.com/webmasters/ajax-crawling/docs/specification
if (false === strpos($effectiveUrl, '_escaped_fragment_')) {
$redirectURL = $this->getMetaRefreshURL($effectiveUrl, $body) ?: $this->getUglyURL($effectiveUrl, $body);

if (!empty($htmlTag)) {
$body = str_replace($matchesConditional[0], $htmlTag . '<head>', $body);
if (false !== $redirectURL) {
return $this->fetchAsync($redirectURL, true, $httpHeader);
}
}
}
}

// be sure to remove ALL other conditional comments for IE
// (regex found here: https://stackoverflow.com/a/137831/569101)
preg_match_all('/<!--\[if\s(?:[^<]+|<(?!!\[endif\]-->))*<!\[endif\]-->/mi', $body, $matchesConditional);
// remove utm parameters & fragment
$effectiveUrl = (string) $this->removeTrackersFromUrl(new Uri(str_replace('&amp;', '&', $effectiveUrl)));

if (isset($matchesConditional[0]) && (is_countable($matchesConditional[0]) ? \count($matchesConditional[0]) : 0) > 1) {
foreach ($matchesConditional as $conditionalComment) {
$body = str_replace($conditionalComment, '', $body);
}
}
$this->logger->info('Data fetched: {data}', ['data' => [
'effective_url' => $effectiveUrl,
'body' => '(only length for debug): ' . \strlen($body),
'headers' => $headers,
'status' => $response->getStatusCode(),
]]);

if (null !== $this->extractor) {
$body = $this->extractor->processStringReplacements($body, $effectiveUrl);
}
return [
'effective_url' => $effectiveUrl,
'body' => $body,
'headers' => $headers,
'status' => $response->getStatusCode(),
];
},
function (\Exception $e) use ($url) {
if ($e instanceof LoopException) {
$this->logger->info('Endless redirect: ' . ($this->config->getMaxRedirect() + 1) . ' on "{url}"', ['url' => $url]);

return [
'effective_url' => $url,
'body' => '',
'headers' => [],
// Too many Redirects
'status' => 310,
];
} elseif ($e instanceof TransferException) {
if (method_exists($e, 'getRequest')) {
$url = (string) $e->getRequest()->getUri();
}

// no response attached to the exception, we won't be able to retrieve content from it
$data = [
'effective_url' => $url,
'body' => '',
'headers' => [],
'status' => 500,
];
$message = 'Request throw exception (with no response): {error_message}';

if (method_exists($e, 'getResponse')) {
// exception has a response which means we might be able to retrieve content from it, log it and continue
$response = $e->getResponse();
$headers = $this->formatHeaders($response);

$data = [
'effective_url' => $url,
'body' => (string) $response->getBody(),
'headers' => $headers,
'status' => $response->getStatusCode(),
];
$message = 'Request throw exception (with a response): {error_message}';
}

$this->logger->warning($message, ['error_message' => $e->getMessage()]);
$this->logger->info('Data fetched: {data}', ['data' => $data]);

return $data;
}

// check for <meta name='fragment' content='!'/>
// for AJAX sites, e.g. Blogger with its dynamic views templates.
// Based on Google's spec: https://developers.google.com/webmasters/ajax-crawling/docs/specification
if (false === strpos($effectiveUrl, '_escaped_fragment_')) {
$redirectURL = $this->getMetaRefreshURL($effectiveUrl, $body) ?: $this->getUglyURL($effectiveUrl, $body);
throw $e;
}
);
}

if (false !== $redirectURL) {
return $this->fetch($redirectURL, true, $httpHeader);
/**
* @param mixed $thenable
*/
private static function makePromise($thenable): PromiseInterface
{
if ($thenable instanceof \Http\Client\Promise\HttpFulfilledPromise) {
// Calling then() on HttpFulfilledPromise would create another HttpFulfilledPromise,
// which cannot resolve to anything other then ResponseInterface.
return new \GuzzleHttp\Promise\FulfilledPromise($thenable->wait());
} elseif ($thenable instanceof \Http\Client\Promise\HttpRejectedPromise) {
// Calling then() on HttpRejectedPromise would create HttpFulfilledPromise,
// which cannot resolve to anything other then ResponseInterface.
try {
// No other way to unpack the Exception than to have wait() throw it.
$thenable->wait();
} catch (\Exception $exception) {
return new \GuzzleHttp\Promise\RejectedPromise($exception);
}
}

// remove utm parameters & fragment
$effectiveUrl = (string) $this->removeTrackersFromUrl(new Uri(str_replace('&amp;', '&', $effectiveUrl)));

$this->logger->info('Data fetched: {data}', ['data' => [
'effective_url' => $effectiveUrl,
'body' => '(only length for debug): ' . \strlen($body),
'headers' => $headers,
'status' => $response->getStatusCode(),
]]);

return [
'effective_url' => $effectiveUrl,
'body' => $body,
'headers' => $headers,
'status' => $response->getStatusCode(),
];
return Create::promiseFor($thenable);
}

/**
Expand Down

0 comments on commit 5c6ba77

Please sign in to comment.