From 3bd25551ae3676e08c6c42b001d922b1e798076b Mon Sep 17 00:00:00 2001 From: Sophia Antipenko Date: Sat, 11 Jun 2022 14:43:32 +0200 Subject: [PATCH] Revert encoding fix #482 (#495) --- lib/config/defaults.js | 3 ++- lib/plugins/save-resource-to-fs-plugin.js | 3 +-- lib/request.js | 23 +++++--------------- lib/scraper.js | 4 ++-- test/functional/encoding/hieroglyphs.test.js | 3 ++- test/unit/scraper-init-test.js | 4 ++-- 6 files changed, 15 insertions(+), 25 deletions(-) diff --git a/lib/config/defaults.js b/lib/config/defaults.js index 8780e83e..1e79fe63 100644 --- a/lib/config/defaults.js +++ b/lib/config/defaults.js @@ -48,7 +48,8 @@ const config = { ], request: { throwHttpErrors: false, - responseType: 'buffer', + encoding: 'binary', + //cookieJar: true, decompress: true, headers: { 'user-agent': defaultRequestUserAgent diff --git a/lib/plugins/save-resource-to-fs-plugin.js b/lib/plugins/save-resource-to-fs-plugin.js index dfb5e3ba..0e7be17c 100644 --- a/lib/plugins/save-resource-to-fs-plugin.js +++ b/lib/plugins/save-resource-to-fs-plugin.js @@ -20,8 +20,7 @@ class SaveResourceToFileSystemPlugin { registerAction('saveResource', async ({resource}) => { const filename = path.join(absoluteDirectoryPath, resource.getFilename()); const text = resource.getText(); - const encoding = typeof text === 'string' ? 'utf-8' : 'binary'; - await fs.outputFile(filename, text, { encoding }); + await fs.outputFile(filename, text, { encoding: 'binary' }); loadedResources.push(resource); }); diff --git a/lib/request.js b/lib/request.js index 8d993093..4ea4e76b 100644 --- a/lib/request.js +++ b/lib/request.js @@ -1,24 +1,18 @@ import got from 'got'; import logger from './logger.js'; -import types from './config/resource-types.js'; -import { extend, isPlainObject, getTypeByMime } from './utils/index.js'; - -const TEXT_RESOURCE_TYPES = [types.html, types.css]; +import { extend, isPlainObject } from './utils/index.js'; function getMimeType (contentType) { return contentType ? contentType.split(';')[0] : null; } -function defaultResponseHandler ({response, type}) { - if (TEXT_RESOURCE_TYPES.includes(type)) { - return response.body.toString(); - } - return response.body; +function defaultResponseHandler ({response}) { + return Promise.resolve(response.body); } function transformResult (result) { switch (true) { - case typeof result === 'string' || Buffer.isBuffer(result): + case typeof result === 'string': return { body: result, metadata: null @@ -47,19 +41,14 @@ async function getRequest ({url, referer, options = {}, afterResponse = defaultR const response = await got(requestOptions); logger.debug(`[request] received response for ${response.url}, statusCode ${response.statusCode}`); - - const mimeType = getMimeType(response.headers['content-type']); - const resourceType = getTypeByMime(mimeType); - - const responseHandlerResult = transformResult(await afterResponse({ response, type: resourceType })); + const responseHandlerResult = transformResult(await afterResponse({response})); if (!responseHandlerResult) { return null; } return { url: response.url, - type: resourceType, - mimeType, + mimeType: getMimeType(response.headers['content-type']), body: responseHandlerResult.body, metadata: responseHandlerResult.metadata }; diff --git a/lib/scraper.js b/lib/scraper.js index 339aa301..040a9cd9 100644 --- a/lib/scraper.js +++ b/lib/scraper.js @@ -13,7 +13,7 @@ import { } from './plugins/index.js'; import * as utils from './utils/index.js'; -const { extend, union, urlsEqual, getTypeByFilename, series } = utils; +const { extend, union, urlsEqual, getTypeByMime, getTypeByFilename, series } = utils; import NormalizedUrlMap from './utils/normalized-url-map.js'; const actionNames = [ @@ -170,7 +170,7 @@ class Scraper { self.requestedResourcePromises.set(responseData.url, requestPromise); } - resource.setType(responseData.type); + resource.setType(getTypeByMime(responseData.mimeType)); const { filename } = await self.runActions('generateFilename', { resource, responseData }); resource.setFilename(filename); diff --git a/test/functional/encoding/hieroglyphs.test.js b/test/functional/encoding/hieroglyphs.test.js index 3081df2b..1ee31973 100644 --- a/test/functional/encoding/hieroglyphs.test.js +++ b/test/functional/encoding/hieroglyphs.test.js @@ -6,7 +6,8 @@ import scrape from 'website-scraper'; const testDirname = './test/functional/encoding/.tmp'; const mockDirname = './test/functional/encoding/mocks'; -describe('Functional: Korean characters are properly encoded/decoded', function() { +// TODO: enable test when encoding issue is fixed +xdescribe('Functional: Korean characters are properly encoded/decoded', function() { const options = { urls: [ 'http://example.com/', diff --git a/test/unit/scraper-init-test.js b/test/unit/scraper-init-test.js index 080c5976..9612e180 100644 --- a/test/unit/scraper-init-test.js +++ b/test/unit/scraper-init-test.js @@ -121,7 +121,7 @@ describe('Scraper initialization', function () { s.options.request.should.containEql({ throwHttpErrors: false, - responseType: 'buffer', + encoding: 'binary', decompress: true, https: { rejectUnauthorized: false @@ -143,7 +143,7 @@ describe('Scraper initialization', function () { s.options.request.should.eql({ throwHttpErrors: true, - responseType: 'buffer', + encoding: 'binary', decompress: true, https: { rejectUnauthorized: false