Skip to content

Commit

Permalink
fix: non-english char encoding, close #454, close #466 (#496)
Browse files Browse the repository at this point in the history
* fix: non-english char encoding

* fix: optional chaining not supported

* fix: various tests

not finished yet

* fix: tests now pass

* fix: testing and code complexity

* fix: code climate

* fix: change comments

* fix: simplify code complexity

* refactor: simplify logic and add more testing

* fix: reduce cognitive complexity

* fix: more cognitive complexity

* chore: damn space

* fix: broke a test

* chore: update docs

* test: re-enable utf8 test

* chore: update request test
  • Loading branch information
phawxby authored Jun 23, 2022
1 parent 3bd2555 commit e8c6f9d
Show file tree
Hide file tree
Showing 11 changed files with 256 additions and 42 deletions.
11 changes: 8 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -326,8 +326,12 @@ Parameters - object which includes:

Should return resolved `Promise` if resource should be saved or rejected with Error `Promise` if it should be skipped.
Promise should be resolved with:
* `string` which contains response body
* or object with properties `body` (response body, string) and `metadata` - everything you want to save for this resource (like headers, original text, timestamps, etc.), scraper will not use this field at all, it is only for result.
* the `response` object with the `body` modified in place as necessary.
* or object with properties
* `body` (response body, string)
* `encoding` (`binary` or `utf8`) used to save the file, binary used by default.
* `metadata` (object) - everything you want to save for this resource (like headers, original text, timestamps, etc.), scraper will not use this field at all, it is only for result.
* a binary `string`. This is advised against because of the binary assumption being made can foul up saving of `utf8` responses to the filesystem.

If multiple actions `afterResponse` added - scraper will use result from last one.
```javascript
Expand All @@ -342,7 +346,8 @@ registerAction('afterResponse', ({response}) => {
metadata: {
headers: response.headers,
someOtherData: [ 1, 2, 3 ]
}
},
encoding: 'utf8'
}
}
});
Expand Down
2 changes: 1 addition & 1 deletion lib/config/defaults.js
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ const config = {
],
request: {
throwHttpErrors: false,
encoding: 'binary',
responseType: 'buffer',
//cookieJar: true,
decompress: true,
headers: {
Expand Down
2 changes: 1 addition & 1 deletion lib/plugins/save-resource-to-fs-plugin.js
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ class SaveResourceToFileSystemPlugin {
registerAction('saveResource', async ({resource}) => {
const filename = path.join(absoluteDirectoryPath, resource.getFilename());
const text = resource.getText();
await fs.outputFile(filename, text, { encoding: 'binary' });
await fs.outputFile(filename, text, { encoding: resource.getEncoding() });
loadedResources.push(resource);
});

Expand Down
87 changes: 68 additions & 19 deletions lib/request.js
Original file line number Diff line number Diff line change
@@ -1,32 +1,78 @@
import got from 'got';
import logger from './logger.js';
import { extend, isPlainObject } from './utils/index.js';
import { extend } from './utils/index.js';

function getMimeType (contentType) {
return contentType ? contentType.split(';')[0] : null;
}

function defaultResponseHandler ({response}) {
return Promise.resolve(response.body);
return Promise.resolve(response);
}

function extractEncodingFromHeader (headers) {
const contentTypeHeader = headers['content-type'];

return contentTypeHeader && contentTypeHeader.includes('utf-8') ? 'utf8' : 'binary';
}

function getEncoding (response) {
if (response && typeof response === 'object') {
if (response.headers && typeof response.headers === 'object') {
return extractEncodingFromHeader(response.headers);
} else if (response.encoding) {
return response.encoding;
}
}

return 'binary';
}

function throwTypeError (result) {
let type = typeof result;

if (result instanceof Error) {
throw result;
} else if (type === 'object' && Array.isArray(result)) {
type = 'array';
}

throw new Error(`Wrong response handler result. Expected string or object, but received ${type}`);
}

function getData (result) {
let data = result;
if (result && typeof result === 'object' && 'body' in result) {
data = result.body;
}

return data;
}

function transformResult (result) {
switch (true) {
case typeof result === 'string':
return {
body: result,
metadata: null
};
case isPlainObject(result):
return {
body: result.body,
metadata: result.metadata || null
};
case result === null:
return null;
default:
throw new Error('Wrong response handler result. Expected string or object, but received ' + typeof result);
const encoding = getEncoding(result);
const data = getData(result);

// Check for no data
if (data === null || data === undefined) {
return null;
}

// Then stringify it.
let body = null;
if (data instanceof Buffer) {
body = data.toString(encoding);
} else if (typeof data === 'string') {
body = data;
} else {
throwTypeError(result);
}

return {
body,
encoding,
metadata: result.metadata || data.metadata || null
};
}

async function getRequest ({url, referer, options = {}, afterResponse = defaultResponseHandler}) {
Expand All @@ -50,10 +96,13 @@ async function getRequest ({url, referer, options = {}, afterResponse = defaultR
url: response.url,
mimeType: getMimeType(response.headers['content-type']),
body: responseHandlerResult.body,
metadata: responseHandlerResult.metadata
metadata: responseHandlerResult.metadata,
encoding: responseHandlerResult.encoding
};
}

export default {
get: getRequest
get: getRequest,
getEncoding,
transformResult
};
9 changes: 9 additions & 0 deletions lib/resource.js
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ class Resource {
this.children = [];

this.saved = false;
this.encoding = 'binary';
}

createChild (url, filename) {
Expand Down Expand Up @@ -69,6 +70,14 @@ class Resource {
return this.type;
}

setEncoding (encoding) {
this.encoding = encoding;
}

getEncoding () {
return this.encoding;
}

isHtml () {
return this.getType() === types.html;
}
Expand Down
1 change: 1 addition & 0 deletions lib/scraper.js
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,7 @@ class Scraper {
self.requestedResourcePromises.set(responseData.url, requestPromise);
}

resource.setEncoding(responseData.encoding);
resource.setType(getTypeByMime(responseData.mimeType));

const { filename } = await self.runActions('generateFilename', { resource, responseData });
Expand Down
28 changes: 14 additions & 14 deletions test/functional/encoding/hieroglyphs.test.js
Original file line number Diff line number Diff line change
@@ -1,13 +1,12 @@
import '../../utils/assertions.js';
import nock from 'nock';
import fs from 'fs-extra';
import fs from 'fs/promises';
import scrape from 'website-scraper';

const testDirname = './test/functional/encoding/.tmp';
const mockDirname = './test/functional/encoding/mocks';

// TODO: enable test when encoding issue is fixed
xdescribe('Functional: Korean characters are properly encoded/decoded', function() {
describe('Functional: UTF8 characters are properly encoded/decoded', () => {
const options = {
urls: [
'http://example.com/',
Expand All @@ -16,27 +15,28 @@ xdescribe('Functional: Korean characters are properly encoded/decoded', function
ignoreErrors: false
};

beforeEach(function() {
beforeEach(() => {
nock.cleanAll();
nock.disableNetConnect();
});

afterEach(function() {
afterEach(async () => {
nock.cleanAll();
nock.enableNetConnect();
fs.removeSync(testDirname);
await fs.rm(testDirname, { recursive: true, force: true });
});

beforeEach(() => {
nock('http://example.com/').get('/').replyWithFile(200, mockDirname + '/index.html', {'content-type': 'text/html'});
nock('http://example.com/').get('/').replyWithFile(200, mockDirname + '/index.html', {'content-type': 'text/html; charset=utf-8'});
});

it('should save the page in the same data as it was originally', () => {
return scrape(options).then(function(result) {
const scrapedIndex = fs.readFileSync(testDirname + '/index.html').toString();
scrapedIndex.should.be.containEql('<div id="special-characters-korean">저는 7년 동안 한국에서 살았어요.</div>');
scrapedIndex.should.be.containEql('<div id="special-characters-ukrainian">Слава Україні!</div>');
scrapedIndex.should.be.containEql('<div id="special-characters-chinese">加入网站</div>');
});
it('should save the page in the same data as it was originally', async () => {
await scrape(options);

const scrapedIndex = await fs.readFile(testDirname + '/index.html', { encoding: 'utf8' });
scrapedIndex.should.be.containEql('<div id="special-characters-korean">저는 7년 동안 한국에서 살았어요.</div>');
scrapedIndex.should.be.containEql('<div id="special-characters-ukrainian">Слава Україні!</div>');
scrapedIndex.should.be.containEql('<div id="special-characters-chinese">加入网站</div>');
scrapedIndex.should.be.containEql('<div id="special-characters-ukrainian">Обладнання та ПЗ</div>');
});
});
1 change: 1 addition & 0 deletions test/functional/encoding/mocks/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -8,5 +8,6 @@
<div id="special-characters-korean">저는 7년 동안 한국에서 살았어요.</div>
<div id="special-characters-ukrainian">Слава Україні!</div>
<div id="special-characters-chinese">加入网站</div>
<div id="special-characters-ukrainian">Обладнання та ПЗ</div>
</body>
</html>
Loading

0 comments on commit e8c6f9d

Please sign in to comment.