Skip to content

Commit

Permalink
Fix encoding issue for non-English websites, closes website-scraper#454
Browse files Browse the repository at this point in the history
  • Loading branch information
aivus authored Jun 9, 2022
1 parent d80e9b0 commit 5a58f48
Show file tree
Hide file tree
Showing 19 changed files with 183 additions and 39 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,4 @@ package-lock.json
npm-debug.log
coverage
test/e2e/results
.nyc-output
3 changes: 1 addition & 2 deletions lib/config/defaults.js
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,7 @@ const config = {
],
request: {
throwHttpErrors: false,
encoding: 'binary',
//cookieJar: true,
responseType: 'buffer',
decompress: true,
headers: {
'user-agent': defaultRequestUserAgent
Expand Down
3 changes: 2 additions & 1 deletion lib/plugins/save-resource-to-fs-plugin.js
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,8 @@ class SaveResourceToFileSystemPlugin {
registerAction('saveResource', async ({resource}) => {
const filename = path.join(absoluteDirectoryPath, resource.getFilename());
const text = resource.getText();
await fs.outputFile(filename, text, { encoding: 'binary' });
const encoding = typeof text === 'string' ? 'utf-8' : 'binary';
await fs.outputFile(filename, text, { encoding });
loadedResources.push(resource);
});

Expand Down
23 changes: 17 additions & 6 deletions lib/request.js
Original file line number Diff line number Diff line change
@@ -1,18 +1,24 @@
import got from 'got';
import logger from './logger.js';
import { extend, isPlainObject } from './utils/index.js';
import types from './config/resource-types.js';
import { extend, isPlainObject, getTypeByMime } from './utils/index.js';

const TEXT_RESOURCE_TYPES = [types.html, types.css];

function getMimeType (contentType) {
return contentType ? contentType.split(';')[0] : null;
}

function defaultResponseHandler ({response}) {
return Promise.resolve(response.body);
function defaultResponseHandler ({response, type}) {
if (TEXT_RESOURCE_TYPES.includes(type)) {
return response.body.toString();
}
return response.body;
}

function transformResult (result) {
switch (true) {
case typeof result === 'string':
case typeof result === 'string' || Buffer.isBuffer(result):
return {
body: result,
metadata: null
Expand Down Expand Up @@ -41,14 +47,19 @@ async function getRequest ({url, referer, options = {}, afterResponse = defaultR

const response = await got(requestOptions);
logger.debug(`[request] received response for ${response.url}, statusCode ${response.statusCode}`);
const responseHandlerResult = transformResult(await afterResponse({response}));

const mimeType = getMimeType(response.headers['content-type']);
const resourceType = getTypeByMime(mimeType);

const responseHandlerResult = transformResult(await afterResponse({ response, type: resourceType }));

if (!responseHandlerResult) {
return null;
}
return {
url: response.url,
mimeType: getMimeType(response.headers['content-type']),
type: resourceType,
mimeType,
body: responseHandlerResult.body,
metadata: responseHandlerResult.metadata
};
Expand Down
4 changes: 2 additions & 2 deletions lib/scraper.js
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ import {
} from './plugins/index.js';

import * as utils from './utils/index.js';
const { extend, union, urlsEqual, getTypeByMime, getTypeByFilename, series } = utils;
const { extend, union, urlsEqual, getTypeByFilename, series } = utils;
import NormalizedUrlMap from './utils/normalized-url-map.js';

const actionNames = [
Expand Down Expand Up @@ -170,7 +170,7 @@ class Scraper {
self.requestedResourcePromises.set(responseData.url, requestPromise);
}

resource.setType(getTypeByMime(responseData.mimeType));
resource.setType(responseData.type);

const { filename } = await self.runActions('generateFilename', { resource, responseData });
resource.setFilename(filename);
Expand Down
8 changes: 4 additions & 4 deletions test/functional/base/base.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -51,15 +51,15 @@ describe('Functional: base', function() {
nock('http://blog.example.com/').get('/').replyWithFile(200, mockDirname + '/blog.html', {'content-type': 'text/html'});

// mock sources for index.html
nock('http://example.com/').get('/index.css').replyWithFile(200, mockDirname + '/index.css');
nock('http://example.com/').get('/index.css').replyWithFile(200, mockDirname + '/index.css', {'content-type': 'text/css'});
nock('http://example.com/').get('/background.png').reply(200, 'OK');
nock('http://example.com/').get('/cat.jpg').reply(200, 'OK');
nock('http://example.com/').get('/script.min.js').reply(200, 'OK');

// mock sources for index.css
nock('http://example.com/').get('/files/index-import-1.css').reply(200, 'OK');
nock('http://example.com/').get('/files/index-import-2.css').replyWithFile(200, mockDirname + '/index-import-2.css');
nock('http://example.com/').get('/files/index-import-3.css').reply(200, 'OK');
nock('http://example.com/').get('/files/index-import-1.css').reply(200, 'OK', {'content-type': 'text/css'});
nock('http://example.com/').get('/files/index-import-2.css').replyWithFile(200, mockDirname + '/index-import-2.css', {'content-type': 'text/css'});
nock('http://example.com/').get('/files/index-import-3.css').reply(200, 'OK', {'content-type': 'text/css'});
nock('http://example.com/').get('/files/index-image-1.png').reply(200, 'OK');
nock('http://example.com/').get('/files/index-image-2.png').reply(200, 'OK');

Expand Down
69 changes: 69 additions & 0 deletions test/functional/binary-resources/images.test.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
import should from 'should';
import '../../utils/assertions.js';
import nock from 'nock';
import fs from 'fs-extra';
import cheerio from 'cheerio';
import scrape from 'website-scraper';

const testDirname = './test/functional/binary-resources/.tmp';
const mockDirname = './test/functional/binary-resources/mocks';

describe('Functional: images', () => {
const options = {
urls: [ 'http://example.com/' ],
directory: testDirname,
subdirectories: [
{ directory: 'img', extensions: ['.jpg', '.png'] }
],
sources: [
{ selector: 'img', attr: 'src' }
],
ignoreErrors: false
};

beforeEach(() => {
nock.cleanAll();
nock.disableNetConnect();
});

afterEach(() => {
nock.cleanAll();
nock.enableNetConnect();
fs.removeSync(testDirname);
});

beforeEach(() => {
// mock base urls
nock('http://example.com/').get('/').replyWithFile(200, mockDirname + '/index.html', {'content-type': 'text/html'});

// mock sources for index.html
nock('http://example.com/').get('/test-image.png').replyWithFile(200, mockDirname + '/test-image.png', {'content-type': 'image/png'});
nock('http://example.com/').get('/test-image.jpg').replyWithFile(200, mockDirname + '/test-image.jpg', {'content-type': 'image/jpeg'});
});

it('should load images and save content correctly', async () => {
await scrape(options);

// should create directory and subdirectories
fs.existsSync(testDirname).should.be.eql(true);
fs.existsSync(testDirname + '/img').should.be.eql(true);

// should contain all sources found in index.html
fs.existsSync(testDirname + '/img/test-image.png').should.be.eql(true);
fs.existsSync(testDirname + '/img/test-image.jpg').should.be.eql(true);

// all sources in index.html should be replaced with local paths
let $ = cheerio.load(fs.readFileSync(testDirname + '/index.html').toString());
$('img.png').attr('src').should.be.eql('img/test-image.png');
$('img.jpg').attr('src').should.be.eql('img/test-image.jpg');

// content of downloaded images should equal original images
const originalPng = fs.readFileSync(mockDirname + '/test-image.png');
const originalJpg = fs.readFileSync(mockDirname + '/test-image.jpg');
const resultPng = fs.readFileSync(testDirname + '/img/test-image.png');
const resultJpg = fs.readFileSync(testDirname + '/img/test-image.jpg');

should(resultPng).be.eql(originalPng);
should(resultJpg).be.eql(originalJpg);
});
});
11 changes: 11 additions & 0 deletions test/functional/binary-resources/mocks/index.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>Index</title>
</head>
<body>
<img class="jpg" src="/test-image.jpg" />
<img class="png" src="/test-image.png" />
</body>
</html>
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
1 change: 0 additions & 1 deletion test/functional/callbacks/callbacks.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@ import sinon from 'sinon';
import scrape from 'website-scraper';

const testDirname = './test/functional/callbacks/.tmp';
const mockDirname = './test/functional/base/mocks';

describe('Functional: onResourceSaved and onResourceError callbacks in plugin', () => {

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,10 +34,10 @@ describe('Functional circular dependencies', function() {
]
};

nock('http://example.com/').get('/index.html').replyWithFile(200, mockDirname + '/index.html');
nock('http://example.com/').get('/about.html').replyWithFile(200, mockDirname + '/about.html');
nock('http://example.com/').get('/style.css').replyWithFile(200, mockDirname + '/style.css');
nock('http://example.com/').get('/style2.css').replyWithFile(200, mockDirname + '/style2.css');
nock('http://example.com/').get('/index.html').replyWithFile(200, mockDirname + '/index.html', {'content-type': 'text/html'});
nock('http://example.com/').get('/about.html').replyWithFile(200, mockDirname + '/about.html', {'content-type': 'text/html'});
nock('http://example.com/').get('/style.css').replyWithFile(200, mockDirname + '/style.css', {'content-type': 'text/css'});
nock('http://example.com/').get('/style2.css').replyWithFile(200, mockDirname + '/style2.css', {'content-type': 'text/css'});

return scrape(options).then(function() {
fs.existsSync(testDirname + '/index.html').should.be.eql(true);
Expand Down
8 changes: 4 additions & 4 deletions test/functional/css-handling/css-handling.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,11 @@ describe('Functional: css handling', function() {
});

it('should correctly handle css files, style tags and style attributes and ignore css-like text inside common html tags', function() {
nock('http://example.com/').get('/').replyWithFile(200, mockDirname + '/index.html');
nock('http://example.com/').get('/style.css').replyWithFile(200, mockDirname + '/style.css');
nock('http://example.com/').get('/').replyWithFile(200, mockDirname + '/index.html', {'content-type': 'text/html'});
nock('http://example.com/').get('/style.css').replyWithFile(200, mockDirname + '/style.css', {'content-type': 'text/css'});

nock('http://example.com/').get('/style-import-1.css').reply(200, 'style-import-1.css');
nock('http://example.com/').get('/style-import-2.css').reply(200, 'style-import-2.css');
nock('http://example.com/').get('/style-import-1.css').reply(200, 'style-import-1.css', {'content-type': 'text/css'});
nock('http://example.com/').get('/style-import-2.css').reply(200, 'style-import-2.css', {'content-type': 'text/css'});
nock('http://example.com/').get('/style-tag.png').reply(200, 'style-tag.png');
nock('http://example.com/').get('/style-attr.png').reply(200, 'style-attr.png');
nock('http://example.com/').get('/css-like-text-in-html.png').reply(200, 'css-like-text-in-html.png');
Expand Down
41 changes: 41 additions & 0 deletions test/functional/encoding/hieroglyphs.test.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import '../../utils/assertions.js';
import nock from 'nock';
import fs from 'fs-extra';
import scrape from 'website-scraper';

const testDirname = './test/functional/encoding/.tmp';
const mockDirname = './test/functional/encoding/mocks';

describe('Functional: Korean characters are properly encoded/decoded', function() {
const options = {
urls: [
'http://example.com/',
],
directory: testDirname,
ignoreErrors: false
};

beforeEach(function() {
nock.cleanAll();
nock.disableNetConnect();
});

afterEach(function() {
nock.cleanAll();
nock.enableNetConnect();
fs.removeSync(testDirname);
});

beforeEach(() => {
nock('http://example.com/').get('/').replyWithFile(200, mockDirname + '/index.html', {'content-type': 'text/html'});
});

it('should save the page in the same data as it was originally', () => {
return scrape(options).then(function(result) {
const scrapedIndex = fs.readFileSync(testDirname + '/index.html').toString();
scrapedIndex.should.be.containEql('<div id="special-characters-korean">저는 7년 동안 한국에서 살았어요.</div>');
scrapedIndex.should.be.containEql('<div id="special-characters-ukrainian">Слава Україні!</div>');
scrapedIndex.should.be.containEql('<div id="special-characters-chinese">加入网站</div>');
});
});
});
12 changes: 12 additions & 0 deletions test/functional/encoding/mocks/index.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>Test</title>
</head>
<body>
<div id="special-characters-korean">저는 7년 동안 한국에서 살았어요.</div>
<div id="special-characters-ukrainian">Слава Україні!</div>
<div id="special-characters-chinese">加入网站</div>
</body>
</html>
4 changes: 2 additions & 2 deletions test/functional/html-entities/html-entities.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@ describe('Functional: html entities', function() {
});

it('should decode all html-entities found in html files and not encode entities from css file', function() {
nock('http://example.com/').get('/').replyWithFile(200, mockDirname + '/index.html');
nock('http://example.com/').get('/style.css').replyWithFile(200, mockDirname + '/style.css');
nock('http://example.com/').get('/').replyWithFile(200, mockDirname + '/index.html', {'content-type': 'text/html'});
nock('http://example.com/').get('/style.css').replyWithFile(200, mockDirname + '/style.css', {'content-type': 'text/css'});

// in index.html
// /fonts?family=Myriad&amp;v=2 => /fonts?family=Myriad&v=2
Expand Down
12 changes: 6 additions & 6 deletions test/functional/redirect/redirect.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -23,16 +23,16 @@ describe('Functional redirects', function() {
});

it('should follow redirects and save resource once if it has different urls', function() {
nock('http://example.com/').get('/').replyWithFile(200, mockDirname + '/index.html');
nock('http://example.com/').get('/').replyWithFile(200, mockDirname + '/index.html', {'content-type': 'text/html'});
// true page - ok
nock('http://example.com/').get('/true-page.html').reply(200, '<html><head></head><body>true page 1</body></html>');
nock('http://example.com/').get('/true-page.html').reply(200, '<html><head></head><body>true page 1</body></html>', {'content-type': 'text/html'});
// duplicating page - redirect to true page
nock('http://example.com/').get('/duplicating-page.html').reply(302, '', {'Location': 'http://example.com/true-page.html'});
nock('http://example.com/').get('/true-page.html').reply(200, 'true page 2');
// duplicating site - redirect to duplicating page, then redirect to true page
nock('http://duplicating.another-site.com/').get('/').reply(302, '', {'Location': 'http://example.com/duplicating-page.html'});
nock('http://example.com/').get('/duplicating-page.html').reply(302, '', {'Location': 'http://example.com/true-page.html'});
nock('http://example.com/').get('/true-page.html').reply(200, 'true page 3');
nock('http://example.com/').get('/true-page.html').reply(200, 'true page 3', {'content-type': 'text/html'});

const options = {
urls: [ 'http://example.com/' ],
Expand Down Expand Up @@ -79,11 +79,11 @@ describe('Functional redirects', function() {
]
};

nock('http://example.com/').get('/').replyWithFile(200, mockDirname + '/relative-resources-index.html');
nock('http://example.com/').get('/').replyWithFile(200, mockDirname + '/relative-resources-index.html', {'content-type': 'text/html'});
nock('http://example.com/').get('/about').reply(301, '', {'Location': 'http://example.com/about/'});
nock('http://example.com/').get('/about/').replyWithFile(200, mockDirname + '/relative-resources-about.html', {'content-type': 'text/html'});
nock('http://example.com/').get('/style.css').reply(200, 'style.css');
nock('http://example.com/').get('/about/style.css').reply(200, 'about/style.css');
nock('http://example.com/').get('/style.css').reply(200, 'style.css', {'content-type': 'text/css'});
nock('http://example.com/').get('/about/style.css').reply(200, 'about/style.css', {'content-type': 'text/css'});

return scrape(options).then(function() {
fs.existsSync(testDirname + '/index.html').should.be.eql(true);
Expand Down
4 changes: 2 additions & 2 deletions test/unit/scraper-init-test.js
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ describe('Scraper initialization', function () {

s.options.request.should.containEql({
throwHttpErrors: false,
encoding: 'binary',
responseType: 'buffer',
decompress: true,
https: {
rejectUnauthorized: false
Expand All @@ -143,7 +143,7 @@ describe('Scraper initialization', function () {

s.options.request.should.eql({
throwHttpErrors: true,
encoding: 'binary',
responseType: 'buffer',
decompress: true,
https: {
rejectUnauthorized: false
Expand Down
10 changes: 5 additions & 5 deletions test/unit/scraper-test.js
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ describe('Scraper', () => {
rr.should.be.eql(r);
rr.getUrl().should.be.eql('http://example.com/a.png');
rr.getFilename().should.be.not.empty();
rr.getText().should.be.eql('OK');
rr.getText().should.be.not.empty();
});

it('should return null if the urlFilter returns false', async () =>{
Expand Down Expand Up @@ -138,7 +138,7 @@ describe('Scraper', () => {
rr.should.be.eql(r);
rr.getUrl().should.be.eql('http://example.com');
rr.getFilename().should.be.not.empty();
rr.getText().should.be.eql('OK');
rr.getText().should.be.not.empty();
});
});

Expand All @@ -160,7 +160,7 @@ describe('Scraper', () => {
rr.should.be.eql(r);
rr.getUrl().should.be.eql('http://example.com/a.png');
rr.getFilename().should.be.not.empty();
rr.getText().should.be.eql('OK');
rr.getText().should.be.not.empty();
});

it('should request the resource if maxDepth is set and resource depth is less than maxDept', async () =>{
Expand All @@ -181,7 +181,7 @@ describe('Scraper', () => {
rr.should.be.eql(r);
rr.getUrl().should.be.eql('http://example.com/a.png');
rr.getFilename().should.be.not.empty();
rr.getText().should.be.eql('OK');
rr.getText().should.be.not.empty();
});

it('should request the resource if maxDepth is set and resource depth is equal to maxDept', async () =>{
Expand All @@ -201,7 +201,7 @@ describe('Scraper', () => {
rr.should.be.eql(r);
rr.getUrl().should.be.eql('http://example.com/a.png');
rr.getFilename().should.be.not.empty();
rr.getText().should.be.eql('OK');
rr.getText().should.be.not.empty();
});

it('should return null if maxDepth is set and resource depth is greater than maxDepth', async () =>{
Expand Down

0 comments on commit 5a58f48

Please sign in to comment.