forked from website-scraper/node-website-scraper
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Fix encoding issue for non-English websites, closes website-scraper#454
- Loading branch information
Showing
19 changed files
with
183 additions
and
39 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,3 +5,4 @@ package-lock.json | |
npm-debug.log | ||
coverage | ||
test/e2e/results | ||
.nyc-output |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,69 @@ | ||
import should from 'should'; | ||
import '../../utils/assertions.js'; | ||
import nock from 'nock'; | ||
import fs from 'fs-extra'; | ||
import cheerio from 'cheerio'; | ||
import scrape from 'website-scraper'; | ||
|
||
const testDirname = './test/functional/binary-resources/.tmp'; | ||
const mockDirname = './test/functional/binary-resources/mocks'; | ||
|
||
describe('Functional: images', () => { | ||
const options = { | ||
urls: [ 'http://example.com/' ], | ||
directory: testDirname, | ||
subdirectories: [ | ||
{ directory: 'img', extensions: ['.jpg', '.png'] } | ||
], | ||
sources: [ | ||
{ selector: 'img', attr: 'src' } | ||
], | ||
ignoreErrors: false | ||
}; | ||
|
||
beforeEach(() => { | ||
nock.cleanAll(); | ||
nock.disableNetConnect(); | ||
}); | ||
|
||
afterEach(() => { | ||
nock.cleanAll(); | ||
nock.enableNetConnect(); | ||
fs.removeSync(testDirname); | ||
}); | ||
|
||
beforeEach(() => { | ||
// mock base urls | ||
nock('http://example.com/').get('/').replyWithFile(200, mockDirname + '/index.html', {'content-type': 'text/html'}); | ||
|
||
// mock sources for index.html | ||
nock('http://example.com/').get('/test-image.png').replyWithFile(200, mockDirname + '/test-image.png', {'content-type': 'image/png'}); | ||
nock('http://example.com/').get('/test-image.jpg').replyWithFile(200, mockDirname + '/test-image.jpg', {'content-type': 'image/jpeg'}); | ||
}); | ||
|
||
it('should load images and save content correctly', async () => { | ||
await scrape(options); | ||
|
||
// should create directory and subdirectories | ||
fs.existsSync(testDirname).should.be.eql(true); | ||
fs.existsSync(testDirname + '/img').should.be.eql(true); | ||
|
||
// should contain all sources found in index.html | ||
fs.existsSync(testDirname + '/img/test-image.png').should.be.eql(true); | ||
fs.existsSync(testDirname + '/img/test-image.jpg').should.be.eql(true); | ||
|
||
// all sources in index.html should be replaced with local paths | ||
let $ = cheerio.load(fs.readFileSync(testDirname + '/index.html').toString()); | ||
$('img.png').attr('src').should.be.eql('img/test-image.png'); | ||
$('img.jpg').attr('src').should.be.eql('img/test-image.jpg'); | ||
|
||
// content of downloaded images should equal original images | ||
const originalPng = fs.readFileSync(mockDirname + '/test-image.png'); | ||
const originalJpg = fs.readFileSync(mockDirname + '/test-image.jpg'); | ||
const resultPng = fs.readFileSync(testDirname + '/img/test-image.png'); | ||
const resultJpg = fs.readFileSync(testDirname + '/img/test-image.jpg'); | ||
|
||
should(resultPng).be.eql(originalPng); | ||
should(resultJpg).be.eql(originalJpg); | ||
}); | ||
}); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
<!DOCTYPE html> | ||
<html lang="en"> | ||
<head> | ||
<meta charset="UTF-8"> | ||
<title>Index</title> | ||
</head> | ||
<body> | ||
<img class="jpg" src="/test-image.jpg" /> | ||
<img class="png" src="/test-image.png" /> | ||
</body> | ||
</html> |
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
import '../../utils/assertions.js'; | ||
import nock from 'nock'; | ||
import fs from 'fs-extra'; | ||
import scrape from 'website-scraper'; | ||
|
||
const testDirname = './test/functional/encoding/.tmp'; | ||
const mockDirname = './test/functional/encoding/mocks'; | ||
|
||
describe('Functional: Korean characters are properly encoded/decoded', function() { | ||
const options = { | ||
urls: [ | ||
'http://example.com/', | ||
], | ||
directory: testDirname, | ||
ignoreErrors: false | ||
}; | ||
|
||
beforeEach(function() { | ||
nock.cleanAll(); | ||
nock.disableNetConnect(); | ||
}); | ||
|
||
afterEach(function() { | ||
nock.cleanAll(); | ||
nock.enableNetConnect(); | ||
fs.removeSync(testDirname); | ||
}); | ||
|
||
beforeEach(() => { | ||
nock('http://example.com/').get('/').replyWithFile(200, mockDirname + '/index.html', {'content-type': 'text/html'}); | ||
}); | ||
|
||
it('should save the page in the same data as it was originally', () => { | ||
return scrape(options).then(function(result) { | ||
const scrapedIndex = fs.readFileSync(testDirname + '/index.html').toString(); | ||
scrapedIndex.should.be.containEql('<div id="special-characters-korean">저는 7년 동안 한국에서 살았어요.</div>'); | ||
scrapedIndex.should.be.containEql('<div id="special-characters-ukrainian">Слава Україні!</div>'); | ||
scrapedIndex.should.be.containEql('<div id="special-characters-chinese">加入网站</div>'); | ||
}); | ||
}); | ||
}); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
<!DOCTYPE html> | ||
<html lang="en"> | ||
<head> | ||
<meta charset="UTF-8"> | ||
<title>Test</title> | ||
</head> | ||
<body> | ||
<div id="special-characters-korean">저는 7년 동안 한국에서 살았어요.</div> | ||
<div id="special-characters-ukrainian">Слава Україні!</div> | ||
<div id="special-characters-chinese">加入网站</div> | ||
</body> | ||
</html> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters