Skip to content

Commit

Permalink
Add children resources to output #19
Browse files Browse the repository at this point in the history
Add children resources to output #19
  • Loading branch information
s0ph1e committed Apr 6, 2016
1 parent d6a4dd4 commit 92eb86f
Show file tree
Hide file tree
Showing 9 changed files with 96 additions and 35 deletions.
19 changes: 10 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,22 +41,23 @@ Makes requests to `urls` and saves all files found with `sources` to `directory`

**options** - object containing next options:

- `urls:` array of urls to load and filenames for them *(required, see example below)*
- `directory:` path to save loaded files *(required)*
- `defaultFilename:` filename for index page *(optional, default: 'index.html')*
- `sources:` array of objects to load, specifies selectors and attribute values to select files for loading *(optional, see default value in `lib/config/defaults.js`)*
- `subdirectories:` array of objects, specifies subdirectories for file extensions. If `null` all files will be saved to `directory` *(optional, see example below)*
- `urls`: array of urls to load and filenames for them *(required, see example below)*
- `directory`: path to save loaded files *(required)*
- `defaultFilename`: filename for index page *(optional, default: 'index.html')*
- `sources`: array of objects to load, specifies selectors and attribute values to select files for loading *(optional, see default value in `lib/config/defaults.js`)*
- `subdirectories`: array of objects, specifies subdirectories for file extensions. If `null` all files will be saved to `directory` *(optional, see example below)*
- `request`: object, custom options for [request](https://github.com/request/request#requestoptions-callback) *(optional, see example below)*
- `recursive`: boolean, if `true` scraper will follow anchors in html files. Don't forget to set `maxDepth` to avoid infinite downloading *(optional, see example below)*
- `maxDepth`: positive number, maximum allowed depth for dependencies *(optional, see example below)*


**callback** - callback function *(optional)*, includes following parameters:

- `error:` if error - `Error` object, if success - `null`
- `result:` if error - `null`, if success - array if objects containing:
- `url:` url of loaded page
- `filename:` filename where page was saved (relative to `directory`)
- `error`: if error - `Error` object, if success - `null`
- `result`: if error - `null`, if success - array of objects containing:
- `url`: url of loaded page
- `filename`: filename where page was saved (relative to `directory`)
- `assets`: array of children resources (each of them contains `url`, `filename`, `assets`)


## Examples
Expand Down
8 changes: 5 additions & 3 deletions lib/file-handlers/css.js
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,12 @@ function loadCss (context, resource) {
var cssUrls = getCssUrls(text);

var promises = _.map(cssUrls, function loadResourceFromCssUrl (cssUrl) {
var resourceUrl = utils.getUrl(url, cssUrl);
var cssResource = resource.createChild(resourceUrl);
var childUrl = utils.getUrl(url, cssUrl);
var childResource = resource.createChild(childUrl);

return context.loadResource(childResource).then(function handleLoadedSource (loadedResource) {
resource.updateChild(childResource, loadedResource);

return context.loadResource(cssResource).then(function handleLoadedSource (loadedResource) {
var relativePath = utils.getRelativePath(filename, loadedResource.getFilename());
text = text.replace(cssUrl, relativePath);
return Promise.resolve();
Expand Down
10 changes: 6 additions & 4 deletions lib/file-handlers/html.js
Original file line number Diff line number Diff line change
Expand Up @@ -53,11 +53,13 @@ function loadResources (context, resource, source) {
var attr = el.attr(source.attr);

if (attr) {
var resourceUrl = utils.getUrl(url, attr);
var htmlResource = resource.createChild(resourceUrl);
htmlResource.setHtmlData({ tagName: el[0].name, attributeName: source.attr });
var childUrl = utils.getUrl(url, attr);
var childResource = resource.createChild(childUrl);
childResource.setHtmlData({ tagName: el[0].name, attributeName: source.attr });

return context.loadResource(childResource).then(function handleLoadedSource (loadedResource) {
resource.updateChild(childResource, loadedResource);

return context.loadResource(htmlResource).then(function handleLoadedSource (loadedResource) {
var relativePath = utils.getRelativePath(filename, loadedResource.getFilename());
var hash = utils.getHashFromUrl(attr);

Expand Down
14 changes: 14 additions & 0 deletions lib/resource.js
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ function getTypeByHtmlData (htmlData) {
function Resource (url, filename) {
this.url = url;
this.filename = filename;
this.children = [];
}

Resource.prototype.createChild = function createChild (url, filename) {
Expand All @@ -23,9 +24,22 @@ Resource.prototype.createChild = function createChild (url, filename) {
child.setParent(this);
child.setDepth(++currentDepth);

this.children.push(child);

return child;
};

Resource.prototype.updateChild = function updateChild (oldChild, newChild) {
var index = _.indexOf(this.children, oldChild);
if (index >= 0) {
this.children[index] = newChild;
}
};

Resource.prototype.getChildren = function getChildren () {
return this.children;
};

Resource.prototype.getUrl = function getUrl () {
return this.url;
};
Expand Down
16 changes: 4 additions & 12 deletions lib/scraper.js
Original file line number Diff line number Diff line change
Expand Up @@ -102,12 +102,9 @@ Scraper.prototype.loadResource = function loadResource (resource) {
// try to find already loaded
var loaded = self.getLoadedResource(resource);

var url = resource.getUrl();
var filename;
var handleFile;

if (!loaded) {
filename = self.generateFilename(resource);
var url = resource.getUrl();
var filename = self.generateFilename(resource);
resource.setFilename(filename);

self.addLoadedResource(resource);
Expand All @@ -116,7 +113,7 @@ Scraper.prototype.loadResource = function loadResource (resource) {
return self.makeRequest(url).then(function requestCompleted (data) {
resource.setUrl(data.url); // Url may be changed in redirects
resource.setText(data.body);
handleFile = self.getResourceHandler(resource);
var handleFile = self.getResourceHandler(resource);
return handleFile(self, resource);
}).then(function fileHandled () {
var filename = path.join(self.options.directory, resource.getFilename());
Expand Down Expand Up @@ -162,12 +159,7 @@ Scraper.prototype.prepare = function prepare () {
Scraper.prototype.load = function load () {
var self = this;
return Promise.map(self.originalResources, function loadPage (po) {
return self.loadResource(po).then(function pageLoaded (loaded) {
return Promise.resolve({
url: loaded.getUrl(),
filename: loaded.getFilename()
});
});
return self.loadResource(po).then(utils.createOutputObject);
});
};

Expand Down
17 changes: 16 additions & 1 deletion lib/utils.js
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
var url = require('url');
var path = require('path');
var _ = require('underscore');
var Promise = require('bluebird');

function isUrl (path) {
Expand Down Expand Up @@ -40,12 +41,26 @@ function waitAllFulfilled (promises) {
}));
}

function createOutputObject (resource) {
var assets = _.chain(resource.getChildren())
.map(createOutputObject)
.uniq()
.value();

return {
url: resource.getUrl(),
filename: resource.getFilename(),
assets: assets
};
}

module.exports = {
isUrl: isUrl,
getUrl: getUrl,
getUnixPath: getUnixPath,
getRelativePath: getRelativePath,
getFilenameFromUrl: getFilenameFromUrl,
getHashFromUrl: getHashFromUrl,
waitAllFulfilled: waitAllFulfilled
waitAllFulfilled: waitAllFulfilled,
createOutputObject: createOutputObject
};
15 changes: 12 additions & 3 deletions test/functional/base-test.js
Original file line number Diff line number Diff line change
Expand Up @@ -66,9 +66,18 @@ describe('Functional base', function() {
scrape(options).then(function(result) {
// should return right result
result.should.be.instanceOf(Array).and.have.length(3);
result.should.containEql({ url: 'http://example.com/', filename: 'index.html' });
result.should.containEql({ url: 'http://example.com/about', filename: 'about.html' });
result.should.containEql({ url: 'http://blog.example.com/', filename: 'blog.html' }); // url after redirect

result[0].should.have.properties({ url: 'http://example.com/', filename: 'index.html' });
result[0].should.have.properties('assets');
result[0].assets.should.be.instanceOf(Array).and.have.length(4);

result[1].should.have.properties({ url: 'http://example.com/about', filename: 'about.html' });
result[1].should.have.properties('assets');
result[1].assets.should.be.instanceOf(Array).and.have.length(4);

result[2].should.have.properties({ url: 'http://blog.example.com/', filename: 'blog.html' }); // url after redirect
result[2].should.have.properties('assets');
result[2].assets.should.be.instanceOf(Array).and.have.length(1);

// should create directory and subdirectories
fs.existsSync(testDirname).should.be.eql(true);
Expand Down
6 changes: 3 additions & 3 deletions test/unit/scraper-test.js
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,7 @@ describe('Scraper', function () {
}).catch(done);
});

it('should return array of objects with url and filename', function(done) {
it('should return array of objects with url, filename and assets', function(done) {
nock('http://first-url.com').get('/').reply(200, 'OK');
nock('http://second-url.com').get('/').reply(500);

Expand All @@ -225,8 +225,8 @@ describe('Scraper', function () {
s.prepare().bind(s).then(s.load).then(function(res) {
res.should.be.instanceOf(Array);
res.should.have.length(2);
res[0].should.have.properties(['url', 'filename']);
res[1].should.have.properties(['url', 'filename']);
res[0].should.have.properties(['url', 'filename', 'assets']);
res[1].should.have.properties(['url', 'filename', 'assets']);
done();
}).catch(done);
});
Expand Down
26 changes: 26 additions & 0 deletions test/unit/utils-test.js
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
require('should');
var utils = require('../../lib/utils');
var Resource = require('../../lib/resource');

describe('Common utils', function () {
describe('#isUrl(url)', function () {
Expand Down Expand Up @@ -78,4 +79,29 @@ describe('Common utils', function () {
utils.getRelativePath('css/1.css', 'css/2.css').should.be.equal('2.css');
});
});

describe('#createOutputObject', function () {
it('should create output object recursively', function() {
var root = new Resource('http://google.com', 'google.html');
root.createChild('http://child-one.com', 'child.html');
root.createChild('http://child-two.com', 'child2.html');

var expected = {
url: 'http://google.com',
filename: 'google.html',
assets: [{
url: 'http://child-one.com',
filename: 'child.html',
assets: []
}, {
url: 'http://child-two.com',
filename: 'child2.html',
assets: []
}]
};

var result = utils.createOutputObject(root);
result.should.eql(expected);
});
});
});

0 comments on commit 92eb86f

Please sign in to comment.