Skip to content

Commit

Permalink
fix(perf): reduce total memory consumption (#153)
Browse files Browse the repository at this point in the history
  • Loading branch information
JustinBeckwith authored Feb 28, 2020
1 parent 420d103 commit aa139da
Show file tree
Hide file tree
Showing 2 changed files with 31 additions and 26 deletions.
24 changes: 14 additions & 10 deletions src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -245,17 +245,21 @@ export class LinkChecker extends EventEmitter {
} catch {}
}

opts.queue.add(async () => {
await this.crawl({
url: result.url!,
crawl,
cache: opts.cache,
results: opts.results,
checkOptions: opts.checkOptions,
queue: opts.queue,
parent: opts.url.href,
// Ensure the url hasn't already been touched, largely to avoid a
// very large queue length and runaway memory consumption
if (!opts.cache.has(result.url.href)) {
opts.queue.add(async () => {
await this.crawl({
url: result.url!,
crawl,
cache: opts.cache,
results: opts.results,
checkOptions: opts.checkOptions,
queue: opts.queue,
parent: opts.url.href,
});
});
});
}
}
}
}
Expand Down
33 changes: 17 additions & 16 deletions src/links.ts
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,16 @@ export interface ParsedUrl {

export function getLinks(source: string, baseUrl: string): ParsedUrl[] {
const $ = cheerio.load(source);
const links = new Array<string>();
Object.keys(linksAttr).forEach(attr => {
let realBaseUrl = baseUrl;
const base = $('base[href]');
if (base.length) {
// only first <base by specification
const htmlBaseUrl = base.first().attr('href')!;
realBaseUrl = getBaseUrl(htmlBaseUrl, baseUrl);
}
const links = new Array<ParsedUrl>();
const attrs = Object.keys(linksAttr);
for (const attr of attrs) {
const elements = linksAttr[attr].map(tag => `${tag}[${attr}]`).join(',');
$(elements).each((i, element) => {
const values = parseAttr(attr, element.attribs[attr]);
Expand All @@ -48,22 +56,15 @@ export function getLinks(source: string, baseUrl: string): ParsedUrl[] {
) {
return;
}
links.push(...values);
for (const v of values) {
if (!!v) {
const link = parseLink(v, realBaseUrl);
links.push(link);
}
}
});
});

let realBaseUrl = baseUrl;
const base = $('base[href]');
if (base.length) {
// only first <base by specification
const htmlBaseUrl = base.first().attr('href')!;
realBaseUrl = getBaseUrl(htmlBaseUrl, baseUrl);
}

const sanitized = links
.filter(link => !!link)
.map(link => parseLink(link, realBaseUrl));
return sanitized;
return links;
}

function getBaseUrl(htmlBaseUrl: string, oldBaseUrl: string): string {
Expand Down

0 comments on commit aa139da

Please sign in to comment.