Skip to content

Commit

Permalink
feat: Scrape users and comments
Browse files Browse the repository at this point in the history
* Save links as json file

* Scrape tags

* Scrape users

* Scrape comments
  • Loading branch information
jtiala authored May 29, 2023
1 parent 93d292a commit 377a8c9
Show file tree
Hide file tree
Showing 3 changed files with 219 additions and 0 deletions.
98 changes: 98 additions & 0 deletions src/scrapers/comments.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
import chalk from "chalk";
import { mkdir, writeFile } from "fs/promises";
import { cleanDir } from "../utils/fs.js";
import { info, success } from "../utils/log.js";
import {
filterHtml,
formatObjectAsJson,
formatStringAsHtml,
getLinks,
getMetadata,
paginatedScrape,
} from "../utils/scraping.js";

export async function scrapeComments({
apiUrl,
dataDir,
classFilters,
idFilters,
elementFilters,
jsonFilters,
removeAttributes,
removeAllAttributes,
removeEmptyElements,
limitPages,
}) {
info(`Scraping ${chalk.blue("comments")}...`, true);

const commentsApiUrl = `${apiUrl}/comments`;
const commentsDir = `${dataDir}/comments`;

await mkdir(commentsDir, { recursive: true });

const saveUnmodifiedHtml =
classFilters.length > 0 ||
idFilters.length > 0 ||
elementFilters.length > 0 ||
removeAttributes.length > 0 ||
removeAllAttributes ||
removeEmptyElements;

await paginatedScrape(commentsApiUrl, limitPages, async (comments) => {
if (!Array.isArray(comments) || comments.length === 0) {
info("No comments found.");
cleanDir(commentsDir, true);

return;
}

for (const comment of comments) {
const commentIdentifier = `${comment.post}-${comment.id}`;
const commentDir = `${commentsDir}/${commentIdentifier}`;

info(`Scraping comment ${chalk.blue(commentIdentifier)}...`);

await cleanDir(commentDir, true, true);

await writeFile(
`${commentDir}/full-data.json`,
formatObjectAsJson(comment)
);

await writeFile(
`${commentDir}/meta-data.json`,
formatObjectAsJson(getMetadata(comment, jsonFilters))
);

await writeFile(
`${commentDir}/links.json`,
formatObjectAsJson(getLinks(comment))
);

await writeFile(
`${commentDir}/rendered-content.html`,
formatStringAsHtml(
filterHtml(comment.content.rendered, {
classFilters,
idFilters,
elementFilters,
removeAttributes,
removeAllAttributes,
removeEmptyElements,
})
)
);

if (saveUnmodifiedHtml) {
await writeFile(
`${commentDir}/rendered-content-unmodified.html`,
formatStringAsHtml(comment.content.rendered)
);
}

success("Done.", true);
}
});

success("Done scraping comments.", true);
}
87 changes: 87 additions & 0 deletions src/scrapers/users.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
import chalk from "chalk";
import { mkdir, writeFile } from "fs/promises";
import { cleanDir } from "../utils/fs.js";
import { info, success } from "../utils/log.js";
import {
formatObjectAsJson,
getLinks,
getMetadata,
paginatedScrape,
} from "../utils/scraping.js";

export async function scrapeUsers({
apiUrl,
dataDir,
jsonFilters,
limitPages,
}) {
info(`Scraping ${chalk.blue("users")}...`, true);

const usersApiUrl = `${apiUrl}/users`;
const postsApiUrl = `${apiUrl}/posts`;
const usersDir = `${dataDir}/users`;

await mkdir(usersDir, { recursive: true });

await paginatedScrape(usersApiUrl, limitPages, async (users) => {
if (!Array.isArray(users) || users.length === 0) {
info("No users found.");
cleanDir(usersDir, true);

return;
}

for (const user of users) {
const userIdentifier = `${user.id}-${user.slug}`;
const userDir = `${usersDir}/${userIdentifier}`;

info(`Scraping user ${chalk.blue(userIdentifier)}...`);

await cleanDir(userDir, true, true);

await writeFile(`${userDir}/full-data.json`, formatObjectAsJson(user));

await writeFile(
`${userDir}/meta-data.json`,
formatObjectAsJson(getMetadata(user, jsonFilters))
);

await writeFile(
`${userDir}/links.json`,
formatObjectAsJson(getLinks(user))
);

let postIds = [];

info(
`Scraping ${chalk.blue("posts")} for user ${chalk.blue(
userIdentifier
)}...`
);

await paginatedScrape(
`${postsApiUrl}?author=${user.id}`,
limitPages,
async (posts) => {
if (!Array.isArray(posts) || posts.length === 0) {
info("No posts found for the user.");

return;
}

for (const post of posts) {
postIds = [...postIds, post.id];
}
}
);

if (postIds.length > 0) {
await writeFile(`${userDir}/posts.json`, formatObjectAsJson(postIds));
}

success("Done.", true);
}
});

success("Done scraping users.", true);
}
34 changes: 34 additions & 0 deletions src/wpdl.js
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,12 @@ import process from "node:process";
import { hideBin } from "yargs/helpers";
import yargs from "yargs/yargs";
import { scrapeCategories } from "./scrapers/categories.js";
import { scrapeComments } from "./scrapers/comments.js";
import { scrapeMedia } from "./scrapers/media.js";
import { scrapePages } from "./scrapers/pages.js";
import { scrapePosts } from "./scrapers/posts.js";
import { scrapeTags } from "./scrapers/tags.js";
import { scrapeUsers } from "./scrapers/users.js";
import { cleanDir, createDir } from "./utils/fs.js";
import { error, info } from "./utils/log.js";
import { getSiteNameFromUrl, isValidUrl } from "./utils/url.js";
Expand All @@ -27,6 +29,10 @@ const argv = yargs(hideBin(process.argv))
type: "boolean",
description: "Scrape posts",
})
.option("comments", {
type: "boolean",
description: "Scrape comments",
})
.option("media", {
type: "boolean",
description: "Scrape media",
Expand All @@ -39,6 +45,10 @@ const argv = yargs(hideBin(process.argv))
type: "boolean",
description: "Scrape categories",
})
.option("users", {
type: "boolean",
description: "Scrape users",
})
.option("targetDir", {
alias: "t",
type: "string",
Expand Down Expand Up @@ -174,6 +184,21 @@ if (argv.posts) {
});
}

if (argv.comments) {
await scrapeComments({
apiUrl,
dataDir,
classFilters,
idFilters,
elementFilters,
jsonFilters,
removeAttributes,
removeAllAttributes: argv.removeAllAttributes,
removeEmptyElements: argv.removeEmptyElements,
limitPages: argv.limitPages,
});
}

if (argv.media) {
await scrapeMedia({
apiUrl,
Expand All @@ -200,3 +225,12 @@ if (argv.categories) {
limitPages: argv.limitPages,
});
}

if (argv.users) {
await scrapeUsers({
apiUrl,
dataDir,
jsonFilters,
limitPages: argv.limitPages,
});
}

0 comments on commit 377a8c9

Please sign in to comment.