-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* Save links as json file * Scrape tags * Scrape users * Scrape comments
- Loading branch information
Showing
3 changed files
with
219 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,98 @@ | ||
import chalk from "chalk"; | ||
import { mkdir, writeFile } from "fs/promises"; | ||
import { cleanDir } from "../utils/fs.js"; | ||
import { info, success } from "../utils/log.js"; | ||
import { | ||
filterHtml, | ||
formatObjectAsJson, | ||
formatStringAsHtml, | ||
getLinks, | ||
getMetadata, | ||
paginatedScrape, | ||
} from "../utils/scraping.js"; | ||
|
||
export async function scrapeComments({ | ||
apiUrl, | ||
dataDir, | ||
classFilters, | ||
idFilters, | ||
elementFilters, | ||
jsonFilters, | ||
removeAttributes, | ||
removeAllAttributes, | ||
removeEmptyElements, | ||
limitPages, | ||
}) { | ||
info(`Scraping ${chalk.blue("comments")}...`, true); | ||
|
||
const commentsApiUrl = `${apiUrl}/comments`; | ||
const commentsDir = `${dataDir}/comments`; | ||
|
||
await mkdir(commentsDir, { recursive: true }); | ||
|
||
const saveUnmodifiedHtml = | ||
classFilters.length > 0 || | ||
idFilters.length > 0 || | ||
elementFilters.length > 0 || | ||
removeAttributes.length > 0 || | ||
removeAllAttributes || | ||
removeEmptyElements; | ||
|
||
await paginatedScrape(commentsApiUrl, limitPages, async (comments) => { | ||
if (!Array.isArray(comments) || comments.length === 0) { | ||
info("No comments found."); | ||
cleanDir(commentsDir, true); | ||
|
||
return; | ||
} | ||
|
||
for (const comment of comments) { | ||
const commentIdentifier = `${comment.post}-${comment.id}`; | ||
const commentDir = `${commentsDir}/${commentIdentifier}`; | ||
|
||
info(`Scraping comment ${chalk.blue(commentIdentifier)}...`); | ||
|
||
await cleanDir(commentDir, true, true); | ||
|
||
await writeFile( | ||
`${commentDir}/full-data.json`, | ||
formatObjectAsJson(comment) | ||
); | ||
|
||
await writeFile( | ||
`${commentDir}/meta-data.json`, | ||
formatObjectAsJson(getMetadata(comment, jsonFilters)) | ||
); | ||
|
||
await writeFile( | ||
`${commentDir}/links.json`, | ||
formatObjectAsJson(getLinks(comment)) | ||
); | ||
|
||
await writeFile( | ||
`${commentDir}/rendered-content.html`, | ||
formatStringAsHtml( | ||
filterHtml(comment.content.rendered, { | ||
classFilters, | ||
idFilters, | ||
elementFilters, | ||
removeAttributes, | ||
removeAllAttributes, | ||
removeEmptyElements, | ||
}) | ||
) | ||
); | ||
|
||
if (saveUnmodifiedHtml) { | ||
await writeFile( | ||
`${commentDir}/rendered-content-unmodified.html`, | ||
formatStringAsHtml(comment.content.rendered) | ||
); | ||
} | ||
|
||
success("Done.", true); | ||
} | ||
}); | ||
|
||
success("Done scraping comments.", true); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,87 @@ | ||
import chalk from "chalk"; | ||
import { mkdir, writeFile } from "fs/promises"; | ||
import { cleanDir } from "../utils/fs.js"; | ||
import { info, success } from "../utils/log.js"; | ||
import { | ||
formatObjectAsJson, | ||
getLinks, | ||
getMetadata, | ||
paginatedScrape, | ||
} from "../utils/scraping.js"; | ||
|
||
export async function scrapeUsers({ | ||
apiUrl, | ||
dataDir, | ||
jsonFilters, | ||
limitPages, | ||
}) { | ||
info(`Scraping ${chalk.blue("users")}...`, true); | ||
|
||
const usersApiUrl = `${apiUrl}/users`; | ||
const postsApiUrl = `${apiUrl}/posts`; | ||
const usersDir = `${dataDir}/users`; | ||
|
||
await mkdir(usersDir, { recursive: true }); | ||
|
||
await paginatedScrape(usersApiUrl, limitPages, async (users) => { | ||
if (!Array.isArray(users) || users.length === 0) { | ||
info("No users found."); | ||
cleanDir(usersDir, true); | ||
|
||
return; | ||
} | ||
|
||
for (const user of users) { | ||
const userIdentifier = `${user.id}-${user.slug}`; | ||
const userDir = `${usersDir}/${userIdentifier}`; | ||
|
||
info(`Scraping user ${chalk.blue(userIdentifier)}...`); | ||
|
||
await cleanDir(userDir, true, true); | ||
|
||
await writeFile(`${userDir}/full-data.json`, formatObjectAsJson(user)); | ||
|
||
await writeFile( | ||
`${userDir}/meta-data.json`, | ||
formatObjectAsJson(getMetadata(user, jsonFilters)) | ||
); | ||
|
||
await writeFile( | ||
`${userDir}/links.json`, | ||
formatObjectAsJson(getLinks(user)) | ||
); | ||
|
||
let postIds = []; | ||
|
||
info( | ||
`Scraping ${chalk.blue("posts")} for user ${chalk.blue( | ||
userIdentifier | ||
)}...` | ||
); | ||
|
||
await paginatedScrape( | ||
`${postsApiUrl}?author=${user.id}`, | ||
limitPages, | ||
async (posts) => { | ||
if (!Array.isArray(posts) || posts.length === 0) { | ||
info("No posts found for the user."); | ||
|
||
return; | ||
} | ||
|
||
for (const post of posts) { | ||
postIds = [...postIds, post.id]; | ||
} | ||
} | ||
); | ||
|
||
if (postIds.length > 0) { | ||
await writeFile(`${userDir}/posts.json`, formatObjectAsJson(postIds)); | ||
} | ||
|
||
success("Done.", true); | ||
} | ||
}); | ||
|
||
success("Done scraping users.", true); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters