Skip to content

Commit

Permalink
feat(core): add article-extractor for common website
Browse files Browse the repository at this point in the history
  • Loading branch information
LarchLiu committed Jun 30, 2023
1 parent 9eb3a7b commit c7406f8
Show file tree
Hide file tree
Showing 5 changed files with 100 additions and 29 deletions.
3 changes: 3 additions & 0 deletions packages/webHub/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,9 @@
"optional": false
}
},
"dependencies": {
"@extractus/article-extractor": "v7.2.16"
},
"devDependencies": {
"@stargram/generate-routes": "workspace:*"
}
Expand Down
7 changes: 7 additions & 0 deletions packages/webHub/src/utils/article-extractor/index.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
import { extract } from '@extractus/article-extractor'

export async function articleExtractor(url: string) {
const article = await extract(url)

return article
}
36 changes: 27 additions & 9 deletions packages/webHub/src/website/common/paths/any.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import type { CommonMeta, PathInfo, WebInfoData, WebLoaderParams, WebLoaderUrls } from '@stargram/core'
import { getDomain } from '@stargram/core/utils'
import { unfurl } from '../../../utils/unfurl'
import { articleExtractor } from '../../../utils/article-extractor'

function filter(urls: WebLoaderUrls): WebLoaderUrls | undefined {
return urls
Expand All @@ -19,22 +20,39 @@ async function getWebInfo(params: WebLoaderParams): Promise<WebInfoData> {
meta.siteName = siteName
meta.prompts = `Website info of ${url}`
// fetch webinfo
const webJson = await unfurl(url, { browserlessToken: params.browserlessToken })
const webJson = await articleExtractor(url)
if (webJson) {
const readability = webJson.content
const openGraph = webJson.open_graph
const faviconPath = webJson.favicon?.split('/')
const favicon = (faviconPath && !faviconPath[faviconPath.length - 1].includes('.ico')) ? webJson.favicon : ''
meta.favicon = favicon
const openGraph = webJson.image
// TODO: wait for article-extractor to export favicon
// const faviconPath = webJson.favicon?.split('/')
// const favicon = (faviconPath && !faviconPath[faviconPath.length - 1].includes('.ico')) ? webJson.favicon : ''
meta.favicon = ''
title = webJson.title || ''
content = webJson.description || ''
content = `${title}\n${webJson.description || ''}`
if (readability)
content = readability
content = `${title}\n${readability.replace(/(<[^>]+>|\{[^}]+\})/g, '')}`

if (openGraph && openGraph.images)
meta.ogImage = openGraph.images[0].url
if (openGraph)
meta.ogImage = openGraph
}
else if (params.browserlessToken) {
const webJson = await unfurl(url, { browserlessToken: params.browserlessToken })
if (webJson) {
const readability = webJson.content
const openGraph = webJson.open_graph
const faviconPath = webJson.favicon?.split('/')
const favicon = (faviconPath && !faviconPath[faviconPath.length - 1].includes('.ico')) ? webJson.favicon : ''
meta.favicon = favicon
title = webJson.title || ''
content = webJson.description || ''
if (readability)
content = readability

if (openGraph && openGraph.images)
meta.ogImage = openGraph.images[0].url
}
}
if (!title || !content)
throw new Error('Not Supported Website')

Expand Down
6 changes: 0 additions & 6 deletions packages/webHub/vite.config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -21,18 +21,12 @@ export default defineConfig({
// into your library
external: [
'@stargram/core',
'@mozilla/readability',
'htmlparser2',
'jsdom',
],
output: {
// Provide global variables to use in the UMD build
// for externalized deps
globals: {
'@stargram/core': '@stargram/core',
'@mozilla/readability': '@mozilla/readability',
'htmlparser2': 'htmlparser2',
'jsdom': 'jsdom',
},
},
},
Expand Down
77 changes: 63 additions & 14 deletions pnpm-lock.yaml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 comment on commit c7406f8

@vercel
Copy link

@vercel vercel bot commented on c7406f8 Jun 30, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Successfully deployed to the following URLs:

star-nexus – ./

star-nexus-git-main-larchliu.vercel.app
star-nexus.vercel.app
star-nexus-larchliu.vercel.app

Please sign in to comment.