-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
fix: update generate-embeddings script
- Loading branch information
1 parent
6f22a95
commit b14d373
Showing
3 changed files
with
78 additions
and
43 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,43 +1,84 @@ | ||
// import path from 'path'; | ||
// import { DirectoryLoader } from 'langchain/document_loaders/fs/directory'; | ||
// import { TextLoader } from 'langchain/document_loaders/fs/text'; | ||
// import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter'; | ||
import path from 'path'; | ||
import { openai } from '@ai-sdk/openai'; | ||
import { embedMany } from 'ai'; | ||
import { DirectoryLoader } from 'langchain/document_loaders/fs/directory'; | ||
import { TextLoader } from 'langchain/document_loaders/fs/text'; | ||
import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter'; | ||
|
||
// import { db } from '../db'; | ||
// import { documents as documentsTable } from '../db/schema'; | ||
// import { generateEmbeddings } from './utils'; | ||
import { db } from '~/lib/db'; | ||
import { documents as documentsTable } from '~/lib/db/schema'; | ||
|
||
// function getLoader() { | ||
// const slug = process.argv.at(2); | ||
// const CONTENT_DIR = path.join(process.cwd(), 'content'); | ||
// if (slug) { | ||
// return new TextLoader(path.join(CONTENT_DIR, `${slug}.mdx`)); | ||
// } | ||
console.log('🚀 Starting the script...'); | ||
|
||
// return new DirectoryLoader(CONTENT_DIR, { | ||
// '.mdx': (path) => new TextLoader(path), | ||
// }); | ||
// } | ||
async function generateEmbeddings(chunks: string[]) { | ||
console.log(`🤖 Generating embeddings for ${chunks.length} chunks...`); | ||
try { | ||
const { embeddings } = await embedMany({ | ||
model: openai.embedding('text-embedding-ada-002'), | ||
values: chunks, | ||
}); | ||
console.log('✅ Embeddings generated successfully!'); | ||
return embeddings; | ||
} catch (error) { | ||
console.error('❌ Error generating embeddings:', error); | ||
throw error; | ||
} | ||
} | ||
|
||
// const loader = getLoader(); | ||
function getLoader() { | ||
const slug = process.argv.at(2); | ||
const CONTENT_DIR = path.join(process.cwd(), 'content'); | ||
console.log(`📂 Using content directory: ${CONTENT_DIR}`); | ||
|
||
// const content = await loader.load(); | ||
if (slug) { | ||
console.log(`📄 Loading single file for slug: ${slug}`); | ||
return new TextLoader(path.join(CONTENT_DIR, `${slug}.mdx`)); | ||
} | ||
|
||
// const markdownSplitter = | ||
// RecursiveCharacterTextSplitter.fromLanguage('markdown'); | ||
console.log('📚 Loading all files from directory...'); | ||
return new DirectoryLoader(CONTENT_DIR, { | ||
'.mdx': (path) => new TextLoader(path), | ||
}); | ||
} | ||
|
||
// const splittedDocuments = await markdownSplitter.splitDocuments(content); | ||
async function main() { | ||
try { | ||
console.log('🔧 Initializing loader...'); | ||
const loader = getLoader(); | ||
|
||
// const chunks = splittedDocuments.map((document) => document.pageContent); | ||
console.log('📥 Loading content...'); | ||
const content = await loader.load(); | ||
console.log(`✅ Loaded ${content.length} document(s).`); | ||
|
||
// const embeddings = await generateEmbeddings(chunks); | ||
console.log('✂️ Splitting documents...'); | ||
const markdownSplitter = | ||
RecursiveCharacterTextSplitter.fromLanguage('markdown'); | ||
const splittedDocuments = await markdownSplitter.splitDocuments(content); | ||
console.log(`✅ Split into ${splittedDocuments.length} chunks.`); | ||
|
||
// await Promise.all( | ||
// embeddings.map((embedding, i) => | ||
// db.insert(documentsTable).values({ | ||
// embedding, | ||
// content: splittedDocuments[i].pageContent, | ||
// metadata: splittedDocuments[i].metadata, | ||
// }) | ||
// ) | ||
// ); | ||
const chunks = splittedDocuments.map((document) => document.pageContent); | ||
|
||
console.log('🧠 Generating embeddings...'); | ||
const embeddings = await generateEmbeddings(chunks); | ||
|
||
console.log('💾 Inserting embeddings into the database...'); | ||
await Promise.all( | ||
embeddings.map((embedding, i) => | ||
db.insert(documentsTable).values({ | ||
embedding, | ||
content: splittedDocuments[i].pageContent, | ||
metadata: splittedDocuments[i].metadata, | ||
}) | ||
) | ||
); | ||
console.log('✅ Data inserted successfully into the database!'); | ||
} catch (error) { | ||
console.error('🔥 Error occurred during execution:', error); | ||
process.exit(1); | ||
} | ||
} | ||
|
||
main().then(() => { | ||
console.log('🎉 Script completed successfully!'); | ||
process.exit(0); | ||
}); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
b14d373
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Successfully deployed to the following URLs:
nikhilsnayak.dev – ./
nikhilsnayak.dev
nikhilsnayakdev-nikhilsnayak-projects.vercel.app
www.nikhilsnayak.dev
nikhilsnayakdev-git-main-nikhilsnayak-projects.vercel.app
nikhilsnayak.vercel.app