Skip to content

Commit

Permalink
fix: update generate-embeddings script
Browse files Browse the repository at this point in the history
  • Loading branch information
nikhilsnayak committed Nov 30, 2024
1 parent 6f22a95 commit b14d373
Show file tree
Hide file tree
Showing 3 changed files with 78 additions and 43 deletions.
10 changes: 1 addition & 9 deletions features/ai/functions/queries.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ import 'server-only';

import { unstable_cache } from 'next/cache';
import { openai } from '@ai-sdk/openai';
import { embed, embedMany, generateObject } from 'ai';
import { embed, generateObject } from 'ai';
import { cosineDistance, desc, gt, sql } from 'drizzle-orm';
import { z } from 'zod';

Expand Down Expand Up @@ -39,14 +39,6 @@ export const getSuggestedQuestions = unstable_cache(
}
);

export async function generateEmbeddings(chunks: string[]) {
const { embeddings } = await embedMany({
model: openai.embedding('text-embedding-ada-002'),
values: chunks,
});
return embeddings;
}

async function generateEmbedding(value: string) {
const input = value.replaceAll('\\n', ' ');
const { embedding } = await embed({
Expand Down
107 changes: 74 additions & 33 deletions scripts/generate-embeddings.ts
Original file line number Diff line number Diff line change
@@ -1,43 +1,84 @@
// import path from 'path';
// import { DirectoryLoader } from 'langchain/document_loaders/fs/directory';
// import { TextLoader } from 'langchain/document_loaders/fs/text';
// import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter';
import path from 'path';
import { openai } from '@ai-sdk/openai';
import { embedMany } from 'ai';
import { DirectoryLoader } from 'langchain/document_loaders/fs/directory';
import { TextLoader } from 'langchain/document_loaders/fs/text';
import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter';

// import { db } from '../db';
// import { documents as documentsTable } from '../db/schema';
// import { generateEmbeddings } from './utils';
import { db } from '~/lib/db';
import { documents as documentsTable } from '~/lib/db/schema';

// function getLoader() {
// const slug = process.argv.at(2);
// const CONTENT_DIR = path.join(process.cwd(), 'content');
// if (slug) {
// return new TextLoader(path.join(CONTENT_DIR, `${slug}.mdx`));
// }
console.log('🚀 Starting the script...');

// return new DirectoryLoader(CONTENT_DIR, {
// '.mdx': (path) => new TextLoader(path),
// });
// }
async function generateEmbeddings(chunks: string[]) {
console.log(`🤖 Generating embeddings for ${chunks.length} chunks...`);
try {
const { embeddings } = await embedMany({
model: openai.embedding('text-embedding-ada-002'),
values: chunks,
});
console.log('✅ Embeddings generated successfully!');
return embeddings;
} catch (error) {
console.error('❌ Error generating embeddings:', error);
throw error;
}
}

// const loader = getLoader();
function getLoader() {
const slug = process.argv.at(2);
const CONTENT_DIR = path.join(process.cwd(), 'content');
console.log(`📂 Using content directory: ${CONTENT_DIR}`);

// const content = await loader.load();
if (slug) {
console.log(`📄 Loading single file for slug: ${slug}`);
return new TextLoader(path.join(CONTENT_DIR, `${slug}.mdx`));
}

// const markdownSplitter =
// RecursiveCharacterTextSplitter.fromLanguage('markdown');
console.log('📚 Loading all files from directory...');
return new DirectoryLoader(CONTENT_DIR, {
'.mdx': (path) => new TextLoader(path),
});
}

// const splittedDocuments = await markdownSplitter.splitDocuments(content);
async function main() {
try {
console.log('🔧 Initializing loader...');
const loader = getLoader();

// const chunks = splittedDocuments.map((document) => document.pageContent);
console.log('📥 Loading content...');
const content = await loader.load();
console.log(`✅ Loaded ${content.length} document(s).`);

// const embeddings = await generateEmbeddings(chunks);
console.log('✂️ Splitting documents...');
const markdownSplitter =
RecursiveCharacterTextSplitter.fromLanguage('markdown');
const splittedDocuments = await markdownSplitter.splitDocuments(content);
console.log(`✅ Split into ${splittedDocuments.length} chunks.`);

// await Promise.all(
// embeddings.map((embedding, i) =>
// db.insert(documentsTable).values({
// embedding,
// content: splittedDocuments[i].pageContent,
// metadata: splittedDocuments[i].metadata,
// })
// )
// );
const chunks = splittedDocuments.map((document) => document.pageContent);

console.log('🧠 Generating embeddings...');
const embeddings = await generateEmbeddings(chunks);

console.log('💾 Inserting embeddings into the database...');
await Promise.all(
embeddings.map((embedding, i) =>
db.insert(documentsTable).values({
embedding,
content: splittedDocuments[i].pageContent,
metadata: splittedDocuments[i].metadata,
})
)
);
console.log('✅ Data inserted successfully into the database!');
} catch (error) {
console.error('🔥 Error occurred during execution:', error);
process.exit(1);
}
}

main().then(() => {
console.log('🎉 Script completed successfully!');
process.exit(0);
});
4 changes: 3 additions & 1 deletion turbo.json
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,9 @@
"KV_REST_API_READ_ONLY_TOKEN",
"AUTH_SECRET",
"AUTH_GITHUB_ID",
"AUTH_GITHUB_SECRET"
"AUTH_GITHUB_SECRET",
"HASH_SECRET",
"GITHUB_PERSONAL_ACCESS_TOKEN"
]
},
"type-check": {
Expand Down

1 comment on commit b14d373

@vercel
Copy link

@vercel vercel bot commented on b14d373 Nov 30, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please sign in to comment.