diff --git a/scripts/.gitignore b/scripts/.gitignore index d4705ac..82ab10b 100644 --- a/scripts/.gitignore +++ b/scripts/.gitignore @@ -2,3 +2,4 @@ client_secret.json openplanner.json miniature/ out_srt/ +out_keywords diff --git a/scripts/youtubeSubtitleEdit.js b/scripts/youtubeSubtitleEdit.js index 01d942d..baf9841 100644 --- a/scripts/youtubeSubtitleEdit.js +++ b/scripts/youtubeSubtitleEdit.js @@ -2,40 +2,56 @@ import { getVideosLast72Hours, initYoutube, updateVideo, updateVideoThumbnail } import fs from 'fs' import axios from 'axios' import path from 'path' +import { PromisePool } from '@supercharge/promise-pool' -const GLADIA_API_KEY = '25f0e29c-1c88-4533-bca7-b096c074dd74' -const GLADIA_TRANSCRIPTION_ENDPOINT = 'https://api.gladia.io/v2/transcription' +const GLADIA_KEY_PATH = path.resolve(process.env.HOME, '.credentials', 'gladia_api.key') +const OPENAI_KEY_PATH = path.resolve(process.env.HOME, '.credentials', 'openai_api.key') const POLLING_INTERVAL = 5000 // 5 seconds +const CONCURRENT_JOBS = 10 + +// Function to get API key from a file +const getApiKey = (filePath) => { + if (!fs.existsSync(filePath)) { + console.error(`❌ Error: API key file not found at: ${filePath}`) + console.log('🔄 Please create a file at the above location with your API key.') + process.exit(1) + } + return fs.readFileSync(filePath, 'utf-8').trim() +} + +const GLADIA_API_KEY = getApiKey(GLADIA_KEY_PATH) +const OPENAI_API_KEY = getApiKey(OPENAI_KEY_PATH) +const GLADIA_TRANSCRIPTION_ENDPOINT = 'https://api.gladia.io/v2/transcription' const joinYoutubeAndOpenPlannerData = (youtubeVideos, openPlannerData) => { - // Check if all videos exist in OpenPlanner const videosWithOpenPlannerData = youtubeVideos.map((video) => { const videoTitle = video.snippet.title - // find session details in openplanner.json const session = openPlannerData.sessions.find( (session) => videoTitle.includes(session.title) || session.title.includes(videoTitle) ) return { - // ...video, videoId: video.contentDetails.videoId, publishedAt: video.contentDetails.videoPublishedAt, session, } }) + const videosWithValidSession = videosWithOpenPlannerData.filter((video) => video.session) - console.log('Matching videos: ' + videosWithValidSession.length) + console.log(`ℹī¸ Matching videos: ${videosWithValidSession.length}`) console.log( - 'Non matching video title or no speakers: ' + - videosWithOpenPlannerData.filter((video) => !video.session).map((video) => video.snippet.title) + `ℹī¸ Non matching video title or no speakers: ${videosWithOpenPlannerData + .filter((video) => !video.session) + .map((video) => video.snippet.title) + .join(', ')}` ) return videosWithValidSession } -async function getTranscriptionIdFromGladia(audioUrl) { +async function getTranscriptionIdFromGladia(audioUrl, customVocabulary) { const headers = { 'Content-Type': 'application/json', 'x-gladia-key': GLADIA_API_KEY, @@ -47,22 +63,28 @@ async function getTranscriptionIdFromGladia(audioUrl) { subtitles_config: { formats: ['srt'], }, + custom_vocabulary: customVocabulary, } - const response = await axios.post(GLADIA_TRANSCRIPTION_ENDPOINT, payload, { headers }) + let response = {} + try { + response = await axios.post(GLADIA_TRANSCRIPTION_ENDPOINT, payload, { headers }) - if (response.status !== 201) { - console.log(response.data) - console.log(response.status) - throw new Error('Failed to initiate transcription') - } + if (response.status !== 201) { + console.error(`❌ Failed to initiate transcription for URL: ${audioUrl}`) + console.error(response.data) + throw new Error('Failed to initiate transcription') + } - const transcriptionId = response.data.id - if (!transcriptionId) { - throw new Error('Transcription ID not found in response') - } + const transcriptionId = response.data.id + if (!transcriptionId) { + throw new Error('Transcription ID not found in response') + } - return transcriptionId + return transcriptionId + } catch (error) { + console.error(`❌ Failed to initiate transcription for URL: ${audioUrl}: `, payload, error) + } } function saveSubtitlesToSrt(subtitles, filename) { @@ -86,10 +108,8 @@ async function getFullTranscriptionFromGladia(transcriptionId) { if (response.data.status === 'done') { isCompleted = true - console.log(JSON.stringify(response.data.result.transcription.subtitles[0].subtitles)) subtitles = response.data.result.transcription.subtitles[0].subtitles } else { - console.log('Transcription not ready yet. Waiting...') await new Promise((resolve) => setTimeout(resolve, POLLING_INTERVAL)) } } @@ -101,56 +121,127 @@ async function getFullTranscriptionFromGladia(transcriptionId) { throw new Error('Subtitles not found in response') } +async function generateKeywords(session) { + const prompt = `Extract 10 technology-related keywords from the following abstract. Keywords shouldn't be french words, but rather technology names or methods. Give me a json list raw:\n\n${session.abstract}` + let keywords = [] + try { + const response = await axios.post( + 'https://api.openai.com/v1/chat/completions', + { + model: 'gpt-3.5-turbo', + messages: [ + { role: 'system', content: 'You are a helpful assistant.' }, + { role: 'user', content: prompt }, + ], + max_tokens: 500, + temperature: 0, + }, + { + headers: { + Authorization: `Bearer ${OPENAI_API_KEY}`, + 'Content-Type': 'application/json', + }, + } + ) + + if (response.data && response.data.choices && response.data.choices.length > 0) { + const rawKeywords = response.data.choices[0].message.content + keywords = rawKeywords.replaceAll('`', '').replace('json', '') + return JSON.parse(keywords) + } + + throw new Error('No keywords found in response') + } catch (error) { + console.error(`❌ Error generating keywords for session: ${session.title}`, error.message, keywords) + return keywords + } +} + +function saveKeywordsToJson(keywords, videoId) { + const outKeywordsDir = './out_keywords' + if (!fs.existsSync(outKeywordsDir)) { + fs.mkdirSync(outKeywordsDir) + } + + const jsonFilename = path.join(outKeywordsDir, `${videoId}.json`) + fs.writeFileSync(jsonFilename, JSON.stringify(keywords, null, 2)) +} + +const processVideo = async (video, outSrtDir, outKeywordsDir) => { + const srtFilename = path.join(outSrtDir, `${video.videoId}.srt`) + const jsonFilename = path.join(outKeywordsDir, `${video.videoId}.json`) + + // Check if subtitles and keywords already exist + const srtExists = fs.existsSync(srtFilename) + const keywordsExist = fs.existsSync(jsonFilename) + let customVocabulary = [] + + if (keywordsExist) { + customVocabulary = JSON.parse(fs.readFileSync(jsonFilename)) + console.log(`ℹī¸ Keywords JSON file already exists for video ID: ${video.videoId}, using existing keywords.`) + } else { + const keywords = await generateKeywords(video.session) + if (keywords.length > 0) { + saveKeywordsToJson(keywords, video.videoId) + customVocabulary = keywords + console.log( + `✅ Generated and saved keywords for session title: ${video.session.title} (ID: ${video.videoId})` + ) + } + } + + if (srtExists) { + console.log(`ℹī¸ SRT file already exists for video ID: ${video.videoId}, skipping transcription...`) + return + } + + try { + const audioUrl = `https://www.youtube.com/watch?v=${video.videoId}` + console.log(`🚀 Initiating transcription for ${video.session.title} (ID: ${video.videoId})`) + + const transcriptionId = await getTranscriptionIdFromGladia(audioUrl, customVocabulary) + + if (transcriptionId == '') { + return + } + console.log(`🚀 Awaiting transcription results for ${video.session.title} (ID: ${video.videoId})`) + const subtitles = await getFullTranscriptionFromGladia(transcriptionId) + + saveSubtitlesToSrt(subtitles, srtFilename) + console.log(`✅ Processed and saved SRT for ${video.session.title} (ID: ${video.videoId})`) + } catch (error) { + console.error(`❌ Failed to process video ID: ${video.videoId}`, error.message) + } +} + const main = async () => { const { auth, channelId } = await initYoutube() - const playlistId = 'PLz7aCyCbFOu8_3w6EydaKkjHDiZ9Az1XR' - const videoCategoryId = '27' // use await listVideoCategories(auth) const openPlannerFileName = 'openplanner.json' const openPlannerContent = JSON.parse(fs.readFileSync(openPlannerFileName)) - // Generate thumbnails using https://fill-my-slides.web.app/ - // return formatFillMySlidesData(openPlannerContent) - const videos = await getVideosLast72Hours(auth, channelId, playlistId) - - console.log('Retrieved videos: ' + videos.length) + console.log('ℹī¸ Retrieved videos: ' + videos.length) const videosWithValidSession = joinYoutubeAndOpenPlannerData(videos, openPlannerContent) - // Create output directory if it doesn't exist const outSrtDir = './out_srt' if (!fs.existsSync(outSrtDir)) { fs.mkdirSync(outSrtDir) } - // Process each video for transcription - for (const video of videosWithValidSession) { - const srtFilename = path.join(outSrtDir, `${video.videoId}.srt`) - - // Skip if SRT file already exists - if (fs.existsSync(srtFilename)) { - console.log(`SRT file already exists for video ID: ${video.videoId}, skipping...`) - continue - } - try { - // const audioUrl = `https://www.youtube.com/watch?v=${video.videoId}` - const audioUrl = `https://www.youtube.com/watch?v=W1WMcg4dFj0` - - // Request transcription and get the ID - const transcriptionId = await getTranscriptionIdFromGladia(audioUrl) + const outKeywordsDir = './out_keywords' + if (!fs.existsSync(outKeywordsDir)) { + fs.mkdirSync(outKeywordsDir) + } - // // Fetch the full transcription using the ID - const subtitles = await getFullTranscriptionFromGladia(transcriptionId) + await PromisePool.withConcurrency(CONCURRENT_JOBS) + .for(videosWithValidSession) + .process(async (video, index, pool) => { + await processVideo(video, outSrtDir, outKeywordsDir) + }) - // Save subtitles to an SRT file - saveSubtitlesToSrt(subtitles, srtFilename) - console.log(`Processed video ID: ${video.videoId}`) - } catch (error) { - console.error(`Failed to process video ID: ${video.videoId}`, error) - } - break - } + console.log('🏁 Completed all video processing.') } main()