feat: add youtube subtitles script

Signed-off-by: AlexandreBrg <burgoni@pm.me>
HugoGresse · Jul 25, 2024 · 2980e8a · 2980e8a
1 parent e91d5bf
commit 2980e8a
Show file tree

Hide file tree

Showing 2 changed files with 147 additions and 55 deletions.
diff --git a/scripts/.gitignore b/scripts/.gitignore
@@ -2,3 +2,4 @@ client_secret.json
 openplanner.json
 miniature/
 out_srt/
+out_keywords
diff --git a/scripts/youtubeSubtitleEdit.js b/scripts/youtubeSubtitleEdit.js
@@ -2,40 +2,56 @@ import { getVideosLast72Hours, initYoutube, updateVideo, updateVideoThumbnail }
 import fs from 'fs'
 import axios from 'axios'
 import path from 'path'
+import { PromisePool } from '@supercharge/promise-pool'
 
-const GLADIA_API_KEY = '25f0e29c-1c88-4533-bca7-b096c074dd74'
-const GLADIA_TRANSCRIPTION_ENDPOINT = 'https://api.gladia.io/v2/transcription'
+const GLADIA_KEY_PATH = path.resolve(process.env.HOME, '.credentials', 'gladia_api.key')
+const OPENAI_KEY_PATH = path.resolve(process.env.HOME, '.credentials', 'openai_api.key')
 const POLLING_INTERVAL = 5000 // 5 seconds
+const CONCURRENT_JOBS = 10
+
+// Function to get API key from a file
+const getApiKey = (filePath) => {
+ if (!fs.existsSync(filePath)) {
+ console.error(`❌ Error: API key file not found at: ${filePath}`)
+ console.log('🔄 Please create a file at the above location with your API key.')
+ process.exit(1)
+ }
+ return fs.readFileSync(filePath, 'utf-8').trim()
+}
+
+const GLADIA_API_KEY = getApiKey(GLADIA_KEY_PATH)
+const OPENAI_API_KEY = getApiKey(OPENAI_KEY_PATH)
+const GLADIA_TRANSCRIPTION_ENDPOINT = 'https://api.gladia.io/v2/transcription'
 
 const joinYoutubeAndOpenPlannerData = (youtubeVideos, openPlannerData) => {
- // Check if all videos exist in OpenPlanner
  const videosWithOpenPlannerData = youtubeVideos.map((video) => {
  const videoTitle = video.snippet.title
 
- // find session details in openplanner.json
  const session = openPlannerData.sessions.find(
  (session) => videoTitle.includes(session.title) || session.title.includes(videoTitle)
  )
 
  return {
- // ...video,
  videoId: video.contentDetails.videoId,
  publishedAt: video.contentDetails.videoPublishedAt,
  session,
  }
  })
+
  const videosWithValidSession = videosWithOpenPlannerData.filter((video) => video.session)
 
- console.log('Matching videos: ' + videosWithValidSession.length)
+ console.log(`ℹ️ Matching videos: ${videosWithValidSession.length}`)
  console.log(
- 'Non matching video title or no speakers: ' +
- videosWithOpenPlannerData.filter((video) => !video.session).map((video) => video.snippet.title)
+ `ℹ️ Non matching video title or no speakers: ${videosWithOpenPlannerData
+ .filter((video) => !video.session)
+ .map((video) => video.snippet.title)
+ .join(', ')}`
  )
 
  return videosWithValidSession
 }
 
-async function getTranscriptionIdFromGladia(audioUrl) {
+async function getTranscriptionIdFromGladia(audioUrl, customVocabulary) {
  const headers = {
  'Content-Type': 'application/json',
  'x-gladia-key': GLADIA_API_KEY,
@@ -47,22 +63,28 @@ async function getTranscriptionIdFromGladia(audioUrl) {
  subtitles_config: {
  formats: ['srt'],
  },
+ custom_vocabulary: customVocabulary,
  }
 
- const response = await axios.post(GLADIA_TRANSCRIPTION_ENDPOINT, payload, { headers })
+ let response = {}
+ try {
+ response = await axios.post(GLADIA_TRANSCRIPTION_ENDPOINT, payload, { headers })
 
- if (response.status !== 201) {
- console.log(response.data)
- console.log(response.status)
- throw new Error('Failed to initiate transcription')
- }
+  if (response.status !== 201) {
+  console.error(`❌ Failed to initiate transcription for URL: ${audioUrl}`)
+  console.error(response.data)
+  throw new Error('Failed to initiate transcription')
+  }
 
- const transcriptionId = response.data.id
- if (!transcriptionId) {
- throw new Error('Transcription ID not found in response')
- }
+  const transcriptionId = response.data.id
+  if (!transcriptionId) {
+  throw new Error('Transcription ID not found in response')
+  }
 
- return transcriptionId
+ return transcriptionId
+ } catch (error) {
+ console.error(`❌ Failed to initiate transcription for URL: ${audioUrl}: `, payload, error)
+ }
 }
 
 function saveSubtitlesToSrt(subtitles, filename) {
@@ -86,10 +108,8 @@ async function getFullTranscriptionFromGladia(transcriptionId) {
 
  if (response.data.status === 'done') {
  isCompleted = true
- console.log(JSON.stringify(response.data.result.transcription.subtitles[0].subtitles))
  subtitles = response.data.result.transcription.subtitles[0].subtitles
  } else {
- console.log('Transcription not ready yet. Waiting...')
  await new Promise((resolve) => setTimeout(resolve, POLLING_INTERVAL))
  }
  }
@@ -101,56 +121,127 @@ async function getFullTranscriptionFromGladia(transcriptionId) {
  throw new Error('Subtitles not found in response')
 }
 
+async function generateKeywords(session) {
+ const prompt = `Extract 10 technology-related keywords from the following abstract. Keywords shouldn't be french words, but rather technology names or methods. Give me a json list raw:\n\n${session.abstract}`
+ let keywords = []
+ try {
+ const response = await axios.post(
+ 'https://api.openai.com/v1/chat/completions',
+ {
+ model: 'gpt-3.5-turbo',
+ messages: [
+ { role: 'system', content: 'You are a helpful assistant.' },
+ { role: 'user', content: prompt },
+ ],
+ max_tokens: 500,
+ temperature: 0,
+ },
+ {
+ headers: {
+ Authorization: `Bearer ${OPENAI_API_KEY}`,
+ 'Content-Type': 'application/json',
+ },
+ }
+ )
+
+ if (response.data && response.data.choices && response.data.choices.length > 0) {
+ const rawKeywords = response.data.choices[0].message.content
+ keywords = rawKeywords.replaceAll('`', '').replace('json', '')
+ return JSON.parse(keywords)
+ }
+
+ throw new Error('No keywords found in response')
+ } catch (error) {
+ console.error(`❌ Error generating keywords for session: ${session.title}`, error.message, keywords)
+ return keywords
+ }
+}
+
+function saveKeywordsToJson(keywords, videoId) {
+ const outKeywordsDir = './out_keywords'
+ if (!fs.existsSync(outKeywordsDir)) {
+ fs.mkdirSync(outKeywordsDir)
+ }
+
+ const jsonFilename = path.join(outKeywordsDir, `${videoId}.json`)
+ fs.writeFileSync(jsonFilename, JSON.stringify(keywords, null, 2))
+}
+
+const processVideo = async (video, outSrtDir, outKeywordsDir) => {
+ const srtFilename = path.join(outSrtDir, `${video.videoId}.srt`)
+ const jsonFilename = path.join(outKeywordsDir, `${video.videoId}.json`)
+
+ // Check if subtitles and keywords already exist
+ const srtExists = fs.existsSync(srtFilename)
+ const keywordsExist = fs.existsSync(jsonFilename)
+ let customVocabulary = []
+
+ if (keywordsExist) {
+ customVocabulary = JSON.parse(fs.readFileSync(jsonFilename))
+ console.log(`ℹ️ Keywords JSON file already exists for video ID: ${video.videoId}, using existing keywords.`)
+ } else {
+ const keywords = await generateKeywords(video.session)
+ if (keywords.length > 0) {
+ saveKeywordsToJson(keywords, video.videoId)
+ customVocabulary = keywords
+ console.log(
+ `✅ Generated and saved keywords for session title: ${video.session.title} (ID: ${video.videoId})`
+ )
+ }
+ }
+
+ if (srtExists) {
+ console.log(`ℹ️ SRT file already exists for video ID: ${video.videoId}, skipping transcription...`)
+ return
+ }
+
+ try {
+ const audioUrl = `https://www.youtube.com/watch?v=${video.videoId}`
+ console.log(`🚀 Initiating transcription for ${video.session.title} (ID: ${video.videoId})`)
+
+ const transcriptionId = await getTranscriptionIdFromGladia(audioUrl, customVocabulary)
+
+ if (transcriptionId == '') {
+ return
+ }
+ console.log(`🚀 Awaiting transcription results for ${video.session.title} (ID: ${video.videoId})`)
+ const subtitles = await getFullTranscriptionFromGladia(transcriptionId)
+
+ saveSubtitlesToSrt(subtitles, srtFilename)
+ console.log(`✅ Processed and saved SRT for ${video.session.title} (ID: ${video.videoId})`)
+ } catch (error) {
+ console.error(`❌ Failed to process video ID: ${video.videoId}`, error.message)
+ }
+}
+
 const main = async () => {
  const { auth, channelId } = await initYoutube()
-
  const playlistId = 'PLz7aCyCbFOu8_3w6EydaKkjHDiZ9Az1XR'
- const videoCategoryId = '27' // use await listVideoCategories(auth)
  const openPlannerFileName = 'openplanner.json'
  const openPlannerContent = JSON.parse(fs.readFileSync(openPlannerFileName))
 
- // Generate thumbnails using https://fill-my-slides.web.app/
- // return formatFillMySlidesData(openPlannerContent)
-
  const videos = await getVideosLast72Hours(auth, channelId, playlistId)
-
- console.log('Retrieved videos: ' + videos.length)
+ console.log('ℹ️ Retrieved videos: ' + videos.length)
 
  const videosWithValidSession = joinYoutubeAndOpenPlannerData(videos, openPlannerContent)
 
- // Create output directory if it doesn't exist
  const outSrtDir = './out_srt'
  if (!fs.existsSync(outSrtDir)) {
  fs.mkdirSync(outSrtDir)
  }
 
- // Process each video for transcription
- for (const video of videosWithValidSession) {
- const srtFilename = path.join(outSrtDir, `${video.videoId}.srt`)
-
- // Skip if SRT file already exists
- if (fs.existsSync(srtFilename)) {
- console.log(`SRT file already exists for video ID: ${video.videoId}, skipping...`)
- continue
- }
- try {
- // const audioUrl = `https://www.youtube.com/watch?v=${video.videoId}`
- const audioUrl = `https://www.youtube.com/watch?v=W1WMcg4dFj0`
-
- // Request transcription and get the ID
- const transcriptionId = await getTranscriptionIdFromGladia(audioUrl)
+ const outKeywordsDir = './out_keywords'
+ if (!fs.existsSync(outKeywordsDir)) {
+ fs.mkdirSync(outKeywordsDir)
+ }
 
- // // Fetch the full transcription using the ID
- const subtitles = await getFullTranscriptionFromGladia(transcriptionId)
+ await PromisePool.withConcurrency(CONCURRENT_JOBS)
+ .for(videosWithValidSession)
+ .process(async (video, index, pool) => {
+ await processVideo(video, outSrtDir, outKeywordsDir)
+ })
 
- // Save subtitles to an SRT file
- saveSubtitlesToSrt(subtitles, srtFilename)
- console.log(`Processed video ID: ${video.videoId}`)
- } catch (error) {
- console.error(`Failed to process video ID: ${video.videoId}`, error)
- }
- break
- }
+ console.log('🏁 Completed all video processing.')
 }
 
 main()