-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathvector_gen.py
76 lines (62 loc) · 2.64 KB
/
vector_gen.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import numpy as np
from gensim.downloader import load
import json
def create_minimal_vectors():
print("Loading word vectors...")
words = load('glove-twitter-25')
categories = {
'entertainment': [
'movie', 'film', 'music', 'game', 'play', 'fun', 'dance', 'sing',
'concert', 'theater', 'show', 'series', 'comedy', 'drama', 'art',
'entertainment', 'performance', 'actor', 'actress', 'celebrity'
],
'technology': [
'tech', 'computer', 'software', 'programming', 'code', 'developer',
'digital', 'internet', 'app', 'gadget', 'hardware', 'AI', 'data',
'robot', 'smart', 'device', 'innovation', 'engineering', 'science'
],
'education': [
'learn', 'study', 'teach', 'school', 'university', 'college',
'education', 'course', 'tutorial', 'guide', 'lesson', 'lecture',
'professor', 'student', 'academic', 'research', 'knowledge'
],
'gaming': [
'game', 'gaming', 'playthrough', 'walkthrough', 'stream', 'console',
'player', 'minecraft', 'fortnite', 'gameplay', 'gamer', 'esports',
'nintendo', 'xbox', 'playstation', 'multiplayer', 'rpg'
],
'sports': [
'sports', 'football', 'basketball', 'soccer', 'baseball', 'tennis',
'game', 'match', 'player', 'team', 'score', 'win', 'championship',
'league', 'athlete', 'fitness', 'workout', 'exercise'
]
}
# Collect all relevant words
relevant_words = set()
for category_words in categories.values():
relevant_words.update(category_words)
# Add common YouTube-specific words
youtube_words = [
'video', 'channel', 'subscribe', 'like', 'comment', 'watch',
'youtube', 'live', 'stream', 'vlog', 'review', 'tutorial',
'reaction', 'compilation', 'viral', 'trending'
]
relevant_words.update(youtube_words)
# Create vectors dictionary
vectors = {}
for word in relevant_words:
try:
if word in words:
# Convert vector to list and round to 6 decimal places to reduce size
vectors[word] = [round(float(x), 6) for x in words[word]]
except KeyError:
continue
print(f"Created vectors for {len(vectors)} words")
with open('word_vectors_mini.json', 'w') as f:
json.dump(vectors, f)
print("Word vectors saved to word_vectors_mini.json")
import os
size_kb = os.path.getsize('word_vectors_mini.json') / 1024
print(f"File size: {size_kb:.2f} KB")
if __name__ == '__main__':
create_minimal_vectors()