-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathimport-comments.py
164 lines (128 loc) · 4.81 KB
/
import-comments.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
"""
This is a script to import comments or generate comments for testing purposes.
Usage:
python import-comments.py <input_fil>
input_file: metadata.info.json file generated by yt-dlp containing comments.
"""
import os
import sys
import re
import json
from datetime import datetime
from dotenv import load_dotenv
from elasticsearch import Elasticsearch, helpers
import logging
es_logger = logging.getLogger("elasticsearch")
es_logger.setLevel(logging.DEBUG)
load_dotenv()
es = Elasticsearch(
[os.getenv("ES_HOST")], basic_auth=(os.getenv("ES_USER"), os.getenv("ES_PASSWORD"))
).options(ignore_status=404)
def format_comments(comments_raw):
"""process comments to match format"""
comments = []
if comments_raw:
for comment in comments_raw:
cleaned_comment = clean_comment(comment)
if not cleaned_comment:
continue
comments.append(cleaned_comment)
return comments
def clean_comment(comment):
"""parse metadata from comment for indexing"""
if not comment.get("text"):
return False
time_text_datetime = datetime.utcfromtimestamp(comment["timestamp"])
time_text = time_text_datetime.timestamp()
if not comment.get("author"):
comment["author"] = comment.get("author_id", "Unknown")
cleaned_comment = {
"comment_id": comment["id"],
"comment_text": comment["text"].replace("\xa0", ""),
"comment_timestamp": comment["timestamp"],
"comment_time_text": time_text,
"comment_likecount": comment.get("like_count", None),
"comment_is_favorited": comment.get("is_favorited", False),
"comment_author": comment["author"],
"comment_author_id": comment["author_id"],
"comment_author_thumbnail": comment["author_thumbnail"],
"comment_author_is_uploader": comment.get("author_is_uploader", False),
"comment_parent": comment["parent"],
}
return cleaned_comment
def extract_video_id(filename):
"""Extracts video ID from the filename which is enclosed in square brackets."""
base_name, _ = os.path.splitext(filename)
id_search = re.search(r"\[([a-zA-Z0-9_-]{11})\]", base_name)
if id_search:
youtube_id = id_search.group(1)
return youtube_id
def main():
if len(sys.argv) < 2:
print(f"Usage: python {os.path.basename(__file__)} <input_dir>")
sys.exit(1)
filename = sys.argv[1]
youtube_id = extract_video_id(filename)
if not youtube_id:
print(f"Could not extract video ID from {filename}")
sys.exit(1)
channel_id = None
formatted_comments = []
with open(filename, "r", encoding="utf-8") as f:
comments_json = json.load(f)
channel_id = comments_json.get("channel_id")
if not channel_id:
print(f"Could not extract channel ID from {filename}. Bad JSON?")
sys.exit(1)
comments_data = comments_json["comments"]
formatted_comments = format_comments(comments_data)
# uncomment me for testing
# for i in range(100000):
# formatted_comments.append(
# {
# "comment_id": f"comment_id_{i}",
# "comment_text": f"comment_text_{i}",
# "comment_timestamp": i,
# "comment_time_text": i,
# "comment_likecount": i,
# "comment_is_favorited": False,
# "comment_author": f"comment_author_{i}",
# "comment_author_id": f"comment_author_id_{i}",
# "comment_author_thumbnail": f"comment_author_thumbnail_{i}",
# "comment_author_is_uploader": False,
# "comment_parent": f"comment_parent_{i}",
# }
# )
print(
f"Importing comments will delete all comments for video {youtube_id} and replace them with the new comments."
)
confirm = input("Continue? (y/n): ")
if confirm.lower() != "y":
print("Aborting")
sys.exit(1)
# delete the comments
es.delete(index="ta_comment", id=youtube_id, refresh=True)
print("Deleted comments for video {youtube_id}")
# Prepare bulk insert
actions = [
{
"_index": "ta_comment",
"_id": youtube_id,
"_source": {
"youtube_id": youtube_id,
"comment_last_refresh": int(datetime.now().timestamp()),
"comment_channel_id": channel_id,
"comment_comments": formatted_comments,
},
}
]
print("Importing comments now")
# Perform bulk insert of new comments
success, errors = helpers.bulk(es, actions)
if success == 1:
print(f"Successfully imported comments for video {youtube_id}")
if len(errors) > 0:
print("Errors encountered during import!")
print(errors)
if __name__ == "__main__":
main()