-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathvideos-without-comments.py
63 lines (51 loc) · 1.83 KB
/
videos-without-comments.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
"""
This script find all videos without comments
and write the IDs to file video_ids_without_comments.txt
"""
import os
from dotenv import load_dotenv
from elasticsearch import Elasticsearch, helpers
import logging
import dateparser
es_logger = logging.getLogger("elasticsearch")
es_logger.setLevel(logging.DEBUG)
load_dotenv()
es = Elasticsearch(
[os.getenv("ES_HOST")], basic_auth=(os.getenv("ES_USER"), os.getenv("ES_PASSWORD"))
).options(ignore_status=404)
es_query = {
"size": 10000,
"query": {
"bool": {
"must_not": [
{"exists": {"field": "comment_count"}},
{"range": {"comment_count": {"gt": 0}}},
],
"must": [
{"term": {"vid_type": "videos"}},
# {
# "range": {
# "vid_last_refresh": {
# "gte": str(int(dateparser.parse("2024-01-01").timestamp())),
# "lte": str(int(dateparser.parse("2024-12-31").timestamp())),
# }
# }
# },
# {"range": {"stats.view_count": {"gt": 1000}}},
],
}
},
"_source": ["youtube_id", "published", "vid_type", "title"],
}
response = es.search(index="ta_video", body=es_query, scroll="5m")
scroll_id = response["_scroll_id"]
results = response["hits"]["hits"]
while len(response["hits"]["hits"]):
response = es.scroll(scroll_id=scroll_id, scroll="5m")
results.extend(response["hits"]["hits"])
video_ids = [hit["_source"]["youtube_id"] for hit in results]
with open("video_ids_without_comments.txt", "w") as file:
for video_id in video_ids:
file.write(f"{video_id}\n")
print(f"Total videos without comments: {len(video_ids)}")
print("Video IDs have been written to video_ids_without_comments.txt")