-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathinstagram_crawler.py
124 lines (104 loc) · 3.8 KB
/
instagram_crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import instaloader
import json
import boto3
from itertools import takewhile, dropwhile
from datetime import datetime, timedelta
bucket_name = "yourssu-community-instagram"
def save_posts_to_s3(posts_json, bucket_name, object_key):
# TODO: profile 적용 필요x
session = boto3.Session()
# Create an S3 client
s3 = session.client('s3')
# Upload the JSON data to S3
s3.put_object(
Bucket=bucket_name,
Key=object_key,
Body=posts_json.encode('utf-8'),
ContentType='application/json'
)
def list_instagram_posts_by_username(username, url):
# create an instance of Instaloader class
loader = instaloader.Instaloader()
# try:
# print('세션')
# loader.load_session_from_file('gon.urssu')
# except:
# try:
# print('로그인')
# id = 'gon.urssu'
# pw = 'wkdghksrhs12.'
# loader.login(id, pw)
# except:
# print('비로그인')
# pass
# get profile information of the user
profile = instaloader.Profile.from_username(loader.context, username)
# create a generator for posts of the user
posts = profile.get_posts()
profile_info = {
"userId": profile.userid,
"username": profile.username,
"externalUrl": profile.external_url,
"followees": profile.followees,
"followers": profile.followers,
"url": url,
}
# Get the current date
current_date = datetime.now()
# create a generator for posts of the user within the specified date range
SINCE = datetime(2099, 12, 31)
UNTIL = datetime(2022, 12, 23)
# filtered_posts = [post for post in posts if post.mediaid > last_media_id]
instagram_posts = []
# iterate over the filtered generator to get information for each post
for post in takewhile(lambda p: p.date > UNTIL, dropwhile(lambda p: p.date > SINCE, posts)):
# for post in posts:
# access attributes of the post object to get information
post_id = post.mediaid
caption = post.caption
like_count = post.likes
comment_count = post.comments
location = post.location
posted_at = post.date_local
is_video = post.is_video
media_urls = []
# check if the post has pictures
if is_video:
media_urls.append(post.video_url)
else:
init = False
# iterate over each picture in the post and get its URL
for pic in post.get_sidecar_nodes():
init = True
media_urls.append(pic.display_url)
if not init:
media_urls.append(post.url)
instagram_posts.append({
"postId": post_id,
"caption": caption,
"location": location,
"mediaUrls": media_urls,
"isVideo": is_video,
"postedAt": str(posted_at).split('+')[0],
"commentCount": comment_count,
"likeCount": like_count,
})
posts = []
page = len(instagram_posts) // 10
if page == 0:
posts.append(instagram_posts)
else:
for i in range(page):
size = i * 10 + 10
posts.append(instagram_posts[i * 10:size])
# convert the list to a JSON array
profile_json = json.dumps({"profile": profile_info}, ensure_ascii=False, indent=4)
posts_json = [json.dumps({"posts": post}, ensure_ascii=False, indent=4) for post in posts]
return len(posts)
# add s3 json object
object_key = f"instagram/{username}/profile/profile.json"
save_posts_to_s3(profile_json, bucket_name, object_key)
for i in range(len(posts_json), 0, -1):
print(posts_json[i])
object_key = f"instagram/{username}/post/posts{i}.json"
save_posts_to_s3(posts_json[i - 1], bucket_name, object_key)