-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
95 lines (71 loc) · 2.93 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
"""Scrapes Sprinklr photos from Facebook, Twitter, and Instagram
TODO: facebook scraper doesnt scrape the correct image
"""
import logging
import csv
import argparse
from pathlib import Path
from urllib.request import urlopen, urlretrieve
import pandas as pd
from bs4 import BeautifulSoup
import instaloader
from tqdm import tqdm
def scrape_facebook_photo(url, photo_folder):
pass
def scrape_twitter_photo(tweet_url, tweet_photo_folder):
"""Scrapes photos from Twitter with bs4"""
try:
# Open post and make soup
tweet_html = urlopen(tweet_url)
soup = BeautifulSoup(tweet_html, 'lxml')
# Find image using platform specific tag
img = soup.find(
"div", {"class": 'AdaptiveMedia-photoContainer js-adaptive-photo'})
# Get image url using platform specific tag
img_url = img.get('data-image-url')
# Urls aren't allowed in filenames, so we change / to -
filename = tweet_url.split('.com/')[1].replace('/', '-') + '.jpg'
# Keep a log of changed filenames, for each social media
with open(f'twitter_name_log.csv', 'w') as log_file:
writer = csv.writer(log_file)
writer.writerow([img_url, filename])
# Scrape photo
filepath = Path(tweet_photo_folder) / filename
urlretrieve(img_url, filepath)
except Exception as e:
logging.exception(f'{tweet_url} was not scrapeed from Twitter')
def scrape_instagram_photo(instagram_post_url, instagram_photo_folder):
"""Scrapes photos from Instagram"""
try:
# Shortcode is the last part of instagram post url
shortcode = instagram_post_url.split('/')[-2]
post = instaloader.Post.from_shortcode(L.context, shortcode)
L.scrape_post(post=post, target=Path(instagram_photo_folder))
except:
logging.exception(
f'{instagram_post_url} was not scrapeed from Instagram')
# Set up argparse
PARSER = argparse.ArgumentParser(description='Scrape IBN data.')
PARSER.add_argument('datafile', help='Path to your data file')
ARGS = PARSER.parse_args()
# Read data
DATA = pd.read_csv(ARGS.datafile, low_memory=False)
# Set up logging
logging.basicConfig(filename='app.log', filemode='w',
format='%(name)s - %(levelname)s - %(message)s')
# Initialize Instaloader
L = instaloader.Instaloader()
# Social medias and associated scraping functions
SO_ME_FUNCS = {'FACEBOOK': scrape_facebook_photo,
'TWITTER': scrape_twitter_photo,
'INSTAGRAM': scrape_instagram_photo}
for so_me, scrape_func in SO_ME_FUNCS.items():
print(f'Scraping {so_me}...')
# Get the post urls for each SoMe
links = DATA[DATA['SocialNetwork'] == so_me]['Permalink']
# Make a folder to put photos for each SoMe
photo_folder = Path('photos') / f'{so_me}_imgs'
Path.mkdir(photo_folder, parents=True, exist_ok=True)
# Scrape photos
for url in tqdm(links):
scrape_func(url, photo_folder)