-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcombined_crawler.py
160 lines (126 loc) · 5.54 KB
/
combined_crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import ssl
import chromedriver_autoinstaller
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import os
import requests
import json
import time
# bs4 기본 설정
headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'}
# 목록 페이지 URL
BASE_URL = 'https://www.albamon.com/alba-talk/experience'
TARGET_PAGE_URL = "https://www.albamon.com/alba-talk/experience?pageIndex={page}&searchKeyword=&sortType=CREATED_DATE"
TARGET_PAGE = 1330
# 크롤링 결과 저장용 리스트
post_data = []
'''
ChromeDriver Setting
'''
ssl._create_default_https_context = ssl._create_unverified_context # SSL 인증서 검증 비활성화
# 자동으로 ChromeDriver 설치
chromedriver_autoinstaller.install()
# Selenium 실행 옵션 설정
chrome_options = Options()
chrome_options.add_argument("--headless") # 브라우저 창을 띄우지 않음
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
# Chrome WebDriver 실행
service = Service()
driver = webdriver.Chrome(service=service, options=chrome_options)
def get_soup(target_url, talkNo, headers):
url = f'{target_url}/{talkNo}?sortType=CREATED_DATE'
# 요청 및 BeautifulSoup 객체 생성
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
return soup
def parse_comment_items(comment_list):
# 날짜와 텍스트를 저장할 리스트
comments_data = []
for comment in comment_list:
# 댓글 텍스트 추출
comment_text_element = comment.select_one('.comment-list__text-override')
comment_text = comment_text_element.get_text(strip=True) if comment_text_element else "N/A"
# 데이터 딕셔너리에 저장
comments_data.append(comment_text)
return comments_data
'''
talkNo만 추출하기 위함
'''
def crawl_talk_no(pageIndex):
print(f"🚀 페이지 {pageIndex} 크롤링 시작...")
driver.get(TARGET_PAGE_URL.format(page=pageIndex))
time.sleep(3) # 페이지 로드 대기
buttons = WebDriverWait(driver, 10).until(
EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".Button_button__S9rjD.Button_text__5x_Cn.Button_large___Kecx.tertiary")))
# JavaScript를 이용해 클릭 (클릭 오류 방지)
driver.execute_script("arguments[0].scrollIntoView();", buttons[0])
driver.execute_script("arguments[0].click();", buttons[0])
time.sleep(2) # 페이지 로드 대기
# 현재 페이지 URL에서 talkNo 추출
current_url = driver.current_url
talk_no = current_url.split("/")[-1].split("?")[0] # talkNo 추출
print(f"🔹 게시글 URL: {current_url}")
return talk_no
'''
최근 talkNo 부터 n개의 게시물 크롤링
'''
def crawl_post_detail(START_TALK_NO, LAST_TALK_NO):
for talk_no in range(START_TALK_NO, LAST_TALK_NO, -1):
soup = get_soup(BASE_URL, talk_no, headers)
# 덧글 저장 리스트
parsed_comments = []
# 콘텐츠 영역 (게시글 제목, 내용, 작성일자)
title_element = soup.select_one('.DetailTitle_detail__header--title__Bbp40')
content_element = soup.select_one('.Detail_content__content__hJ5M7')
date_element = soup.select_one('.CommonInfos_info__wrapper__aGcEl > div:nth-child(2)')
view_count_element = soup.select_one('.experience__span--view')
title = title_element.get_text(strip=True) if title_element else "N/A"
contents = content_element.get_text(strip=True) if content_element else "N/A"
content_date = date_element.get_text(strip=True) if date_element else "N/A"
view_count = view_count_element.get_text(strip=True) if date_element else "N/A"
# 필수값이 없거나 빈 문자열이면 저장하지 않고 넘어가기
if not title_element or not title_element.get_text(strip=True) or not contents or not content_date:
continue
# 댓글 리스트 파싱
comment_list = soup.select('.CommentList_comment-contents__YVrtF > ul li')
parsed_comments = parse_comment_items(comment_list)
# 게시글 데이터를 리스트에 추가
post_data.append({
"talkNo": talk_no,
"Title": title,
"Contents": contents,
"Date": content_date,
"ViewCount": view_count,
"Comments": parsed_comments
})
print(f"talkNo: {talk_no}")
print(f"Title: {title}")
print(f"Contents: {contents}")
print(f"Date: {content_date}")
print(f"comments : {parsed_comments}")
print(f"ViewCount: {view_count}")
print("-" * 80)
return post_data
START_TALK_NO = int(crawl_talk_no(1))
LAST_TALK_NO = int(crawl_talk_no(TARGET_PAGE))
data = crawl_post_detail(START_TALK_NO, LAST_TALK_NO)
# 브라우저 종료
driver.quit()
# 데이터프레임 생성
df = pd.DataFrame(data)
# 저장할 디렉토리 생성
output_dir = "crawling_result"
os.makedirs(output_dir, exist_ok=True)
# CSV 파일로 저장
csv_file = os.path.join(output_dir, "crawling_combined_result.csv")
df.to_csv(csv_file, index=False, encoding='utf-8-sig') # UTF-8 인코딩으로 저장
print(f"Data saved to {csv_file}")