-
Notifications
You must be signed in to change notification settings - Fork 0
/
naverWork.py
62 lines (51 loc) · 1.92 KB
/
naverWork.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import requests
import datetime
import json
import selenium
from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from elasticsearch import Elasticsearch, helpers
es = Elasticsearch(
hosts=[{'host': "localhost", 'port': "9200"}]
)
target_source = "https://m.naver.com/naverapp/?cmd=onMenu&version=3&menuCode=NEWS"
portal = "naver"
options = webdriver.ChromeOptions()
options.add_argument('--headless')
# options.add_argument('window-size=1200x600')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
#chrome드라이버가 PATH 환경변수 설정이 되어있지 않다면 executable_path 옵션으로 chromedriver 위치 지정
driver = webdriver.Chrome(chrome_options=options, executable_path="/usr/local/bin/chromedriver")
driver.implicitly_wait(time_to_wait=20)
driver.get(url=target_source)
boxes = driver.find_elements_by_class_name('ccj_journal_box')
documents = []
for box in boxes:
article = box.find_element_by_class_name('ut_t')
title = str(article.get_attribute('innerText'))
source = target_source
provider = box.find_element_by_class_name(
'ccj_btn_subscribe').get_attribute('data-press')
news_id = box.find_element_by_class_name('ut_a').get_attribute('data-aid')
documents.append(
{
'_index': "naver-news-articles-v1",
'_source': {
"portal": portal,
"title": title,
"source": target_source,
"provider": provider,
"timestamp": datetime.datetime.now(),
"newsID": news_id
}
}
)
helpers.bulk(es, documents)
print(documents)
driver.close()