-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathSpiderOne.py
106 lines (103 loc) · 2.98 KB
/
SpiderOne.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
# -*- coding: utf-8 -*-
'''
Created on 2016��9��18��
@author: su
'''
#git config core.autocrlf false
import time
import datetime
from multiprocessing import Pool
import requests
import bs4
import os
import pickle
root_url = 'http://wufazhuce.com'
todaytime = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
def save_img():
pass
def get_url(num,string):
return root_url + '/'+string+'/' +str(num)
def get_img_data(url):
try:
response = requests.get(url)
soup = bs4.BeautifulSoup(response.text,"html.parser")
for meta in soup.select('meta'):
if meta.get('name') =='description':
title = meta.get('content')
for i in soup.select('.one-titulo'):
titulo = i.get_text().strip()
imgUrl = soup.find_all('img')[1]['src']
return imgUrl,titulo,title
except:
pass
def get_article_data(url):
try:
response = requests.get(url)
soup = bs4.BeautifulSoup(response.text,"html.parser")
for i in soup.select('.comilla-cerrar'):
cerrar = i.get_text().strip()
for i in soup.select('.articulo-titulo'):
titulo = i.get_text().strip()
for i in soup.select('.articulo-autor'):
autor = i.get_text().strip()
for i in soup.select('.articulo-contenido'):
contenido = i.get_text()
return cerrar,titulo,autor,contenido
except:
pass
def write_article_file(num):
url = get_url(num, 'article')
print url
try:
cerrar,titulo,autor,contenido = get_article_data(url)
filename = 'ONE-ESSAY\\'+titulo+'.md'
file = open(filename, 'w')
file.write('> '+cerrar+'\n\n')
file.write('###'+titulo+'\n')
file.write('####'+autor+'\n')
file.write(contenido)
file.close()
except:
pass
def write_img_file(num):
url = get_url(num, 'one')
try:
imgUrl,titulo,title = get_img_data(url)
filename = 'ONE-IMG\\'+titulo+'.md'
file = open(filename, 'w')
file.write('![one]('+imgUrl+')'+'\n')
file.write('#'+titulo+'\n')
file.write(title)
file.close()
except:
pass
def push_data():
gitadd = 'git add -A'
gitcommit = 'git commit -m "'+todaytime+'"'
gitpush = 'git push origin master'
os.system(gitadd+' && '+gitcommit+' && ' +gitpush)
def write_pkl(begin):
output = open('data.pkl', 'wb')
pickle.dump(begin,output)
output.close()
def read_pkl():
data = 1
try:
pkl_file = open('data.pkl', 'rb')
data = pickle.load(pkl_file)
except:
pass
return data
if __name__=='__main__':
pool = Pool(4)
start = datetime.date(2012,9,15)
timeArray = time.localtime(int(time.time()))
now = datetime.date.today()
days = now -start
begin = read_pkl()
for i in range(begin,days.days):
begin = i
write_img_file(i)
write_article_file(i)
write_pkl(begin)
push_data()