Skip to content

Commit

Permalink
add utils history url
Browse files Browse the repository at this point in the history
  • Loading branch information
wnma3mz committed Dec 27, 2020
1 parent 05c0d4b commit cf8a30b
Show file tree
Hide file tree
Showing 2 changed files with 48 additions and 7 deletions.
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

setuptools.setup(
name="wechatarticles",
version="0.5.5",
version="0.5.6",
author="wnma3mz",
author_email="wnma3mz@gmail.com",
description="wechat articles scrapy",
Expand Down
53 changes: 47 additions & 6 deletions wechatarticles/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,13 @@
import json
import os
import re
import time

import requests
from bs4 import BeautifulSoup as bs

from .GetUrls import PCUrls

base_columns = ['url', 'title', 'date', 'headlines', 'copyright']
A_columns = ['read_num', 'old_like_num', 'like_num']
B_columns = ['comments_num', 'comments_content', 'comments_like_num']
Expand Down Expand Up @@ -127,16 +130,54 @@ def copyright_num(copyright_stat):
def copyright_num_detailed(copyright_stat):
copyright_stat_lst = [14, 12, 201]
if copyright_stat == 11:
return 1 # 标记原创
return 1 # 标记原创
elif copyright_stat == 100:
return 0 # 荐号
return 0 # 荐号
elif copyright_stat == 101:
return 2 # 转发
return 2 # 转发
elif copyright_stat == 0:
return 3 # 来源非微信文章
return 3 # 来源非微信文章
elif copyright_stat == 1:
return 4 # 形容词(xxx的公众号)
return 4 # 形容词(xxx的公众号)
elif copyright_stat in copyright_stat_lst:
return 5
else:
return None
return None


def read_nickname(fname):
# 读取数据
with open(fname, 'r', encoding='utf-8') as f:
haved_data = f.readlines()
return [line.split(', ') for line in haved_data]


def get_history_urls(biz,
uin,
key,
lst=[],
start_timestamp=0,
count=10,
endcount=99999):
t = PCUrls(biz=biz, uin=uin, cookie='')
try:
while True:
res = t.get_urls(key, offset=count)
if res == []:
break
count += 10
print(count)
lst.append(res)
dt = res[-1]["comm_msg_info"]["datetime"]
if dt <= start_timestamp or count >= endcount:
break
time.sleep(5)
except KeyboardInterrupt as e:
print('程序手动中断')
return lst
except Exception as e:
print(e)
print("获取文章链接失败。。。退出程序")
assert 1 == 2
finally:
return lst

0 comments on commit cf8a30b

Please sign in to comment.