Skip to content

Commit

Permalink
crawl user tweets within a specified time period #308
Browse files Browse the repository at this point in the history
  • Loading branch information
nghuyong committed Dec 12, 2023
1 parent 150332b commit fd47ba3
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 5 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -372,6 +372,7 @@ python run_spider.py tweet_by_keyword

## 更新日志

- 2023.11: 支持采集指定时间段的用户推文 [#308](https://github.com/nghuyong/WeiboSpider/issues/308)
- 2023.04: 支持针对推文id的推文采集 [#272](https://github.com/nghuyong/WeiboSpider/issues/272)
- 2022.11: 支持针对单个关键词获取单天超过1200页的检索结果 [#257](https://github.com/nghuyong/WeiboSpider/issues/257)
- 2022.11: 支持长微博全文的获取
Expand Down
19 changes: 14 additions & 5 deletions weibospider/spiders/tweet_by_user_id.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,10 @@
Mail: nghuyong@163.com
Created Time: 2020/4/14
"""
import datetime
import json
import re

from scrapy import Spider
from scrapy.http import Request
from spiders.common import parse_tweet_info, parse_long_tweet
Expand All @@ -16,16 +19,23 @@ class TweetSpiderByUserID(Spider):
用户推文数据采集
"""
name = "tweet_spider_by_user_id"
base_url = "https://weibo.cn"

def start_requests(self):
"""
爬虫入口
"""
# 这里user_ids可替换成实际待采集的数据
user_ids = ['1087770692']
# 这里的时间替换成实际需要的时间段,如果要采集用户全部推文 is_split_by_hour 设置为False
is_split_by_hour = True
start_time = datetime.datetime(year=2022, month=1, day=1)
end_time = datetime.datetime(year=2023, month=1, day=1)
for user_id in user_ids:
url = f"https://weibo.com/ajax/statuses/mymblog?uid={user_id}&page=1"
url = f"https://weibo.com/ajax/statuses/searchProfile?uid={user_id}&page=1&hasori=1&hastext=1&haspic=1&hasvideo=1&hasmusic=1&hasret=1"
if is_split_by_hour:
start_time = int(start_time.timestamp())
end_time = int(end_time.timestamp())
url += f'&starttime={start_time}&endtime={end_time}'
yield Request(url, callback=self.parse, meta={'user_id': user_id, 'page_num': 1})

def parse(self, response, **kwargs):
Expand All @@ -44,6 +54,5 @@ def parse(self, response, **kwargs):
yield item
if tweets:
user_id, page_num = response.meta['user_id'], response.meta['page_num']
page_num += 1
url = f"https://weibo.com/ajax/statuses/mymblog?uid={user_id}&page={page_num}"
yield Request(url, callback=self.parse, meta={'user_id': user_id, 'page_num': page_num})
url = response.url.replace(f'page={page_num}', f'page={page_num + 1}')
yield Request(url, callback=self.parse, meta={'user_id': user_id, 'page_num': page_num + 1})

0 comments on commit fd47ba3

Please sign in to comment.