Skip to content

Commit

Permalink
Merge pull request #461 from FloRainRJY/xiaohongshu_comment_number_re…
Browse files Browse the repository at this point in the history
…strict

feat: xhs增加最大评论数量限制
  • Loading branch information
NanmiCoder authored Oct 23, 2024
2 parents 0bb9298 + 19269c6 commit fa2bcc4
Show file tree
Hide file tree
Showing 3 changed files with 13 additions and 3 deletions.
5 changes: 5 additions & 0 deletions config/base_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,11 @@
# 是否开启爬评论模式, 默认开启爬评论
ENABLE_GET_COMMENTS = True

# 爬取一级评论的数量控制(单视频/帖子)
CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES = 10



# 是否开启爬二级评论模式, 默认不开启爬二级评论
# 老版本项目使用了 db, 则需参考 schema/tables.sql line 287 增加表字段
ENABLE_GET_SUB_COMMENTS = False
Expand Down
7 changes: 5 additions & 2 deletions media_platform/xhs/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -288,7 +288,8 @@ async def get_note_sub_comments(self, note_id: str, root_comment_id: str, num: i
return await self.get(uri, params)

async def get_note_all_comments(self, note_id: str, crawl_interval: float = 1.0,
callback: Optional[Callable] = None) -> List[Dict]:
callback: Optional[Callable] = None,
max_count: int = 10) -> List[Dict]:
"""
获取指定笔记下的所有一级评论,该方法会一直查找一个帖子下的所有评论信息
Args:
Expand All @@ -302,7 +303,7 @@ async def get_note_all_comments(self, note_id: str, crawl_interval: float = 1.0,
result = []
comments_has_more = True
comments_cursor = ""
while comments_has_more:
while comments_has_more and len(result) < max_count:
comments_res = await self.get_note_comments(note_id, comments_cursor)
comments_has_more = comments_res.get("has_more", False)
comments_cursor = comments_res.get("cursor", "")
Expand All @@ -311,6 +312,8 @@ async def get_note_all_comments(self, note_id: str, crawl_interval: float = 1.0,
f"[XiaoHongShuClient.get_note_all_comments] No 'comments' key found in response: {comments_res}")
break
comments = comments_res["comments"]
if len(result) + len(comments) > max_count:
comments = comments[:max_count - len(result)]
if callback:
await callback(note_id, comments)
await asyncio.sleep(crawl_interval)
Expand Down
4 changes: 3 additions & 1 deletion media_platform/xhs/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@

import config
from base.base_crawler import AbstractCrawler
from config import CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES
from model.m_xiaohongshu import NoteUrlInfo
from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool
from store import xhs as xhs_store
Expand Down Expand Up @@ -263,7 +264,8 @@ async def get_comments(self, note_id: str, semaphore: asyncio.Semaphore):
await self.xhs_client.get_note_all_comments(
note_id=note_id,
crawl_interval=random.random(),
callback=xhs_store.batch_update_xhs_note_comments
callback=xhs_store.batch_update_xhs_note_comments,
max_count=CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES
)

@staticmethod
Expand Down

0 comments on commit fa2bcc4

Please sign in to comment.