diff --git a/config/base_config.py b/config/base_config.py index 52c6fc91..a78ab5e7 100644 --- a/config/base_config.py +++ b/config/base_config.py @@ -59,6 +59,11 @@ # 是否开启爬评论模式, 默认开启爬评论 ENABLE_GET_COMMENTS = True +# 爬取一级评论的数量控制(单视频/帖子) +CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES = 10 + + + # 是否开启爬二级评论模式, 默认不开启爬二级评论 # 老版本项目使用了 db, 则需参考 schema/tables.sql line 287 增加表字段 ENABLE_GET_SUB_COMMENTS = False diff --git a/media_platform/xhs/client.py b/media_platform/xhs/client.py index d0882349..ee7f82e3 100644 --- a/media_platform/xhs/client.py +++ b/media_platform/xhs/client.py @@ -288,7 +288,8 @@ async def get_note_sub_comments(self, note_id: str, root_comment_id: str, num: i return await self.get(uri, params) async def get_note_all_comments(self, note_id: str, crawl_interval: float = 1.0, - callback: Optional[Callable] = None) -> List[Dict]: + callback: Optional[Callable] = None, + max_count: int = 10) -> List[Dict]: """ 获取指定笔记下的所有一级评论,该方法会一直查找一个帖子下的所有评论信息 Args: @@ -302,7 +303,7 @@ async def get_note_all_comments(self, note_id: str, crawl_interval: float = 1.0, result = [] comments_has_more = True comments_cursor = "" - while comments_has_more: + while comments_has_more and len(result) < max_count: comments_res = await self.get_note_comments(note_id, comments_cursor) comments_has_more = comments_res.get("has_more", False) comments_cursor = comments_res.get("cursor", "") @@ -311,6 +312,8 @@ async def get_note_all_comments(self, note_id: str, crawl_interval: float = 1.0, f"[XiaoHongShuClient.get_note_all_comments] No 'comments' key found in response: {comments_res}") break comments = comments_res["comments"] + if len(result) + len(comments) > max_count: + comments = comments[:max_count - len(result)] if callback: await callback(note_id, comments) await asyncio.sleep(crawl_interval) diff --git a/media_platform/xhs/core.py b/media_platform/xhs/core.py index e51ff9b4..ef1011d8 100644 --- a/media_platform/xhs/core.py +++ b/media_platform/xhs/core.py @@ -21,6 +21,7 @@ import config from base.base_crawler import AbstractCrawler +from config import CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES from model.m_xiaohongshu import NoteUrlInfo from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool from store import xhs as xhs_store @@ -263,7 +264,8 @@ async def get_comments(self, note_id: str, semaphore: asyncio.Semaphore): await self.xhs_client.get_note_all_comments( note_id=note_id, crawl_interval=random.random(), - callback=xhs_store.batch_update_xhs_note_comments + callback=xhs_store.batch_update_xhs_note_comments, + max_count=CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES ) @staticmethod