feat: 知乎支持详情模式

NanmiCoder · Dec 26, 2024 · ea5223c · ea5223c
1 parent dc9116e
commit ea5223c
Show file tree

Hide file tree

Showing 6 changed files with 239 additions and 17 deletions.
diff --git a/README.md b/README.md
@@ -32,7 +32,7 @@
 | B 站   | ✅          | ✅              | ✅        | ✅              | ✅          | ✅        | ✅              |
 | 微博   | ✅          | ✅              | ✅        | ✅              | ✅          | ✅        | ✅              |
 | 贴吧   | ✅          | ✅              | ✅        | ✅              | ✅          | ✅        | ✅              |
-| 知乎   | ✅          | ❌              | ✅        | ✅              | ✅          | ✅        | ✅              |
+| 知乎   | ✅          | ✅              | ✅        | ✅              | ✅          | ✅        | ✅              |
 
 ### MediaCrawlerPro重磅发布啦！！！
 > 主打学习成熟项目的架构设计，不仅仅是爬虫，Pro中的其他代码设计思路也是值得学习，欢迎大家关注！！！
@@ -111,7 +111,9 @@
 > [MediaCrawler在线文档](https://nanmicoder.github.io/MediaCrawler/)
 > 
 
-# 知识付费服务
+# 作者提供的知识服务
+> 如果想快速入门和学习该项目的使用、源码架构设计等、学习编程技术、亦或者想了解MediaCrawlerPro的源代码设计可以看下我的知识付费栏目。
+
 [作者的知识付费栏目介绍](https://nanmicoder.github.io/MediaCrawler/%E7%9F%A5%E8%AF%86%E4%BB%98%E8%B4%B9%E4%BB%8B%E7%BB%8D.html)
 
 # 项目微信交流群

diff --git a/config/base_config.py b/config/base_config.py
@@ -162,6 +162,13 @@
     # ........................
 ]
 
+# 指定知乎需要爬取的帖子ID列表
+ZHIHU_SPECIFIED_ID_LIST = [
+    "https://www.zhihu.com/question/826896610/answer/4885821440", # 回答
+    "https://zhuanlan.zhihu.com/p/673461588", # 文章
+    "https://www.zhihu.com/zvideo/1539542068422144000" # 视频
+]
+
 # 词云相关
 # 是否开启生成评论词云图
 ENABLE_GET_WORDCLOUD = False

diff --git a/constant/zhihu.py b/constant/zhihu.py
@@ -11,7 +11,9 @@
 
 # -*- coding: utf-8 -*-
 ZHIHU_URL = "https://www.zhihu.com"
+ZHIHU_ZHUANLAN_URL = "https://zhuanlan.zhihu.com"
 
 ANSWER_NAME = "answer"
 ARTICLE_NAME = "article"
-VIDEO_NAME = "zvideo"
+VIDEO_NAME = "zvideo"
+
diff --git a/media_platform/zhihu/client.py b/media_platform/zhihu/client.py
@@ -121,7 +121,12 @@ async def get(self, uri: str, params=None, **kwargs) -> Union[Response, Dict, st
         if isinstance(params, dict):
             final_uri += '?' + urlencode(params)
         headers = await self._pre_headers(final_uri)
-        return await self.request(method="GET", url=zhihu_constant.ZHIHU_URL + final_uri, headers=headers, **kwargs)
+        base_url = (
+            zhihu_constant.ZHIHU_URL
+            if "/p/" not in uri
+            else zhihu_constant.ZHIHU_ZHUANLAN_URL
+        )
+        return await self.request(method="GET", url=base_url + final_uri, headers=headers, **kwargs)
 
     async def pong(self) -> bool:
         """
@@ -209,7 +214,7 @@ async def get_note_by_keyword(
         return self._extractor.extract_contents_from_search(search_res)
 
     async def get_root_comments(self, content_id: str, content_type: str, offset: str = "", limit: int = 10,
-                                order_by: str = "sort") -> Dict:
+                                order_by: str = "score") -> Dict:
         """
         获取内容的一级评论
         Args:
@@ -222,13 +227,16 @@ async def get_root_comments(self, content_id: str, content_type: str, offset: st
         Returns:
 
         """
-        uri = f"/api/v4/{content_type}s/{content_id}/root_comments"
-        params = {
-            "order": order_by,
-            "offset": offset,
-            "limit": limit
-        }
+        uri = f"/api/v4/comment_v5/{content_type}s/{content_id}/root_comment"
+        params = {"order": order_by, "offset": offset, "limit": limit}
         return await self.get(uri, params)
+        # uri = f"/api/v4/{content_type}s/{content_id}/root_comments"
+        # params = {
+        #     "order": order_by,
+        #     "offset": offset,
+        #     "limit": limit
+        # }
+        # return await self.get(uri, params)
 
     async def get_child_comments(self, root_comment_id: str, offset: str = "", limit: int = 10,
                                  order_by: str = "sort") -> Dict:
@@ -496,3 +504,46 @@ async def get_all_videos_by_creator(self, creator: ZhihuCreator, crawl_interval:
             offset += limit
             await asyncio.sleep(crawl_interval)
         return all_contents
+
+
+    async def get_answer_info(
+        self, question_id: str, answer_id: str
+    ) -> Optional[ZhihuContent]:
+        """
+        获取回答信息
+        Args:
+            question_id:
+            answer_id:
+
+        Returns:
+
+        """
+        uri = f"/question/{question_id}/answer/{answer_id}"
+        response_html = await self.get(uri, return_response=True)
+        return self._extractor.extract_answer_content_from_html(response_html)
+
+    async def get_article_info(self, article_id: str) -> Optional[ZhihuContent]:
+        """
+        获取文章信息
+        Args:
+            article_id:
+
+        Returns:
+
+        """
+        uri = f"/p/{article_id}"
+        response_html = await self.get(uri, return_response=True)
+        return self._extractor.extract_article_content_from_html(response_html)
+
+    async def get_video_info(self, video_id: str) -> Optional[ZhihuContent]:
+        """
+        获取视频信息
+        Args:
+            video_id:
+
+        Returns:
+
+        """
+        uri = f"/zvideo/{video_id}"
+        response_html = await self.get(uri, return_response=True)
+        return self._extractor.extract_zvideo_content_from_html(response_html)
diff --git a/media_platform/zhihu/core.py b/media_platform/zhihu/core.py
@@ -14,12 +14,13 @@
 import os
 import random
 from asyncio import Task
-from typing import Dict, List, Optional, Tuple
+from typing import Dict, List, Optional, Tuple, cast
 
 from playwright.async_api import (BrowserContext, BrowserType, Page,
                                   async_playwright)
 
 import config
+from constant import zhihu as constant
 from base.base_crawler import AbstractCrawler
 from model.m_zhihu import ZhihuContent, ZhihuCreator
 from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool
@@ -29,7 +30,7 @@
 
 from .client import ZhiHuClient
 from .exception import DataFetchError
-from .help import ZhihuExtractor
+from .help import ZhihuExtractor, judge_zhihu_url
 from .login import ZhiHuLogin
 
 
@@ -96,7 +97,7 @@ async def start(self) -> None:
                 await self.search()
             elif config.CRAWLER_TYPE == "detail":
                 # Get the information and comments of the specified post
-                raise NotImplementedError
+                await self.get_specified_notes()
             elif config.CRAWLER_TYPE == "creator":
                 # Get creator's information and their notes and comments
                 await self.get_creators_and_notes()
@@ -226,6 +227,76 @@ async def get_creators_and_notes(self) -> None:
             # Get all comments of the creator's contents
             await self.batch_get_content_comments(all_content_list)
 
+    async def get_note_detail(
+        self, full_note_url: str, semaphore: asyncio.Semaphore
+    ) -> Optional[ZhihuContent]:
+        """
+        Get note detail
+        Args:
+            full_note_url: str
+            semaphore:
+
+        Returns:
+
+        """
+        async with semaphore:
+            utils.logger.info(
+                f"[ZhihuCrawler.get_specified_notes] Begin get specified note {full_note_url}"
+            )
+            # judge note type
+            note_type: str = judge_zhihu_url(full_note_url)
+            if note_type == constant.ANSWER_NAME:
+                question_id = full_note_url.split("/")[-3]
+                answer_id = full_note_url.split("/")[-1]
+                utils.logger.info(
+                    f"[ZhihuCrawler.get_specified_notes] Get answer info, question_id: {question_id}, answer_id: {answer_id}"
+                )
+                return await self.zhihu_client.get_answer_info(question_id, answer_id)
+
+            elif note_type == constant.ARTICLE_NAME:
+                article_id = full_note_url.split("/")[-1]
+                utils.logger.info(
+                    f"[ZhihuCrawler.get_specified_notes] Get article info, article_id: {article_id}"
+                )
+                return await self.zhihu_client.get_article_info(article_id)
+
+            elif note_type == constant.VIDEO_NAME:
+                video_id = full_note_url.split("/")[-1]
+                utils.logger.info(
+                    f"[ZhihuCrawler.get_specified_notes] Get video info, video_id: {video_id}"
+                )
+                return await self.zhihu_client.get_video_info(video_id)
+
+    async def get_specified_notes(self):
+        """
+        Get the information and comments of the specified post
+        Returns:
+
+        """
+        get_note_detail_task_list = []
+        for full_note_url in config.ZHIHU_SPECIFIED_ID_LIST:
+            # remove query params
+            full_note_url = full_note_url.split("?")[0]
+            crawler_task = self.get_note_detail(
+                full_note_url=full_note_url,
+                semaphore=asyncio.Semaphore(config.MAX_CONCURRENCY_NUM),
+            )
+            get_note_detail_task_list.append(crawler_task)
+
+        need_get_comment_notes: List[ZhihuContent] = []
+        note_details = await asyncio.gather(*get_note_detail_task_list)
+        for index, note_detail in enumerate(note_details):
+            if not note_detail:
+                utils.logger.info(
+                    f"[ZhihuCrawler.get_specified_notes] Note {config.ZHIHU_SPECIFIED_ID_LIST[index]} not found"
+                )
+                continue
+
+            note_detail = cast(ZhihuContent, note_detail)  # only for type check
+            need_get_comment_notes.append(note_detail)
+            await zhihu_store.update_zhihu_content(note_detail)
+
+        await self.batch_get_content_comments(need_get_comment_notes)
 
     @staticmethod
     def format_proxy_info(ip_proxy_info: IpInfoModel) -> Tuple[Optional[Dict], Optional[Dict]]:

diff --git a/media_platform/zhihu/help.py b/media_platform/zhihu/help.py
@@ -159,15 +159,13 @@ def _extract_zvideo_content(self, zvideo: Dict) -> ZhihuContent:
         res = ZhihuContent()
 
         if "video" in zvideo and isinstance(zvideo.get("video"), dict): # 说明是从创作者主页的视频列表接口来的
-            res.content_id = zvideo.get("video").get("video_id")
             res.content_url = f"{zhihu_constant.ZHIHU_URL}/zvideo/{res.content_id}"
             res.created_time = zvideo.get("published_at")
             res.updated_time = zvideo.get("updated_at")
         else:
-            res.content_id = zvideo.get("zvideo_id")
             res.content_url = zvideo.get("video_url")
             res.created_time = zvideo.get("created_at")
-
+        res.content_id = zvideo.get("id")
         res.content_type = zvideo.get("type")
         res.title = extract_text_from_html(zvideo.get("title"))
         res.desc = extract_text_from_html(zvideo.get("description"))
@@ -369,3 +367,94 @@ def extract_content_list_from_creator(self, anwser_list: List[Dict]) -> List[Zhi
             return []
 
         return self._extract_content_list(anwser_list)
+
+
+
+
+    def extract_answer_content_from_html(self, html_content: str) -> Optional[ZhihuContent]:
+        """
+        extract zhihu answer content from html
+        Args:
+            html_content:
+
+        Returns:
+
+        """
+        js_init_data: str = Selector(text=html_content).xpath("//script[@id='js-initialData']/text()").get(default="")
+        if not js_init_data:
+            return None
+        json_data: Dict = json.loads(js_init_data)
+        answer_info: Dict = json_data.get("initialState", {}).get("entities", {}).get("answers", {})
+        if not answer_info:
+            return None
+
+        return self._extract_answer_content(answer_info.get(list(answer_info.keys())[0]))
+
+    def extract_article_content_from_html(self, html_content: str) -> Optional[ZhihuContent]:
+        """
+        extract zhihu article content from html
+        Args:
+            html_content:
+
+        Returns:
+
+        """
+        js_init_data: str = Selector(text=html_content).xpath("//script[@id='js-initialData']/text()").get(default="")
+        if not js_init_data:
+            return None
+        json_data: Dict = json.loads(js_init_data)
+        article_info: Dict = json_data.get("initialState", {}).get("entities", {}).get("articles", {})
+        if not article_info:
+            return None
+
+        return self._extract_article_content(article_info.get(list(article_info.keys())[0]))
+
+    def extract_zvideo_content_from_html(self, html_content: str) -> Optional[ZhihuContent]:
+        """
+        extract zhihu zvideo content from html
+        Args:
+            html_content:
+
+        Returns:
+
+        """
+        js_init_data: str = Selector(text=html_content).xpath("//script[@id='js-initialData']/text()").get(default="")
+        if not js_init_data:
+            return None
+        json_data: Dict = json.loads(js_init_data)
+        zvideo_info: Dict = json_data.get("initialState", {}).get("entities", {}).get("zvideos", {})
+        users: Dict = json_data.get("initialState", {}).get("entities", {}).get("users", {})
+        if not zvideo_info:
+            return None
+
+        # handler user info and video info
+        video_detail_info: Dict = zvideo_info.get(list(zvideo_info.keys())[0])
+        if not video_detail_info:
+            return None
+        if isinstance(video_detail_info.get("author"), str):
+            author_name: str = video_detail_info.get("author")
+            video_detail_info["author"] = users.get(author_name)
+
+        return self._extract_zvideo_content(video_detail_info)
+
+
+def judge_zhihu_url(note_detail_url: str) -> str:
+    """
+    judge zhihu url type
+    Args:
+        note_detail_url:
+            eg1: https://www.zhihu.com/question/123456789/answer/123456789 # answer
+            eg2: https://www.zhihu.com/p/123456789 # article
+            eg3: https://www.zhihu.com/zvideo/123456789 # zvideo
+
+    Returns:
+
+    """
+    if "/answer/" in note_detail_url:
+        return zhihu_constant.ANSWER_NAME
+    elif "/p/" in note_detail_url:
+        return zhihu_constant.ARTICLE_NAME
+    elif "/zvideo/" in note_detail_url:
+        return zhihu_constant.VIDEO_NAME
+    else:
+        return ""