Skip to content

Commit

Permalink
feat: 知乎支持详情模式
Browse files Browse the repository at this point in the history
  • Loading branch information
NanmiCoder committed Dec 26, 2024
1 parent dc9116e commit ea5223c
Show file tree
Hide file tree
Showing 6 changed files with 239 additions and 17 deletions.
6 changes: 4 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
| B 站 ||||||||
| 微博 ||||||||
| 贴吧 ||||||||
| 知乎 || ||||||
| 知乎 || ||||||

### MediaCrawlerPro重磅发布啦!!!
> 主打学习成熟项目的架构设计,不仅仅是爬虫,Pro中的其他代码设计思路也是值得学习,欢迎大家关注!!!
Expand Down Expand Up @@ -111,7 +111,9 @@
> [MediaCrawler在线文档](https://nanmicoder.github.io/MediaCrawler/)
>
# 知识付费服务
# 作者提供的知识服务
> 如果想快速入门和学习该项目的使用、源码架构设计等、学习编程技术、亦或者想了解MediaCrawlerPro的源代码设计可以看下我的知识付费栏目。
[作者的知识付费栏目介绍](https://nanmicoder.github.io/MediaCrawler/%E7%9F%A5%E8%AF%86%E4%BB%98%E8%B4%B9%E4%BB%8B%E7%BB%8D.html)

# 项目微信交流群
Expand Down
7 changes: 7 additions & 0 deletions config/base_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,13 @@
# ........................
]

# 指定知乎需要爬取的帖子ID列表
ZHIHU_SPECIFIED_ID_LIST = [
"https://www.zhihu.com/question/826896610/answer/4885821440", # 回答
"https://zhuanlan.zhihu.com/p/673461588", # 文章
"https://www.zhihu.com/zvideo/1539542068422144000" # 视频
]

# 词云相关
# 是否开启生成评论词云图
ENABLE_GET_WORDCLOUD = False
Expand Down
4 changes: 3 additions & 1 deletion constant/zhihu.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,9 @@

# -*- coding: utf-8 -*-
ZHIHU_URL = "https://www.zhihu.com"
ZHIHU_ZHUANLAN_URL = "https://zhuanlan.zhihu.com"

ANSWER_NAME = "answer"
ARTICLE_NAME = "article"
VIDEO_NAME = "zvideo"
VIDEO_NAME = "zvideo"

67 changes: 59 additions & 8 deletions media_platform/zhihu/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,12 @@ async def get(self, uri: str, params=None, **kwargs) -> Union[Response, Dict, st
if isinstance(params, dict):
final_uri += '?' + urlencode(params)
headers = await self._pre_headers(final_uri)
return await self.request(method="GET", url=zhihu_constant.ZHIHU_URL + final_uri, headers=headers, **kwargs)
base_url = (
zhihu_constant.ZHIHU_URL
if "/p/" not in uri
else zhihu_constant.ZHIHU_ZHUANLAN_URL
)
return await self.request(method="GET", url=base_url + final_uri, headers=headers, **kwargs)

async def pong(self) -> bool:
"""
Expand Down Expand Up @@ -209,7 +214,7 @@ async def get_note_by_keyword(
return self._extractor.extract_contents_from_search(search_res)

async def get_root_comments(self, content_id: str, content_type: str, offset: str = "", limit: int = 10,
order_by: str = "sort") -> Dict:
order_by: str = "score") -> Dict:
"""
获取内容的一级评论
Args:
Expand All @@ -222,13 +227,16 @@ async def get_root_comments(self, content_id: str, content_type: str, offset: st
Returns:
"""
uri = f"/api/v4/{content_type}s/{content_id}/root_comments"
params = {
"order": order_by,
"offset": offset,
"limit": limit
}
uri = f"/api/v4/comment_v5/{content_type}s/{content_id}/root_comment"
params = {"order": order_by, "offset": offset, "limit": limit}
return await self.get(uri, params)
# uri = f"/api/v4/{content_type}s/{content_id}/root_comments"
# params = {
# "order": order_by,
# "offset": offset,
# "limit": limit
# }
# return await self.get(uri, params)

async def get_child_comments(self, root_comment_id: str, offset: str = "", limit: int = 10,
order_by: str = "sort") -> Dict:
Expand Down Expand Up @@ -496,3 +504,46 @@ async def get_all_videos_by_creator(self, creator: ZhihuCreator, crawl_interval:
offset += limit
await asyncio.sleep(crawl_interval)
return all_contents


async def get_answer_info(
self, question_id: str, answer_id: str
) -> Optional[ZhihuContent]:
"""
获取回答信息
Args:
question_id:
answer_id:
Returns:
"""
uri = f"/question/{question_id}/answer/{answer_id}"
response_html = await self.get(uri, return_response=True)
return self._extractor.extract_answer_content_from_html(response_html)

async def get_article_info(self, article_id: str) -> Optional[ZhihuContent]:
"""
获取文章信息
Args:
article_id:
Returns:
"""
uri = f"/p/{article_id}"
response_html = await self.get(uri, return_response=True)
return self._extractor.extract_article_content_from_html(response_html)

async def get_video_info(self, video_id: str) -> Optional[ZhihuContent]:
"""
获取视频信息
Args:
video_id:
Returns:
"""
uri = f"/zvideo/{video_id}"
response_html = await self.get(uri, return_response=True)
return self._extractor.extract_zvideo_content_from_html(response_html)
77 changes: 74 additions & 3 deletions media_platform/zhihu/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,13 @@
import os
import random
from asyncio import Task
from typing import Dict, List, Optional, Tuple
from typing import Dict, List, Optional, Tuple, cast

from playwright.async_api import (BrowserContext, BrowserType, Page,
async_playwright)

import config
from constant import zhihu as constant
from base.base_crawler import AbstractCrawler
from model.m_zhihu import ZhihuContent, ZhihuCreator
from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool
Expand All @@ -29,7 +30,7 @@

from .client import ZhiHuClient
from .exception import DataFetchError
from .help import ZhihuExtractor
from .help import ZhihuExtractor, judge_zhihu_url
from .login import ZhiHuLogin


Expand Down Expand Up @@ -96,7 +97,7 @@ async def start(self) -> None:
await self.search()
elif config.CRAWLER_TYPE == "detail":
# Get the information and comments of the specified post
raise NotImplementedError
await self.get_specified_notes()
elif config.CRAWLER_TYPE == "creator":
# Get creator's information and their notes and comments
await self.get_creators_and_notes()
Expand Down Expand Up @@ -226,6 +227,76 @@ async def get_creators_and_notes(self) -> None:
# Get all comments of the creator's contents
await self.batch_get_content_comments(all_content_list)

async def get_note_detail(
self, full_note_url: str, semaphore: asyncio.Semaphore
) -> Optional[ZhihuContent]:
"""
Get note detail
Args:
full_note_url: str
semaphore:
Returns:
"""
async with semaphore:
utils.logger.info(
f"[ZhihuCrawler.get_specified_notes] Begin get specified note {full_note_url}"
)
# judge note type
note_type: str = judge_zhihu_url(full_note_url)
if note_type == constant.ANSWER_NAME:
question_id = full_note_url.split("/")[-3]
answer_id = full_note_url.split("/")[-1]
utils.logger.info(
f"[ZhihuCrawler.get_specified_notes] Get answer info, question_id: {question_id}, answer_id: {answer_id}"
)
return await self.zhihu_client.get_answer_info(question_id, answer_id)

elif note_type == constant.ARTICLE_NAME:
article_id = full_note_url.split("/")[-1]
utils.logger.info(
f"[ZhihuCrawler.get_specified_notes] Get article info, article_id: {article_id}"
)
return await self.zhihu_client.get_article_info(article_id)

elif note_type == constant.VIDEO_NAME:
video_id = full_note_url.split("/")[-1]
utils.logger.info(
f"[ZhihuCrawler.get_specified_notes] Get video info, video_id: {video_id}"
)
return await self.zhihu_client.get_video_info(video_id)

async def get_specified_notes(self):
"""
Get the information and comments of the specified post
Returns:
"""
get_note_detail_task_list = []
for full_note_url in config.ZHIHU_SPECIFIED_ID_LIST:
# remove query params
full_note_url = full_note_url.split("?")[0]
crawler_task = self.get_note_detail(
full_note_url=full_note_url,
semaphore=asyncio.Semaphore(config.MAX_CONCURRENCY_NUM),
)
get_note_detail_task_list.append(crawler_task)

need_get_comment_notes: List[ZhihuContent] = []
note_details = await asyncio.gather(*get_note_detail_task_list)
for index, note_detail in enumerate(note_details):
if not note_detail:
utils.logger.info(
f"[ZhihuCrawler.get_specified_notes] Note {config.ZHIHU_SPECIFIED_ID_LIST[index]} not found"
)
continue

note_detail = cast(ZhihuContent, note_detail) # only for type check
need_get_comment_notes.append(note_detail)
await zhihu_store.update_zhihu_content(note_detail)

await self.batch_get_content_comments(need_get_comment_notes)

@staticmethod
def format_proxy_info(ip_proxy_info: IpInfoModel) -> Tuple[Optional[Dict], Optional[Dict]]:
Expand Down
95 changes: 92 additions & 3 deletions media_platform/zhihu/help.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,15 +159,13 @@ def _extract_zvideo_content(self, zvideo: Dict) -> ZhihuContent:
res = ZhihuContent()

if "video" in zvideo and isinstance(zvideo.get("video"), dict): # 说明是从创作者主页的视频列表接口来的
res.content_id = zvideo.get("video").get("video_id")
res.content_url = f"{zhihu_constant.ZHIHU_URL}/zvideo/{res.content_id}"
res.created_time = zvideo.get("published_at")
res.updated_time = zvideo.get("updated_at")
else:
res.content_id = zvideo.get("zvideo_id")
res.content_url = zvideo.get("video_url")
res.created_time = zvideo.get("created_at")

res.content_id = zvideo.get("id")
res.content_type = zvideo.get("type")
res.title = extract_text_from_html(zvideo.get("title"))
res.desc = extract_text_from_html(zvideo.get("description"))
Expand Down Expand Up @@ -369,3 +367,94 @@ def extract_content_list_from_creator(self, anwser_list: List[Dict]) -> List[Zhi
return []

return self._extract_content_list(anwser_list)




def extract_answer_content_from_html(self, html_content: str) -> Optional[ZhihuContent]:
"""
extract zhihu answer content from html
Args:
html_content:
Returns:
"""
js_init_data: str = Selector(text=html_content).xpath("//script[@id='js-initialData']/text()").get(default="")
if not js_init_data:
return None
json_data: Dict = json.loads(js_init_data)
answer_info: Dict = json_data.get("initialState", {}).get("entities", {}).get("answers", {})
if not answer_info:
return None

return self._extract_answer_content(answer_info.get(list(answer_info.keys())[0]))

def extract_article_content_from_html(self, html_content: str) -> Optional[ZhihuContent]:
"""
extract zhihu article content from html
Args:
html_content:
Returns:
"""
js_init_data: str = Selector(text=html_content).xpath("//script[@id='js-initialData']/text()").get(default="")
if not js_init_data:
return None
json_data: Dict = json.loads(js_init_data)
article_info: Dict = json_data.get("initialState", {}).get("entities", {}).get("articles", {})
if not article_info:
return None

return self._extract_article_content(article_info.get(list(article_info.keys())[0]))

def extract_zvideo_content_from_html(self, html_content: str) -> Optional[ZhihuContent]:
"""
extract zhihu zvideo content from html
Args:
html_content:
Returns:
"""
js_init_data: str = Selector(text=html_content).xpath("//script[@id='js-initialData']/text()").get(default="")
if not js_init_data:
return None
json_data: Dict = json.loads(js_init_data)
zvideo_info: Dict = json_data.get("initialState", {}).get("entities", {}).get("zvideos", {})
users: Dict = json_data.get("initialState", {}).get("entities", {}).get("users", {})
if not zvideo_info:
return None

# handler user info and video info
video_detail_info: Dict = zvideo_info.get(list(zvideo_info.keys())[0])
if not video_detail_info:
return None
if isinstance(video_detail_info.get("author"), str):
author_name: str = video_detail_info.get("author")
video_detail_info["author"] = users.get(author_name)

return self._extract_zvideo_content(video_detail_info)


def judge_zhihu_url(note_detail_url: str) -> str:
"""
judge zhihu url type
Args:
note_detail_url:
eg1: https://www.zhihu.com/question/123456789/answer/123456789 # answer
eg2: https://www.zhihu.com/p/123456789 # article
eg3: https://www.zhihu.com/zvideo/123456789 # zvideo
Returns:
"""
if "/answer/" in note_detail_url:
return zhihu_constant.ANSWER_NAME
elif "/p/" in note_detail_url:
return zhihu_constant.ARTICLE_NAME
elif "/zvideo/" in note_detail_url:
return zhihu_constant.VIDEO_NAME
else:
return ""

0 comments on commit ea5223c

Please sign in to comment.