change to httpx

MorvanZhou · Nov 29, 2023 · e0c2805 · e0c2805
1 parent 6cd0d58
commit e0c2805
Show file tree

Hide file tree

Showing 8 changed files with 165 additions and 53 deletions.
diff --git a/src/rethink/dist-local/css/app.09f0dec8.css b/src/rethink/dist-local/css/app.09f0dec8.css
@@ -3175,7 +3175,6 @@ img[data-v-4f42ee16] {
 .at-search-result[data-v-5527cb1e]:hover {
   background-color: #eeeeee;
 }
-
 .result-hl[data-v-5527cb1e] {
   font-size: 0.8em;
   color: #666;
@@ -3196,15 +3195,12 @@ img[data-v-4f42ee16] {
   height: 100%;
   margin: 0 auto;
 }
-
 .circle-bg[data-v-1bc9a6f2] {
   background: #FEFEFE;
 }
-
 .circle[data-v-1bc9a6f2] {
   height: 50px;
 }
-
 .dots[data-v-1bc9a6f2] {
   width: 60px;
 }
@@ -3219,11 +3215,9 @@ img[data-v-4f42ee16] {
   max-height: 400px;
   overflow-y: auto;
 }
-
 .at-search-height-sm[data-v-e789ed84] {
   max-height: 600px !important;
 }
-
 .at-search-result-group-label[data-v-e789ed84] {
   font-weight: 500;
   font-size: 0.9em;

diff --git a/src/rethink/dist-local/js/app.js b/src/rethink/dist-local/js/app.js
diff --git a/src/rethink/models/files/upload.py b/src/rethink/models/files/upload.py
@@ -7,8 +7,8 @@
 import zipfile
 from typing import List, Tuple, Optional
 
+import httpx
 import pymongo.errors
-import requests
 from bson import ObjectId
 from bson.tz_util import utc
 from fastapi import UploadFile
@@ -630,20 +630,34 @@ async def fetch_image_vditor(uid: str, url: str) -> Tuple[str, const.Code]:
         return "", code
     if await models.user.user_space_not_enough(u=u):
         return "", const.Code.USER_SPACE_NOT_ENOUGH
+    async with httpx.AsyncClient() as client:
+        try:
+            response = await client.get(
+                url=url,
+                headers=models.utils.ASYNC_CLIENT_HEADERS,
+                timeout=5.
+            )
+        except (
+                httpx.ConnectTimeout,
+                RuntimeError,
+                httpx.ConnectError,
+                httpx.ReadTimeout,
+                httpx.HTTPError
+        ) as e:
+            logger.info(f"failed to get {url}: {e}")
+            return "", const.Code.FILE_OPEN_ERROR
+        if response.status_code != 200:
+            return "", const.Code.FILE_OPEN_ERROR
+
+        content = response.content
+
+        file = UploadFile(
+            filename=url.split("/")[-1],
+            file=io.BytesIO(content),
+            headers=Headers(response.headers),
+            size=len(content)
+        )
 
-    try:
-        r = requests.get(url)
-    except requests.exceptions.RequestException:
-        return url, const.Code.OK
-
-    if r.status_code != 200:
-        return "", const.Code.FILE_OPEN_ERROR
-    file = UploadFile(
-        filename=url.split("/")[-1],
-        file=io.BytesIO(r.content),
-        headers=Headers(r.headers),
-        size=len(r.content)
-    )
     res = await file_ops.save_upload_files(
         uid=uid,
         files=[file],

diff --git a/src/rethink/models/search_engine/engine.py b/src/rethink/models/search_engine/engine.py
@@ -4,6 +4,7 @@
 from typing import List, Tuple, Sequence
 
 from rethink import const
+from rethink.models.utils import strip_html_tags
 
 
 @dataclass
@@ -12,6 +13,10 @@ class SearchDoc:
     title: str
     body: str
 
+    def __post_init__(self):
+        self.title = strip_html_tags(self.title)
+        self.body = strip_html_tags(self.body)
+
 
 @dataclass
 class RestoreSearchDoc:
@@ -23,6 +28,10 @@ class RestoreSearchDoc:
     disabled: bool
     inTrash: bool
 
+    def __post_init__(self):
+        self.title = strip_html_tags(self.title)
+        self.body = strip_html_tags(self.body)
+
 
 @dataclass
 class SearchResult:

diff --git a/src/rethink/models/utils.py b/src/rethink/models/utils.py
@@ -3,6 +3,8 @@
 import math
 import re
 import uuid
+from html.parser import HTMLParser
+from io import StringIO
 from typing import Tuple
 
 import httpx
@@ -68,7 +70,7 @@ def preprocess_md(md: str, snippet_len: int = 200) -> Tuple[str, str, str]:
     title, body = split_title_body(fulltext=md)
     title = md2txt(title.strip())
     body = md2txt(body.strip())
-    snippet = body[:snippet_len]
+    snippet = strip_html_tags(body)[:snippet_len]
     return title, body, snippet
 
 
@@ -156,14 +158,14 @@ def contain_only_http_link(md: str) -> str:
 
 async def get_title_description_from_link(url: str, language: str) -> Tuple[str, str]:
     if language == const.Language.ZH.value:
-        title = "网址没发现标题"
-        description = "网址没发现描述"
+        no_title = "网址没发现标题"
+        no_description = "网址没发现描述"
     elif language == const.Language.EN.value:
-        title = "No title found"
-        description = "No description found"
+        no_title = "No title found"
+        no_description = "No description found"
     else:
-        title = "No title found"
-        description = "No description found"
+        no_title = "No title found"
+        no_description = "No description found"
     async with httpx.AsyncClient() as client:
         try:
             response = await client.get(
@@ -179,14 +181,15 @@ async def get_title_description_from_link(url: str, language: str) -> Tuple[str,
                 httpx.HTTPError
         ) as e:
             logger.info(f"failed to get {url}: {e}")
-            return title, description
+            return no_title, no_description
         if response.status_code in [302, 301]:
             url = response.headers["Location"]
             return await get_title_description_from_link(url=url, language=language)
         if response.status_code != 200:
-            return title, description
+            return no_title, no_description
         html = response.text
 
+    title, description = "", ""
     found = re.search(r'<meta[^>]*name="title"[^>]*content="([^"]*)"[^>]*>', html, re.DOTALL)
     if found is None:
         found = re.search(r'<meta[^>]*content="([^"]*)"[^>]*name="title"[^>]*>', html, re.DOTALL)
@@ -200,4 +203,29 @@ async def get_title_description_from_link(url: str, language: str) -> Tuple[str,
         found = re.search(r'<meta[^>]*content="([^"]*)"[^>]*name="description"[^>]*>', html, re.DOTALL)
     if found:
         description = found.group(1).strip()[:400]
+    if title == "":
+        title = no_title
+    if description == "":
+        description = no_description
     return title, description
+
+
+class MLStripper(HTMLParser):
+    def __init__(self):
+        super().__init__()
+        self.reset()
+        self.strict = False
+        self.convert_charrefs = True
+        self.text = StringIO()
+
+    def handle_data(self, d):
+        self.text.write(d)
+
+    def get_data(self):
+        return self.text.getvalue()
+
+
+def strip_html_tags(html):
+    s = MLStripper()
+    s.feed(html)
+    return s.get_data()
diff --git a/tests/test_api.py b/tests/test_api.py
@@ -6,6 +6,7 @@
 import unittest
 from pathlib import Path
 from typing import Dict
+from unittest.mock import patch
 from zipfile import ZipFile
 
 from PIL import Image
@@ -513,7 +514,11 @@ def test_upload_image(self):
         f1.close()
         shutil.rmtree("temp", ignore_errors=True)
 
-    def test_put_quick_node(self):
+    @patch(
+        "rethink.models.utils.httpx.AsyncClient.get",
+        return_value=Response(200, content="<title>百度一下</title>".encode("utf-8"))
+    )
+    def test_put_quick_node(self, mocker):
         resp = self.client.put(
             "/api/node/quick",
             json={

diff --git a/tests/test_models_local.py b/tests/test_models_local.py
@@ -4,8 +4,9 @@
 from io import BytesIO
 from pathlib import Path
 from textwrap import dedent
+from unittest.mock import patch
 
-import requests
+import httpx
 from PIL import Image
 from bson import ObjectId
 from bson.tz_util import utc
@@ -351,7 +352,7 @@ async def test_files_upload_process(self):
             "startAt": now,
             "running": True,
             "obsidian": {},
-            "problemFiles": [],
+            "msg": "",
             "code": const.Code.OK.value,
         }
         res = await models.database.COLL.import_data.insert_one(doc)
@@ -409,22 +410,32 @@ async def test_upload_image_vditor(self):
         u, code = await models.user.get(self.uid)
         self.assertEqual(used_space + size, u["usedSpace"])
 
-    async def test_fetch_image_vditor(self):
+    @patch(
+        "rethink.models.files.upload.httpx.AsyncClient.get",
+    )
+    async def test_fetch_image_vditor(self, mock_get):
+        f = open(Path(__file__).parent.parent / "img" / "phone-notes.png", "rb")
+        mock_get.return_value = httpx.Response(
+            200,
+            content=f.read(),
+            headers={"content-type": "image/png"}
+        )
+
         u, code = await models.user.get(self.uid)
         used_space = u["usedSpace"]
 
-        url = "https://rethink.run/favicon.ico"
+        url = "https://rethink.run/favicon.png"
         new_url, code = await models.files.fetch_image_vditor(self.uid, url)
         self.assertEqual(const.Code.OK, code)
-        self.assertTrue(new_url.endswith(".ico"))
+        self.assertTrue(new_url.endswith(".png"))
         self.assertTrue(new_url.startswith("/"))
         local_file = Path(__file__).parent / "tmp" / ".data" / new_url[1:]
         self.assertTrue(local_file.exists())
         local_file.unlink()
 
         u, code = await models.user.get(self.uid)
-        r = requests.get(url)
-        self.assertEqual(used_space + len(r.content), u["usedSpace"])
+        self.assertEqual(used_space + f.tell(), u["usedSpace"])
+        f.close()
 
     async def test_update_used_space(self):
         u, code = await models.user.get(self.uid)

diff --git a/tests/test_models_utils.py b/tests/test_models_utils.py
@@ -1,5 +1,8 @@
 import unittest
 from textwrap import dedent
+from unittest.mock import patch
+
+import httpx
 
 from rethink import const, config
 from rethink.models import utils
@@ -92,18 +95,66 @@ def setUpClass(cls) -> None:
     def tearDownClass(cls) -> None:
         config.get_settings.cache_clear()
 
-    @unittest.skip("skip outer connection test")
-    async def test_get_title_description_from_link(self):
-        for url, res in [
-            ("https://github.com/MorvanZhou/rethink", True),
-            # ("https://zhuanlan.zhihu.com/p/610939462?utm_id=0", True),
-            ("https://waqwe12f2f2fa.fffffffff", False),
-            ("https://baidu.com", True),
-            ("https://rethink.run", True),
-            ("https://rethink.run/about", True),
-            ("https://baidu.com/wqwqqqqq", False),
-            ("https://mp.weixin.qq.com/s/jbB0GXbjHpFR8m1-6TSASw", True),
+    # @unittest.skip("skip outer connection test")
+    @patch(
+        "rethink.models.utils.httpx.AsyncClient.get",
+    )
+    async def test_get_title_description_from_link(self, mock_get):
+        for url, content, res in [
+            (
+                    "https://github.com/MorvanZhou/rethink",
+                    "<title>MorvanZhou/rethink: Rethink: a note taking web app</title>"
+                    """<meta name="description" content="Rethink: a note taking web app. Contribute to 
+                    MorvanZhou/rethink development by creating an account on GitHub.">""",
+                    True
+            ),
+            (
+                    "https://zhuanlan.zhihu.com/p/610939462?utm_id=0",
+                    """<head>
+                    <meta charSet="utf-8"/>
+                    <title data-rh="true">python的httpx库如何使用 - 知乎</title>
+                    <meta data-rh="true" name="description" content="httpx是一个基于Python的异步HTTP客户端库，
+                    可以用于发送HTTP请求和接收HTTP响应。以下是一些httpx库的基本使用方法：
+                     发送HTTP GET请求import httpx async with httpx.AsyncClient() as client: response = await…"/>""",
+                    True
+            ),
+            (
+                    "https://waqwe12f2f2fa.fffffffff",
+                    "",
+                    False
+            ),
+            (
+                    "https://baidu.com",
+                    """<title>百度一下，你就知道</title>
+                    <meta name="description" content="全球领先的中文搜索引擎、
+                    致力于让网民更便捷地获取信息，找到所求。百度超过千亿的中文网页数据库，可以瞬间找到相关的搜索结果。">""",
+                    True
+            ),
+            (
+                    "https://rethink.run",
+                    """<meta content="Rethink" name="title"><title>rethink</title><meta content="Rethink: think differently" name="description">""",
+                    True
+            ),
+            (
+                    "https://baidu.com/wqwqqqqq",
+                    "",
+                    False
+            ),
+            (
+                    "https://mp.weixin.qq.com/s/jbB0GXbjHpFR8m1-6TSASw",
+                    """<title></title><meta name="description" content="" />""",
+                    False),
         ]:
+            if res:
+                mock_get.return_value = httpx.Response(
+                    status_code=200,
+                    content=content.encode("utf-8"),
+                )
+            else:
+                mock_get.return_value = httpx.Response(
+                    status_code=404,
+                    content=content.encode("utf-8"),
+                )
             title, desc = await utils.get_title_description_from_link(
                 url, language=const.Language.EN.value)
             if res: