Merge pull request #180 from NekoAria/2.0

V2.4.0
Quan666 · Sep 12, 2021 · 98f5ff7 · 98f5ff7
2 parents 9584803 + a709b73
commit 98f5ff7
Show file tree

Hide file tree

Showing 17 changed files with 268 additions and 186 deletions.
diff --git a/.env b/.env
@@ -1,2 +1,2 @@
 ENVIRONMENT=prod
-VERSION='v2.3.1'
+VERSION='v2.4.0'
diff --git a/.env.dev b/.env.dev
@@ -10,7 +10,6 @@ RSS_PROXY="127.0.0.1:7890"  # 代理地址
 RSSHUB="https://rsshub.app"  # rsshub订阅地址
 RSSHUB_BACKUP=[]  # 备用rsshub地址 填写示例 ["https://rsshub.app","https://rsshub.app"] 务必使用双引号！！！
 DB_CACHE_EXPIRE=30  # 去重数据库的记录清理限定天数
-LIMIT=50  # 缓存rss条数
 
 # 图片压缩
 ZIP_SIZE=2048  # 非 GIF 图片压缩后的最大长宽值，单位 px

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "ELF_RSS"
-version = "2.3.1"
+version = "2.4.0"
 description = "ELF_RSS"
 authors = ["Quan666"]
 license = "GPL v3"
@@ -31,6 +31,7 @@ pydantic = "~=1.8.1"
 pyquery = "~=1.4.3"
 python-qbittorrent = "~=0.4.2"
 tenacity = "~=7.0.0"
+tinydb = "~=4.5.1"
 typing-extensions = "~=3.7.4.3"
 
 [tool.poetry.dev-dependencies]

diff --git a/requirements.txt b/requirements.txt
@@ -17,4 +17,5 @@ nonebot-adapter-cqhttp~=2.0.0a15
 ImageHash~=4.2.0
 pydantic~=1.8.1
 tenacity~=7.0.0
-bbcode~=1.1.0
+bbcode~=1.1.0
+tinydb~=4.5.1
diff --git a/setup.py b/setup.py
@@ -7,7 +7,7 @@
 
 setuptools.setup(
     name="ELF_RSS",
-    version="2.3.1",
+    version="2.4.0",
     author="Quan666",
     author_email="i@oy.mk",
     description="QQ机器人 RSS订阅 插件，订阅源建议选择 RSSHub",

diff --git a/src/plugins/ELF_RSS2/RSS/routes/Parsing/__init__.py b/src/plugins/ELF_RSS2/RSS/routes/Parsing/__init__.py
@@ -11,7 +11,12 @@
 
 from . import check_update, send_message
 from .download_torrent import down_torrent
-from .duplicate_filter import cache_db_manage, duplicate_exists, insert_into_cache_db
+from .duplicate_filter import (
+    cache_db_manage,
+    cache_json_manage,
+    duplicate_exists,
+    insert_into_cache_db,
+)
 from .handle_html_tag import handle_bbcode
 from .handle_html_tag import handle_html_tag
 from .handle_images import handle_img
@@ -193,15 +198,13 @@ def __init__(self, rss: Rss):
         ]
 
     # 开始解析
-    async def start(self, new_rss: dict, old_data: list):
-        # new_data 是完整的 rss 解析后的 dict，old_data 是 list
+    async def start(self, new_rss: dict):
+        # new_data 是完整的 rss 解析后的 dict
         # 前置处理
         self.state.update(
             {
                 "rss_title": new_rss.get("feed").get("title"),
-                "new_rss": new_rss,
                 "new_data": new_rss.get("entries"),
-                "old_data": old_data,
                 "change_data": [],  # 更新的消息列表
                 "conn": None,  # 数据库连接
             }
@@ -251,46 +254,44 @@ async def start(self, new_rss: dict, old_data: list):
 # 检查更新
 @ParsingBase.append_before_handler(priority=10)
 async def handle_check_update(rss: Rss, state: dict):
-    change_data = await check_update.check_update(
-        state.get("new_data"), state.get("old_data")
-    )
+    _file = FILE_PATH + (rss.name + ".json")
+    change_data = await check_update.check_update(_file, state.get("new_data"))
     return {"change_data": change_data}
 
 
 # 判断是否满足推送条件
 @ParsingBase.append_before_handler(priority=11)
 async def handle_check_update(rss: Rss, state: dict):
     change_data = state.get("change_data")
-    new_rss = state.get("new_rss")
     for item in change_data.copy():
         summary = get_summary(item)
         # 检查是否包含屏蔽词
         if config.black_word and re.findall("|".join(config.black_word), summary):
             logger.info("内含屏蔽词，已经取消推送该消息")
-            write_item(rss=rss, new_rss=new_rss, new_item=item)
+            write_item(name=rss.name, new_item=item)
             change_data.remove(item)
             continue
         # 检查是否匹配关键词 使用 down_torrent_keyword 字段,命名是历史遗留导致，实际应该是白名单关键字
         if rss.down_torrent_keyword and not re.search(
             rss.down_torrent_keyword, summary
         ):
-            write_item(rss=rss, new_rss=new_rss, new_item=item)
+            write_item(name=rss.name, new_item=item)
             change_data.remove(item)
             continue
         # 检查是否匹配黑名单关键词 使用 black_keyword 字段
         if rss.black_keyword and (
             re.search(rss.black_keyword, item["title"])
             or re.search(rss.black_keyword, summary)
         ):
-            write_item(rss=rss, new_rss=new_rss, new_item=item)
+            write_item(name=rss.name, new_item=item)
             change_data.remove(item)
             continue
         # 检查是否只推送有图片的消息
         if (rss.only_pic or rss.only_has_pic) and not re.search(
             r"<img.+?>|\[img]", summary
         ):
             logger.info(f"{rss.name} 已开启仅图片/仅含有图片，该消息没有图片，将跳过")
-            write_item(rss=rss, new_rss=new_rss, new_item=item)
+            write_item(name=rss.name, new_item=item)
             change_data.remove(item)
 
     return {"change_data": change_data}
@@ -300,7 +301,6 @@ async def handle_check_update(rss: Rss, state: dict):
 @ParsingBase.append_before_handler(priority=12)
 async def handle_check_update(rss: Rss, state: dict):
     change_data = state.get("change_data")
-    new_rss = state.get("new_rss")
     conn = state.get("conn")
 
     # 检查是否启用去重 使用 duplicate_filter_mode 字段
@@ -312,6 +312,7 @@ async def handle_check_update(rss: Rss, state: dict):
         conn.set_trace_callback(logger.debug)
 
     await cache_db_manage(conn)
+    await cache_json_manage(FILE_PATH + (rss.name + ".json"))
 
     delete = []
     for index, item in enumerate(change_data):
@@ -324,7 +325,7 @@ async def handle_check_update(rss: Rss, state: dict):
             summary=summary,
         )
         if is_duplicate:
-            write_item(rss=rss, new_rss=new_rss, new_item=item)
+            write_item(name=rss.name, new_item=item)
             delete.append(index)
         else:
             change_data[index]["image_hash"] = str(image_hash)
@@ -480,7 +481,7 @@ async def handle_torrent(
 async def handle_date(
     rss: Rss, state: dict, item: dict, item_msg: str, tmp: str, tmp_state: dict
 ) -> str:
-    date = (
+    date = tuple(
         item.get("updated_parsed")
         if item.get("updated_parsed")
         else item.get("published_parsed")
@@ -503,7 +504,10 @@ async def handle_message(
 ) -> str:
     # 发送消息并写入文件
     if await send_message.send_msg(rss=rss, msg=item_msg, item=item):
-        write_item(rss=rss, new_rss=state.get("new_rss"), new_item=item)
+        if item.get("to_send"):
+            item.pop("to_send")
+            item.pop("count")
+        write_item(name=rss.name, new_item=item)
 
         if rss.duplicate_filter_mode:
             image_hash = item["image_hash"]
@@ -512,6 +516,13 @@ async def handle_message(
             )
 
         state["item_count"] += 1
+    else:
+        item["to_send"] = True
+        if not item.get("count"):
+            item["count"] = 1
+        else:
+            item["count"] += 1
+        write_item(name=rss.name, new_item=item)
 
     return ""
 

diff --git a/src/plugins/ELF_RSS2/RSS/routes/Parsing/check_update.py b/src/plugins/ELF_RSS2/RSS/routes/Parsing/check_update.py
@@ -1,13 +1,15 @@
 import hashlib
-import json
 import time
 
+from tinydb import TinyDB, Query
 from typing import Dict, Any
 
 
 # 处理日期
 async def handle_date(date=None) -> str:
     if date:
+        if not isinstance(date, tuple):
+            date = tuple(date)
         rss_time = time.mktime(date)
         # 时差处理，待改进
         if rss_time + 28800.0 < time.time():
@@ -18,46 +20,44 @@ async def handle_date(date=None) -> str:
         return "日期：" + time.strftime("%m月%d日 %H:%M:%S", time.localtime())
 
 
-# 将 dict 对象转换为 json 字符串后，计算哈希值，供后续比较
+# 对 dict 对象计算哈希值，供后续比较
 def dict_hash(dictionary: Dict[str, Any]) -> str:
     keys = ["id", "link", "published", "updated", "title"]
-    dictionary_temp = {k: dictionary[k] for k in keys if k in dictionary}
-    d_hash = hashlib.md5()
-    encoded = json.dumps(dictionary_temp, sort_keys=True).encode()
-    d_hash.update(encoded)
-    return d_hash.hexdigest()
+    string = "|".join([dictionary[k] for k in keys if k in dictionary])
+    result = hashlib.md5(string.encode())
+    return result.hexdigest()
 
 
 # 检查更新
-async def check_update(new: list, old: list) -> list:
-    # 有些订阅可能存在没有 entries 的情况，比如 Bilibili 直播间开播状态，直接跳过
-    if not new:
+async def check_update(_file: str, new: list) -> list:
+    db = TinyDB(
+        _file,
+        encoding="utf-8",
+        sort_keys=True,
+        indent=4,
+        ensure_ascii=False,
+    )
+    # 发送失败超过 3 次的消息不再发送
+    to_send_list = db.search(
+        (Query().to_send.exists()) & (Query().count.test(lambda x: x <= 3))
+    )
+
+    if not new and not to_send_list:
         return []
 
-    old_hash_list = [dict_hash(i) if not i.get("hash") else i.get("hash") for i in old]
-    # 对比本地消息缓存和获取到的消息，新的存入 hash ，随着检查更新的次数增多，逐步替换原来没存 hash 的缓存记录
-    temp = []
-    hash_list = []
+    old_hash_list = [r.get("hash") for r in db.all()]
     for i in new:
         hash_temp = dict_hash(i)
         if hash_temp not in old_hash_list:
             i["hash"] = hash_temp
-            temp.append(i)
-            hash_list.append(hash_temp)
-
-    # 将结果进行去重，避免消息重复发送
-    result = [
-        value
-        for index, value in enumerate(temp)
-        if value["hash"] not in hash_list[index + 1 :]
-    ]
+            to_send_list.append(i)
 
     # 对结果按照发布时间排序
     result_with_date = [
         (await handle_date(i.get("updated_parsed")), i)
         if i.get("updated_parsed")
         else (await handle_date(i.get("published_parsed")), i)
-        for i in result
+        for i in to_send_list
     ]
     result_with_date.sort(key=lambda tup: tup[0])
     result = [i for key, i in result_with_date]

diff --git a/src/plugins/ELF_RSS2/RSS/routes/Parsing/duplicate_filter.py b/src/plugins/ELF_RSS2/RSS/routes/Parsing/duplicate_filter.py
@@ -1,15 +1,22 @@
-import sqlite3
+import datetime
 import imagehash
+import os
+import sqlite3
+import time
 
 from PIL import Image, UnidentifiedImageError
 from io import BytesIO
 from nonebot.log import logger
+from pathlib import Path
 from pyquery import PyQuery as Pq
+from tinydb import TinyDB, Query
 
 from .handle_images import download_image
 from ... import rss_class
 from ....config import config
 
+FILE_PATH = str(str(Path.cwd()) + os.sep + "data" + os.sep)
+
 
 # 对去重数据库进行管理
 async def cache_db_manage(conn: sqlite3.connect) -> None:
@@ -38,6 +45,34 @@ async def cache_db_manage(conn: sqlite3.connect) -> None:
     conn.commit()
 
 
+# 对缓存 json 进行管理
+async def cache_json_manage(_file: str) -> None:
+    db = TinyDB(
+        _file,
+        encoding="utf-8",
+        sort_keys=True,
+        indent=4,
+        ensure_ascii=False,
+    )
+    expired_date = datetime.datetime.utcnow() - datetime.timedelta(
+        days=config.db_cache_expire
+    )
+    expired_timestamp = datetime.datetime.timestamp(expired_date)
+    # 移除超过 config.db_cache_expire 天的记录
+    db.remove(
+        (
+            Query().published_parsed.test(
+                lambda x: time.mktime(tuple(x)) <= expired_timestamp
+            )
+        )
+        | (
+            Query().updated_parsed.test(
+                lambda x: time.mktime(tuple(x)) <= expired_timestamp
+            )
+        )
+    )
+
+
 # 去重判断
 async def duplicate_exists(
     rss: rss_class.Rss, conn: sqlite3.connect, link: str, title: str, summary: str