From c1edd3a6fb0d5f0c8d36cbf374620f1c822def51 Mon Sep 17 00:00:00 2001 From: NekoAria <990879119@qq.com> Date: Sat, 11 Sep 2021 11:01:55 +0800 Subject: [PATCH 1/4] =?UTF-8?q?:recycle:=20=E9=87=8D=E6=9E=84=E4=BB=A3?= =?UTF-8?q?=E7=A0=81=20-=20=E5=B0=86=20`rss.json`=20=E6=94=B9=E9=80=A0?= =?UTF-8?q?=E4=B8=BA=20`tinydb`=20=E6=95=B0=E6=8D=AE=E5=BA=93?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pyproject.toml | 1 + requirements.txt | 3 +- src/plugins/ELF_RSS2/RSS/rss_class.py | 114 ++++++++++++------------ src/plugins/ELF_RSS2/RSS/rss_parsing.py | 4 - src/plugins/ELF_RSS2/change_dy.py | 14 ++- src/plugins/ELF_RSS2/del_dy.py | 9 +- src/plugins/ELF_RSS2/start.py | 41 ++++++++- 7 files changed, 114 insertions(+), 72 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index d50dcf3c..e8a4f4f0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -31,6 +31,7 @@ pydantic = "~=1.8.1" pyquery = "~=1.4.3" python-qbittorrent = "~=0.4.2" tenacity = "~=7.0.0" +tinydb = "~=4.5.1" typing-extensions = "~=3.7.4.3" [tool.poetry.dev-dependencies] diff --git a/requirements.txt b/requirements.txt index 14298a95..511ee16f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -17,4 +17,5 @@ nonebot-adapter-cqhttp~=2.0.0a15 ImageHash~=4.2.0 pydantic~=1.8.1 tenacity~=7.0.0 -bbcode~=1.1.0 \ No newline at end of file +bbcode~=1.1.0 +tinydb~=4.5.1 \ No newline at end of file diff --git a/src/plugins/ELF_RSS2/RSS/rss_class.py b/src/plugins/ELF_RSS2/RSS/rss_class.py index 88fc4572..d0d9a42a 100644 --- a/src/plugins/ELF_RSS2/RSS/rss_class.py +++ b/src/plugins/ELF_RSS2/RSS/rss_class.py @@ -1,10 +1,11 @@ -import codecs import json import os import re -from pathlib import Path from nonebot.log import logger +from pathlib import Path +from tinydb import TinyDB, Query +from tinydb.operations import set from ..config import config @@ -104,46 +105,21 @@ def read_rss() -> list: if not os.path.isfile(str(FILE_PATH + "rss.json")): return [] rss_list = [] - with codecs.open(str(FILE_PATH + "rss.json"), "r", "utf-8") as load_f: - rss_list_json = json.load(load_f) - for rss_one in rss_list_json: - tmp_rss = Rss("", "", "-1", "-1") - if not isinstance(rss_one, str): - rss_one = json.dumps(rss_one) - tmp_rss.__dict__ = json.loads(rss_one) - rss_list.append(tmp_rss) + db = TinyDB( + str(FILE_PATH + "rss.json"), + encoding="utf-8", + sort_keys=True, + indent=4, + ensure_ascii=False, + ) + for rss in db.all(): + tmp_rss = Rss("", "", "-1", "-1") + if not isinstance(rss, str): + rss = json.dumps(rss) + tmp_rss.__dict__ = json.loads(rss) + rss_list.append(tmp_rss) return rss_list - # 写入记录,传入rss list,不传就把当前 self 写入 - def write_rss(self, rss_new: list = None): - # 先读取订阅记录 - rss_old = self.read_rss() - # 把当前 self 写入 - if not rss_new: - rss_new = [self] - - for tmp_new in rss_new: - flag = True - for index, i_old in enumerate(rss_old): - # 如果有记录 就修改记录,没有就添加 - if i_old.name == tmp_new.name: - rss_old[index] = tmp_new - flag = False - break - if flag: - rss_old.append(tmp_new) - rss_json = [] - for rss_one in rss_old: - tmp = {} - tmp.update(rss_one.__dict__) - rss_json.append(tmp) - if not os.path.isdir(FILE_PATH): - os.makedirs(FILE_PATH) - with codecs.open(str(FILE_PATH + "rss.json"), "w", "utf-8") as dump_f: - dump_f.write( - json.dumps(rss_json, sort_keys=True, indent=4, ensure_ascii=False) - ) - # 查找是否存在当前订阅名 rss 要转换为 rss_ def find_name(self, name: str): # 过滤特殊字符 @@ -161,37 +137,54 @@ def add_user(self, user: str): if str(user) in self.user_id: return self.user_id.append(str(user)) - self.write_rss() + db = TinyDB( + str(FILE_PATH + "rss.json"), + encoding="utf-8", + sort_keys=True, + indent=4, + ensure_ascii=False, + ) + db.upsert(self.__dict__, Query().name == self.name) # 添加订阅 群组 def add_group(self, group: str): if str(group) in self.group_id: return self.group_id.append(str(group)) - self.write_rss() + db = TinyDB( + str(FILE_PATH + "rss.json"), + encoding="utf-8", + sort_keys=True, + indent=4, + ensure_ascii=False, + ) + db.upsert(self.__dict__, Query().name == self.name) # 删除订阅 群组 def delete_group(self, group: str) -> bool: if not str(group) in self.group_id: return False self.group_id.remove(str(group)) - self.write_rss() + db = TinyDB( + str(FILE_PATH + "rss.json"), + encoding="utf-8", + sort_keys=True, + indent=4, + ensure_ascii=False, + ) + db.update(set("group_id", self.group_id), Query().name == self.name) return True # 删除整个订阅 - def delete_rss(self, delrss): - rss_old = self.read_rss() - rss_json = [] - for rss_one in rss_old: - if rss_one.name != delrss.name: - rss_json.append(rss_one.__dict__) - - if not os.path.isdir(FILE_PATH): - os.makedirs(FILE_PATH) - with codecs.open(str(FILE_PATH + "rss.json"), "w", "utf-8") as dump_f: - dump_f.write( - json.dumps(rss_json, sort_keys=True, indent=4, ensure_ascii=False) - ) + def delete_rss(self): + db = TinyDB( + str(FILE_PATH + "rss.json"), + encoding="utf-8", + sort_keys=True, + indent=4, + ensure_ascii=False, + ) + db.remove(Query().name == self.name) self.delete_file() # 删除订阅json文件 @@ -225,7 +218,14 @@ def set_cookies(self, cookies_str: str) -> bool: name, value = line.strip().split("=") cookies[name] = value self.cookies = cookies - self.write_rss() + db = TinyDB( + str(FILE_PATH + "rss.json"), + encoding="utf-8", + sort_keys=True, + indent=4, + ensure_ascii=False, + ) + db.update(set("cookies", cookies), Query().name == self.name) return True else: self.cookies = None diff --git a/src/plugins/ELF_RSS2/RSS/rss_parsing.py b/src/plugins/ELF_RSS2/RSS/rss_parsing.py index 903adeb0..de67a3f9 100644 --- a/src/plugins/ELF_RSS2/RSS/rss_parsing.py +++ b/src/plugins/ELF_RSS2/RSS/rss_parsing.py @@ -3,11 +3,9 @@ import asyncio import feedparser import httpx -import os.path import re from nonebot.log import logger -from pathlib import Path from tenacity import retry, stop_after_attempt, stop_after_delay, RetryError, TryAgain from . import rss_class @@ -15,8 +13,6 @@ from .routes.Parsing.read_or_write_rss_data import read_rss, write_rss from ..config import config -FILE_PATH = str(str(Path.cwd()) + os.sep + "data" + os.sep) - STATUS_CODE = [200, 301, 302] # 去掉烦人的 returning true from eof_received() has no effect when using ssl httpx 警告 diff --git a/src/plugins/ELF_RSS2/change_dy.py b/src/plugins/ELF_RSS2/change_dy.py index 31396c99..5c0cd8dd 100644 --- a/src/plugins/ELF_RSS2/change_dy.py +++ b/src/plugins/ELF_RSS2/change_dy.py @@ -1,4 +1,5 @@ import copy +import os import re from nonebot import on_command @@ -7,11 +8,15 @@ from nonebot.adapters.cqhttp import Bot, Event, GroupMessageEvent, permission, unescape from nonebot.log import logger from nonebot.rule import to_me +from pathlib import Path +from tinydb import TinyDB, Query from .RSS import rss_class from .RSS import my_trigger as tr scheduler = require("nonebot_plugin_apscheduler").scheduler +# 存储目录 +FILE_PATH = str(str(Path.cwd()) + os.sep + "data" + os.sep) RSS_CHANGE = on_command( "change", @@ -181,7 +186,14 @@ async def handle_rss_change(bot: Bot, event: Event, state: dict): else: setattr(rss, "content_to_remove", rm_list) # 参数解析完毕,写入 - rss.write_rss() + db = TinyDB( + str(FILE_PATH + "rss.json"), + encoding="utf-8", + sort_keys=True, + indent=4, + ensure_ascii=False, + ) + db.update(rss.__dict__, Query().name == name) # 加入定时任务 if not rss.stop: await tr.add_job(rss) diff --git a/src/plugins/ELF_RSS2/del_dy.py b/src/plugins/ELF_RSS2/del_dy.py index 17e99257..78e91a91 100644 --- a/src/plugins/ELF_RSS2/del_dy.py +++ b/src/plugins/ELF_RSS2/del_dy.py @@ -1,6 +1,3 @@ -import os -from pathlib import Path - from nonebot import on_command from nonebot import permission as su from nonebot import require @@ -11,8 +8,6 @@ from .RSS import my_trigger as tr SCHEDULER = require("nonebot_plugin_apscheduler").scheduler -# 存储目录 -FILE_PATH = str(str(Path.cwd()) + os.sep + "data" + os.sep) RSS_DELETE = on_command( "deldy", @@ -47,7 +42,7 @@ async def handle_rss_delete(bot: Bot, event: Event, state: dict): if group_id: if rss.delete_group(group=group_id): if not rss.group_id and not rss.user_id: - rss.delete_rss(rss) + rss.delete_rss() await tr.delete_job(rss) else: await tr.add_job(rss) @@ -55,6 +50,6 @@ async def handle_rss_delete(bot: Bot, event: Event, state: dict): else: await RSS_DELETE.send(f"❌ 当前群组没有订阅: {rss.name} !") else: - rss.delete_rss(rss) + rss.delete_rss() await tr.delete_job(rss) await RSS_DELETE.send(f"👏 订阅 {rss.name} 删除成功!") diff --git a/src/plugins/ELF_RSS2/start.py b/src/plugins/ELF_RSS2/start.py index e8c554b4..dd8d7211 100644 --- a/src/plugins/ELF_RSS2/start.py +++ b/src/plugins/ELF_RSS2/start.py @@ -1,10 +1,13 @@ import codecs +import json import nonebot import os import re + from nonebot import logger, on_metaevent from nonebot.adapters.cqhttp import Bot, Event, LifecycleMetaEvent from pathlib import Path +from tinydb import TinyDB from .config import config from .RSS import rss_class @@ -14,6 +17,7 @@ def hash_clear(): + json_paths = list(Path(FILE_PATH).glob("*.json")) for j in [str(i) for i in json_paths if i != "rss.json"]: @@ -27,6 +31,40 @@ def hash_clear(): f.write(line) +# 将 rss.json 改造为 tinydb 数据库 +def change_rss_json(): + + db = TinyDB( + str(FILE_PATH + "rss.json"), + encoding="utf-8", + sort_keys=True, + indent=4, + ensure_ascii=False, + ) + + try: + db.all() + except TypeError: + + with codecs.open(str(FILE_PATH + "rss.json"), "r", "utf-8") as f: + rss_list_json = json.load(f) + + os.remove(str(FILE_PATH + "rss.json")) + + db = TinyDB( + str(FILE_PATH + "rss.json"), + encoding="utf-8", + sort_keys=True, + indent=4, + ensure_ascii=False, + ) + + for rss_json in rss_list_json: + if not isinstance(rss_json, str): + rss_json = json.dumps(rss_json) + db.insert(json.loads(rss_json)) + + async def start(): (bot,) = nonebot.get_bots().values() @@ -49,8 +87,6 @@ async def start(): ), ) logger.info("ELF_RSS 订阅器启动成功!") - if config.version == "v2.2.7": - hash_clear() except Exception as e: await bot.send_msg( message_type="private", @@ -64,6 +100,7 @@ async def start(): ) logger.info("第一次启动,你还没有订阅,记得添加哟!") logger.debug(e) + raise async def check_first_connect(bot: Bot, event: Event, state: dict) -> bool: From 687ccdc238ebecbe40fc2255977ac5793cc82320 Mon Sep 17 00:00:00 2001 From: NekoAria <990879119@qq.com> Date: Sat, 11 Sep 2021 11:05:44 +0800 Subject: [PATCH 2/4] =?UTF-8?q?:bookmark:=20=E5=8F=91=E5=B8=83=E4=B8=80?= =?UTF-8?q?=E4=B8=AA=E7=89=88=E6=9C=AC?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .env | 2 +- pyproject.toml | 2 +- setup.py | 2 +- src/plugins/ELF_RSS2/start.py | 3 +++ 4 files changed, 6 insertions(+), 3 deletions(-) diff --git a/.env b/.env index b887a7a5..a989df06 100644 --- a/.env +++ b/.env @@ -1,2 +1,2 @@ ENVIRONMENT=prod -VERSION='v2.3.1' +VERSION='v2.4.0' diff --git a/pyproject.toml b/pyproject.toml index e8a4f4f0..8b8bc314 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "ELF_RSS" -version = "2.3.1" +version = "2.4.0" description = "ELF_RSS" authors = ["Quan666"] license = "GPL v3" diff --git a/setup.py b/setup.py index f6bf3ab0..43d4b006 100644 --- a/setup.py +++ b/setup.py @@ -7,7 +7,7 @@ setuptools.setup( name="ELF_RSS", - version="2.3.1", + version="2.4.0", author="Quan666", author_email="i@oy.mk", description="QQ机器人 RSS订阅 插件,订阅源建议选择 RSSHub", diff --git a/src/plugins/ELF_RSS2/start.py b/src/plugins/ELF_RSS2/start.py index dd8d7211..c2ea9686 100644 --- a/src/plugins/ELF_RSS2/start.py +++ b/src/plugins/ELF_RSS2/start.py @@ -68,6 +68,9 @@ def change_rss_json(): async def start(): (bot,) = nonebot.get_bots().values() + if config.version == "v2.4.0": + change_rss_json() + try: rss = rss_class.Rss("", "", "-1", "-1") rss_list = rss.read_rss() # 读取list From e0209a4c56bbc2facbb49f774a6478419f9f08a5 Mon Sep 17 00:00:00 2001 From: NekoAria <990879119@qq.com> Date: Sat, 11 Sep 2021 11:27:07 +0800 Subject: [PATCH 3/4] =?UTF-8?q?:fire:=20=E7=A7=BB=E9=99=A4=E4=BB=A3?= =?UTF-8?q?=E7=A0=81=E6=88=96=E6=96=87=E4=BB=B6=20-=20=E7=A7=BB=E9=99=A4?= =?UTF-8?q?=E6=B2=A1=E7=94=A8=E5=88=B0=E7=9A=84=E5=B1=9E=E6=80=A7=20`sum`?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/plugins/ELF_RSS2/RSS/rss_class.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/plugins/ELF_RSS2/RSS/rss_class.py b/src/plugins/ELF_RSS2/RSS/rss_class.py index d0d9a42a..bea811a8 100644 --- a/src/plugins/ELF_RSS2/RSS/rss_class.py +++ b/src/plugins/ELF_RSS2/RSS/rss_class.py @@ -20,7 +20,6 @@ class Rss: user_id = [] # 订阅用户(qq) -1 为空 group_id = [] # 订阅群组 img_proxy = False - sum = 20 # 加载条数 time = "5" # 更新频率 分钟/次 translation = False # 翻译 only_title = False # 仅标题 From aaf3644ab434c8b01aa47f806940ef6711eda916 Mon Sep 17 00:00:00 2001 From: NekoAria <990879119@qq.com> Date: Sun, 12 Sep 2021 16:25:52 +0800 Subject: [PATCH 4/4] =?UTF-8?q?:recycle:=20=E9=87=8D=E6=9E=84=E4=BB=A3?= =?UTF-8?q?=E7=A0=81=20-=20=E5=B0=86=20`=E7=BC=93=E5=AD=98=20json`=20?= =?UTF-8?q?=E6=94=B9=E9=80=A0=E4=B8=BA=20`tinydb`=20=E6=95=B0=E6=8D=AE?= =?UTF-8?q?=E5=BA=93=20-=20=E9=87=8D=E6=9E=84=20`=E6=A3=80=E6=9F=A5?= =?UTF-8?q?=E6=9B=B4=E6=96=B0`=20=E7=9B=B8=E5=85=B3=E9=80=BB=E8=BE=91=20-?= =?UTF-8?q?=20=E9=87=8D=E6=9E=84=20`=E7=BC=93=E5=AD=98json=E7=9A=84?= =?UTF-8?q?=E8=AF=BB=E5=86=99`=20=E7=9B=B8=E5=85=B3=E9=80=BB=E8=BE=91=20-?= =?UTF-8?q?=20=E9=87=8D=E6=9E=84=20`dict=5Fhash()`=20=E7=9A=84=E9=80=BB?= =?UTF-8?q?=E8=BE=91?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .env.dev | 1 - .../ELF_RSS2/RSS/routes/Parsing/__init__.py | 45 ++++++++----- .../RSS/routes/Parsing/check_update.py | 48 +++++++------- .../RSS/routes/Parsing/duplicate_filter.py | 37 ++++++++++- .../routes/Parsing/read_or_write_rss_data.py | 58 ++++++----------- src/plugins/ELF_RSS2/RSS/routes/nga.py | 9 ++- src/plugins/ELF_RSS2/RSS/rss_class.py | 44 +++++-------- src/plugins/ELF_RSS2/RSS/rss_parsing.py | 24 +++++-- src/plugins/ELF_RSS2/add_dy.py | 4 +- src/plugins/ELF_RSS2/config.py | 1 - src/plugins/ELF_RSS2/start.py | 65 ++++++++++--------- 11 files changed, 187 insertions(+), 149 deletions(-) diff --git a/.env.dev b/.env.dev index a6c677ef..ba74455d 100644 --- a/.env.dev +++ b/.env.dev @@ -10,7 +10,6 @@ RSS_PROXY="127.0.0.1:7890" # 代理地址 RSSHUB="https://rsshub.app" # rsshub订阅地址 RSSHUB_BACKUP=[] # 备用rsshub地址 填写示例 ["https://rsshub.app","https://rsshub.app"] 务必使用双引号!!! DB_CACHE_EXPIRE=30 # 去重数据库的记录清理限定天数 -LIMIT=50 # 缓存rss条数 # 图片压缩 ZIP_SIZE=2048 # 非 GIF 图片压缩后的最大长宽值,单位 px diff --git a/src/plugins/ELF_RSS2/RSS/routes/Parsing/__init__.py b/src/plugins/ELF_RSS2/RSS/routes/Parsing/__init__.py index 4f6bc4fd..b7e9a70a 100644 --- a/src/plugins/ELF_RSS2/RSS/routes/Parsing/__init__.py +++ b/src/plugins/ELF_RSS2/RSS/routes/Parsing/__init__.py @@ -11,7 +11,12 @@ from . import check_update, send_message from .download_torrent import down_torrent -from .duplicate_filter import cache_db_manage, duplicate_exists, insert_into_cache_db +from .duplicate_filter import ( + cache_db_manage, + cache_json_manage, + duplicate_exists, + insert_into_cache_db, +) from .handle_html_tag import handle_bbcode from .handle_html_tag import handle_html_tag from .handle_images import handle_img @@ -193,15 +198,13 @@ def __init__(self, rss: Rss): ] # 开始解析 - async def start(self, new_rss: dict, old_data: list): - # new_data 是完整的 rss 解析后的 dict,old_data 是 list + async def start(self, new_rss: dict): + # new_data 是完整的 rss 解析后的 dict # 前置处理 self.state.update( { "rss_title": new_rss.get("feed").get("title"), - "new_rss": new_rss, "new_data": new_rss.get("entries"), - "old_data": old_data, "change_data": [], # 更新的消息列表 "conn": None, # 数据库连接 } @@ -251,9 +254,8 @@ async def start(self, new_rss: dict, old_data: list): # 检查更新 @ParsingBase.append_before_handler(priority=10) async def handle_check_update(rss: Rss, state: dict): - change_data = await check_update.check_update( - state.get("new_data"), state.get("old_data") - ) + _file = FILE_PATH + (rss.name + ".json") + change_data = await check_update.check_update(_file, state.get("new_data")) return {"change_data": change_data} @@ -261,20 +263,19 @@ async def handle_check_update(rss: Rss, state: dict): @ParsingBase.append_before_handler(priority=11) async def handle_check_update(rss: Rss, state: dict): change_data = state.get("change_data") - new_rss = state.get("new_rss") for item in change_data.copy(): summary = get_summary(item) # 检查是否包含屏蔽词 if config.black_word and re.findall("|".join(config.black_word), summary): logger.info("内含屏蔽词,已经取消推送该消息") - write_item(rss=rss, new_rss=new_rss, new_item=item) + write_item(name=rss.name, new_item=item) change_data.remove(item) continue # 检查是否匹配关键词 使用 down_torrent_keyword 字段,命名是历史遗留导致,实际应该是白名单关键字 if rss.down_torrent_keyword and not re.search( rss.down_torrent_keyword, summary ): - write_item(rss=rss, new_rss=new_rss, new_item=item) + write_item(name=rss.name, new_item=item) change_data.remove(item) continue # 检查是否匹配黑名单关键词 使用 black_keyword 字段 @@ -282,7 +283,7 @@ async def handle_check_update(rss: Rss, state: dict): re.search(rss.black_keyword, item["title"]) or re.search(rss.black_keyword, summary) ): - write_item(rss=rss, new_rss=new_rss, new_item=item) + write_item(name=rss.name, new_item=item) change_data.remove(item) continue # 检查是否只推送有图片的消息 @@ -290,7 +291,7 @@ async def handle_check_update(rss: Rss, state: dict): r"|\[img]", summary ): logger.info(f"{rss.name} 已开启仅图片/仅含有图片,该消息没有图片,将跳过") - write_item(rss=rss, new_rss=new_rss, new_item=item) + write_item(name=rss.name, new_item=item) change_data.remove(item) return {"change_data": change_data} @@ -300,7 +301,6 @@ async def handle_check_update(rss: Rss, state: dict): @ParsingBase.append_before_handler(priority=12) async def handle_check_update(rss: Rss, state: dict): change_data = state.get("change_data") - new_rss = state.get("new_rss") conn = state.get("conn") # 检查是否启用去重 使用 duplicate_filter_mode 字段 @@ -312,6 +312,7 @@ async def handle_check_update(rss: Rss, state: dict): conn.set_trace_callback(logger.debug) await cache_db_manage(conn) + await cache_json_manage(FILE_PATH + (rss.name + ".json")) delete = [] for index, item in enumerate(change_data): @@ -324,7 +325,7 @@ async def handle_check_update(rss: Rss, state: dict): summary=summary, ) if is_duplicate: - write_item(rss=rss, new_rss=new_rss, new_item=item) + write_item(name=rss.name, new_item=item) delete.append(index) else: change_data[index]["image_hash"] = str(image_hash) @@ -480,7 +481,7 @@ async def handle_torrent( async def handle_date( rss: Rss, state: dict, item: dict, item_msg: str, tmp: str, tmp_state: dict ) -> str: - date = ( + date = tuple( item.get("updated_parsed") if item.get("updated_parsed") else item.get("published_parsed") @@ -503,7 +504,10 @@ async def handle_message( ) -> str: # 发送消息并写入文件 if await send_message.send_msg(rss=rss, msg=item_msg, item=item): - write_item(rss=rss, new_rss=state.get("new_rss"), new_item=item) + if item.get("to_send"): + item.pop("to_send") + item.pop("count") + write_item(name=rss.name, new_item=item) if rss.duplicate_filter_mode: image_hash = item["image_hash"] @@ -512,6 +516,13 @@ async def handle_message( ) state["item_count"] += 1 + else: + item["to_send"] = True + if not item.get("count"): + item["count"] = 1 + else: + item["count"] += 1 + write_item(name=rss.name, new_item=item) return "" diff --git a/src/plugins/ELF_RSS2/RSS/routes/Parsing/check_update.py b/src/plugins/ELF_RSS2/RSS/routes/Parsing/check_update.py index c2aba3e4..55cb2846 100644 --- a/src/plugins/ELF_RSS2/RSS/routes/Parsing/check_update.py +++ b/src/plugins/ELF_RSS2/RSS/routes/Parsing/check_update.py @@ -1,13 +1,15 @@ import hashlib -import json import time +from tinydb import TinyDB, Query from typing import Dict, Any # 处理日期 async def handle_date(date=None) -> str: if date: + if not isinstance(date, tuple): + date = tuple(date) rss_time = time.mktime(date) # 时差处理,待改进 if rss_time + 28800.0 < time.time(): @@ -18,46 +20,44 @@ async def handle_date(date=None) -> str: return "日期:" + time.strftime("%m月%d日 %H:%M:%S", time.localtime()) -# 将 dict 对象转换为 json 字符串后,计算哈希值,供后续比较 +# 对 dict 对象计算哈希值,供后续比较 def dict_hash(dictionary: Dict[str, Any]) -> str: keys = ["id", "link", "published", "updated", "title"] - dictionary_temp = {k: dictionary[k] for k in keys if k in dictionary} - d_hash = hashlib.md5() - encoded = json.dumps(dictionary_temp, sort_keys=True).encode() - d_hash.update(encoded) - return d_hash.hexdigest() + string = "|".join([dictionary[k] for k in keys if k in dictionary]) + result = hashlib.md5(string.encode()) + return result.hexdigest() # 检查更新 -async def check_update(new: list, old: list) -> list: - # 有些订阅可能存在没有 entries 的情况,比如 Bilibili 直播间开播状态,直接跳过 - if not new: +async def check_update(_file: str, new: list) -> list: + db = TinyDB( + _file, + encoding="utf-8", + sort_keys=True, + indent=4, + ensure_ascii=False, + ) + # 发送失败超过 3 次的消息不再发送 + to_send_list = db.search( + (Query().to_send.exists()) & (Query().count.test(lambda x: x <= 3)) + ) + + if not new and not to_send_list: return [] - old_hash_list = [dict_hash(i) if not i.get("hash") else i.get("hash") for i in old] - # 对比本地消息缓存和获取到的消息,新的存入 hash ,随着检查更新的次数增多,逐步替换原来没存 hash 的缓存记录 - temp = [] - hash_list = [] + old_hash_list = [r.get("hash") for r in db.all()] for i in new: hash_temp = dict_hash(i) if hash_temp not in old_hash_list: i["hash"] = hash_temp - temp.append(i) - hash_list.append(hash_temp) - - # 将结果进行去重,避免消息重复发送 - result = [ - value - for index, value in enumerate(temp) - if value["hash"] not in hash_list[index + 1 :] - ] + to_send_list.append(i) # 对结果按照发布时间排序 result_with_date = [ (await handle_date(i.get("updated_parsed")), i) if i.get("updated_parsed") else (await handle_date(i.get("published_parsed")), i) - for i in result + for i in to_send_list ] result_with_date.sort(key=lambda tup: tup[0]) result = [i for key, i in result_with_date] diff --git a/src/plugins/ELF_RSS2/RSS/routes/Parsing/duplicate_filter.py b/src/plugins/ELF_RSS2/RSS/routes/Parsing/duplicate_filter.py index fcb76c0f..3f35fd38 100644 --- a/src/plugins/ELF_RSS2/RSS/routes/Parsing/duplicate_filter.py +++ b/src/plugins/ELF_RSS2/RSS/routes/Parsing/duplicate_filter.py @@ -1,15 +1,22 @@ -import sqlite3 +import datetime import imagehash +import os +import sqlite3 +import time from PIL import Image, UnidentifiedImageError from io import BytesIO from nonebot.log import logger +from pathlib import Path from pyquery import PyQuery as Pq +from tinydb import TinyDB, Query from .handle_images import download_image from ... import rss_class from ....config import config +FILE_PATH = str(str(Path.cwd()) + os.sep + "data" + os.sep) + # 对去重数据库进行管理 async def cache_db_manage(conn: sqlite3.connect) -> None: @@ -38,6 +45,34 @@ async def cache_db_manage(conn: sqlite3.connect) -> None: conn.commit() +# 对缓存 json 进行管理 +async def cache_json_manage(_file: str) -> None: + db = TinyDB( + _file, + encoding="utf-8", + sort_keys=True, + indent=4, + ensure_ascii=False, + ) + expired_date = datetime.datetime.utcnow() - datetime.timedelta( + days=config.db_cache_expire + ) + expired_timestamp = datetime.datetime.timestamp(expired_date) + # 移除超过 config.db_cache_expire 天的记录 + db.remove( + ( + Query().published_parsed.test( + lambda x: time.mktime(tuple(x)) <= expired_timestamp + ) + ) + | ( + Query().updated_parsed.test( + lambda x: time.mktime(tuple(x)) <= expired_timestamp + ) + ) + ) + + # 去重判断 async def duplicate_exists( rss: rss_class.Rss, conn: sqlite3.connect, link: str, title: str, summary: str diff --git a/src/plugins/ELF_RSS2/RSS/routes/Parsing/read_or_write_rss_data.py b/src/plugins/ELF_RSS2/RSS/routes/Parsing/read_or_write_rss_data.py index bd5e5703..fc3d7970 100644 --- a/src/plugins/ELF_RSS2/RSS/routes/Parsing/read_or_write_rss_data.py +++ b/src/plugins/ELF_RSS2/RSS/routes/Parsing/read_or_write_rss_data.py @@ -1,48 +1,32 @@ -import codecs -import json import os from pathlib import Path - -from ....RSS import rss_class -from ....config import config +from tinydb import TinyDB, Query FILE_PATH = str(str(Path.cwd()) + os.sep + "data" + os.sep) # 读取记录 -def read_rss(name) -> dict: - # 检查是否存在rss记录 - json_path = FILE_PATH + (name + ".json") - if not os.path.isfile(json_path) or os.stat(json_path).st_size == 0: - return {} - with codecs.open(json_path, "r", "utf-8") as load_f: - load_dict = json.load(load_f) - return load_dict - - -# 写入记录 -def write_rss(name: str, new_rss: dict, new_item: list = None): - if new_item: - max_length = len(new_rss.get("entries")) - # 防止 rss 超过设置的缓存条数 - if max_length >= config.limit: - limit = max_length + config.limit - else: - limit = config.limit - old = read_rss(name) - for tmp in new_item: - old["entries"].insert(0, tmp) - old["entries"] = old["entries"][0:limit] - else: - old = new_rss - if not os.path.isdir(FILE_PATH): - os.makedirs(FILE_PATH) - with codecs.open(FILE_PATH + (name + ".json"), "w", "utf-8") as dump_f: - dump_f.write(json.dumps(old, sort_keys=True, indent=4, ensure_ascii=False)) +def read_rss(name: str) -> list: + _file = FILE_PATH + (name + ".json") + db = TinyDB( + _file, + encoding="utf-8", + sort_keys=True, + indent=4, + ensure_ascii=False, + ) + return db.all() # 写入单条消息 -def write_item(rss: rss_class.Rss, new_rss: dict, new_item: dict): - tmp = [new_item] - write_rss(name=rss.name, new_rss=new_rss, new_item=tmp) +def write_item(name: str, new_item: dict): + _file = FILE_PATH + (name + ".json") + db = TinyDB( + _file, + encoding="utf-8", + sort_keys=True, + indent=4, + ensure_ascii=False, + ) + db.upsert(new_item, Query().hash == str(new_item.get("hash"))) diff --git a/src/plugins/ELF_RSS2/RSS/routes/nga.py b/src/plugins/ELF_RSS2/RSS/routes/nga.py index c35b9880..4ccb23d4 100644 --- a/src/plugins/ELF_RSS2/RSS/routes/nga.py +++ b/src/plugins/ELF_RSS2/RSS/routes/nga.py @@ -1,9 +1,13 @@ +import os import re -from .Parsing import ParsingBase, check_update +from pathlib import Path +from .Parsing import ParsingBase, check_update from ..rss_class import Rss +FILE_PATH = str(str(Path.cwd()) + os.sep + "data" + os.sep) + # 检查更新 @ParsingBase.append_before_handler(rex="nga", priority=10) @@ -17,5 +21,6 @@ async def handle_check_update(rss: Rss, state: dict): for i in old_data: i["link"] = re.sub(r"&rand=\d+", "", i["link"]) - change_data = await check_update.check_update(new_data, old_data) + _file = FILE_PATH + (rss.name + ".json") + change_data = await check_update.check_update(_file, new_data) return {"change_data": change_data} diff --git a/src/plugins/ELF_RSS2/RSS/rss_class.py b/src/plugins/ELF_RSS2/RSS/rss_class.py index bea811a8..3497b4db 100644 --- a/src/plugins/ELF_RSS2/RSS/rss_class.py +++ b/src/plugins/ELF_RSS2/RSS/rss_class.py @@ -11,6 +11,7 @@ # 存储目录 FILE_PATH = str(str(Path.cwd()) + os.sep + "data" + os.sep) +JSON_PATH = FILE_PATH + "rss.json" class Rss: @@ -101,11 +102,11 @@ def get_url(self, rsshub: str = config.rsshub) -> str: @staticmethod def read_rss() -> list: # 如果文件不存在 - if not os.path.isfile(str(FILE_PATH + "rss.json")): + if not os.path.isfile(JSON_PATH): return [] rss_list = [] db = TinyDB( - str(FILE_PATH + "rss.json"), + JSON_PATH, encoding="utf-8", sort_keys=True, indent=4, @@ -131,27 +132,18 @@ def find_name(self, name: str): return feed return None - # 添加订阅 QQ - def add_user(self, user: str): - if str(user) in self.user_id: - return - self.user_id.append(str(user)) - db = TinyDB( - str(FILE_PATH + "rss.json"), - encoding="utf-8", - sort_keys=True, - indent=4, - ensure_ascii=False, - ) - db.upsert(self.__dict__, Query().name == self.name) - - # 添加订阅 群组 - def add_group(self, group: str): - if str(group) in self.group_id: - return - self.group_id.append(str(group)) + # 添加订阅 + def add_user_or_group(self, user: str = None, group: str = None): + if user: + if str(user) in self.user_id: + return + self.user_id.append(str(user)) + else: + if str(group) in self.group_id: + return + self.group_id.append(str(group)) db = TinyDB( - str(FILE_PATH + "rss.json"), + JSON_PATH, encoding="utf-8", sort_keys=True, indent=4, @@ -165,7 +157,7 @@ def delete_group(self, group: str) -> bool: return False self.group_id.remove(str(group)) db = TinyDB( - str(FILE_PATH + "rss.json"), + JSON_PATH, encoding="utf-8", sort_keys=True, indent=4, @@ -177,7 +169,7 @@ def delete_group(self, group: str) -> bool: # 删除整个订阅 def delete_rss(self): db = TinyDB( - str(FILE_PATH + "rss.json"), + JSON_PATH, encoding="utf-8", sort_keys=True, indent=4, @@ -188,7 +180,7 @@ def delete_rss(self): # 删除订阅json文件 def delete_file(self): - this_file_path = str(FILE_PATH + self.name + ".json") + this_file_path = str(FILE_PATH + (self.name + ".json")) if os.path.exists(this_file_path): os.remove(this_file_path) @@ -218,7 +210,7 @@ def set_cookies(self, cookies_str: str) -> bool: cookies[name] = value self.cookies = cookies db = TinyDB( - str(FILE_PATH + "rss.json"), + JSON_PATH, encoding="utf-8", sort_keys=True, indent=4, diff --git a/src/plugins/ELF_RSS2/RSS/rss_parsing.py b/src/plugins/ELF_RSS2/RSS/rss_parsing.py index de67a3f9..38fa33fc 100644 --- a/src/plugins/ELF_RSS2/RSS/rss_parsing.py +++ b/src/plugins/ELF_RSS2/RSS/rss_parsing.py @@ -3,16 +3,20 @@ import asyncio import feedparser import httpx +import os import re from nonebot.log import logger +from pathlib import Path from tenacity import retry, stop_after_attempt, stop_after_delay, RetryError, TryAgain +from tinydb import TinyDB from . import rss_class from .routes.Parsing import ParsingRss, get_proxy -from .routes.Parsing.read_or_write_rss_data import read_rss, write_rss from ..config import config +FILE_PATH = str(str(Path.cwd()) + os.sep + "data" + os.sep) + STATUS_CODE = [200, 301, 302] # 去掉烦人的 returning true from eof_received() has no effect when using ssl httpx 警告 @@ -40,15 +44,23 @@ async def start(rss: rss_class.Rss) -> None: cookies_str = "及 cookies " if rss.cookies else "" logger.error(f"{rss.name}[{rss.get_url()}]抓取失败!已达最大重试次数!请检查订阅地址{cookies_str}!") return - old_rss = read_rss(rss.name) - old_rss_list = old_rss.get("entries") - if not old_rss: - write_rss(name=rss.name, new_rss=new_rss) + # 检查是否存在rss记录 + _file = FILE_PATH + (rss.name + ".json") + if not os.path.isfile(_file): + db = TinyDB( + _file, + encoding="utf-8", + sort_keys=True, + indent=4, + ensure_ascii=False, + ) + entries = new_rss.get("entries") + db.insert_multiple(entries) logger.info(f"{rss.name} 第一次抓取成功!") return pr = ParsingRss(rss=rss) - await pr.start(new_rss=new_rss, old_data=old_rss_list) + await pr.start(new_rss=new_rss) # 获取 RSS 并解析为 json ,失败重试 diff --git a/src/plugins/ELF_RSS2/add_dy.py b/src/plugins/ELF_RSS2/add_dy.py index 1406f7d0..2ea7ac13 100644 --- a/src/plugins/ELF_RSS2/add_dy.py +++ b/src/plugins/ELF_RSS2/add_dy.py @@ -48,11 +48,11 @@ async def handle_rss_add(bot: Bot, event: Event, state: dict): async def add_group_or_user(_group_id, _user_id): if _group_id: - rss.add_group(group=str(_group_id)) + rss.add_user_or_group(group=str(_group_id)) await tr.add_job(rss) await RSS_ADD.send("👏 订阅到当前群组成功!") else: - rss.add_user(user=_user_id) + rss.add_user_or_group(user=_user_id) await tr.add_job(rss) await RSS_ADD.send("👏 订阅到当前账号成功!") diff --git a/src/plugins/ELF_RSS2/config.py b/src/plugins/ELF_RSS2/config.py index db20f6f5..fabcd844 100644 --- a/src/plugins/ELF_RSS2/config.py +++ b/src/plugins/ELF_RSS2/config.py @@ -14,7 +14,6 @@ class Config: rsshub: AnyHttpUrl = "https://rsshub.app" rsshub_backup: List[AnyHttpUrl] = [] db_cache_expire = 30 - limit = 50 zip_size: int = 2 * 1024 diff --git a/src/plugins/ELF_RSS2/start.py b/src/plugins/ELF_RSS2/start.py index c2ea9686..83b59382 100644 --- a/src/plugins/ELF_RSS2/start.py +++ b/src/plugins/ELF_RSS2/start.py @@ -2,57 +2,59 @@ import json import nonebot import os -import re from nonebot import logger, on_metaevent from nonebot.adapters.cqhttp import Bot, Event, LifecycleMetaEvent from pathlib import Path from tinydb import TinyDB -from .config import config -from .RSS import rss_class from .RSS import my_trigger as rt +from .RSS import rss_class +from .RSS.routes.Parsing.check_update import dict_hash +from .config import config FILE_PATH = str(str(Path.cwd()) + os.sep + "data" + os.sep) +JSON_PATH = FILE_PATH + "rss.json" -def hash_clear(): - +# 将 xxx.json (缓存) 改造为 tinydb 数据库 +def change_cache_json(): json_paths = list(Path(FILE_PATH).glob("*.json")) for j in [str(i) for i in json_paths if i != "rss.json"]: with codecs.open(j, "r", "utf-8") as f: - lines = f.readlines() + cache_json = json.load(f) + entries = cache_json.get("entries") + + if entries: + os.remove(j) + db = TinyDB( + j, + encoding="utf-8", + sort_keys=True, + indent=4, + ensure_ascii=False, + ) - with codecs.open(j, "w", "utf-8") as f: - for line in lines: - if not re.search(r'"hash": "[0-9a-zA-Z]{32}",', line): - f.write(line) + for i in entries: + i["hash"] = dict_hash(i) + db.insert(i) # 将 rss.json 改造为 tinydb 数据库 def change_rss_json(): - - db = TinyDB( - str(FILE_PATH + "rss.json"), - encoding="utf-8", - sort_keys=True, - indent=4, - ensure_ascii=False, - ) - - try: - db.all() - except TypeError: - - with codecs.open(str(FILE_PATH + "rss.json"), "r", "utf-8") as f: - rss_list_json = json.load(f) - - os.remove(str(FILE_PATH + "rss.json")) - + with codecs.open(JSON_PATH, "r", "utf-8") as f: + rss_list_json = json.load(f) + if isinstance(rss_list_json, list): + _default = None + else: + _default = rss_list_json.get("_default") + + if not _default: + os.remove(JSON_PATH) db = TinyDB( - str(FILE_PATH + "rss.json"), + JSON_PATH, encoding="utf-8", sort_keys=True, indent=4, @@ -60,9 +62,7 @@ def change_rss_json(): ) for rss_json in rss_list_json: - if not isinstance(rss_json, str): - rss_json = json.dumps(rss_json) - db.insert(json.loads(rss_json)) + db.insert(rss_json) async def start(): @@ -70,6 +70,7 @@ async def start(): if config.version == "v2.4.0": change_rss_json() + change_cache_json() try: rss = rss_class.Rss("", "", "-1", "-1")