Skip to content

Commit

Permalink
Merge pull request #180 from NekoAria/2.0
Browse files Browse the repository at this point in the history
V2.4.0
  • Loading branch information
Quan authored Sep 12, 2021
2 parents 9584803 + a709b73 commit 98f5ff7
Show file tree
Hide file tree
Showing 17 changed files with 268 additions and 186 deletions.
2 changes: 1 addition & 1 deletion .env
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
ENVIRONMENT=prod
VERSION='v2.3.1'
VERSION='v2.4.0'
1 change: 0 additions & 1 deletion .env.dev
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@ RSS_PROXY="127.0.0.1:7890" # 代理地址
RSSHUB="https://rsshub.app" # rsshub订阅地址
RSSHUB_BACKUP=[] # 备用rsshub地址 填写示例 ["https://rsshub.app","https://rsshub.app"] 务必使用双引号!!!
DB_CACHE_EXPIRE=30 # 去重数据库的记录清理限定天数
LIMIT=50 # 缓存rss条数

# 图片压缩
ZIP_SIZE=2048 # 非 GIF 图片压缩后的最大长宽值,单位 px
Expand Down
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "ELF_RSS"
version = "2.3.1"
version = "2.4.0"
description = "ELF_RSS"
authors = ["Quan666"]
license = "GPL v3"
Expand Down Expand Up @@ -31,6 +31,7 @@ pydantic = "~=1.8.1"
pyquery = "~=1.4.3"
python-qbittorrent = "~=0.4.2"
tenacity = "~=7.0.0"
tinydb = "~=4.5.1"
typing-extensions = "~=3.7.4.3"

[tool.poetry.dev-dependencies]
Expand Down
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -17,4 +17,5 @@ nonebot-adapter-cqhttp~=2.0.0a15
ImageHash~=4.2.0
pydantic~=1.8.1
tenacity~=7.0.0
bbcode~=1.1.0
bbcode~=1.1.0
tinydb~=4.5.1
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

setuptools.setup(
name="ELF_RSS",
version="2.3.1",
version="2.4.0",
author="Quan666",
author_email="i@oy.mk",
description="QQ机器人 RSS订阅 插件,订阅源建议选择 RSSHub",
Expand Down
45 changes: 28 additions & 17 deletions src/plugins/ELF_RSS2/RSS/routes/Parsing/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,12 @@

from . import check_update, send_message
from .download_torrent import down_torrent
from .duplicate_filter import cache_db_manage, duplicate_exists, insert_into_cache_db
from .duplicate_filter import (
cache_db_manage,
cache_json_manage,
duplicate_exists,
insert_into_cache_db,
)
from .handle_html_tag import handle_bbcode
from .handle_html_tag import handle_html_tag
from .handle_images import handle_img
Expand Down Expand Up @@ -193,15 +198,13 @@ def __init__(self, rss: Rss):
]

# 开始解析
async def start(self, new_rss: dict, old_data: list):
# new_data 是完整的 rss 解析后的 dict,old_data 是 list
async def start(self, new_rss: dict):
# new_data 是完整的 rss 解析后的 dict
# 前置处理
self.state.update(
{
"rss_title": new_rss.get("feed").get("title"),
"new_rss": new_rss,
"new_data": new_rss.get("entries"),
"old_data": old_data,
"change_data": [], # 更新的消息列表
"conn": None, # 数据库连接
}
Expand Down Expand Up @@ -251,46 +254,44 @@ async def start(self, new_rss: dict, old_data: list):
# 检查更新
@ParsingBase.append_before_handler(priority=10)
async def handle_check_update(rss: Rss, state: dict):
change_data = await check_update.check_update(
state.get("new_data"), state.get("old_data")
)
_file = FILE_PATH + (rss.name + ".json")
change_data = await check_update.check_update(_file, state.get("new_data"))
return {"change_data": change_data}


# 判断是否满足推送条件
@ParsingBase.append_before_handler(priority=11)
async def handle_check_update(rss: Rss, state: dict):
change_data = state.get("change_data")
new_rss = state.get("new_rss")
for item in change_data.copy():
summary = get_summary(item)
# 检查是否包含屏蔽词
if config.black_word and re.findall("|".join(config.black_word), summary):
logger.info("内含屏蔽词,已经取消推送该消息")
write_item(rss=rss, new_rss=new_rss, new_item=item)
write_item(name=rss.name, new_item=item)
change_data.remove(item)
continue
# 检查是否匹配关键词 使用 down_torrent_keyword 字段,命名是历史遗留导致,实际应该是白名单关键字
if rss.down_torrent_keyword and not re.search(
rss.down_torrent_keyword, summary
):
write_item(rss=rss, new_rss=new_rss, new_item=item)
write_item(name=rss.name, new_item=item)
change_data.remove(item)
continue
# 检查是否匹配黑名单关键词 使用 black_keyword 字段
if rss.black_keyword and (
re.search(rss.black_keyword, item["title"])
or re.search(rss.black_keyword, summary)
):
write_item(rss=rss, new_rss=new_rss, new_item=item)
write_item(name=rss.name, new_item=item)
change_data.remove(item)
continue
# 检查是否只推送有图片的消息
if (rss.only_pic or rss.only_has_pic) and not re.search(
r"<img.+?>|\[img]", summary
):
logger.info(f"{rss.name} 已开启仅图片/仅含有图片,该消息没有图片,将跳过")
write_item(rss=rss, new_rss=new_rss, new_item=item)
write_item(name=rss.name, new_item=item)
change_data.remove(item)

return {"change_data": change_data}
Expand All @@ -300,7 +301,6 @@ async def handle_check_update(rss: Rss, state: dict):
@ParsingBase.append_before_handler(priority=12)
async def handle_check_update(rss: Rss, state: dict):
change_data = state.get("change_data")
new_rss = state.get("new_rss")
conn = state.get("conn")

# 检查是否启用去重 使用 duplicate_filter_mode 字段
Expand All @@ -312,6 +312,7 @@ async def handle_check_update(rss: Rss, state: dict):
conn.set_trace_callback(logger.debug)

await cache_db_manage(conn)
await cache_json_manage(FILE_PATH + (rss.name + ".json"))

delete = []
for index, item in enumerate(change_data):
Expand All @@ -324,7 +325,7 @@ async def handle_check_update(rss: Rss, state: dict):
summary=summary,
)
if is_duplicate:
write_item(rss=rss, new_rss=new_rss, new_item=item)
write_item(name=rss.name, new_item=item)
delete.append(index)
else:
change_data[index]["image_hash"] = str(image_hash)
Expand Down Expand Up @@ -480,7 +481,7 @@ async def handle_torrent(
async def handle_date(
rss: Rss, state: dict, item: dict, item_msg: str, tmp: str, tmp_state: dict
) -> str:
date = (
date = tuple(
item.get("updated_parsed")
if item.get("updated_parsed")
else item.get("published_parsed")
Expand All @@ -503,7 +504,10 @@ async def handle_message(
) -> str:
# 发送消息并写入文件
if await send_message.send_msg(rss=rss, msg=item_msg, item=item):
write_item(rss=rss, new_rss=state.get("new_rss"), new_item=item)
if item.get("to_send"):
item.pop("to_send")
item.pop("count")
write_item(name=rss.name, new_item=item)

if rss.duplicate_filter_mode:
image_hash = item["image_hash"]
Expand All @@ -512,6 +516,13 @@ async def handle_message(
)

state["item_count"] += 1
else:
item["to_send"] = True
if not item.get("count"):
item["count"] = 1
else:
item["count"] += 1
write_item(name=rss.name, new_item=item)

return ""

Expand Down
48 changes: 24 additions & 24 deletions src/plugins/ELF_RSS2/RSS/routes/Parsing/check_update.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,15 @@
import hashlib
import json
import time

from tinydb import TinyDB, Query
from typing import Dict, Any


# 处理日期
async def handle_date(date=None) -> str:
if date:
if not isinstance(date, tuple):
date = tuple(date)
rss_time = time.mktime(date)
# 时差处理,待改进
if rss_time + 28800.0 < time.time():
Expand All @@ -18,46 +20,44 @@ async def handle_date(date=None) -> str:
return "日期:" + time.strftime("%m月%d日 %H:%M:%S", time.localtime())


# dict 对象转换为 json 字符串后,计算哈希值,供后续比较
# dict 对象计算哈希值,供后续比较
def dict_hash(dictionary: Dict[str, Any]) -> str:
keys = ["id", "link", "published", "updated", "title"]
dictionary_temp = {k: dictionary[k] for k in keys if k in dictionary}
d_hash = hashlib.md5()
encoded = json.dumps(dictionary_temp, sort_keys=True).encode()
d_hash.update(encoded)
return d_hash.hexdigest()
string = "|".join([dictionary[k] for k in keys if k in dictionary])
result = hashlib.md5(string.encode())
return result.hexdigest()


# 检查更新
async def check_update(new: list, old: list) -> list:
# 有些订阅可能存在没有 entries 的情况,比如 Bilibili 直播间开播状态,直接跳过
if not new:
async def check_update(_file: str, new: list) -> list:
db = TinyDB(
_file,
encoding="utf-8",
sort_keys=True,
indent=4,
ensure_ascii=False,
)
# 发送失败超过 3 次的消息不再发送
to_send_list = db.search(
(Query().to_send.exists()) & (Query().count.test(lambda x: x <= 3))
)

if not new and not to_send_list:
return []

old_hash_list = [dict_hash(i) if not i.get("hash") else i.get("hash") for i in old]
# 对比本地消息缓存和获取到的消息,新的存入 hash ,随着检查更新的次数增多,逐步替换原来没存 hash 的缓存记录
temp = []
hash_list = []
old_hash_list = [r.get("hash") for r in db.all()]
for i in new:
hash_temp = dict_hash(i)
if hash_temp not in old_hash_list:
i["hash"] = hash_temp
temp.append(i)
hash_list.append(hash_temp)

# 将结果进行去重,避免消息重复发送
result = [
value
for index, value in enumerate(temp)
if value["hash"] not in hash_list[index + 1 :]
]
to_send_list.append(i)

# 对结果按照发布时间排序
result_with_date = [
(await handle_date(i.get("updated_parsed")), i)
if i.get("updated_parsed")
else (await handle_date(i.get("published_parsed")), i)
for i in result
for i in to_send_list
]
result_with_date.sort(key=lambda tup: tup[0])
result = [i for key, i in result_with_date]
Expand Down
37 changes: 36 additions & 1 deletion src/plugins/ELF_RSS2/RSS/routes/Parsing/duplicate_filter.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,22 @@
import sqlite3
import datetime
import imagehash
import os
import sqlite3
import time

from PIL import Image, UnidentifiedImageError
from io import BytesIO
from nonebot.log import logger
from pathlib import Path
from pyquery import PyQuery as Pq
from tinydb import TinyDB, Query

from .handle_images import download_image
from ... import rss_class
from ....config import config

FILE_PATH = str(str(Path.cwd()) + os.sep + "data" + os.sep)


# 对去重数据库进行管理
async def cache_db_manage(conn: sqlite3.connect) -> None:
Expand Down Expand Up @@ -38,6 +45,34 @@ async def cache_db_manage(conn: sqlite3.connect) -> None:
conn.commit()


# 对缓存 json 进行管理
async def cache_json_manage(_file: str) -> None:
db = TinyDB(
_file,
encoding="utf-8",
sort_keys=True,
indent=4,
ensure_ascii=False,
)
expired_date = datetime.datetime.utcnow() - datetime.timedelta(
days=config.db_cache_expire
)
expired_timestamp = datetime.datetime.timestamp(expired_date)
# 移除超过 config.db_cache_expire 天的记录
db.remove(
(
Query().published_parsed.test(
lambda x: time.mktime(tuple(x)) <= expired_timestamp
)
)
| (
Query().updated_parsed.test(
lambda x: time.mktime(tuple(x)) <= expired_timestamp
)
)
)


# 去重判断
async def duplicate_exists(
rss: rss_class.Rss, conn: sqlite3.connect, link: str, title: str, summary: str
Expand Down
Loading

0 comments on commit 98f5ff7

Please sign in to comment.