From 3c4d246f59ea5423f49e0ef04e6ca60e33c1f1dd Mon Sep 17 00:00:00 2001
From: tangyoha <39958403+tangyoha@users.noreply.github.com>
Date: Mon, 13 Feb 2023 19:56:49 +0800
Subject: [PATCH] feat: add filter (#61)
---
.gitignore | 2 +
README.md | 31 ++--
README_CN.md | 9 +-
media_downloader.py | 19 +-
module/app.py | 35 +++-
module/cloud_drive.py | 6 +-
module/filter.py | 326 +++++++++++++++++++++++++++++++++
module/templates/index.html | 12 +-
module/web.py | 11 +-
requirements.txt | 4 +-
tests/test_common.py | 110 +++++++++++
tests/test_media_downloader.py | 106 ++---------
tests/utils/test_filter.py | 179 ++++++++++++++++++
tests/utils/test_format.py | 47 ++++-
utils/__init__.py | 2 +-
utils/format.py | 97 ++++++++++
utils/meta_data.py | 96 ++++++++++
17 files changed, 961 insertions(+), 131 deletions(-)
create mode 100644 module/filter.py
create mode 100644 tests/test_common.py
create mode 100644 tests/utils/test_filter.py
create mode 100644 utils/meta_data.py
diff --git a/.gitignore b/.gitignore
index 7e558020..1e9826ca 100644
--- a/.gitignore
+++ b/.gitignore
@@ -59,3 +59,5 @@ photo/
voice/
video/
video_note/
+parser.out
+parsetab.py
diff --git a/README.md b/README.md
index 96d3491c..df9ba8b4 100644
--- a/README.md
+++ b/README.md
@@ -6,8 +6,8 @@
-
-
+
+
@@ -149,6 +149,8 @@ file_name_prefix_split: ' - '
max_concurrent_transmissions: 1
web_host: 127.0.0.1
web_port: 5000
+download_filter:
+ 'telegram_chat_id': message_date >= 2022-12-01 00:00:00 and message_date <= 2023-01-17 00:00:00
```
- **api_hash** - The api_hash you got from telegram apps
@@ -160,26 +162,27 @@ web_port: 5000
- **file_formats** - File types to download for supported media types which are `audio`, `document` and `video`. Default format is `all`, downloads all files.
- **save_path** - The root directory where you want to store downloaded files.
- **file_path_prefix** - Store file subfolders, the order of the list is not fixed, can be randomly combined.
- - `chat_title` - channel or group title, it will be chat id if not exist title.
- - `media_datetime` - media date, also see pyrogram.types.Message.date.strftime("%Y_%m").
- - `media_type` - media type, also see `media_types`.
+ - `chat_title` - Channel or group title, it will be chat id if not exist title.
+ - `media_datetime` - Media date, also see pyrogram.types.Message.date.strftime("%Y_%m").
+ - `media_type` - Media type, also see `media_types`.
- **disable_syslog** - You can choose which types of logs to disable,see `logging._nameToLevel`.
- **upload_drive** - You can upload file to cloud drive.
- `enable_upload_file` - Enable upload file, default `false`.
- `remote_dir` - Where you upload, like `drive_id/drive_name`.
- `upload_adapter` - Upload file adapter, which can be `rclone`, `aligo`. If it is `rclone`, it supports all `rclone` servers that support uploading. If it is `aligo`, it supports uploading `Ali cloud disk`.
- - `rclone_path` - RClone exe path, see wiki[how to use rclone](https://github.com/tangyoha/telegram_media_downloader/wiki#how-to-use-rclone)
+ - `rclone_path` - RClone exe path, see [How to use rclone](https://github.com/tangyoha/telegram_media_downloader/wiki/Rclone)
- `before_upload_file_zip` - Zip file before upload, default `false`.
- `after_upload_file_delete` - Delete file after upload success, default `false`.
-- **file_name_prefix** - custom file name, use the same as **file_path_prefix**
- - `message_id` - message id
- - `file_name` - file name (may be empty)
- - `caption` - the title of the message (may be empty)
-- **file_name_prefix_split** - custom file name prefix symbol, the default is `-`
+- **file_name_prefix** - Custom file name, use the same as **file_path_prefix**
+ - `message_id` - Message id
+ - `file_name` - File name (may be empty)
+ - `caption` - The title of the message (may be empty)
+- **file_name_prefix_split** - Custom file name prefix symbol, the default is `-`
- **max_concurrent_transmissions** - Set the maximum amount of concurrent transmissions (uploads & downloads). A value that is too high may result in network related issues. Defaults to 1.
-- **hide_file_name** - whether to hide the web interface file name, default `false`
-- **web_host** - web host
-- **web_port** - web port
+- **hide_file_name** - Whether to hide the web interface file name, default `false`
+- **web_host** - Web host.
+- **web_port** - Web port.
+- **download_filter** - Download filter, see [How to use Filter](https://github.com/tangyoha/telegram_media_downloader/wiki/How-to-use-Filter)
## Execution
diff --git a/README_CN.md b/README_CN.md
index 0202cb2c..31ba7224 100644
--- a/README_CN.md
+++ b/README_CN.md
@@ -6,9 +6,9 @@
-
+
-
+
@@ -141,6 +141,8 @@ file_name_prefix_split: ' - '
max_concurrent_transmissions: 1
web_host: 127.0.0.1
web_port: 5000
+download_filter:
+ 'telegram_chat_id': message_date >= 2022-12-01 00:00:00 and message_date <= 2023-01-17 00:00:00
```
- **api_hash** - 你从电报应用程序获得的 api_hash
@@ -160,7 +162,7 @@ web_port: 5000
- `enable_upload_file` - [必填]启用上传文件,默认为`false`
- `remote_dir` - [必填]你上传的地方
- `upload_adapter` - [必填]上传文件适配器,可以为`rclone`,`aligo`。如果为`rclone`,则支持rclone所有支持上传的服务器,如果为aligo,则支持上传阿里云盘
- - `rclone_path`,如果配置`upload_adapter`为`rclone`则为必填,`rclone`的可执行目录,见wiki[如何使用rclone](https://github.com/tangyoha/telegram_media_downloader/wiki#how-to-use-rclone)
+ - `rclone_path`,如果配置`upload_adapter`为`rclone`则为必填,`rclone`的可执行目录,查阅 [如何使用rclone](https://github.com/tangyoha/telegram_media_downloader/wiki/Rclone)
- `before_upload_file_zip` - 上传前压缩文件,默认为`false`
- `after_upload_file_delete` - 上传成功后删除文件,默认为`false`
- **file_name_prefix** - 自定义文件名称,使用和 **file_path_prefix** 一样
@@ -172,6 +174,7 @@ web_port: 5000
- **hide_file_name** - 是否隐藏web界面文件名称,默认`false`
- **web_host** - web界面地址
- **web_port** - web界面端口
+- **download_filter** - 下载过滤器, 查阅 [How to use Filter](https://github.com/tangyoha/telegram_media_downloader/wiki/How-to-use-Filter)
## 执行
diff --git a/media_downloader.py b/media_downloader.py
index ee5f3c85..b6ead6bf 100644
--- a/media_downloader.py
+++ b/media_downloader.py
@@ -16,6 +16,7 @@
from module.web import get_flask_app, update_download_status
from utils.log import LogFilter
from utils.meta import print_meta
+from utils.meta_data import MetaData
from utils.updates import check_for_updates
logging.basicConfig(
@@ -269,7 +270,6 @@ async def download_media(
continue
file_name, file_format = await _get_media_meta(message, _media, _type)
media_size = getattr(_media, "file_size", 0)
-
if _can_download(_type, file_formats, file_format):
if _is_exist(file_name):
# TODO: check if the file download complete
@@ -325,7 +325,6 @@ async def download_media(
_check_download_finish(media_size, download_path, message.id)
await app.upload_file(file_name)
- app.downloaded_ids.append(message.id)
break
except pyrogram.errors.exceptions.bad_request_400.BadRequest:
logger.warning(
@@ -434,8 +433,11 @@ async def begin_import(pagination_limit: int):
api_id=app.api_id,
api_hash=app.api_hash,
proxy=app.proxy,
- max_concurrent_transmissions=app.max_concurrent_transmissions,
)
+
+ if getattr(client, "max_concurrent_transmissions", None):
+ client.max_concurrent_transmissions = app.max_concurrent_transmissions
+
await client.start()
print("Successfully started (Press Ctrl+C to stop)")
@@ -467,11 +469,12 @@ async def begin_import(pagination_limit: int):
app.last_read_message_id = last_read_message_id
async for message in messages_iter: # type: ignore
- if pagination_count != pagination_limit and not app.need_skip_message(
- message.id
- ):
- pagination_count += 1
- messages_list.append(message)
+ meta_data = MetaData()
+ meta_data.get_meta_data(message)
+ if pagination_count != pagination_limit:
+ if not app.need_skip_message(str(app.chat_id), message.id, meta_data):
+ pagination_count += 1
+ messages_list.append(message)
else:
last_read_message_id = await process_messages(
client,
diff --git a/module/app.py b/module/app.py
index 33696cf5..0080389f 100644
--- a/module/app.py
+++ b/module/app.py
@@ -6,6 +6,9 @@
from loguru import logger
from module.cloud_drive import CloudDrive, CloudDriveConfig
+from module.filter import Filter
+from utils.format import replace_date_time
+from utils.meta_data import MetaData
# pylint: disable = R0902
@@ -37,6 +40,7 @@ def __init__(
self.config_file: str = config_file
self.app_data_file: str = app_data_file
self.application_name: str = application_name
+ self.download_filter = Filter()
self.reset()
@@ -84,6 +88,7 @@ def reset(self):
self.max_concurrent_transmissions: int = 1
self.web_host: str = "localhost"
self.web_port: int = 5000
+ self.download_filter_dict: dict = {}
def load_config(self, _config: dict) -> bool:
"""load config from str.
@@ -165,6 +170,15 @@ def load_config(self, _config: dict) -> bool:
self.web_host = _config.get("web_host", self.web_host)
self.web_port = _config.get("web_port", self.web_port)
+ self.download_filter_dict = _config.get(
+ "download_filter", self.download_filter_dict
+ )
+
+ for key, value in self.download_filter_dict.items():
+ self.download_filter_dict[key] = replace_date_time(value)
+
+ # TODO: add check if expression exist syntax error
+
self.max_concurrent_transmissions = _config.get(
"max_concurrent_transmissions", self.max_concurrent_transmissions
)
@@ -273,19 +287,34 @@ def get_file_name(
res = f"{message_id}"
return res
- def need_skip_message(self, message_id: int) -> bool:
+ def need_skip_message(
+ self, chat_id: str, message_id: int, meta_data: MetaData
+ ) -> bool:
"""if need skip download message.
Parameters
----------
+ chat_id: str
+ Config.yaml defined
+
message_id: int
- readily to download message id
+ Readily to download message id
+
+ meta_data: MetaData
+ Ready to match filter
Returns
-------
bool
"""
- return self.ids_to_retry_dict.get(message_id) is not None
+ if message_id in self.ids_to_retry_dict:
+ return True
+
+ if chat_id in self.download_filter_dict:
+ self.download_filter.set_meta_data(meta_data)
+ return not self.download_filter.exec(self.download_filter_dict[chat_id])
+
+ return False
def update_config(self, immediate: bool = True):
"""update config
diff --git a/module/cloud_drive.py b/module/cloud_drive.py
index 1f1c0805..20f94ecb 100644
--- a/module/cloud_drive.py
+++ b/module/cloud_drive.py
@@ -72,7 +72,7 @@ def zip_file(local_file_path: str) -> str:
Zip local file
"""
- zip_file_name = os.path.basename(local_file_path).split(".")[0] + ".zip"
+ zip_file_name = local_file_path.split(".")[0] + ".zip"
with ZipFile(zip_file_name, "w") as zip_writer:
zip_writer.write(local_file_path)
@@ -104,8 +104,8 @@ async def rclone_upload_file(
file_path = local_file_path
cmd = (
- f'"{drive_config.rclone_path}" copy "{file_path}"'
- "{remote_dir}/ --create-empty-src-dirs --ignore-existing --progress"
+ f'"{drive_config.rclone_path}" copy "{file_path}" '
+ f"{remote_dir}/ --create-empty-src-dirs --ignore-existing --progress"
)
proc = await asyncio.create_subprocess_shell(
cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT
diff --git a/module/filter.py b/module/filter.py
new file mode 100644
index 00000000..e98b41fa
--- /dev/null
+++ b/module/filter.py
@@ -0,0 +1,326 @@
+"""Filter for download"""
+
+import re
+from datetime import datetime
+from typing import Any
+
+from ply import lex, yacc
+
+from utils.meta_data import MetaData, NoneObj, ReString
+
+
+class Parser:
+ """
+ Base class for a lexer/parser that has the rules defined as methods
+ """
+
+ def __init__(self, debug: bool = False):
+ self.names: dict = {}
+ self.debug = debug
+ # Build the lexer and parser
+ lex.lex(module=self)
+ yacc.yacc(module=self)
+
+ def reset(self):
+ """Reset all symbol"""
+ self.names.clear()
+
+ def exec(self, filter_str: str) -> Any:
+ """Exec filter str"""
+ # ) #
+ return yacc.parse(filter_str, debug=self.debug)
+
+
+# pylint: disable = R0904
+class BaseFilter(Parser):
+ """for normal filter"""
+
+ def __init__(self, debug: bool = False):
+ """
+ Parameters
+ ----------
+ debug: bool
+ If output debug info
+
+ """
+ super().__init__(debug=debug)
+
+ def _output(self, output_str: str):
+ """For print debug info"""
+ if self.debug:
+ print(output_str)
+
+ reserved = {
+ "and": "AND",
+ "or": "OR",
+ }
+
+ tokens = (
+ "NAME",
+ "NUMBER",
+ "GE",
+ "LE",
+ "LOR",
+ "LAND",
+ "STRING",
+ "RESTRING",
+ "EQ",
+ "NE",
+ "TIME",
+ "AND",
+ "OR",
+ )
+
+ literals = ["=", "+", "-", "*", "/", "(", ")", ">", "<"]
+
+ # t_NAME = r'[a-zA-Z_][a-zA-Z0-9_]*'
+ t_GE = r">="
+ t_LE = r"<="
+ t_LOR = r"\|\|"
+ t_LAND = r"&&"
+ t_EQ = r"=="
+ t_NE = r"!="
+
+ def t_TIME(self, t):
+ r"\d{4}-\d{1,2}-\d{1,2}[ ]{1,}\d{1,2}:\d{1,2}:\d{1,2}"
+ t.value = datetime.strptime(t.value, "%Y-%m-%d %H:%M:%S")
+ return t
+
+ def t_STRING(self, t):
+ r"'([^\\']+|\\'|\\\\)*'"
+ t.value = t.value[1:-1].encode().decode("unicode_escape")
+ return t
+
+ def t_RESTRING(self, t):
+ r"r'([^\\']+|\\'|\\\\)*'"
+ t.value = t.value[2:-1].encode().decode("unicode_escape")
+ return t
+
+ def t_NAME(self, t):
+ r"[a-zA-Z_][a-zA-Z0-9_]*"
+ t.type = BaseFilter.reserved.get(t.value, "NAME")
+ return t
+
+ def t_NUMBER(self, t):
+ r"\d+"
+ t.value = int(t.value)
+ return t
+
+ t_ignore = " \t"
+
+ def t_newline(self, t):
+ r"\n+"
+ t.lexer.lineno += t.value.count("\n")
+
+ def t_error(self, t):
+ """print error"""
+ print(f"Illegal character '{t.value[0]}'")
+ t.lexer.skip(1)
+
+ precedence = (
+ ("left", "LOR", "OR"),
+ ("left", "LAND", "AND"),
+ ("left", "EQ", "NE"),
+ ("nonassoc", ">", "<", "GE", "LE"),
+ ("left", "+", "-"),
+ ("left", "*", "/"),
+ ("right", "UMINUS"),
+ )
+
+ def p_statement_assign(self, p):
+ 'statement : NAME "=" expression'
+ self.names[p[1]] = p[3]
+
+ def p_statement_expr(self, p):
+ "statement : expression"
+ self._output(p[1])
+ p[0] = p[1]
+
+ def p_expression_binop(self, p):
+ """expression : expression '+' expression
+ | expression '-' expression
+ | expression '*' expression
+ | expression '/' expression"""
+ if isinstance(p[1], NoneObj):
+ p[1] = 0
+ if isinstance(p[3], NoneObj):
+ p[3] = 0
+
+ if p[2] == "+":
+ p[0] = p[1] + p[3]
+ elif p[2] == "-":
+ p[0] = p[1] - p[3]
+ elif p[2] == "*":
+ p[0] = p[1] * p[3]
+ elif p[2] == "/":
+ p[0] = p[1] / p[3]
+
+ self._output(f"binop {p[1]} {p[2]} {p[3]} = {p[0]}")
+
+ def p_expression_comp(self, p):
+ """expression : expression '>' expression
+ | expression '<' expression"""
+
+ if isinstance(p[1], NoneObj) or isinstance(p[3], NoneObj):
+ p[0] = True
+ return
+
+ if p[1] is None or p[3] is None:
+ p[0] = True
+ return
+ if p[2] == ">":
+ p[0] = p[1] > p[3]
+ elif p[2] == "<":
+ p[0] = p[1] < p[3]
+
+ def p_expression_uminus(self, p):
+ "expression : '-' expression %prec UMINUS"
+ p[0] = -p[2]
+
+ def p_expression_ge(self, p):
+ "expression : expression GE expression"
+ if isinstance(p[1], NoneObj) or isinstance(p[3], NoneObj):
+ p[0] = True
+ return
+
+ if p[1] is None or p[3] is None:
+ p[0] = True
+ return
+
+ p[0] = p[1] >= p[3]
+ self._output(f"{p[1]} {p[2]} {p[3]} {p[0]}")
+
+ def p_expression_le(self, p):
+ "expression : expression LE expression"
+ if isinstance(p[1], NoneObj) or isinstance(p[3], NoneObj):
+ p[0] = True
+ return
+
+ if p[1] is None or p[3] is None:
+ p[0] = True
+ return
+
+ p[0] = p[1] <= p[3]
+ self._output(f"{p[1]} {p[2]} {p[3]} = {p[0]}")
+
+ def p_expression_eq(self, p):
+ "expression : expression EQ expression"
+ if isinstance(p[1], NoneObj) or isinstance(p[3], NoneObj):
+ p[0] = True
+ return
+
+ if p[1] is None or p[3] is None:
+ p[0] = True
+ return
+
+ if isinstance(p[3], ReString):
+ if not isinstance(p[1], str):
+ p[0] = 0
+ return
+ p[0] = re.fullmatch(p[3].re_string, p[1]) is not None
+ self._output(f"{p[1]} {p[2]} {p[3].re_string} {p[0]}")
+ elif isinstance(p[1], ReString):
+ if not isinstance(p[3], str):
+ p[0] = 0
+ return
+ p[0] = re.fullmatch(p[1].re_string, p[3]) is not None
+ self._output(f"{p[1]} {p[2]} {p[3].re_string} {p[0]}")
+ else:
+ p[0] = p[1] == p[3]
+ self._output(f"{p[1]} {p[2]} {p[3]} {p[0]}")
+
+ def p_expression_ne(self, p):
+ "expression : expression NE expression"
+ if isinstance(p[1], NoneObj) or isinstance(p[3], NoneObj):
+ p[0] = True
+ return
+
+ if p[1] is None or p[3] is None:
+ p[0] = True
+ return
+ if isinstance(p[3], ReString):
+ if not isinstance(p[1], str):
+ p[0] = 0
+ return
+ p[0] = re.fullmatch(p[3].re_string, p[1]) is None
+ self._output(f"{p[1]} {p[2]} {p[3].re_string} {p[0]}")
+ elif isinstance(p[1], ReString):
+ if not isinstance(p[3], str):
+ p[0] = 0
+ return
+ p[0] = re.fullmatch(p[1].re_string, p[3]) is None
+ self._output(f"{p[1]} {p[2]} {p[3].re_string} {p[0]}")
+ else:
+ p[0] = p[1] != p[3]
+ self._output(f"{p[1]} {p[2]} {p[3]} = {p[0]}")
+
+ def p_expression_group(self, p):
+ "expression : '(' expression ')'"
+ p[0] = p[2]
+
+ def p_expression_number(self, p):
+ "expression : NUMBER"
+ p[0] = p[1]
+
+ def p_expression_time(self, p):
+ "expression : TIME"
+ p[0] = p[1]
+
+ def p_expression_name(self, p):
+ "expression : NAME"
+ try:
+ p[0] = self.names[p[1]]
+ except LookupError:
+ self._output(f"Undefined name '{p[1]}'")
+ p[0] = NoneObj()
+
+ def p_expression_lor(self, p):
+ "expression : expression LOR expression"
+ p[0] = p[1] or p[3]
+
+ def p_expression_land(self, p):
+ "expression : expression LAND expression"
+ p[0] = p[1] and p[3]
+
+ def p_expression_or(self, p):
+ "expression : expression OR expression"
+ p[0] = p[1] or p[3]
+
+ def p_expression_and(self, p):
+ "expression : expression AND expression"
+ p[0] = p[1] and p[3]
+
+ def p_expression_string(self, p):
+ "expression : STRING"
+ p[0] = p[1]
+
+ def p_expression_restring(self, p):
+ "expression : RESTRING"
+ p[0] = ReString(p[1])
+ self._output("RESTRING : " + p[0].re_string)
+
+ # pylint: disable = C0116
+ def p_error(self, p):
+ if p:
+ print(f"Syntax error at '{p.value}'")
+ else:
+ print("Syntax error at EOF")
+
+
+class Filter:
+ """filter for telegram download"""
+
+ def __init__(self):
+ self.filter = BaseFilter()
+
+ def set_meta_data(self, meta_data: MetaData):
+ """Set meta data for filter"""
+ self.filter.reset()
+ self.filter.names = meta_data.data()
+
+ def exec(self, filter_str: str) -> Any:
+ """Exec filter str"""
+
+ if self.filter.names:
+ return self.filter.exec(filter_str)
+ raise ValueError("meta data cannot be empty!")
diff --git a/module/templates/index.html b/module/templates/index.html
index 0a3c6bff..97feade5 100644
--- a/module/templates/index.html
+++ b/module/templates/index.html
@@ -38,7 +38,7 @@