diff --git a/.gitignore b/.gitignore index 7e558020..1e9826ca 100644 --- a/.gitignore +++ b/.gitignore @@ -59,3 +59,5 @@ photo/ voice/ video/ video_note/ +parser.out +parsetab.py diff --git a/README.md b/README.md index 96d3491c..df9ba8b4 100644 --- a/README.md +++ b/README.md @@ -6,8 +6,8 @@ Coverage Status License: MIT Code style: black -Code style: black -Code style: black + +Code style: black

@@ -149,6 +149,8 @@ file_name_prefix_split: ' - ' max_concurrent_transmissions: 1 web_host: 127.0.0.1 web_port: 5000 +download_filter: + 'telegram_chat_id': message_date >= 2022-12-01 00:00:00 and message_date <= 2023-01-17 00:00:00 ``` - **api_hash** - The api_hash you got from telegram apps @@ -160,26 +162,27 @@ web_port: 5000 - **file_formats** - File types to download for supported media types which are `audio`, `document` and `video`. Default format is `all`, downloads all files. - **save_path** - The root directory where you want to store downloaded files. - **file_path_prefix** - Store file subfolders, the order of the list is not fixed, can be randomly combined. - - `chat_title` - channel or group title, it will be chat id if not exist title. - - `media_datetime` - media date, also see pyrogram.types.Message.date.strftime("%Y_%m"). - - `media_type` - media type, also see `media_types`. + - `chat_title` - Channel or group title, it will be chat id if not exist title. + - `media_datetime` - Media date, also see pyrogram.types.Message.date.strftime("%Y_%m"). + - `media_type` - Media type, also see `media_types`. - **disable_syslog** - You can choose which types of logs to disable,see `logging._nameToLevel`. - **upload_drive** - You can upload file to cloud drive. - `enable_upload_file` - Enable upload file, default `false`. - `remote_dir` - Where you upload, like `drive_id/drive_name`. - `upload_adapter` - Upload file adapter, which can be `rclone`, `aligo`. If it is `rclone`, it supports all `rclone` servers that support uploading. If it is `aligo`, it supports uploading `Ali cloud disk`. - - `rclone_path` - RClone exe path, see wiki[how to use rclone](https://github.com/tangyoha/telegram_media_downloader/wiki#how-to-use-rclone) + - `rclone_path` - RClone exe path, see [How to use rclone](https://github.com/tangyoha/telegram_media_downloader/wiki/Rclone) - `before_upload_file_zip` - Zip file before upload, default `false`. - `after_upload_file_delete` - Delete file after upload success, default `false`. -- **file_name_prefix** - custom file name, use the same as **file_path_prefix** - - `message_id` - message id - - `file_name` - file name (may be empty) - - `caption` - the title of the message (may be empty) -- **file_name_prefix_split** - custom file name prefix symbol, the default is `-` +- **file_name_prefix** - Custom file name, use the same as **file_path_prefix** + - `message_id` - Message id + - `file_name` - File name (may be empty) + - `caption` - The title of the message (may be empty) +- **file_name_prefix_split** - Custom file name prefix symbol, the default is `-` - **max_concurrent_transmissions** - Set the maximum amount of concurrent transmissions (uploads & downloads). A value that is too high may result in network related issues. Defaults to 1. -- **hide_file_name** - whether to hide the web interface file name, default `false` -- **web_host** - web host -- **web_port** - web port +- **hide_file_name** - Whether to hide the web interface file name, default `false` +- **web_host** - Web host. +- **web_port** - Web port. +- **download_filter** - Download filter, see [How to use Filter](https://github.com/tangyoha/telegram_media_downloader/wiki/How-to-use-Filter) ## Execution diff --git a/README_CN.md b/README_CN.md index 0202cb2c..31ba7224 100644 --- a/README_CN.md +++ b/README_CN.md @@ -6,9 +6,9 @@ Coverage Status License: MIT Code style: black -Code style: black + Code style: black -

+

@@ -141,6 +141,8 @@ file_name_prefix_split: ' - ' max_concurrent_transmissions: 1 web_host: 127.0.0.1 web_port: 5000 +download_filter: + 'telegram_chat_id': message_date >= 2022-12-01 00:00:00 and message_date <= 2023-01-17 00:00:00 ``` - **api_hash** - 你从电报应用程序获得的 api_hash @@ -160,7 +162,7 @@ web_port: 5000 - `enable_upload_file` - [必填]启用上传文件,默认为`false` - `remote_dir` - [必填]你上传的地方 - `upload_adapter` - [必填]上传文件适配器,可以为`rclone`,`aligo`。如果为`rclone`,则支持rclone所有支持上传的服务器,如果为aligo,则支持上传阿里云盘 - - `rclone_path`,如果配置`upload_adapter`为`rclone`则为必填,`rclone`的可执行目录,见wiki[如何使用rclone](https://github.com/tangyoha/telegram_media_downloader/wiki#how-to-use-rclone) + - `rclone_path`,如果配置`upload_adapter`为`rclone`则为必填,`rclone`的可执行目录,查阅 [如何使用rclone](https://github.com/tangyoha/telegram_media_downloader/wiki/Rclone) - `before_upload_file_zip` - 上传前压缩文件,默认为`false` - `after_upload_file_delete` - 上传成功后删除文件,默认为`false` - **file_name_prefix** - 自定义文件名称,使用和 **file_path_prefix** 一样 @@ -172,6 +174,7 @@ web_port: 5000 - **hide_file_name** - 是否隐藏web界面文件名称,默认`false` - **web_host** - web界面地址 - **web_port** - web界面端口 +- **download_filter** - 下载过滤器, 查阅 [How to use Filter](https://github.com/tangyoha/telegram_media_downloader/wiki/How-to-use-Filter) ## 执行 diff --git a/media_downloader.py b/media_downloader.py index ee5f3c85..b6ead6bf 100644 --- a/media_downloader.py +++ b/media_downloader.py @@ -16,6 +16,7 @@ from module.web import get_flask_app, update_download_status from utils.log import LogFilter from utils.meta import print_meta +from utils.meta_data import MetaData from utils.updates import check_for_updates logging.basicConfig( @@ -269,7 +270,6 @@ async def download_media( continue file_name, file_format = await _get_media_meta(message, _media, _type) media_size = getattr(_media, "file_size", 0) - if _can_download(_type, file_formats, file_format): if _is_exist(file_name): # TODO: check if the file download complete @@ -325,7 +325,6 @@ async def download_media( _check_download_finish(media_size, download_path, message.id) await app.upload_file(file_name) - app.downloaded_ids.append(message.id) break except pyrogram.errors.exceptions.bad_request_400.BadRequest: logger.warning( @@ -434,8 +433,11 @@ async def begin_import(pagination_limit: int): api_id=app.api_id, api_hash=app.api_hash, proxy=app.proxy, - max_concurrent_transmissions=app.max_concurrent_transmissions, ) + + if getattr(client, "max_concurrent_transmissions", None): + client.max_concurrent_transmissions = app.max_concurrent_transmissions + await client.start() print("Successfully started (Press Ctrl+C to stop)") @@ -467,11 +469,12 @@ async def begin_import(pagination_limit: int): app.last_read_message_id = last_read_message_id async for message in messages_iter: # type: ignore - if pagination_count != pagination_limit and not app.need_skip_message( - message.id - ): - pagination_count += 1 - messages_list.append(message) + meta_data = MetaData() + meta_data.get_meta_data(message) + if pagination_count != pagination_limit: + if not app.need_skip_message(str(app.chat_id), message.id, meta_data): + pagination_count += 1 + messages_list.append(message) else: last_read_message_id = await process_messages( client, diff --git a/module/app.py b/module/app.py index 33696cf5..0080389f 100644 --- a/module/app.py +++ b/module/app.py @@ -6,6 +6,9 @@ from loguru import logger from module.cloud_drive import CloudDrive, CloudDriveConfig +from module.filter import Filter +from utils.format import replace_date_time +from utils.meta_data import MetaData # pylint: disable = R0902 @@ -37,6 +40,7 @@ def __init__( self.config_file: str = config_file self.app_data_file: str = app_data_file self.application_name: str = application_name + self.download_filter = Filter() self.reset() @@ -84,6 +88,7 @@ def reset(self): self.max_concurrent_transmissions: int = 1 self.web_host: str = "localhost" self.web_port: int = 5000 + self.download_filter_dict: dict = {} def load_config(self, _config: dict) -> bool: """load config from str. @@ -165,6 +170,15 @@ def load_config(self, _config: dict) -> bool: self.web_host = _config.get("web_host", self.web_host) self.web_port = _config.get("web_port", self.web_port) + self.download_filter_dict = _config.get( + "download_filter", self.download_filter_dict + ) + + for key, value in self.download_filter_dict.items(): + self.download_filter_dict[key] = replace_date_time(value) + + # TODO: add check if expression exist syntax error + self.max_concurrent_transmissions = _config.get( "max_concurrent_transmissions", self.max_concurrent_transmissions ) @@ -273,19 +287,34 @@ def get_file_name( res = f"{message_id}" return res - def need_skip_message(self, message_id: int) -> bool: + def need_skip_message( + self, chat_id: str, message_id: int, meta_data: MetaData + ) -> bool: """if need skip download message. Parameters ---------- + chat_id: str + Config.yaml defined + message_id: int - readily to download message id + Readily to download message id + + meta_data: MetaData + Ready to match filter Returns ------- bool """ - return self.ids_to_retry_dict.get(message_id) is not None + if message_id in self.ids_to_retry_dict: + return True + + if chat_id in self.download_filter_dict: + self.download_filter.set_meta_data(meta_data) + return not self.download_filter.exec(self.download_filter_dict[chat_id]) + + return False def update_config(self, immediate: bool = True): """update config diff --git a/module/cloud_drive.py b/module/cloud_drive.py index 1f1c0805..20f94ecb 100644 --- a/module/cloud_drive.py +++ b/module/cloud_drive.py @@ -72,7 +72,7 @@ def zip_file(local_file_path: str) -> str: Zip local file """ - zip_file_name = os.path.basename(local_file_path).split(".")[0] + ".zip" + zip_file_name = local_file_path.split(".")[0] + ".zip" with ZipFile(zip_file_name, "w") as zip_writer: zip_writer.write(local_file_path) @@ -104,8 +104,8 @@ async def rclone_upload_file( file_path = local_file_path cmd = ( - f'"{drive_config.rclone_path}" copy "{file_path}"' - "{remote_dir}/ --create-empty-src-dirs --ignore-existing --progress" + f'"{drive_config.rclone_path}" copy "{file_path}" ' + f"{remote_dir}/ --create-empty-src-dirs --ignore-existing --progress" ) proc = await asyncio.create_subprocess_shell( cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT diff --git a/module/filter.py b/module/filter.py new file mode 100644 index 00000000..e98b41fa --- /dev/null +++ b/module/filter.py @@ -0,0 +1,326 @@ +"""Filter for download""" + +import re +from datetime import datetime +from typing import Any + +from ply import lex, yacc + +from utils.meta_data import MetaData, NoneObj, ReString + + +class Parser: + """ + Base class for a lexer/parser that has the rules defined as methods + """ + + def __init__(self, debug: bool = False): + self.names: dict = {} + self.debug = debug + # Build the lexer and parser + lex.lex(module=self) + yacc.yacc(module=self) + + def reset(self): + """Reset all symbol""" + self.names.clear() + + def exec(self, filter_str: str) -> Any: + """Exec filter str""" + # ) # + return yacc.parse(filter_str, debug=self.debug) + + +# pylint: disable = R0904 +class BaseFilter(Parser): + """for normal filter""" + + def __init__(self, debug: bool = False): + """ + Parameters + ---------- + debug: bool + If output debug info + + """ + super().__init__(debug=debug) + + def _output(self, output_str: str): + """For print debug info""" + if self.debug: + print(output_str) + + reserved = { + "and": "AND", + "or": "OR", + } + + tokens = ( + "NAME", + "NUMBER", + "GE", + "LE", + "LOR", + "LAND", + "STRING", + "RESTRING", + "EQ", + "NE", + "TIME", + "AND", + "OR", + ) + + literals = ["=", "+", "-", "*", "/", "(", ")", ">", "<"] + + # t_NAME = r'[a-zA-Z_][a-zA-Z0-9_]*' + t_GE = r">=" + t_LE = r"<=" + t_LOR = r"\|\|" + t_LAND = r"&&" + t_EQ = r"==" + t_NE = r"!=" + + def t_TIME(self, t): + r"\d{4}-\d{1,2}-\d{1,2}[ ]{1,}\d{1,2}:\d{1,2}:\d{1,2}" + t.value = datetime.strptime(t.value, "%Y-%m-%d %H:%M:%S") + return t + + def t_STRING(self, t): + r"'([^\\']+|\\'|\\\\)*'" + t.value = t.value[1:-1].encode().decode("unicode_escape") + return t + + def t_RESTRING(self, t): + r"r'([^\\']+|\\'|\\\\)*'" + t.value = t.value[2:-1].encode().decode("unicode_escape") + return t + + def t_NAME(self, t): + r"[a-zA-Z_][a-zA-Z0-9_]*" + t.type = BaseFilter.reserved.get(t.value, "NAME") + return t + + def t_NUMBER(self, t): + r"\d+" + t.value = int(t.value) + return t + + t_ignore = " \t" + + def t_newline(self, t): + r"\n+" + t.lexer.lineno += t.value.count("\n") + + def t_error(self, t): + """print error""" + print(f"Illegal character '{t.value[0]}'") + t.lexer.skip(1) + + precedence = ( + ("left", "LOR", "OR"), + ("left", "LAND", "AND"), + ("left", "EQ", "NE"), + ("nonassoc", ">", "<", "GE", "LE"), + ("left", "+", "-"), + ("left", "*", "/"), + ("right", "UMINUS"), + ) + + def p_statement_assign(self, p): + 'statement : NAME "=" expression' + self.names[p[1]] = p[3] + + def p_statement_expr(self, p): + "statement : expression" + self._output(p[1]) + p[0] = p[1] + + def p_expression_binop(self, p): + """expression : expression '+' expression + | expression '-' expression + | expression '*' expression + | expression '/' expression""" + if isinstance(p[1], NoneObj): + p[1] = 0 + if isinstance(p[3], NoneObj): + p[3] = 0 + + if p[2] == "+": + p[0] = p[1] + p[3] + elif p[2] == "-": + p[0] = p[1] - p[3] + elif p[2] == "*": + p[0] = p[1] * p[3] + elif p[2] == "/": + p[0] = p[1] / p[3] + + self._output(f"binop {p[1]} {p[2]} {p[3]} = {p[0]}") + + def p_expression_comp(self, p): + """expression : expression '>' expression + | expression '<' expression""" + + if isinstance(p[1], NoneObj) or isinstance(p[3], NoneObj): + p[0] = True + return + + if p[1] is None or p[3] is None: + p[0] = True + return + if p[2] == ">": + p[0] = p[1] > p[3] + elif p[2] == "<": + p[0] = p[1] < p[3] + + def p_expression_uminus(self, p): + "expression : '-' expression %prec UMINUS" + p[0] = -p[2] + + def p_expression_ge(self, p): + "expression : expression GE expression" + if isinstance(p[1], NoneObj) or isinstance(p[3], NoneObj): + p[0] = True + return + + if p[1] is None or p[3] is None: + p[0] = True + return + + p[0] = p[1] >= p[3] + self._output(f"{p[1]} {p[2]} {p[3]} {p[0]}") + + def p_expression_le(self, p): + "expression : expression LE expression" + if isinstance(p[1], NoneObj) or isinstance(p[3], NoneObj): + p[0] = True + return + + if p[1] is None or p[3] is None: + p[0] = True + return + + p[0] = p[1] <= p[3] + self._output(f"{p[1]} {p[2]} {p[3]} = {p[0]}") + + def p_expression_eq(self, p): + "expression : expression EQ expression" + if isinstance(p[1], NoneObj) or isinstance(p[3], NoneObj): + p[0] = True + return + + if p[1] is None or p[3] is None: + p[0] = True + return + + if isinstance(p[3], ReString): + if not isinstance(p[1], str): + p[0] = 0 + return + p[0] = re.fullmatch(p[3].re_string, p[1]) is not None + self._output(f"{p[1]} {p[2]} {p[3].re_string} {p[0]}") + elif isinstance(p[1], ReString): + if not isinstance(p[3], str): + p[0] = 0 + return + p[0] = re.fullmatch(p[1].re_string, p[3]) is not None + self._output(f"{p[1]} {p[2]} {p[3].re_string} {p[0]}") + else: + p[0] = p[1] == p[3] + self._output(f"{p[1]} {p[2]} {p[3]} {p[0]}") + + def p_expression_ne(self, p): + "expression : expression NE expression" + if isinstance(p[1], NoneObj) or isinstance(p[3], NoneObj): + p[0] = True + return + + if p[1] is None or p[3] is None: + p[0] = True + return + if isinstance(p[3], ReString): + if not isinstance(p[1], str): + p[0] = 0 + return + p[0] = re.fullmatch(p[3].re_string, p[1]) is None + self._output(f"{p[1]} {p[2]} {p[3].re_string} {p[0]}") + elif isinstance(p[1], ReString): + if not isinstance(p[3], str): + p[0] = 0 + return + p[0] = re.fullmatch(p[1].re_string, p[3]) is None + self._output(f"{p[1]} {p[2]} {p[3].re_string} {p[0]}") + else: + p[0] = p[1] != p[3] + self._output(f"{p[1]} {p[2]} {p[3]} = {p[0]}") + + def p_expression_group(self, p): + "expression : '(' expression ')'" + p[0] = p[2] + + def p_expression_number(self, p): + "expression : NUMBER" + p[0] = p[1] + + def p_expression_time(self, p): + "expression : TIME" + p[0] = p[1] + + def p_expression_name(self, p): + "expression : NAME" + try: + p[0] = self.names[p[1]] + except LookupError: + self._output(f"Undefined name '{p[1]}'") + p[0] = NoneObj() + + def p_expression_lor(self, p): + "expression : expression LOR expression" + p[0] = p[1] or p[3] + + def p_expression_land(self, p): + "expression : expression LAND expression" + p[0] = p[1] and p[3] + + def p_expression_or(self, p): + "expression : expression OR expression" + p[0] = p[1] or p[3] + + def p_expression_and(self, p): + "expression : expression AND expression" + p[0] = p[1] and p[3] + + def p_expression_string(self, p): + "expression : STRING" + p[0] = p[1] + + def p_expression_restring(self, p): + "expression : RESTRING" + p[0] = ReString(p[1]) + self._output("RESTRING : " + p[0].re_string) + + # pylint: disable = C0116 + def p_error(self, p): + if p: + print(f"Syntax error at '{p.value}'") + else: + print("Syntax error at EOF") + + +class Filter: + """filter for telegram download""" + + def __init__(self): + self.filter = BaseFilter() + + def set_meta_data(self, meta_data: MetaData): + """Set meta data for filter""" + self.filter.reset() + self.filter.names = meta_data.data() + + def exec(self, filter_str: str) -> Any: + """Exec filter str""" + + if self.filter.names: + return self.filter.exec(filter_str) + raise ValueError("meta data cannot be empty!") diff --git a/module/templates/index.html b/module/templates/index.html index 0a3c6bff..97feade5 100644 --- a/module/templates/index.html +++ b/module/templates/index.html @@ -38,7 +38,7 @@