Skip to content

Commit 79d34d9

Browse files
committed
add command to add post ids
1 parent 0134286 commit 79d34d9

File tree

3 files changed

+110
-18
lines changed

3 files changed

+110
-18
lines changed

assign_ids.py

Lines changed: 0 additions & 18 deletions
This file was deleted.

sdk/commands/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
from ._add_ids import AddIDsCommand
12
from ._check_all import CheckAllCommand
23
from ._command import Command
34
from ._html import HTMLCommand
@@ -10,6 +11,7 @@
1011
__all__ = ['Command', 'COMMANDS']
1112

1213
COMMANDS = (
14+
AddIDsCommand,
1315
HTMLCommand,
1416
RunCodeCommand,
1517
ScheduleCommand,

sdk/commands/_add_ids.py

Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
2+
from __future__ import annotations
3+
4+
import asyncio
5+
import os
6+
from functools import cached_property
7+
from pathlib import Path
8+
9+
from telethon import TelegramClient
10+
11+
from ..post import Post
12+
from ._command import Command
13+
import re
14+
15+
CHANNEL = 'pythonetc'
16+
REX_URL = re.compile(r'https?\:[a-zA-Z0-9\-\/\.\(]+')
17+
REX_PYTHON = re.compile(r'3\.[0-9]{1,2}')
18+
REX_QNAME = re.compile(r'[a-z]{2,}\.[a-zA-Z0-9]{2,}')
19+
URLs = tuple[str, ...]
20+
21+
22+
class AddIDsCommand(Command):
23+
"""Add Telegram post IDs to all posts that don't have it.
24+
"""
25+
name = 'add-ids'
26+
27+
def run(self) -> int:
28+
if 'API_ID' not in os.environ:
29+
self.warn(
30+
'API_ID and API_HASH env vars required, '
31+
'you can get them at https://my.telegram.org/apps',
32+
)
33+
return 1
34+
asyncio.run(self._run())
35+
return 0
36+
37+
async def _run(self) -> None:
38+
self.print('reading posts...')
39+
paths = self._get_paths()
40+
self.print('fetching IDs...')
41+
async with self._client:
42+
ids = await self._get_ids()
43+
self.print('setting IDs...')
44+
for keyword, id in ids.items():
45+
path = paths.get(keyword)
46+
if path is None:
47+
continue
48+
content = path.read_text(encoding='utf-8')
49+
lines = content.splitlines()
50+
lines.insert(2, f'id: {id}')
51+
new_content = '\n'.join(lines)
52+
new_content = new_content.rstrip() + '\n'
53+
assert new_content != content
54+
path.write_text(new_content, encoding='utf-8')
55+
self.print(f'added ID for {path.name}')
56+
57+
def _get_paths(self) -> dict[URLs, Path]:
58+
paths: dict[URLs, Path] = {}
59+
for path in sorted(Path('posts').iterdir()):
60+
if path.suffix != '.md':
61+
continue
62+
post = Post.from_path(path)
63+
if post.id is not None:
64+
continue
65+
keywords = self._get_keywords(post.md_content)
66+
if not keywords:
67+
continue
68+
if keywords in paths:
69+
name1 = path.name
70+
name2 = paths[keywords].name
71+
msg = f'duplicate set of keywords: {name1} and {name2}'
72+
raise RuntimeError(msg)
73+
paths[keywords] = path
74+
return paths
75+
76+
async def _get_ids(self) -> dict[URLs, int]:
77+
ids = {}
78+
async for message in self._client.iter_messages(CHANNEL):
79+
if message.text is None:
80+
continue
81+
keywords = self._get_keywords(message.text)
82+
if not keywords:
83+
continue
84+
if keywords in ids:
85+
continue
86+
ids[keywords] = message.id
87+
return ids
88+
89+
def _get_keywords(self, text: str) -> URLs:
90+
"""
91+
Get some key components from the text (URLs, qualnames, Python versions)
92+
that can be used to uniquely identify a text.
93+
94+
It allows us to match the same text ignoring changes in formatting
95+
or corrected typos.
96+
"""
97+
result = REX_URL.findall(text)
98+
result.extend(REX_PYTHON.findall(text))
99+
result.extend(REX_QNAME.findall(text))
100+
return tuple(result)
101+
102+
@cached_property
103+
def _client(self) -> TelegramClient:
104+
return TelegramClient(
105+
'bot',
106+
api_id=os.environ['API_ID'],
107+
api_hash=os.environ['API_HASH'],
108+
)

0 commit comments

Comments
 (0)