From cedc1b1f8349880f9afcdf73c7812725c4ef9196 Mon Sep 17 00:00:00 2001 From: ChuckNorrison <2964146+ChuckNorrison@users.noreply.github.com> Date: Mon, 19 Jun 2023 20:27:10 +0200 Subject: [PATCH] set photo filename as content for photo messages, the content should not be empty. Write the filename instead. pylint rating to 9.95/10 --- telegram-export-converter.py | 184 ++++++++++++++++++++--------------- 1 file changed, 103 insertions(+), 81 deletions(-) diff --git a/telegram-export-converter.py b/telegram-export-converter.py index f1c14b7..e3ec8e2 100644 --- a/telegram-export-converter.py +++ b/telegram-export-converter.py @@ -1,11 +1,17 @@ +#!/usr/bin/env python + +"""Convert telegram export html data to csv file""" + from html import unescape from time import time -from sys import argv +#from sys import argv import csv import os +import sys import re class Message: + """create message object""" def __init__(self): self.message_id = None self.timestamp = None @@ -14,13 +20,20 @@ def __init__(self): self.reply = None self.content = None - def toTuple(self): - if self.message_id: self.message_id = self.message_id.replace('message', '') - if self.timestamp: self.timestamp = ' '.join(self.timestamp.split()[:2]) - if self.sender: self.sender = unescape(self.sender.strip()) - if self.fwd: self.fwd = unescape(self.fwd.strip()) - if self.reply: self.reply = self.reply.replace('message', '') - if self.content: self.content = unescape(self.content.strip()) + def to_tuple(self): + """create tuple""" + if self.message_id: + self.message_id = self.message_id.replace('message', '') + if self.timestamp: + self.timestamp = ' '.join(self.timestamp.split()[:2]) + if self.sender: + self.sender = unescape(self.sender.strip()) + if self.fwd: + self.fwd = unescape(self.fwd.strip()) + if self.reply: + self.reply = self.reply.replace('message', '') + if self.content: + self.content = unescape(self.content.strip()) return (self.message_id, self.timestamp, self.sender, self.fwd, self.reply, self.content) @@ -30,12 +43,12 @@ def toTuple(self): message_id_joined_pattern = re.compile('
') +fwd_pattern = re.compile(r'
') fwd_reply_pattern = re.compile('
') fwd_sender_pattern = re.compile('([^<]+)') same_fwd_text_pattern = re.compile('
') -reply_pattern = re.compile('In reply to ') video_pattern = re.compile('
') @@ -43,8 +56,10 @@ def toTuple(self): audio_pattern = re.compile('
') file_pattern = re.compile('
') contact_pattern = re.compile('
') -contact_link_pattern = re.compile('') poll_pattern = re.compile('
') game_pattern = re.compile('') @@ -58,17 +73,18 @@ def toTuple(self): print("Starting...") -# Scans current directory for message.html Telegram chat export files +# Scans current directory for message.html Telegram chat export files message_files = [] -n = 1 +MSG_NUMBER = 1 for file in os.listdir(): if file.startswith('messages') and file.endswith('.html'): - message_files.append('messages' + (str(n) if n > 1 else '') + '.html') - n += 1 + message_files.append('messages' + (str(MSG_NUMBER) if MSG_NUMBER > 1 else '') + '.html') + MSG_NUMBER += 1 if not message_files: - print('No message.html files found. Are you sure the script is in the right directory? Exiting...') - exit() + print('No message.html files found. Are you sure the script ' + 'is in the right directory? Exiting...') + sys.exit(1) print(f'Loading all {len(message_files)} message files...') @@ -84,64 +100,64 @@ def toTuple(self): # Sets output filename as the chat's name chat_name = lines[15] -output_file = 'Telegram-' + ''.join(c if c.isalnum() else '_' for c in chat_name) + '.csv' +OUTPUT_FILE = 'Telegram-' + ''.join(c if c.isalnum() else '_' for c in chat_name) + '.csv' ################################################################################ print(f'Processing \'{chat_name}\'...') messages = [] -cur = 0 -last_sender = None -last_fwd_sender = None +CUR = 0 +LAST_SENDER = None +LAST_FWD_SENDER = None -while cur < len(lines): +while CUR < len(lines): # Skip lines that aren't the start of a message - if not lines[cur].startswith('
': - cur += 8 + if lines[CUR+4] == '
': + CUR += 8 else: - cur += 9 + CUR += 9 - timestamp = re.findall(timestamp_pattern, lines[cur]) + timestamp = re.findall(timestamp_pattern, lines[CUR]) m.timestamp = timestamp[0] - cur += 4 - m.sender = lines[cur] - last_sender = m.sender + CUR += 4 + m.sender = lines[CUR] + LAST_SENDER = m.sender - cur += 3 - m.content = lines[cur] + CUR += 3 + m.content = lines[CUR] else: # Same sender as the message before - cur += 2 - timestamp = re.findall(timestamp_pattern, lines[cur]) + CUR += 2 + timestamp = re.findall(timestamp_pattern, lines[CUR]) m.timestamp = timestamp[0] - m.sender = last_sender + m.sender = LAST_SENDER - cur += 4 - m.content = lines[cur] + CUR += 4 + m.content = lines[CUR] is_fwd = re.match(fwd_pattern, m.content) is_same_fwd_text = re.match(same_fwd_text_pattern, m.content) @@ -152,43 +168,43 @@ def toTuple(self): if is_fwd: # If it's from a Deleted Account, no initial is # shown as avatar, so there's a line less to skip - if lines[cur+2] == '
': - cur += 7 + if lines[CUR+2] == '
': + CUR += 7 else: - cur += 8 - - fwd_sender = re.findall(fwd_sender_pattern, lines[cur]) + CUR += 8 + + fwd_sender = re.findall(fwd_sender_pattern, lines[CUR]) m.fwd = fwd_sender[0] - last_fwd_sender = m.fwd + LAST_FWD_SENDER = m.fwd - cur += 2 - is_fwd_reply = re.findall(fwd_reply_pattern, lines[cur]) + CUR += 2 + is_fwd_reply = re.findall(fwd_reply_pattern, lines[CUR]) if is_fwd_reply: - cur += 4 + CUR += 4 else: - cur += 1 + CUR += 1 - m.content = lines[cur] + m.content = lines[CUR] elif is_fwd_reply_same_fwd_text: - m.fwd = last_fwd_sender + m.fwd = LAST_FWD_SENDER - cur += 4 - m.content = lines[cur] + CUR += 4 + m.content = lines[CUR] elif is_same_fwd_text: - m.fwd = last_fwd_sender + m.fwd = LAST_FWD_SENDER - cur += 1 - m.content = lines[cur] + CUR += 1 + m.content = lines[CUR] elif is_same_fwd_media: - m.fwd = last_fwd_sender + m.fwd = LAST_FWD_SENDER - cur += 6 - m.content = f'[{lines[cur]}]' + CUR += 6 + m.content = f'[{lines[CUR]}]' elif is_reply: m.reply = is_reply[0] - cur += 3 - m.content = lines[cur] + CUR += 3 + m.content = lines[CUR] if m.content.startswith('<'): is_photo = re.match(photo_pattern, m.content) @@ -205,21 +221,21 @@ def toTuple(self): # Write type of media as content if any([is_photo, is_video, is_voice, is_audio, is_file]): - cur += 5 - m.content = f'[{lines[cur]}]' + CUR += 5 + m.content = f'[{lines[CUR]}]' elif is_contact or is_contact_link: - cur += 5 - m.content = f'[Contact - {lines[cur]} - {lines[cur+3]}]' + CUR += 5 + m.content = f'[Contact - {lines[CUR]} - {lines[CUR+3]}]' elif is_location_link: - cur += 5 - m.content = f'[{lines[cur]} - {lines[cur+3]}]' + CUR += 5 + m.content = f'[{lines[CUR]} - {lines[CUR+3]}]' elif is_call: - cur += 8 - m.content = f'[Call - {lines[cur]}]' + CUR += 8 + m.content = f'[Call - {lines[CUR]}]' elif is_poll: - m.content = f'[{lines[cur+5]} - {lines[cur+2]}]' + m.content = f'[{lines[CUR+5]} - {lines[CUR+2]}]' elif is_game: - m.content = f'[Game - {lines[cur+5]} - {lines[cur+11]}]' + m.content = f'[Game - {lines[CUR+5]} - {lines[CUR+11]}]' # Replace HTML line breaks if '
' in m.content: @@ -241,13 +257,19 @@ def toTuple(self): if m.content == '
': m.content = '[Animated emoji]' + # write file name as content + if m.content == "": + new_content = lines[CUR].replace('',"") + m.content = new_content + messages.append(m) - cur += 1 + CUR += 1 # Write CSV -with open(output_file, 'w+', encoding='UTF-8', newline='') as f: +with open(OUTPUT_FILE, 'w+', encoding='UTF-8', newline='') as f: writer = csv.writer(f) writer.writerow(list(messages[0].__dict__.keys())) - writer.writerows([m.toTuple() for m in messages]) + writer.writerows([m.to_tuple() for m in messages]) -print(f'Written to \'{output_file}\' in {(time()-t0):.2f}s.') \ No newline at end of file +print(f'Written to \'{OUTPUT_FILE}\' in {(time()-t0):.2f}s.')