From cedc1b1f8349880f9afcdf73c7812725c4ef9196 Mon Sep 17 00:00:00 2001
From: ChuckNorrison <2964146+ChuckNorrison@users.noreply.github.com>
Date: Mon, 19 Jun 2023 20:27:10 +0200
Subject: [PATCH] set photo filename as content

for photo messages, the content should not be empty. Write the filename instead.

pylint rating to 9.95/10
---
 telegram-export-converter.py | 184 ++++++++++++++++++++---------------
 1 file changed, 103 insertions(+), 81 deletions(-)
diff --git a/telegram-export-converter.py b/telegram-export-converter.py
index f1c14b7..e3ec8e2 100644
--- a/telegram-export-converter.py
+++ b/telegram-export-converter.py
@@ -1,11 +1,17 @@
+#!/usr/bin/env python
+
+"""Convert telegram export html data to csv file"""
+
 from html import unescape
 from time import time
-from sys import argv
+#from sys import argv
 import csv
 import os
+import sys
 import re
 
 class Message:
+    """create message object"""
     def __init__(self):
         self.message_id = None
         self.timestamp = None
@@ -14,13 +20,20 @@ def __init__(self):
         self.reply = None
         self.content = None
 
-    def toTuple(self):
-        if self.message_id: self.message_id = self.message_id.replace('message', '')
-        if self.timestamp: self.timestamp = ' '.join(self.timestamp.split()[:2])
-        if self.sender: self.sender = unescape(self.sender.strip())
-        if self.fwd: self.fwd = unescape(self.fwd.strip())
-        if self.reply: self.reply = self.reply.replace('message', '')
-        if self.content: self.content = unescape(self.content.strip())
+    def to_tuple(self):
+        """create tuple"""
+        if self.message_id:
+            self.message_id = self.message_id.replace('message', '')
+        if self.timestamp:
+            self.timestamp = ' '.join(self.timestamp.split()[:2])
+        if self.sender:
+            self.sender = unescape(self.sender.strip())
+        if self.fwd:
+            self.fwd = unescape(self.fwd.strip())
+        if self.reply:
+            self.reply = self.reply.replace('message', '')
+        if self.content:
+            self.content = unescape(self.content.strip())
 
         return (self.message_id, self.timestamp, self.sender, self.fwd, self.reply, self.content)
 
@@ -30,12 +43,12 @@ def toTuple(self):
 message_id_joined_pattern = re.compile('<div class="message default clearfix joined" id="([^"]+)')
 timestamp_pattern = re.compile('<div class="pull_right date details" title="([^"]+)')
 
-fwd_pattern = re.compile('<div class="userpic userpic\d+" style="width: 42px; height: 42px">')
+fwd_pattern = re.compile(r'<div class="userpic userpic\d+" style="width: 42px; height: 42px">')
 fwd_reply_pattern = re.compile('<div class="reply_to details">')
 fwd_sender_pattern = re.compile('([^<]+)<span class="date details')
 same_fwd_media_pattern = re.compile('<div class="media_wrap clearfix">')
 same_fwd_text_pattern = re.compile('<div class="text">')
-reply_pattern = re.compile('In reply to <a href="(?:messages\d*.html)?#go_to_([^"]+)"')
+reply_pattern = re.compile(r'In reply to <a href="(?:messages\d*.html)?#go_to_([^"]+)"')
 
 photo_pattern = re.compile('<div class="media clearfix pull_left media_photo">')
 video_pattern = re.compile('<div class="media clearfix pull_left media_video">')
@@ -43,8 +56,10 @@ def toTuple(self):
 audio_pattern = re.compile('<div class="media clearfix pull_left media_audio_file">')
 file_pattern = re.compile('<div class="media clearfix pull_left media_file">')
 contact_pattern = re.compile('<div class="media clearfix pull_left media_contact">')
-contact_link_pattern = re.compile('<a class="media clearfix pull_left block_link media_contact" href="[^"]+"')
-location_link_pattern = re.compile('<a class="media clearfix pull_left block_link media_location" href="[^"]+"')
+contact_link_pattern = re.compile(
+        '<a class="media clearfix pull_left block_link media_contact" href="[^"]+"')
+location_link_pattern = re.compile(
+        '<a class="media clearfix pull_left block_link media_location" href="[^"]+"')
 call_pattern = re.compile('<div class="media clearfix pull_left media_call( success)?">')
 poll_pattern = re.compile('<div class="media_poll">')
 game_pattern = re.compile('<a class="media clearfix pull_left block_link media_game" href="[^"]+">')
@@ -58,17 +73,18 @@ def toTuple(self):
 
 print("Starting...")
 
-# Scans current directory for message<n>.html Telegram chat export files
+# Scans current directory for message<MSG_NUMBER>.html Telegram chat export files
 message_files = []
-n = 1
+MSG_NUMBER = 1
 for file in os.listdir():
     if file.startswith('messages') and file.endswith('.html'):
-        message_files.append('messages' + (str(n) if n > 1 else '') + '.html')
-        n += 1
+        message_files.append('messages' + (str(MSG_NUMBER) if MSG_NUMBER > 1 else '') + '.html')
+        MSG_NUMBER += 1
 
 if not message_files:
-    print('No message.html files found. Are you sure the script is in the right directory? Exiting...')
-    exit()
+    print('No message.html files found. Are you sure the script '
+        'is in the right directory? Exiting...')
+    sys.exit(1)
 
 print(f'Loading all {len(message_files)} message files...')
 
@@ -84,64 +100,64 @@ def toTuple(self):
 
 # Sets output filename as the chat's name
 chat_name = lines[15]
-output_file = 'Telegram-' + ''.join(c if c.isalnum() else '_' for c in chat_name) + '.csv'
+OUTPUT_FILE = 'Telegram-' + ''.join(c if c.isalnum() else '_' for c in chat_name) + '.csv'
 
 ################################################################################
 
 print(f'Processing \'{chat_name}\'...')
 
 messages = []
-cur = 0
-last_sender = None
-last_fwd_sender = None
+CUR = 0
+LAST_SENDER = None
+LAST_FWD_SENDER = None
 
-while cur < len(lines):
+while CUR < len(lines):
     # Skip lines that aren't the start of a message
-    if not lines[cur].startswith('<div class='):
-        cur += 1
+    if not lines[CUR].startswith('<div class='):
+        CUR += 1
         continue
 
     # Check if it's a new sender's message
-    new = True
-    message_id = re.findall(message_id_new_pattern, lines[cur])
+    NEW = True
+    message_id = re.findall(message_id_new_pattern, lines[CUR])
     if not message_id:
-        new = False
-        message_id = re.findall(message_id_joined_pattern, lines[cur])
+        NEW = False
+        message_id = re.findall(message_id_joined_pattern, lines[CUR])
 
     # Skip lines that aren't the start of a message
     if not message_id:
-        cur += 1
+        CUR += 1
         continue
 
     m = Message()
     m.message_id = message_id[0]
 
-    if new: # New sender
+    if NEW: # New sender
         # If it's from a Deleted Account, no initial is
         # shown as avatar, so there's a line less to skip
-        if lines[cur+4] == '</div>':
-            cur += 8
+        if lines[CUR+4] == '</div>':
+            CUR += 8
         else:
-            cur += 9
+            CUR += 9
 
-        timestamp = re.findall(timestamp_pattern, lines[cur])
+        timestamp = re.findall(timestamp_pattern, lines[CUR])
         m.timestamp = timestamp[0]
 
-        cur += 4
-        m.sender = lines[cur]
-        last_sender = m.sender
+        CUR += 4
+        m.sender = lines[CUR]
+        LAST_SENDER = m.sender
 
-        cur += 3
-        m.content = lines[cur]
+        CUR += 3
+        m.content = lines[CUR]
     else: # Same sender as the message before
-        cur += 2
-        timestamp = re.findall(timestamp_pattern, lines[cur])
+        CUR += 2
+        timestamp = re.findall(timestamp_pattern, lines[CUR])
         m.timestamp = timestamp[0]
 
-        m.sender = last_sender
+        m.sender = LAST_SENDER
 
-        cur += 4
-        m.content = lines[cur]
+        CUR += 4
+        m.content = lines[CUR]
 
     is_fwd = re.match(fwd_pattern, m.content)
     is_same_fwd_text = re.match(same_fwd_text_pattern, m.content)
@@ -152,43 +168,43 @@ def toTuple(self):
     if is_fwd:
         # If it's from a Deleted Account, no initial is
         # shown as avatar, so there's a line less to skip
-        if lines[cur+2] == '</div>':
-            cur += 7
+        if lines[CUR+2] == '</div>':
+            CUR += 7
         else:
-            cur += 8
-        
-        fwd_sender = re.findall(fwd_sender_pattern, lines[cur])
+            CUR += 8
+
+        fwd_sender = re.findall(fwd_sender_pattern, lines[CUR])
         m.fwd = fwd_sender[0]
-        last_fwd_sender = m.fwd
+        LAST_FWD_SENDER = m.fwd
 
-        cur += 2
-        is_fwd_reply = re.findall(fwd_reply_pattern, lines[cur])
+        CUR += 2
+        is_fwd_reply = re.findall(fwd_reply_pattern, lines[CUR])
         if is_fwd_reply:
-            cur += 4
+            CUR += 4
         else:
-            cur += 1
+            CUR += 1
 
-        m.content = lines[cur]
+        m.content = lines[CUR]
     elif is_fwd_reply_same_fwd_text:
-        m.fwd = last_fwd_sender
+        m.fwd = LAST_FWD_SENDER
 
-        cur += 4
-        m.content = lines[cur]
+        CUR += 4
+        m.content = lines[CUR]
     elif is_same_fwd_text:
-        m.fwd = last_fwd_sender
+        m.fwd = LAST_FWD_SENDER
 
-        cur += 1
-        m.content = lines[cur]
+        CUR += 1
+        m.content = lines[CUR]
     elif is_same_fwd_media:
-        m.fwd = last_fwd_sender
+        m.fwd = LAST_FWD_SENDER
 
-        cur += 6
-        m.content = f'[{lines[cur]}]'
+        CUR += 6
+        m.content = f'[{lines[CUR]}]'
     elif is_reply:
         m.reply = is_reply[0]
 
-        cur += 3
-        m.content = lines[cur]
+        CUR += 3
+        m.content = lines[CUR]
 
     if m.content.startswith('<'):
         is_photo = re.match(photo_pattern, m.content)
@@ -205,21 +221,21 @@ def toTuple(self):
 
         # Write type of media as content
         if any([is_photo, is_video, is_voice, is_audio, is_file]):
-            cur += 5
-            m.content = f'[{lines[cur]}]'
+            CUR += 5
+            m.content = f'[{lines[CUR]}]'
         elif is_contact or is_contact_link:
-            cur += 5
-            m.content = f'[Contact - {lines[cur]} - {lines[cur+3]}]'
+            CUR += 5
+            m.content = f'[Contact - {lines[CUR]} - {lines[CUR+3]}]'
         elif is_location_link:
-            cur += 5
-            m.content = f'[{lines[cur]} - {lines[cur+3]}]'
+            CUR += 5
+            m.content = f'[{lines[CUR]} - {lines[CUR+3]}]'
         elif is_call:
-            cur += 8
-            m.content = f'[Call - {lines[cur]}]'
+            CUR += 8
+            m.content = f'[Call - {lines[CUR]}]'
         elif is_poll:
-            m.content = f'[{lines[cur+5]} - {lines[cur+2]}]'
+            m.content = f'[{lines[CUR+5]} - {lines[CUR+2]}]'
         elif is_game:
-            m.content = f'[Game - {lines[cur+5]} - {lines[cur+11]}]'
+            m.content = f'[Game - {lines[CUR+5]} - {lines[CUR+11]}]'
 
     # Replace HTML line breaks
     if '<br>' in m.content:
@@ -241,13 +257,19 @@ def toTuple(self):
     if m.content == '</div>':
         m.content = '[Animated emoji]'
 
+    # write file name as content
+    if m.content == "":
+        new_content = lines[CUR].replace('<a class="photo_wrap clearfix pull_left" href="',"")
+        new_content = new_content.replace('">',"")
+        m.content = new_content
+
     messages.append(m)
-    cur += 1
+    CUR += 1
 
 # Write CSV
-with open(output_file, 'w+', encoding='UTF-8', newline='') as f:
+with open(OUTPUT_FILE, 'w+', encoding='UTF-8', newline='') as f:
     writer = csv.writer(f)
     writer.writerow(list(messages[0].__dict__.keys()))
-    writer.writerows([m.toTuple() for m in messages])
+    writer.writerows([m.to_tuple() for m in messages])
 
-print(f'Written to \'{output_file}\' in {(time()-t0):.2f}s.')
\ No newline at end of file
+print(f'Written to \'{OUTPUT_FILE}\' in {(time()-t0):.2f}s.')