Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

set photo filename as content #6

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
184 changes: 103 additions & 81 deletions telegram-export-converter.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,17 @@
#!/usr/bin/env python

"""Convert telegram export html data to csv file"""

from html import unescape
from time import time
from sys import argv
#from sys import argv
import csv
import os
import sys
import re

class Message:
"""create message object"""
def __init__(self):
self.message_id = None
self.timestamp = None
Expand All @@ -14,13 +20,20 @@ def __init__(self):
self.reply = None
self.content = None

def toTuple(self):
if self.message_id: self.message_id = self.message_id.replace('message', '')
if self.timestamp: self.timestamp = ' '.join(self.timestamp.split()[:2])
if self.sender: self.sender = unescape(self.sender.strip())
if self.fwd: self.fwd = unescape(self.fwd.strip())
if self.reply: self.reply = self.reply.replace('message', '')
if self.content: self.content = unescape(self.content.strip())
def to_tuple(self):
"""create tuple"""
if self.message_id:
self.message_id = self.message_id.replace('message', '')
if self.timestamp:
self.timestamp = ' '.join(self.timestamp.split()[:2])
if self.sender:
self.sender = unescape(self.sender.strip())
if self.fwd:
self.fwd = unescape(self.fwd.strip())
if self.reply:
self.reply = self.reply.replace('message', '')
if self.content:
self.content = unescape(self.content.strip())

return (self.message_id, self.timestamp, self.sender, self.fwd, self.reply, self.content)

Expand All @@ -30,21 +43,23 @@ def toTuple(self):
message_id_joined_pattern = re.compile('<div class="message default clearfix joined" id="([^"]+)')
timestamp_pattern = re.compile('<div class="pull_right date details" title="([^"]+)')

fwd_pattern = re.compile('<div class="userpic userpic\d+" style="width: 42px; height: 42px">')
fwd_pattern = re.compile(r'<div class="userpic userpic\d+" style="width: 42px; height: 42px">')
fwd_reply_pattern = re.compile('<div class="reply_to details">')
fwd_sender_pattern = re.compile('([^<]+)<span class="date details')
same_fwd_media_pattern = re.compile('<div class="media_wrap clearfix">')
same_fwd_text_pattern = re.compile('<div class="text">')
reply_pattern = re.compile('In reply to <a href="(?:messages\d*.html)?#go_to_([^"]+)"')
reply_pattern = re.compile(r'In reply to <a href="(?:messages\d*.html)?#go_to_([^"]+)"')

photo_pattern = re.compile('<div class="media clearfix pull_left media_photo">')
video_pattern = re.compile('<div class="media clearfix pull_left media_video">')
voice_pattern = re.compile('<div class="media clearfix pull_left media_voice_message">')
audio_pattern = re.compile('<div class="media clearfix pull_left media_audio_file">')
file_pattern = re.compile('<div class="media clearfix pull_left media_file">')
contact_pattern = re.compile('<div class="media clearfix pull_left media_contact">')
contact_link_pattern = re.compile('<a class="media clearfix pull_left block_link media_contact" href="[^"]+"')
location_link_pattern = re.compile('<a class="media clearfix pull_left block_link media_location" href="[^"]+"')
contact_link_pattern = re.compile(
'<a class="media clearfix pull_left block_link media_contact" href="[^"]+"')
location_link_pattern = re.compile(
'<a class="media clearfix pull_left block_link media_location" href="[^"]+"')
call_pattern = re.compile('<div class="media clearfix pull_left media_call( success)?">')
poll_pattern = re.compile('<div class="media_poll">')
game_pattern = re.compile('<a class="media clearfix pull_left block_link media_game" href="[^"]+">')
Expand All @@ -58,17 +73,18 @@ def toTuple(self):

print("Starting...")

# Scans current directory for message<n>.html Telegram chat export files
# Scans current directory for message<MSG_NUMBER>.html Telegram chat export files
message_files = []
n = 1
MSG_NUMBER = 1
for file in os.listdir():
if file.startswith('messages') and file.endswith('.html'):
message_files.append('messages' + (str(n) if n > 1 else '') + '.html')
n += 1
message_files.append('messages' + (str(MSG_NUMBER) if MSG_NUMBER > 1 else '') + '.html')
MSG_NUMBER += 1

if not message_files:
print('No message.html files found. Are you sure the script is in the right directory? Exiting...')
exit()
print('No message.html files found. Are you sure the script '
'is in the right directory? Exiting...')
sys.exit(1)

print(f'Loading all {len(message_files)} message files...')

Expand All @@ -84,64 +100,64 @@ def toTuple(self):

# Sets output filename as the chat's name
chat_name = lines[15]
output_file = 'Telegram-' + ''.join(c if c.isalnum() else '_' for c in chat_name) + '.csv'
OUTPUT_FILE = 'Telegram-' + ''.join(c if c.isalnum() else '_' for c in chat_name) + '.csv'

################################################################################

print(f'Processing \'{chat_name}\'...')

messages = []
cur = 0
last_sender = None
last_fwd_sender = None
CUR = 0
LAST_SENDER = None
LAST_FWD_SENDER = None

while cur < len(lines):
while CUR < len(lines):
# Skip lines that aren't the start of a message
if not lines[cur].startswith('<div class='):
cur += 1
if not lines[CUR].startswith('<div class='):
CUR += 1
continue

# Check if it's a new sender's message
new = True
message_id = re.findall(message_id_new_pattern, lines[cur])
NEW = True
message_id = re.findall(message_id_new_pattern, lines[CUR])
if not message_id:
new = False
message_id = re.findall(message_id_joined_pattern, lines[cur])
NEW = False
message_id = re.findall(message_id_joined_pattern, lines[CUR])

# Skip lines that aren't the start of a message
if not message_id:
cur += 1
CUR += 1
continue

m = Message()
m.message_id = message_id[0]

if new: # New sender
if NEW: # New sender
# If it's from a Deleted Account, no initial is
# shown as avatar, so there's a line less to skip
if lines[cur+4] == '</div>':
cur += 8
if lines[CUR+4] == '</div>':
CUR += 8
else:
cur += 9
CUR += 9

timestamp = re.findall(timestamp_pattern, lines[cur])
timestamp = re.findall(timestamp_pattern, lines[CUR])
m.timestamp = timestamp[0]

cur += 4
m.sender = lines[cur]
last_sender = m.sender
CUR += 4
m.sender = lines[CUR]
LAST_SENDER = m.sender

cur += 3
m.content = lines[cur]
CUR += 3
m.content = lines[CUR]
else: # Same sender as the message before
cur += 2
timestamp = re.findall(timestamp_pattern, lines[cur])
CUR += 2
timestamp = re.findall(timestamp_pattern, lines[CUR])
m.timestamp = timestamp[0]

m.sender = last_sender
m.sender = LAST_SENDER

cur += 4
m.content = lines[cur]
CUR += 4
m.content = lines[CUR]

is_fwd = re.match(fwd_pattern, m.content)
is_same_fwd_text = re.match(same_fwd_text_pattern, m.content)
Expand All @@ -152,43 +168,43 @@ def toTuple(self):
if is_fwd:
# If it's from a Deleted Account, no initial is
# shown as avatar, so there's a line less to skip
if lines[cur+2] == '</div>':
cur += 7
if lines[CUR+2] == '</div>':
CUR += 7
else:
cur += 8
fwd_sender = re.findall(fwd_sender_pattern, lines[cur])
CUR += 8

fwd_sender = re.findall(fwd_sender_pattern, lines[CUR])
m.fwd = fwd_sender[0]
last_fwd_sender = m.fwd
LAST_FWD_SENDER = m.fwd

cur += 2
is_fwd_reply = re.findall(fwd_reply_pattern, lines[cur])
CUR += 2
is_fwd_reply = re.findall(fwd_reply_pattern, lines[CUR])
if is_fwd_reply:
cur += 4
CUR += 4
else:
cur += 1
CUR += 1

m.content = lines[cur]
m.content = lines[CUR]
elif is_fwd_reply_same_fwd_text:
m.fwd = last_fwd_sender
m.fwd = LAST_FWD_SENDER

cur += 4
m.content = lines[cur]
CUR += 4
m.content = lines[CUR]
elif is_same_fwd_text:
m.fwd = last_fwd_sender
m.fwd = LAST_FWD_SENDER

cur += 1
m.content = lines[cur]
CUR += 1
m.content = lines[CUR]
elif is_same_fwd_media:
m.fwd = last_fwd_sender
m.fwd = LAST_FWD_SENDER

cur += 6
m.content = f'[{lines[cur]}]'
CUR += 6
m.content = f'[{lines[CUR]}]'
elif is_reply:
m.reply = is_reply[0]

cur += 3
m.content = lines[cur]
CUR += 3
m.content = lines[CUR]

if m.content.startswith('<'):
is_photo = re.match(photo_pattern, m.content)
Expand All @@ -205,21 +221,21 @@ def toTuple(self):

# Write type of media as content
if any([is_photo, is_video, is_voice, is_audio, is_file]):
cur += 5
m.content = f'[{lines[cur]}]'
CUR += 5
m.content = f'[{lines[CUR]}]'
elif is_contact or is_contact_link:
cur += 5
m.content = f'[Contact - {lines[cur]} - {lines[cur+3]}]'
CUR += 5
m.content = f'[Contact - {lines[CUR]} - {lines[CUR+3]}]'
elif is_location_link:
cur += 5
m.content = f'[{lines[cur]} - {lines[cur+3]}]'
CUR += 5
m.content = f'[{lines[CUR]} - {lines[CUR+3]}]'
elif is_call:
cur += 8
m.content = f'[Call - {lines[cur]}]'
CUR += 8
m.content = f'[Call - {lines[CUR]}]'
elif is_poll:
m.content = f'[{lines[cur+5]} - {lines[cur+2]}]'
m.content = f'[{lines[CUR+5]} - {lines[CUR+2]}]'
elif is_game:
m.content = f'[Game - {lines[cur+5]} - {lines[cur+11]}]'
m.content = f'[Game - {lines[CUR+5]} - {lines[CUR+11]}]'

# Replace HTML line breaks
if '<br>' in m.content:
Expand All @@ -241,13 +257,19 @@ def toTuple(self):
if m.content == '</div>':
m.content = '[Animated emoji]'

# write file name as content
if m.content == "":
new_content = lines[CUR].replace('<a class="photo_wrap clearfix pull_left" href="',"")
new_content = new_content.replace('">',"")
m.content = new_content

messages.append(m)
cur += 1
CUR += 1

# Write CSV
with open(output_file, 'w+', encoding='UTF-8', newline='') as f:
with open(OUTPUT_FILE, 'w+', encoding='UTF-8', newline='') as f:
writer = csv.writer(f)
writer.writerow(list(messages[0].__dict__.keys()))
writer.writerows([m.toTuple() for m in messages])
writer.writerows([m.to_tuple() for m in messages])

print(f'Written to \'{output_file}\' in {(time()-t0):.2f}s.')
print(f'Written to \'{OUTPUT_FILE}\' in {(time()-t0):.2f}s.')