Skip to content
This repository has been archived by the owner on Jul 26, 2023. It is now read-only.

Doesn't work anymore? #3

Open
rohanp opened this issue Jun 26, 2015 · 7 comments
Open

Doesn't work anymore? #3

rohanp opened this issue Jun 26, 2015 · 7 comments

Comments

@rohanp
Copy link

rohanp commented Jun 26, 2015

I remember this used to work, but now when I try it on a normal size conversation (a few thousand msgs) it endlessly scraps the first 2000 messages..

@dufferzafar
Copy link

Yeah, I am having the exact same problem. I've spent hours trying to debug this problem but can't really think of a reason. I wonder, Why this happening.

@dufferzafar
Copy link

I found out the solution while checking out Pull #4. FB has added a timestamp field now.

@szmarci
Copy link

szmarci commented Aug 13, 2015

Hi @dufferzafar don't you know how can I make it work with dumper.py? the pull you referenced only affects group_dumper.py. I'm trying to scrape a single conversation, but keep getting the last 2000 messages.

@dufferzafar
Copy link

@szmarci the same trick is required to get it working with dumper.py

I have a similar script here, in case you need it. https://github.com/dufferzafar/Python-Scripts/blob/master/Facebook/Conversations/messages.py

@tomer8007
Copy link

@szmarci I've modified the original script to work with facebook's new timestamp field.
Here it is:

import urllib2
import urllib
import gzip
import os
import json
import sys
import time
import StringIO

__author__ = "Raghav Sood"
__copyright__ = "Copyright 2014"
__credits__ = ["Raghav Sood"]
__license__ = "CC"
__version__ = "1.0"
__maintainer__ = "Raghav Sood"
__email__ = "raghavsood@appaholics.in"
__status__ = "Production"

if len(sys.argv) <= 1:
    print "Usage:\n     python dumper.py [conversation ID] [chunk_size (recommended: 2000)] [{optional} offset location (default: 0)]"
    print "Example conversation with Raghav Sood"
    print " python dumper.py 1075686392 2000 0"
    sys.exit()

error_timeout = 30 # Change this to alter error timeout (seconds)
general_timeout = 7 # Change this to alter waiting time afetr every request (seconds)
messages = []
talk = sys.argv[1]
offset = int(sys.argv[3]) if len(sys.argv) >= 4 else int("0")
timestamp = int("0")
messages_data = "lolno"
end_mark = "\"payload\":{\"end_of_history\""
limit = int(sys.argv[2])
headers = {"origin": "https://www.facebook.com", 
"accept-encoding": "gzip,deflate", 
"accept-language": "en-US,en;q=0.8", 
"cookie": "your_cookie_value" # fill cookie value
"pragma": "no-cache", 
"user-agent": " Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.107 Safari/537.36", 
"content-type": "application/x-www-form-urlencoded", 
"accept": "*/*", 
"cache-control": "no-cache", 
"referer": "https://www.facebook.com/messages/zuck"}

base_directory = "Messages/"
directory = base_directory + str(talk) + "/"
pretty_directory = base_directory + str(talk) + "/Pretty/"

try:
    os.makedirs(directory)
except OSError:
    pass # already exists

try:
    os.makedirs(pretty_directory)
except OSError:
    pass # already exists

while end_mark not in messages_data:

    data_text = {"messages[user_ids][" + str(talk) + "][offset]": str(offset), 
    "messages[user_ids][" + str(talk) + "][limit]": str(limit),
        "messages[user_ids]["+ str(talk) + "][timestamp]": str(timestamp),
    "client": "web_messenger", 
    "__user": "your_user_id",  # fill POST form values
    "__a": "your __a", 
    "__dyn": "your __dyn", 
    "__req": "your __req", 
    "fb_dtsg": "your_fb_dtsg", 
    "ttstamp": "your_ttstamp", 
    "__rev": "your __rev"}
    data = urllib.urlencode(data_text)
    url = "https://www.facebook.com/ajax/mercury/thread_info.php"

    print "Retrieving messages " + str(offset) + "-" + str(limit+offset) + " for conversation ID " + str(talk)
    req = urllib2.Request(url, data, headers)
    response = urllib2.urlopen(req)
    compressed = StringIO.StringIO(response.read())
    decompressedFile = gzip.GzipFile(fileobj=compressed)


    outfile = open(directory + str(offset) + "-" + str(limit+offset) + ".json", 'w')
    messages_data = decompressedFile.read()
    messages_data = messages_data[9:]
    json_data = json.loads(messages_data)
    if json_data is not None and json_data['payload'] is not None:
        try:
            messages = json_data['payload']['actions'] + messages
            timestamp = int(json_data['payload']['actions'][0]['timestamp']) - 1
        except KeyError:
            pass #no more messages
    else:
        print "Error in retrieval. Retrying after " + str(error_timeout) + "s"
        print "Data Dump:"
        print json_data
        time.sleep(error_timeout)
        continue
    outfile.write(messages_data)
    outfile.close() 
    command = "python -mjson.tool " + directory + str(offset) + "-" + str(limit+offset) + ".json > " + pretty_directory + str(offset) + "-" + str(limit+offset) + ".pretty.json"
    os.system(command)
    offset = offset + limit
    time.sleep(general_timeout) 

finalfile = open(directory + "complete.json", 'wb')
finalfile.write(json.dumps(messages))
finalfile.close()
command = "python -mjson.tool " + directory + "complete.json > " + pretty_directory + "complete.pretty.json"
os.system(command)

@j6k4m8
Copy link

j6k4m8 commented Jan 1, 2016

This is great — seems to be working for me!

@j6k4m8
Copy link

j6k4m8 commented Jul 19, 2016

Included these changes in my fork, which also includes a stdin header parser to make your life easier, if you run this frequently like I do!

Sign up for free to subscribe to this conversation on GitHub. Already have an account? Sign in.
Labels
None yet
Projects
None yet
Development

No branches or pull requests

5 participants