From 97b6078622f5fc9ee9907b08ea5ce51db2f62c55 Mon Sep 17 00:00:00 2001 From: Chris Peterson Date: Sun, 29 Dec 2013 00:41:25 -0500 Subject: [PATCH] fixed some stuff ok, so nate's code wasn't working for me, so here's a bunch of stuff i did to make it work for @latourliturgies * merged in some changes that jacob harris made since nate forked his project (htmlentities, blank tweet protection) * there was some stuff that ebook.rb needs that wasn't specified in twitter_init.rb so i added it * commented some stuff in ebook.rb that had been confusing to me * added some debugging code to the tweet generation in ebook.rb so it's clear what is being scraped from which account i'm pretty sure these changes are helpful, but i'm not sure if i'm contributing them in a helpful way, since this is my first time trying to contribute to someone else's code on github. apologies in advance! --- Gemfile | 1 + ebook.rb | 51 ++++++++++++++++++++++-------------------- ebook.worker | 1 + twitter_init.rb.sample | 3 ++- 4 files changed, 31 insertions(+), 25 deletions(-) diff --git a/Gemfile b/Gemfile index 8c89053..b9658f6 100644 --- a/Gemfile +++ b/Gemfile @@ -6,3 +6,4 @@ gem "twitter" gem "typhoeus" gem "iron_worker_ng" gem 'punkt-segmenter' +gem 'htmlentities' diff --git a/ebook.rb b/ebook.rb index 8b99be1..bd779a1 100644 --- a/ebook.rb +++ b/ebook.rb @@ -5,9 +5,10 @@ require 'punkt-segmenter' require 'twitter_init' require 'markov' +require 'htmlentities' -source_tweets = [] -#source_tweets_two = [] +source_tweets = ['@First_Source'] #the @username of the first source in quotes +source_tweets_two = ['@Second_Source'] #the @username of the second source in quotes $rand_limit ||= 10 $markov_index ||= 2 @@ -27,9 +28,10 @@ def random_closing_punctuation end def filtered_tweets(tweets) + html_decoder = HTMLEntities.new include_urls = $include_urls || params["include_urls"] include_replies = $include_replies || params["include_replies"] - source_tweets = tweets.map {|t| t.text.gsub(/\b(RT|MT) .+/, '') } + source_tweets = tweets.map {|t| html_decoder.decode(t.text).gsub(/\b(RT|MT) .+/, '') } if !include_urls source_tweets = source_tweets.reject {|t| t =~ /(https?:\/\/)/ } @@ -54,16 +56,14 @@ def filtered_tweets(tweets) else # Fetch a thousand tweets begin - user_tweets = Twitter.user_timeline($source_account, :count => 200, :trim_user => true, :exclude_replies => false, :include_rts => false) + user_tweets = Twitter.user_timeline($source_account, :count => 200, :trim_user => true, :include_rts => false) max_id = user_tweets.last.id source_tweets += filtered_tweets(user_tweets) - - # Twitter only returns up to 3200 of a user timeline, includes retweets. - 17.times do - user_tweets = Twitter.user_timeline($source_account, :count => 200, :trim_user => true, :max_id => max_id - 1, :exclude_replies => false, :include_rts => false) - puts "MAX_ID #{max_id} TWEETS: #{user_tweets.length}" + 12.times do + user_tweets = Twitter.user_timeline($source_account, :count => 200, :trim_user => true, :max_id => max_id - 1, :include_rts => false) + puts "MAX_ID #{max_id} TWEETS: #{user_tweets.length}" + "(First Source)" break if user_tweets.last.nil? max_id = user_tweets.last.id source_tweets += filtered_tweets(user_tweets) @@ -75,13 +75,14 @@ def filtered_tweets(tweets) source_tweets += filtered_tweets(user_tweets) 12.times do user_tweets = Twitter.user_timeline($source_account_two, :count => 200, :trim_user => true, :max_id => max_id - 1, :exclude_replies => false, :include_rts => false) - puts "MAX_ID #{max_id} TWEETS: #{user_tweets.length}" + puts "MAX_ID #{max_id} TWEETS: #{user_tweets.length}" + "(Second Source)" break if user_tweets.last.nil? max_id = user_tweets.last.id source_tweets += filtered_tweets(user_tweets) end - rescue + rescue => ex + puts ex.message end puts "#{source_tweets.length} tweets found" @@ -91,12 +92,13 @@ def filtered_tweets(tweets) end markov = MarkovChainer.new($markov_index) - #markov_two = MarkovChainer.new($markov_index) + markov_two = MarkovChainer.new($markov_index) tokenizer = Punkt::SentenceTokenizer.new(source_tweets.join(" ")) # init with corpus of all sentences - #tokenizer_two = Punkt::SentenceTokenizer.new(source_tweets_two.join(" ")) # init with corpus of all sentences + tokenizer_two = Punkt::SentenceTokenizer.new(source_tweets_two.join(" ")) # init with corpus of all sentences source_tweets.each do |twt| + next if twt.nil? || twt == '' sentences = tokenizer.sentences_from_text(twt, :output => :sentences_text) # sentences = text.split(/[.:;?!]/) @@ -114,13 +116,14 @@ def filtered_tweets(tweets) end end - #source_tweets_two.each do |twt| - # sentences = tokenizer.sentences_from_text(twt, :output => :sentences_text) - # sentences.each do |sentence| - # next if sentence =~ /@/ - # markov_two.add_sentence(sentence) - # end - #end + source_tweets_two.each do |twt| + next if twt.nil? || twt == '' + sentences = tokenizer.sentences_from_text(twt, :output => :sentences_text) + sentences.each do |sentence| + next if sentence =~ /@/ + markov_two.add_sentence(sentence) + end + end tweet = nil @@ -130,10 +133,10 @@ def filtered_tweets(tweets) tweet_letters = tweet.gsub(/\P{Word}/, '') next if source_tweets.any? {|t| t.gsub(/\P{Word}/, '') =~ /#{tweet_letters}/ } - #if rand(3) == 0 && tweet =~ /(in|to|from|for|with|by|our|of|your|around|under|beyond)\p{Space}\w+$/ - # puts "Losing last word randomly" - # tweet.gsub(/\p{Space}\p{Word}+.$/, '') # randomly losing the last word sometimes like horse_ebooks - #end + # if rand(3) == 0 && tweet =~ /(in|to|from|for|with|by|our|of|your|around|under|beyond)\p{Space}\w+$/ + # puts "Losing last word randomly" + # tweet.gsub(/\p{Space}\p{Word}+.$/, '') # randomly losing the last word sometimes like horse_ebooks + # end if tweet.length < 40 && rand(10) == 0 puts "Short tweet. Adding another sentence randomly" diff --git a/ebook.worker b/ebook.worker index e567c15..f0209eb 100644 --- a/ebook.worker +++ b/ebook.worker @@ -8,3 +8,4 @@ file "markov.rb" gem 'twitter' gem 'punkt-segmenter' +gem 'htmlentities' diff --git a/twitter_init.rb.sample b/twitter_init.rb.sample index be3e22d..eb7d31c 100644 --- a/twitter_init.rb.sample +++ b/twitter_init.rb.sample @@ -5,5 +5,6 @@ Twitter.configure do |config| config.oauth_token_secret = 'KEY' end -$source_account = 'your_regular_account' +$source_account = '@First_Source' #the @username of the first source in quotes +$source_account_two = '@Second_Source' #the @username of the second source in quotes $rand_limit = 10 # run 1 out of every $rand_limit times roughly \ No newline at end of file