diff --git a/Gemfile b/Gemfile index 8c89053..b9658f6 100644 --- a/Gemfile +++ b/Gemfile @@ -6,3 +6,4 @@ gem "twitter" gem "typhoeus" gem "iron_worker_ng" gem 'punkt-segmenter' +gem 'htmlentities' diff --git a/ebook.rb b/ebook.rb index 8b99be1..bd779a1 100644 --- a/ebook.rb +++ b/ebook.rb @@ -5,9 +5,10 @@ require 'punkt-segmenter' require 'twitter_init' require 'markov' +require 'htmlentities' -source_tweets = [] -#source_tweets_two = [] +source_tweets = ['@First_Source'] #the @username of the first source in quotes +source_tweets_two = ['@Second_Source'] #the @username of the second source in quotes $rand_limit ||= 10 $markov_index ||= 2 @@ -27,9 +28,10 @@ def random_closing_punctuation end def filtered_tweets(tweets) + html_decoder = HTMLEntities.new include_urls = $include_urls || params["include_urls"] include_replies = $include_replies || params["include_replies"] - source_tweets = tweets.map {|t| t.text.gsub(/\b(RT|MT) .+/, '') } + source_tweets = tweets.map {|t| html_decoder.decode(t.text).gsub(/\b(RT|MT) .+/, '') } if !include_urls source_tweets = source_tweets.reject {|t| t =~ /(https?:\/\/)/ } @@ -54,16 +56,14 @@ def filtered_tweets(tweets) else # Fetch a thousand tweets begin - user_tweets = Twitter.user_timeline($source_account, :count => 200, :trim_user => true, :exclude_replies => false, :include_rts => false) + user_tweets = Twitter.user_timeline($source_account, :count => 200, :trim_user => true, :include_rts => false) max_id = user_tweets.last.id source_tweets += filtered_tweets(user_tweets) - - # Twitter only returns up to 3200 of a user timeline, includes retweets. - 17.times do - user_tweets = Twitter.user_timeline($source_account, :count => 200, :trim_user => true, :max_id => max_id - 1, :exclude_replies => false, :include_rts => false) - puts "MAX_ID #{max_id} TWEETS: #{user_tweets.length}" + 12.times do + user_tweets = Twitter.user_timeline($source_account, :count => 200, :trim_user => true, :max_id => max_id - 1, :include_rts => false) + puts "MAX_ID #{max_id} TWEETS: #{user_tweets.length}" + "(First Source)" break if user_tweets.last.nil? max_id = user_tweets.last.id source_tweets += filtered_tweets(user_tweets) @@ -75,13 +75,14 @@ def filtered_tweets(tweets) source_tweets += filtered_tweets(user_tweets) 12.times do user_tweets = Twitter.user_timeline($source_account_two, :count => 200, :trim_user => true, :max_id => max_id - 1, :exclude_replies => false, :include_rts => false) - puts "MAX_ID #{max_id} TWEETS: #{user_tweets.length}" + puts "MAX_ID #{max_id} TWEETS: #{user_tweets.length}" + "(Second Source)" break if user_tweets.last.nil? max_id = user_tweets.last.id source_tweets += filtered_tweets(user_tweets) end - rescue + rescue => ex + puts ex.message end puts "#{source_tweets.length} tweets found" @@ -91,12 +92,13 @@ def filtered_tweets(tweets) end markov = MarkovChainer.new($markov_index) - #markov_two = MarkovChainer.new($markov_index) + markov_two = MarkovChainer.new($markov_index) tokenizer = Punkt::SentenceTokenizer.new(source_tweets.join(" ")) # init with corpus of all sentences - #tokenizer_two = Punkt::SentenceTokenizer.new(source_tweets_two.join(" ")) # init with corpus of all sentences + tokenizer_two = Punkt::SentenceTokenizer.new(source_tweets_two.join(" ")) # init with corpus of all sentences source_tweets.each do |twt| + next if twt.nil? || twt == '' sentences = tokenizer.sentences_from_text(twt, :output => :sentences_text) # sentences = text.split(/[.:;?!]/) @@ -114,13 +116,14 @@ def filtered_tweets(tweets) end end - #source_tweets_two.each do |twt| - # sentences = tokenizer.sentences_from_text(twt, :output => :sentences_text) - # sentences.each do |sentence| - # next if sentence =~ /@/ - # markov_two.add_sentence(sentence) - # end - #end + source_tweets_two.each do |twt| + next if twt.nil? || twt == '' + sentences = tokenizer.sentences_from_text(twt, :output => :sentences_text) + sentences.each do |sentence| + next if sentence =~ /@/ + markov_two.add_sentence(sentence) + end + end tweet = nil @@ -130,10 +133,10 @@ def filtered_tweets(tweets) tweet_letters = tweet.gsub(/\P{Word}/, '') next if source_tweets.any? {|t| t.gsub(/\P{Word}/, '') =~ /#{tweet_letters}/ } - #if rand(3) == 0 && tweet =~ /(in|to|from|for|with|by|our|of|your|around|under|beyond)\p{Space}\w+$/ - # puts "Losing last word randomly" - # tweet.gsub(/\p{Space}\p{Word}+.$/, '') # randomly losing the last word sometimes like horse_ebooks - #end + # if rand(3) == 0 && tweet =~ /(in|to|from|for|with|by|our|of|your|around|under|beyond)\p{Space}\w+$/ + # puts "Losing last word randomly" + # tweet.gsub(/\p{Space}\p{Word}+.$/, '') # randomly losing the last word sometimes like horse_ebooks + # end if tweet.length < 40 && rand(10) == 0 puts "Short tweet. Adding another sentence randomly" diff --git a/ebook.worker b/ebook.worker index e567c15..f0209eb 100644 --- a/ebook.worker +++ b/ebook.worker @@ -8,3 +8,4 @@ file "markov.rb" gem 'twitter' gem 'punkt-segmenter' +gem 'htmlentities' diff --git a/twitter_init.rb.sample b/twitter_init.rb.sample index be3e22d..eb7d31c 100644 --- a/twitter_init.rb.sample +++ b/twitter_init.rb.sample @@ -5,5 +5,6 @@ Twitter.configure do |config| config.oauth_token_secret = 'KEY' end -$source_account = 'your_regular_account' +$source_account = '@First_Source' #the @username of the first source in quotes +$source_account_two = '@Second_Source' #the @username of the second source in quotes $rand_limit = 10 # run 1 out of every $rand_limit times roughly \ No newline at end of file