Skip to content

Commit

Permalink
fixed some stuff
Browse files Browse the repository at this point in the history
ok, so nate's code wasn't working for me, so here's a bunch of stuff i
did to make it work for @latourliturgies

* merged in some changes that jacob harris made since nate forked his
project (htmlentities, blank tweet protection)

* there was some stuff that ebook.rb needs that wasn't specified in
twitter_init.rb so i added it

* commented some stuff in ebook.rb that had been confusing to me

* added some debugging code to the tweet generation in ebook.rb so it's
clear what is being scraped from which account

i'm pretty sure these changes are helpful, but i'm not sure if i'm
contributing them in a helpful way, since this is my first time trying
to contribute to someone else's code on github. apologies in advance!
  • Loading branch information
peteyreplies committed Dec 29, 2013
1 parent 45a2dd3 commit 97b6078
Show file tree
Hide file tree
Showing 4 changed files with 31 additions and 25 deletions.
1 change: 1 addition & 0 deletions Gemfile
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,4 @@ gem "twitter"
gem "typhoeus"
gem "iron_worker_ng"
gem 'punkt-segmenter'
gem 'htmlentities'
51 changes: 27 additions & 24 deletions ebook.rb
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,10 @@
require 'punkt-segmenter'
require 'twitter_init'
require 'markov'
require 'htmlentities'

source_tweets = []
#source_tweets_two = []
source_tweets = ['@First_Source'] #the @username of the first source in quotes
source_tweets_two = ['@Second_Source'] #the @username of the second source in quotes

$rand_limit ||= 10
$markov_index ||= 2
Expand All @@ -27,9 +28,10 @@ def random_closing_punctuation
end

def filtered_tweets(tweets)
html_decoder = HTMLEntities.new
include_urls = $include_urls || params["include_urls"]
include_replies = $include_replies || params["include_replies"]
source_tweets = tweets.map {|t| t.text.gsub(/\b(RT|MT) .+/, '') }
source_tweets = tweets.map {|t| html_decoder.decode(t.text).gsub(/\b(RT|MT) .+/, '') }

if !include_urls
source_tweets = source_tweets.reject {|t| t =~ /(https?:\/\/)/ }
Expand All @@ -54,16 +56,14 @@ def filtered_tweets(tweets)
else
# Fetch a thousand tweets
begin
user_tweets = Twitter.user_timeline($source_account, :count => 200, :trim_user => true, :exclude_replies => false, :include_rts => false)
user_tweets = Twitter.user_timeline($source_account, :count => 200, :trim_user => true, :include_rts => false)
max_id = user_tweets.last.id
source_tweets += filtered_tweets(user_tweets)



# Twitter only returns up to 3200 of a user timeline, includes retweets.
17.times do
user_tweets = Twitter.user_timeline($source_account, :count => 200, :trim_user => true, :max_id => max_id - 1, :exclude_replies => false, :include_rts => false)
puts "MAX_ID #{max_id} TWEETS: #{user_tweets.length}"
12.times do
user_tweets = Twitter.user_timeline($source_account, :count => 200, :trim_user => true, :max_id => max_id - 1, :include_rts => false)
puts "MAX_ID #{max_id} TWEETS: #{user_tweets.length}" + "(First Source)"
break if user_tweets.last.nil?
max_id = user_tweets.last.id
source_tweets += filtered_tweets(user_tweets)
Expand All @@ -75,13 +75,14 @@ def filtered_tweets(tweets)
source_tweets += filtered_tweets(user_tweets)
12.times do
user_tweets = Twitter.user_timeline($source_account_two, :count => 200, :trim_user => true, :max_id => max_id - 1, :exclude_replies => false, :include_rts => false)
puts "MAX_ID #{max_id} TWEETS: #{user_tweets.length}"
puts "MAX_ID #{max_id} TWEETS: #{user_tweets.length}" + "(Second Source)"
break if user_tweets.last.nil?
max_id = user_tweets.last.id
source_tweets += filtered_tweets(user_tweets)
end

rescue
rescue => ex
puts ex.message
end

puts "#{source_tweets.length} tweets found"
Expand All @@ -91,12 +92,13 @@ def filtered_tweets(tweets)
end

markov = MarkovChainer.new($markov_index)
#markov_two = MarkovChainer.new($markov_index)
markov_two = MarkovChainer.new($markov_index)

tokenizer = Punkt::SentenceTokenizer.new(source_tweets.join(" ")) # init with corpus of all sentences
#tokenizer_two = Punkt::SentenceTokenizer.new(source_tweets_two.join(" ")) # init with corpus of all sentences
tokenizer_two = Punkt::SentenceTokenizer.new(source_tweets_two.join(" ")) # init with corpus of all sentences

source_tweets.each do |twt|
next if twt.nil? || twt == ''
sentences = tokenizer.sentences_from_text(twt, :output => :sentences_text)

# sentences = text.split(/[.:;?!]/)
Expand All @@ -114,13 +116,14 @@ def filtered_tweets(tweets)
end
end

#source_tweets_two.each do |twt|
# sentences = tokenizer.sentences_from_text(twt, :output => :sentences_text)
# sentences.each do |sentence|
# next if sentence =~ /@/
# markov_two.add_sentence(sentence)
# end
#end
source_tweets_two.each do |twt|
next if twt.nil? || twt == ''
sentences = tokenizer.sentences_from_text(twt, :output => :sentences_text)
sentences.each do |sentence|
next if sentence =~ /@/
markov_two.add_sentence(sentence)
end
end

tweet = nil

Expand All @@ -130,10 +133,10 @@ def filtered_tweets(tweets)
tweet_letters = tweet.gsub(/\P{Word}/, '')
next if source_tweets.any? {|t| t.gsub(/\P{Word}/, '') =~ /#{tweet_letters}/ }

#if rand(3) == 0 && tweet =~ /(in|to|from|for|with|by|our|of|your|around|under|beyond)\p{Space}\w+$/
# puts "Losing last word randomly"
# tweet.gsub(/\p{Space}\p{Word}+.$/, '') # randomly losing the last word sometimes like horse_ebooks
#end
# if rand(3) == 0 && tweet =~ /(in|to|from|for|with|by|our|of|your|around|under|beyond)\p{Space}\w+$/
# puts "Losing last word randomly"
# tweet.gsub(/\p{Space}\p{Word}+.$/, '') # randomly losing the last word sometimes like horse_ebooks
# end

if tweet.length < 40 && rand(10) == 0
puts "Short tweet. Adding another sentence randomly"
Expand Down
1 change: 1 addition & 0 deletions ebook.worker
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,4 @@ file "markov.rb"

gem 'twitter'
gem 'punkt-segmenter'
gem 'htmlentities'
3 changes: 2 additions & 1 deletion twitter_init.rb.sample
Original file line number Diff line number Diff line change
Expand Up @@ -5,5 +5,6 @@ Twitter.configure do |config|
config.oauth_token_secret = 'KEY'
end

$source_account = 'your_regular_account'
$source_account = '@First_Source' #the @username of the first source in quotes
$source_account_two = '@Second_Source' #the @username of the second source in quotes
$rand_limit = 10 # run 1 out of every $rand_limit times roughly

0 comments on commit 97b6078

Please sign in to comment.