-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdbgen.rb
executable file
·48 lines (37 loc) · 1.35 KB
/
dbgen.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
#!/usr/bin/env ruby
require 'rubygems'
require 'mechanize'
require 'logger'
require 'open-uri'
require 'yaml'
# dbgen for google news
# most used:
# sports: http://news.google.com/?ned=us&topic=s
# world: http://news.google.com/?ned=us&topic=w
# entertainment: http://news.google.com/?ned=us&topic=e
# sci/tech: http://news.google.com/?ned=us&topic=t
# health: http://news.google.com/?ned=us&topic=m
# business: http://news.google.com/?ned=us&topic=b
# code from: http://zenmachine.wordpress.com/practical-text-classification-with-ruby/
tagSrc = {
'sports' => 'http://news.google.com/?ned=us&topic=s',
'world' => 'http://news.google.com/?ned=us&topic=w',
'entertainment' => 'http://news.google.com/?ned=us&topic=e',
'scitech' => 'http://news.google.com/?ned=us&topic=t',
'health' => 'http://news.google.com/?ned=us&topic=m',
'business' => 'http://news.google.com/?ned=us&topic=b'
}
limit=10
tagSrc.each do |tag,url|
agent = WWW::Mechanize.new{ |obj| obj.log = Logger.new('dbgen.log') }
puts("Start processing: #{url} for tag: #{tag}")
news = Array.new
page = agent.get(url)
# snippet
ni = page.search("div[@class='snippet']")
ni.each do |n|
news << n.to_s.gsub(/(<\/?[^>]*>)|(&(.+)\;)/,'')
end
File.open(tag+'.yaml', 'a+') { |f| f.puts news.to_yaml }
puts("Done processing: #{url}")
end