forked from opendatakosovo/open-arbk
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfix_activities.rb
61 lines (50 loc) · 1.91 KB
/
fix_activities.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
require 'mongo'
'''
When processing the raw scraped data, the activity codes were converted
from String to Integers.
An inappropriate method was used for this convertion in which activity
codes with leading 0s (e.g. "0123") were interpreted as octal numbers
rather than decimal. In other words, we did Integer("0123") instead of
"0123".to_i which, in this case, gave us 83 instead of 123.
For all businesses with activities codes that have leading zeros in them,
this script will reset the activity codes list in the formatted subdocument
to the corrected value.
'''
# Establish connection to database
client = Mongo::Client.new([ '127.0.0.1:27017' ], :database => 'arbk')
Mongo::Logger.logger.level = ::Logger::FATAL
$collection_businesses = client[:businesses]
def fix()
fix_count = 0
businesses = $collection_businesses.find({'fixed': {'$exists' => false}}).each { |business|
catch :problematic do
id = business['_id']
regnum = business['formatted']['registrationNum'].to_s
activities = business['raw']['activities']
activities.each { |activity|
if activity['key'].start_with?('0')
fix_formatted_activities(id, regnum, activities)
fix_count += 1
throw :problematic
end
}
end
}
puts 'Fixed ' + fix_count.to_s + ' documents.'
end
def fix_formatted_activities(id, regnum, activities)
activity_codes = []
activities.each { |activity|
if !activity['key'].empty?
activity_codes.push(activity['key'].to_i)
else
puts 'WARNING: Something\'s up with ' + regnum
end
}
puts 'Fixing: ' + regnum
# Update/fix document
$collection_businesses.update_one(
{'_id' => id},
{'$set' => {'formatted.activities' => activity_codes, 'fixed' => true}})
end
fix()