Skip to content

Commit

Permalink
Initial import
Browse files Browse the repository at this point in the history
  • Loading branch information
petewarden committed Oct 4, 2010
0 parents commit fee57b3
Show file tree
Hide file tree
Showing 13 changed files with 2,701,530 additions and 0 deletions.
106 changes: 106 additions & 0 deletions cliargs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
# Geodict
# Copyright (C) 2010 Pete Warden <pete@petewarden.com>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.

import sys, string

def print_usage_and_exit(cliargs):
print "Usage:"

for long, arginfo in cliargs.items():
short = arginfo['short']
type = arginfo['type']
required = (type=='required')
optional = (type=='optional')
description = arginfo['description']

output = '-'+short+'/--'+long+' '

if optional or required:
output += '<value> '

output += ': '+description

if required:
output += ' (required)'

print output

exit()

def get_options(cliargs):

options = {'unnamed': [] }
skip_next = False
for index in range(1, len(sys.argv)):
if skip_next:
skip_next = False
continue

currentarg = sys.argv[index].lower()
argparts = currentarg.split('=')
namepart = argparts[0]

if namepart.startswith('--'):
longname = namepart[2:]
elif namepart.startswith('-'):
shortname = namepart[1:]
longname = shortname
for name, info in cliargs.items():
if shortname==info['short']:
longname = name
break
else:
longname = 'unnamed'

if longname=='unnamed':
options['unnamed'].append(namepart)
else:
if longname not in cliargs:
print "Unknown argument '"+longname+"'"
print_usage_and_exit(cliargs)

arginfo = cliargs[longname]
argtype = arginfo['type']
if argtype=='switch':
value = True
elif len(argparts) > 1:
value = argparts[1]
elif (index+1) < len(sys.argv):
value = sys.argv[index+1]
skip_next = True
else:
print "Missing value after '"+longname+"'"
print_usage_and_exit(cliargs)

options[longname] = value

for longname, arginfo in cliargs.items():
type = arginfo['type']

if longname not in options:
if type == 'required':
print "Missing required value for '"+longname+"'"
print_usage_and_exit(cliargs)
elif type == 'optional':
if not 'default' in arginfo:
die('Missing default value for '+longname)
options[longname] = arginfo['default']
elif type == 'switch':
options[longname] = False
else:
die('Unknown type "'+type+'" for '+longname)

return options
96 changes: 96 additions & 0 deletions geodict.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
#!/usr/bin/env python

# Geodict
# Copyright (C) 2010 Pete Warden <pete@petewarden.com>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.

import csv, os, os.path, MySQLdb, sys, json, csv
import geodict_lib, cliargs

args = {
'input': {
'short': 'i',
'type': 'optional',
'description': 'The name of the input file to scan for locations. If none is set, will read from STDIN',
'default': '-'
},
'output': {
'short': 'o',
'type': 'optional',
'description': 'The name of the file to write the location data to. If none is set, will write to STDOUT',
'default': '-'
},
'format': {
'short': 'f',
'type': 'optional',
'description': 'The format to use to output information about any locations found. By default it will write out location names separated by newlines, but specifying "json" will give more detailed information',
'default': 'text'
}
};

options = cliargs.get_options(args)

input = options['input']
output = options['output']
format = options['format']

if input is '-':
input_handle = sys.stdin
else:
try:
input_handle = open(input, 'rb')
except:
die("Couldn't open file '"+input+"'")

if output is '-':
output_handle = sys.stdout
else:
try:
output_handle = open(output, 'wb')
except:
die("Couldn't write to file '"+output+"'")

text = input_handle.read()

locations = geodict_lib.find_locations_in_text(text)

output_string = ''
if format.lower() == 'json':
output_string = json.dumps(locations)
output_handle.write(output_string)
elif format.lower() == 'text':
for location in locations:
found_tokens = location['found_tokens']
start_index = found_tokens[0]['start_index']
end_index = found_tokens[len(found_tokens)-1]['end_index']
output_string += text[start_index:(end_index+1)]
output_string += "\n"
output_handle.write(output_string)
elif format.lower() == 'csv':
writer = csv.writer(output_handle)
writer.writerow(['location', 'type', 'lat', 'lon'])
for location in locations:
found_tokens = location['found_tokens']
start_index = found_tokens[0]['start_index']
end_index = found_tokens[len(found_tokens)-1]['end_index']
name = text[start_index:(end_index+1)]
type = found_tokens[0]['type'].lower()
lat = found_tokens[0]['lat']
lon = found_tokens[0]['lon']
writer.writerow([name, type, lat, lon])
else:
print "Unknown output format '"+format+"'"
exit()

38 changes: 38 additions & 0 deletions geodict_config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
# Geodict
# Copyright (C) 2010 Pete Warden <pete@petewarden.com>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.

# The location of the source data to be loaded into your database
source_folder = './source_data/'

# Your MySQL user credentials
user = 'root'
password = ''

# The address and port number of your database server
host = 'localhost'
port = 0

# The name of the database to create
database = 'geodict'

# The maximum number of words in any name
word_max = 3

# Words that provide evidence that what follows them is a location
location_words = {
'at': True,
'in': True
}
Loading

0 comments on commit fee57b3

Please sign in to comment.