-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgroup_publisher.py
109 lines (99 loc) · 4.13 KB
/
group_publisher.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import sys
from collections import namedtuple, defaultdict
import re
from optparse import OptionParser
Publisher = namedtuple('Publisher','prefix year name')
def main():
usage = "usage: %prog [options] publisher_tuples"
parser = OptionParser(usage=usage)
parser.add_option("-f", "--format", action="store", type="choice",
choices=["rdf", "text"], default="text", dest="format")
parser.add_option("-n", "--sample-size", action="store", type=int,
default=None, dest="sample_size")
(options, args) = parser.parse_args()
if len(args) != 1:
parser.error("Must specify publisher_tuples file")
publishers_file_name = args[0]
with open(publishers_file_name) as publishers_file:
publishers = read_publishers_set(publishers_file,
options.sample_size)
publishers_by_name = get_publishers_by_name(publishers)
if options.format == 'rdf':
output_rdf(publishers_by_name)
elif options.format == 'text':
output_text(publishers_by_name)
def get_publishers_by_name(publishers):
"""Return publishers in a dict indexed by sortname"""
publishers_by_name = defaultdict(set) # Tons of duplictes
for publisher in publishers:
sortname = as_sortname(publisher)
publishers_by_name[sortname].add(publisher)
return publishers_by_name
ignore_words = ["publishing", "co", "pub", "inc", "ltd", "company"]
NON_CHAR = re.compile(r'\W')
def as_sortname(publisher):
"""Convert the publisher name in to it's sortable key for grouping."""
sortname = publisher.name
sortname = sortname.lower()
tokens = NON_CHAR.split(sortname)
sortname = "".join(token for token in tokens if token not in ignore_words)
sortname = sortname.lower()
return sortname
def read_publishers_set(publishers_file, sample_size=None):
"""Return a set from a file containing python tuples in the form:
(u'0220', 1996, u'Penguin')
(u'0669', 1990, u'D.C. Heath & Co')
Stop after reading sample_size number of tuples to nearest 10,000
"""
publishers = set()
for i, line in enumerate(publishers_file):
pub = Publisher(*eval(line))
publishers.add(pub)
if i % 10000 == 0:
print >> sys.stderr, "Processing:", i
if sample_size and i >= sample_size:
break
print >> sys.stderr, "Processed:", i , "Unique Publishers:", len(publishers)
return publishers
def output_rdf(publishers_by_name):
"""Output RDF N3 for each publisher in the form:
pub:oreillymedia a foaf:Organization;
foaf:name "O' Reilly Media Inc."@en,
"O'Reilly Media"@en,
"O'Reilly Media Inc"@en,
"O'Reilly Media Inc."@en,
"O'Reilly Media, Inc."@en,
"O'ReillyMedia, Inc."@en,
"O'reilly Media"@en .
"""
import pymantic.RDF
import rdflib
graph = rdflib.ConjunctiveGraph()
@pymantic.RDF.register_class('foaf:Organization')
class Organization(pymantic.RDF.Resource):
namespaces = {'foaf':'http://xmlns.com/foaf/0.1/'}
for key in publishers_by_name:
uri = "http://gavin.carothers.name/work/archive/publishers/%s" % key
publisher = Organization.new(graph, uri)
pubs = publishers_by_name[key]
names = set()
for pub in pubs:
names.add(pub.name)
publisher['foaf:name'] = names
graph.bind('pub', rdflib.Namespace('http://gavin.carothers.name/work/archive/publishers/'))
graph.bind('foaf', rdflib.Namespace('http://xmlns.com/foaf/0.1/'))
print graph.serialize(format='n3')
def output_text(publishers_by_name):
"""Output plain text with Publisher tuples in the form:
mcgrawhill 3
Publisher(prefix=u'007', year=1989, name=u'McGraw-Hill')
Publisher(prefix=u'000', year=1991, name=u'McGraw Hill')
Publisher(prefix=u'000', year=1986, name=u'McGraw Hill')
"""
for key in sorted(publishers_by_name):
pubs = publishers_by_name[key]
print key, len(pubs)
for pub in pubs:
print "\t", pub
if __name__ == "__main__":
sys.exit(main())