Skip to content

Commit

Permalink
first stab
Browse files Browse the repository at this point in the history
  • Loading branch information
bluej100 committed Nov 21, 2016
1 parent 9186add commit 44214fc
Show file tree
Hide file tree
Showing 9 changed files with 283 additions and 0 deletions.
19 changes: 19 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1 +1,20 @@
# unsong-prince

## Usage

```
python unsong-spider.py
```

### PDF

```
python unsong-html-binder.py
prince output/unsong.html -o output/unsong.pdf
```

### EPUB

```
python unsong-epub-binder.py
```
2 changes: 2 additions & 0 deletions chapters/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
*
!.gitignore
2 changes: 2 additions & 0 deletions output/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
*
!.gitignore
105 changes: 105 additions & 0 deletions unsong-epub-binder.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
#!/usr/bin/python
import os, zipfile, glob
from bs4 import BeautifulSoup

epub = zipfile.ZipFile('output/unsong.epub', 'w')

# The first file must be named "mimetype"
epub.writestr("mimetype", "application/epub+zip")

# We need an index file, that lists all other HTML files
# This index file itself is referenced in the META_INF/container.xml
# file
epub.writestr("META-INF/container.xml", '''<container version="1.0"
xmlns="urn:oasis:names:tc:opendocument:xmlns:container">
<rootfiles>
<rootfile full-path="OEBPS/Content.opf" media-type="application/oebps-package+xml"/>
</rootfiles>
</container>''');

# The index file is another XML file, living per convention
# in OEBPS/Content.xml
index_tpl = '''<package version="2.0"
unique-identifier="bookid"
xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns="http://www.idpf.org/2007/opf">
<metadata>
<dc:title>Unsong</dc:title>
<dc:creator>Scott Alexander</dc:creator>
<dc:publisher>Scott Alexander</dc:publisher>
<dc:date>2016</dc:date>
<dc:language>en</dc:language>
<dc:identifier id="bookid">http://unsongbook.com/</dc:identifier>
</metadata>
<manifest>
<item id="ncx" href="toc.ncx" media-type="application/x-dtbncx+xml" />
%(manifest)s
</manifest>
<spine toc="ncx">
%(spine)s
</spine>
</package>'''

# OEBPS/toc.ncx
toc_tpl = '''<?xml version='1.0' encoding='utf-8'?>
<!DOCTYPE ncx PUBLIC "-//NISO//DTD ncx 2005-1//EN"
"http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">
<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1">
<head>
<meta name="dtb:uid"
content="urn:uuid:77a19404-c4cc-43d9-9652-284184825e9e"/>
<meta name="dtb:depth" content="1"/>
<meta name="dtb:totalPageCount" content="0"/>
<meta name="dtb:maxPageNumber" content="0"/>
</head>
<docTitle>
<text>Unsong</text>
</docTitle>
<navMap>
%(navmap)s
</navMap>
</ncx>
'''

manifest = ""
spine = ""
navmap = ""

for i, chapter in enumerate(sorted(glob.glob('chapters/*.html'))):
basename = os.path.basename(chapter)

soup = BeautifulSoup(open(chapter), "lxml", from_encoding="UTF-8")
soup.html['xmlns'] = 'http://www.w3.org/1999/xhtml'
h2 = soup.h2
del h2['id']
chapter_title = h2.string.encode("UTF-8")
head = soup.new_tag("head")
title = soup.new_tag("title")
title.append(chapter_title)
head.append(title)
soup.html.insert(0, head)
for font in soup.find_all('font'):
font.unwrap()
epub.writestr('OEBPS/'+basename, soup.prettify().encode("UTF-8"))

manifest += '<item id="file_%s" href="%s" media-type="application/xhtml+xml"/>' % (
i+1, basename)
spine += '<itemref idref="file_%s" />' % (i+1)
navmap += '''<navPoint id="navpoint-%s" playOrder="%s">
<navLabel>
<text>%s</text>
</navLabel>
<content src="%s"/>
</navPoint>
''' % (i+1, i+1, chapter_title, basename)

# Write the toc
epub.writestr('OEBPS/toc.ncx', toc_tpl % {
'navmap': navmap,
})

# Finally, write the index
epub.writestr('OEBPS/Content.opf', index_tpl % {
'manifest': manifest,
'spine': spine,
})
2 changes: 2 additions & 0 deletions unsong-footer.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
</body>
</html>
110 changes: 110 additions & 0 deletions unsong-header.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
<!doctype html>
<html>
<head>
<meta charset="UTF-8" />
<title>Unsong</title>
<style>
body {
font-size: 12pt;
font-family: "Lora", "Garamond Premier Pro", "Adobe Garamond Pro", "Garamond";
text-align: justify;
}
h1, h2, h3 {
text-align: center;
font-family: "Open Sans", Lumos;
text-transform: uppercase;
}
p {
margin: 0.6em 0;
}
p.credits {
text-align: center;
font-family: "Open Sans", Lumos;
}
a {
text-decoration-style: dashed;
color: #000;
}
hr, div.sep {
border-top: 1px solid #888888;
line-height: 0;
margin: 1em auto;
width: 10em;
}
#toc {
list-style-type: none;
padding: 0;
width: 25em;
margin: 0 auto;
}
#toc a {
text-decoration: none;
}

@media print {
body {
display: block;
counter-reset: page 1
}
@page {
size: 6in 9in;
}
@page:left {margin: 0.875in 0.75in 0.875in 0.625in;}
@page:right {margin: 0.875in 0.625in 0.875in 0.75in;}

@page:first {
margin-top: 2in;
}
@page chapter {
@top {
content: string(chapter-title);
font-family: "Open Sans", "Lumos";
text-transform: uppercase;
}
@bottom {
content: counter(page);
font-family: "Open Sans", "Lumos";
}
}
@page chapter:right {
@top {
content: string(chapter-title);
}
}
article {
page: chapter;
}
article h2 {
string-set: chapter-title content();
}
h2 {
page-break-before: always;
}
#toc a::after { content: leader(".") target-counter(attr(href), page); }
}
</style>
<script>
window.onload = function() {
var chapterTitles = document.getElementsByTagName('h2');
var ol = document.createElement('ol');
ol.id = 'toc';
document.body.insertBefore(ol, document.getElementsByTagName('h2')[0].parentNode);
for (var i = 0; i < chapterTitles.length; i++) {
var title = chapterTitles[i];
var li = document.createElement('li');
var a = document.createElement('a');
var chapter = title.id;
a.setAttribute('href', '#'+chapter);
a.appendChild(document.createTextNode(title.firstChild.nodeValue));
li.appendChild(a);
ol.appendChild(li);
}
var header = document.createElement('h2');
header.appendChild(document.createTextNode('Contents'));
document.body.insertBefore(header, ol);
}
</script>
</head>
<body>
<h1><img src="http://i.imgur.com/d9LvKMc.png" alt="Unsong" /></h1>
<p class="credits">by Scott Alexander</p>
12 changes: 12 additions & 0 deletions unsong-html-binder.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#!/usr/bin/python
import sys, os, glob

f = open('output/unsong.html', 'w')
f.write(open('unsong-header.html', 'r').read())

for chapter in sorted(glob.glob('chapters/*.html')):
f.write('<article>')
f.write(open(chapter).read())
f.write('</article>')

f.write(open('unsong-footer.html', 'r').read())
Binary file added unsong-logo.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
31 changes: 31 additions & 0 deletions unsong-spider.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
#!/usr/bin/python
import sys, time, os, random, re, urllib2

titlere = re.compile('<h1.*?>(.*?)</h1>', re.DOTALL);
contentre = re.compile('<div class="pjgm-postcontent">(.*?)<div class="sharedaddy', re.DOTALL);
nextre = re.compile('<div class="pjgm-navnex"><a href="(.*?)" rel="next"', re.DOTALL);
url = "http://unsongbook.com/prologue-2/"
i = 1

while url:
print url
request = urllib2.Request(url, headers={'User-Agent' : "Magic Browser"})
response = urllib2.urlopen(request)
html = response.read()

title = titlere.search(html).group(1)
content = contentre.search(html).group(1)
path = "chapters/%03d.html" % i
f = open(path, 'w')
f.write('<h2 id="'+str(i)+'">'+title+'</h2>')
f.write(content)
f.close()
print path

nextsearch = nextre.search(html)
url = nextsearch and nextsearch.group(1)
i += 1

time.sleep(1+3*random.random())

print 'done'

0 comments on commit 44214fc

Please sign in to comment.