-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
9 changed files
with
283 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1,20 @@ | ||
# unsong-prince | ||
|
||
## Usage | ||
|
||
``` | ||
python unsong-spider.py | ||
``` | ||
|
||
|
||
``` | ||
python unsong-html-binder.py | ||
prince output/unsong.html -o output/unsong.pdf | ||
``` | ||
|
||
### EPUB | ||
|
||
``` | ||
python unsong-epub-binder.py | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
* | ||
!.gitignore |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
* | ||
!.gitignore |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,105 @@ | ||
#!/usr/bin/python | ||
import os, zipfile, glob | ||
from bs4 import BeautifulSoup | ||
|
||
epub = zipfile.ZipFile('output/unsong.epub', 'w') | ||
|
||
# The first file must be named "mimetype" | ||
epub.writestr("mimetype", "application/epub+zip") | ||
|
||
# We need an index file, that lists all other HTML files | ||
# This index file itself is referenced in the META_INF/container.xml | ||
# file | ||
epub.writestr("META-INF/container.xml", '''<container version="1.0" | ||
xmlns="urn:oasis:names:tc:opendocument:xmlns:container"> | ||
<rootfiles> | ||
<rootfile full-path="OEBPS/Content.opf" media-type="application/oebps-package+xml"/> | ||
</rootfiles> | ||
</container>'''); | ||
|
||
# The index file is another XML file, living per convention | ||
# in OEBPS/Content.xml | ||
index_tpl = '''<package version="2.0" | ||
unique-identifier="bookid" | ||
xmlns:dc="http://purl.org/dc/elements/1.1/" | ||
xmlns="http://www.idpf.org/2007/opf"> | ||
<metadata> | ||
<dc:title>Unsong</dc:title> | ||
<dc:creator>Scott Alexander</dc:creator> | ||
<dc:publisher>Scott Alexander</dc:publisher> | ||
<dc:date>2016</dc:date> | ||
<dc:language>en</dc:language> | ||
<dc:identifier id="bookid">http://unsongbook.com/</dc:identifier> | ||
</metadata> | ||
<manifest> | ||
<item id="ncx" href="toc.ncx" media-type="application/x-dtbncx+xml" /> | ||
%(manifest)s | ||
</manifest> | ||
<spine toc="ncx"> | ||
%(spine)s | ||
</spine> | ||
</package>''' | ||
|
||
# OEBPS/toc.ncx | ||
toc_tpl = '''<?xml version='1.0' encoding='utf-8'?> | ||
<!DOCTYPE ncx PUBLIC "-//NISO//DTD ncx 2005-1//EN" | ||
"http://www.daisy.org/z3986/2005/ncx-2005-1.dtd"> | ||
<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1"> | ||
<head> | ||
<meta name="dtb:uid" | ||
content="urn:uuid:77a19404-c4cc-43d9-9652-284184825e9e"/> | ||
<meta name="dtb:depth" content="1"/> | ||
<meta name="dtb:totalPageCount" content="0"/> | ||
<meta name="dtb:maxPageNumber" content="0"/> | ||
</head> | ||
<docTitle> | ||
<text>Unsong</text> | ||
</docTitle> | ||
<navMap> | ||
%(navmap)s | ||
</navMap> | ||
</ncx> | ||
''' | ||
|
||
manifest = "" | ||
spine = "" | ||
navmap = "" | ||
|
||
for i, chapter in enumerate(sorted(glob.glob('chapters/*.html'))): | ||
basename = os.path.basename(chapter) | ||
|
||
soup = BeautifulSoup(open(chapter), "lxml", from_encoding="UTF-8") | ||
soup.html['xmlns'] = 'http://www.w3.org/1999/xhtml' | ||
h2 = soup.h2 | ||
del h2['id'] | ||
chapter_title = h2.string.encode("UTF-8") | ||
head = soup.new_tag("head") | ||
title = soup.new_tag("title") | ||
title.append(chapter_title) | ||
head.append(title) | ||
soup.html.insert(0, head) | ||
for font in soup.find_all('font'): | ||
font.unwrap() | ||
epub.writestr('OEBPS/'+basename, soup.prettify().encode("UTF-8")) | ||
|
||
manifest += '<item id="file_%s" href="%s" media-type="application/xhtml+xml"/>' % ( | ||
i+1, basename) | ||
spine += '<itemref idref="file_%s" />' % (i+1) | ||
navmap += '''<navPoint id="navpoint-%s" playOrder="%s"> | ||
<navLabel> | ||
<text>%s</text> | ||
</navLabel> | ||
<content src="%s"/> | ||
</navPoint> | ||
''' % (i+1, i+1, chapter_title, basename) | ||
|
||
# Write the toc | ||
epub.writestr('OEBPS/toc.ncx', toc_tpl % { | ||
'navmap': navmap, | ||
}) | ||
|
||
# Finally, write the index | ||
epub.writestr('OEBPS/Content.opf', index_tpl % { | ||
'manifest': manifest, | ||
'spine': spine, | ||
}) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
</body> | ||
</html> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,110 @@ | ||
<!doctype html> | ||
<html> | ||
<head> | ||
<meta charset="UTF-8" /> | ||
<title>Unsong</title> | ||
<style> | ||
body { | ||
font-size: 12pt; | ||
font-family: "Lora", "Garamond Premier Pro", "Adobe Garamond Pro", "Garamond"; | ||
text-align: justify; | ||
} | ||
h1, h2, h3 { | ||
text-align: center; | ||
font-family: "Open Sans", Lumos; | ||
text-transform: uppercase; | ||
} | ||
p { | ||
margin: 0.6em 0; | ||
} | ||
p.credits { | ||
text-align: center; | ||
font-family: "Open Sans", Lumos; | ||
} | ||
a { | ||
text-decoration-style: dashed; | ||
color: #000; | ||
} | ||
hr, div.sep { | ||
border-top: 1px solid #888888; | ||
line-height: 0; | ||
margin: 1em auto; | ||
width: 10em; | ||
} | ||
#toc { | ||
list-style-type: none; | ||
padding: 0; | ||
width: 25em; | ||
margin: 0 auto; | ||
} | ||
#toc a { | ||
text-decoration: none; | ||
} | ||
|
||
@media print { | ||
body { | ||
display: block; | ||
counter-reset: page 1 | ||
} | ||
@page { | ||
size: 6in 9in; | ||
} | ||
@page:left {margin: 0.875in 0.75in 0.875in 0.625in;} | ||
@page:right {margin: 0.875in 0.625in 0.875in 0.75in;} | ||
|
||
@page:first { | ||
margin-top: 2in; | ||
} | ||
@page chapter { | ||
@top { | ||
content: string(chapter-title); | ||
font-family: "Open Sans", "Lumos"; | ||
text-transform: uppercase; | ||
} | ||
@bottom { | ||
content: counter(page); | ||
font-family: "Open Sans", "Lumos"; | ||
} | ||
} | ||
@page chapter:right { | ||
@top { | ||
content: string(chapter-title); | ||
} | ||
} | ||
article { | ||
page: chapter; | ||
} | ||
article h2 { | ||
string-set: chapter-title content(); | ||
} | ||
h2 { | ||
page-break-before: always; | ||
} | ||
#toc a::after { content: leader(".") target-counter(attr(href), page); } | ||
} | ||
</style> | ||
<script> | ||
window.onload = function() { | ||
var chapterTitles = document.getElementsByTagName('h2'); | ||
var ol = document.createElement('ol'); | ||
ol.id = 'toc'; | ||
document.body.insertBefore(ol, document.getElementsByTagName('h2')[0].parentNode); | ||
for (var i = 0; i < chapterTitles.length; i++) { | ||
var title = chapterTitles[i]; | ||
var li = document.createElement('li'); | ||
var a = document.createElement('a'); | ||
var chapter = title.id; | ||
a.setAttribute('href', '#'+chapter); | ||
a.appendChild(document.createTextNode(title.firstChild.nodeValue)); | ||
li.appendChild(a); | ||
ol.appendChild(li); | ||
} | ||
var header = document.createElement('h2'); | ||
header.appendChild(document.createTextNode('Contents')); | ||
document.body.insertBefore(header, ol); | ||
} | ||
</script> | ||
</head> | ||
<body> | ||
<h1><img src="http://i.imgur.com/d9LvKMc.png" alt="Unsong" /></h1> | ||
<p class="credits">by Scott Alexander</p> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
#!/usr/bin/python | ||
import sys, os, glob | ||
|
||
f = open('output/unsong.html', 'w') | ||
f.write(open('unsong-header.html', 'r').read()) | ||
|
||
for chapter in sorted(glob.glob('chapters/*.html')): | ||
f.write('<article>') | ||
f.write(open(chapter).read()) | ||
f.write('</article>') | ||
|
||
f.write(open('unsong-footer.html', 'r').read()) |
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
#!/usr/bin/python | ||
import sys, time, os, random, re, urllib2 | ||
|
||
titlere = re.compile('<h1.*?>(.*?)</h1>', re.DOTALL); | ||
contentre = re.compile('<div class="pjgm-postcontent">(.*?)<div class="sharedaddy', re.DOTALL); | ||
nextre = re.compile('<div class="pjgm-navnex"><a href="(.*?)" rel="next"', re.DOTALL); | ||
url = "http://unsongbook.com/prologue-2/" | ||
i = 1 | ||
|
||
while url: | ||
print url | ||
request = urllib2.Request(url, headers={'User-Agent' : "Magic Browser"}) | ||
response = urllib2.urlopen(request) | ||
html = response.read() | ||
|
||
title = titlere.search(html).group(1) | ||
content = contentre.search(html).group(1) | ||
path = "chapters/%03d.html" % i | ||
f = open(path, 'w') | ||
f.write('<h2 id="'+str(i)+'">'+title+'</h2>') | ||
f.write(content) | ||
f.close() | ||
print path | ||
|
||
nextsearch = nextre.search(html) | ||
url = nextsearch and nextsearch.group(1) | ||
i += 1 | ||
|
||
time.sleep(1+3*random.random()) | ||
|
||
print 'done' |