first stab

bluej100 · Nov 21, 2016 · 44214fc · 44214fc
1 parent 9186add
commit 44214fc
Show file tree

Hide file tree

Showing 9 changed files with 283 additions and 0 deletions.
diff --git a/README.md b/README.md
@@ -1 +1,20 @@
 # unsong-prince
+
+## Usage
+
+```
+python unsong-spider.py
+```
+
+### PDF
+
+```
+python unsong-html-binder.py
+prince output/unsong.html -o output/unsong.pdf
+```
+
+### EPUB
+
+```
+python unsong-epub-binder.py
+```
diff --git a/chapters/.gitignore b/chapters/.gitignore
@@ -0,0 +1,2 @@
+*
+!.gitignore
diff --git a/output/.gitignore b/output/.gitignore
@@ -0,0 +1,2 @@
+*
+!.gitignore
diff --git a/unsong-epub-binder.py b/unsong-epub-binder.py
@@ -0,0 +1,105 @@
+#!/usr/bin/python
+import os, zipfile, glob
+from bs4 import BeautifulSoup
+
+epub = zipfile.ZipFile('output/unsong.epub', 'w')
+
+# The first file must be named "mimetype"
+epub.writestr("mimetype", "application/epub+zip")
+
+# We need an index file, that lists all other HTML files
+# This index file itself is referenced in the META_INF/container.xml
+# file
+epub.writestr("META-INF/container.xml", '''<container version="1.0"
+           xmlns="urn:oasis:names:tc:opendocument:xmlns:container">
+  <rootfiles>
+    <rootfile full-path="OEBPS/Content.opf" media-type="application/oebps-package+xml"/>
+  </rootfiles>
+</container>''');
+
+# The index file is another XML file, living per convention
+# in OEBPS/Content.xml
+index_tpl = '''<package version="2.0"
+  unique-identifier="bookid"
+  xmlns:dc="http://purl.org/dc/elements/1.1/"
+  xmlns="http://www.idpf.org/2007/opf">
+  <metadata>
+    <dc:title>Unsong</dc:title>
+    <dc:creator>Scott Alexander</dc:creator>
+    <dc:publisher>Scott Alexander</dc:publisher>
+    <dc:date>2016</dc:date>
+    <dc:language>en</dc:language>
+    <dc:identifier id="bookid">http://unsongbook.com/</dc:identifier>
+  </metadata>
+  <manifest>
+    <item id="ncx" href="toc.ncx" media-type="application/x-dtbncx+xml" />
+    %(manifest)s
+  </manifest>
+  <spine toc="ncx">
+    %(spine)s
+  </spine>
+</package>'''
+
+# OEBPS/toc.ncx
+toc_tpl = '''<?xml version='1.0' encoding='utf-8'?>
+<!DOCTYPE ncx PUBLIC "-//NISO//DTD ncx 2005-1//EN"
+                 "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">
+<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1">
+  <head>
+    <meta name="dtb:uid"
+content="urn:uuid:77a19404-c4cc-43d9-9652-284184825e9e"/>
+    <meta name="dtb:depth" content="1"/>
+    <meta name="dtb:totalPageCount" content="0"/>
+    <meta name="dtb:maxPageNumber" content="0"/>
+  </head>
+  <docTitle>
+    <text>Unsong</text>
+  </docTitle>
+  <navMap>
+    %(navmap)s
+  </navMap>
+</ncx>
+'''
+
+manifest = ""
+spine = ""
+navmap = ""
+
+for i, chapter in enumerate(sorted(glob.glob('chapters/*.html'))):
+    basename = os.path.basename(chapter)
+
+    soup = BeautifulSoup(open(chapter), "lxml", from_encoding="UTF-8")
+    soup.html['xmlns'] = 'http://www.w3.org/1999/xhtml'
+    h2 = soup.h2
+    del h2['id']
+    chapter_title = h2.string.encode("UTF-8")
+    head = soup.new_tag("head")
+    title = soup.new_tag("title")
+    title.append(chapter_title)
+    head.append(title)
+    soup.html.insert(0, head)
+    for font in soup.find_all('font'):
+        font.unwrap()
+    epub.writestr('OEBPS/'+basename, soup.prettify().encode("UTF-8"))
+
+    manifest += '<item id="file_%s" href="%s" media-type="application/xhtml+xml"/>' % (
+                  i+1, basename)
+    spine += '<itemref idref="file_%s" />' % (i+1)
+    navmap += '''<navPoint id="navpoint-%s" playOrder="%s">
+      <navLabel>
+        <text>%s</text>
+      </navLabel>
+      <content src="%s"/>
+    </navPoint>
+''' % (i+1, i+1, chapter_title, basename)
+
+# Write the toc
+epub.writestr('OEBPS/toc.ncx', toc_tpl % {
+  'navmap': navmap,
+})
+
+# Finally, write the index
+epub.writestr('OEBPS/Content.opf', index_tpl % {
+  'manifest': manifest,
+  'spine': spine,
+})
diff --git a/unsong-footer.html b/unsong-footer.html
@@ -0,0 +1,2 @@
+</body>
+</html>
diff --git a/unsong-header.html b/unsong-header.html
@@ -0,0 +1,110 @@
+<!doctype html>
+<html>
+<head>
+<meta charset="UTF-8" />
+<title>Unsong</title>
+<style>
+body {
+	font-size: 12pt;
+	font-family: "Lora", "Garamond Premier Pro", "Adobe Garamond Pro", "Garamond";
+	text-align: justify;
+}
+h1, h2, h3 {
+	text-align: center;
+	font-family: "Open Sans", Lumos;
+	text-transform: uppercase;
+}
+p {
+  margin: 0.6em 0;
+}
+p.credits {
+	text-align: center;
+	font-family: "Open Sans", Lumos;
+}
+a {
+  text-decoration-style: dashed;
+	color: #000;
+}
+hr, div.sep {
+	border-top: 1px solid #888888;
+	line-height: 0;
+	margin: 1em auto;
+	width: 10em;
+}
+#toc {
+	list-style-type: none;
+	padding: 0;
+	width: 25em;
+	margin: 0 auto;
+}
+#toc a {
+	text-decoration: none;
+}
+
+@media print {
+	body {
+		display: block;
+		counter-reset: page 1
+	}
+	@page {
+		size: 6in 9in;
+	}
+	@page:left {margin: 0.875in 0.75in 0.875in 0.625in;}
+	@page:right {margin: 0.875in 0.625in 0.875in 0.75in;}
+
+	@page:first {
+		margin-top: 2in;
+	}
+	@page chapter {
+		@top {
+			content: string(chapter-title);
+			font-family: "Open Sans", "Lumos";
+			text-transform: uppercase;
+		}
+		@bottom {
+			content: counter(page);
+			font-family: "Open Sans", "Lumos";
+		}
+	}
+	@page chapter:right {
+		@top {
+			content: string(chapter-title);
+		}
+	}
+	article {
+		page: chapter;
+	}
+	article h2 {
+		string-set: chapter-title content();
+	}
+	h2 {
+		page-break-before: always;
+	}
+	#toc a::after { content: leader(".") target-counter(attr(href), page); }
+}
+</style>
+<script>
+window.onload = function() {
+	var chapterTitles = document.getElementsByTagName('h2');
+	var ol = document.createElement('ol');
+	ol.id = 'toc';
+	document.body.insertBefore(ol, document.getElementsByTagName('h2')[0].parentNode);
+	for (var i = 0; i < chapterTitles.length; i++) {
+		var title = chapterTitles[i];
+		var li = document.createElement('li');
+		var a = document.createElement('a');
+		var chapter = title.id;
+		a.setAttribute('href', '#'+chapter);
+		a.appendChild(document.createTextNode(title.firstChild.nodeValue));
+		li.appendChild(a);
+		ol.appendChild(li);
+	}
+	var header = document.createElement('h2');
+	header.appendChild(document.createTextNode('Contents'));
+	document.body.insertBefore(header, ol);
+}
+</script>
+</head>
+<body>
+<h1><img src="http://i.imgur.com/d9LvKMc.png" alt="Unsong" /></h1>
+<p class="credits">by Scott Alexander</p>
diff --git a/unsong-html-binder.py b/unsong-html-binder.py
@@ -0,0 +1,12 @@
+#!/usr/bin/python
+import sys, os, glob
+
+f = open('output/unsong.html', 'w')
+f.write(open('unsong-header.html', 'r').read())
+
+for chapter in sorted(glob.glob('chapters/*.html')):
+    f.write('<article>')
+    f.write(open(chapter).read())
+    f.write('</article>')
+
+f.write(open('unsong-footer.html', 'r').read())
diff --git a/unsong-logo.png b/unsong-logo.png
diff --git a/unsong-spider.py b/unsong-spider.py
@@ -0,0 +1,31 @@
+#!/usr/bin/python
+import sys, time, os, random, re, urllib2
+
+titlere = re.compile('<h1.*?>(.*?)</h1>', re.DOTALL);
+contentre = re.compile('<div class="pjgm-postcontent">(.*?)<div class="sharedaddy', re.DOTALL);
+nextre = re.compile('<div class="pjgm-navnex"><a href="(.*?)" rel="next"', re.DOTALL);
+url = "http://unsongbook.com/prologue-2/"
+i = 1
+
+while url:
+    print url
+    request = urllib2.Request(url, headers={'User-Agent' : "Magic Browser"})
+    response = urllib2.urlopen(request)
+    html = response.read()
+
+    title = titlere.search(html).group(1)
+    content = contentre.search(html).group(1)
+    path = "chapters/%03d.html" % i
+    f = open(path, 'w')
+    f.write('<h2 id="'+str(i)+'">'+title+'</h2>')
+    f.write(content)
+    f.close()
+    print path
+
+    nextsearch = nextre.search(html)
+    url = nextsearch and nextsearch.group(1)
+    i += 1
+
+    time.sleep(1+3*random.random())
+
+print 'done'