Merge branch 'develop'

the-happy-hippo · Apr 25, 2014 · 866dde7 · 866dde7
2 parents 782b34d + 25e1b22
commit 866dde7
Show file tree

Hide file tree

Showing 20 changed files with 735 additions and 125 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,7 +1,9 @@
 *.log
 *.pot
 *.pyc
+log*.txt
 venv
+tmp
 local_settings.py
 app/lib
 tags
diff --git a/.gitmodules b/.gitmodules
@@ -0,0 +1,6 @@
+[submodule "lib/pyphen"]
+	path = lib/pyphen
+	url = https://github.com/the-happy-hippo/pyphen.git
+[submodule "lib/readability"]
+	path = lib/readability
+	url = https://github.com/the-happy-hippo/python-readability.git
diff --git a/README.md b/README.md
@@ -5,21 +5,48 @@ sprits-it! — Awesome Speed-Reading
 
 The speed reading technique is currently based on the ideas of [Spritz](http://www.spritzinc.com/) described in their [blog](http://www.spritzinc.com/blog). At the time of this writing [Spritz](http://www.spritzinc.com/) had virtually nothing to offer to [iOs](http://www.apple.com/ios) device users to play with. This project is targeted specifically on usability in Safari and Chrome mobile browsers but should ultimately work well on other platforms.
 
+## Screenshots
+
+<img src='https://github.com/the-happy-hippo/sprits-it/raw/gh-pages/app/static/img/cap_alice.gif'
+    width='300px' height='450px' alt='English-language Demo'></img>
+    &nbsp;
+<img src='https://github.com/the-happy-hippo/sprits-it/raw/gh-pages/app/static/img/cap_geroi.gif'
+    width='300px' height='450px' alt='Russian-language Demo'></img>
+
+## Features
+
+- Speed-read any web page in your browser (well, _almost_ any) — also on mobile;
+- Easy-to-install _bookmarklet_;
+- Configurable reading speed, play/pause/rewind, nite mode;
+- Settings, text and reading position can be saved in browser local storage and restored next time you visit;
+- Uses Readability™ technology for text extraction and cleanup (HTML boilerplate removal);
+- Hyphenation for long words with language auto-detection;
+- Punctuation aware;
+- _Coming soon_: e-book reading (`ePub` format);
+- _Coming soon_: word and context highlighting.
+
 ## Demo
 
-Live demo is hosted at [Google App Engine](http://cloud.google.com/products/app-engine) here:
+Live demo is hosted on [Heroku](https://www.heroku.com) here:
 
-> http://spritsit.appspot.com/read
+> http://sprits-it.herokuapp.com/read
 
 To read a web page at ``page_url`` use the following GET query:
-`http://spritsit.appspot.com/read?url=page_url`
+`http://sprits-it.herokuapp.com/read?url=page_url`
 
 For example:
 
-> http://spritsit.appspot.com/read?url=http%3A%2F%2Fwww.spritzinc.com%2Fthe-science
+> http://sprits-it.herokuapp.com/read?url=http%3A%2F%2Fwww.spritzinc.com%2Fthe-science
 
 Alternatively, you migh just grab a _bookmarklet_ in the settings section of the demo.
 
+## Changelog
+
+Currently development betas only — simply because the project lacks sufficient user base to be tested extensively. Sorry… and you can help fixing that!
+
+- `0.1-beta` — Initial release — client/server version
+- `0.2-beta` — Added hyphenation and punctuation handling; improved text extraction
+
 ## Contributors
 
 The bootstrap code was imported from a [CodePen](http://codepen.io) snippet at http://codepen.io/the-happy-hippo/pen/aDHrl which in turn had been forked from a great Pen by [Charlotte Dann](http://codepen.io/pouretrebelle) at http://codepen.io/pouretrebelle/pen/reGKw with javascript cherrypicking for [Readability](https://www.readability.com) text extraction from [OpenSpritz](https://github.com/Miserlou/OpenSpritz) project by [Rich Jones](https://github.com/Miserlou)(@miserlou).

diff --git a/app/.env b/app/.env
@@ -1,3 +1,2 @@
-APP_DEBUG=0
 ALLOW_STREAMING=1
-READ_API_TOKEN=112358
+CURRENT_VERSION_ID=v0.2-beta
diff --git a/app/app.yaml b/app/app.yaml
@@ -1,5 +1,5 @@
 application: spritsit
-version: 0-1-beta
+version: v0-2-beta
 runtime: python27
 api_version: 1
 threadsafe: true
@@ -15,14 +15,6 @@ libraries:
   version: latest
 
 handlers:
-- url: /favicon\.ico
-  static_files: static/img/favicon.ico
-  upload: static/img/favicon\.ico
-
-- url: /read
-  static_files: static/index.html
-  upload: static/index\.html
-
 - url: /assets/css/(.*ttf)
   static_files: static/css/\1
   upload: static/css/(.*\.ttf)
@@ -50,19 +42,18 @@ handlers:
   script: spritsit.app
 
 skip_files:
+- tmp
 - ^(.*/)?#.*#$
 - ^(.*/)?.*~$
 - ^(.*/)?.*\.py[co]$
 - ^(.*/)?.*/RCS/.*$
 - ^(.*/)?\..*$
 - ^(.*/)?.*\.cmd$
-- ^(.*/)?.*\.json$
 - ^(.*/)?.*\.sass$
 - ^(.*/)?.*\.jade$
 
 env_variables:
   ALLOW_STREAMING: false
-  READ_API_TOKEN: 112358
 
 inbound_services:
 - warmup

diff --git a/app/extractor.py b/app/extractor.py
@@ -0,0 +1,235 @@
+# -*- coding: utf-8 -*-
+
+""" Text extraction methods.
+"""
+import re
+import json
+import urllib
+import urllib2
+import logging
+
+import lazygen
+from settings import settings
+
+import fixpath
+
+from lxml import html, etree
+from readability import readability
+
+import pyphen
+from guess_language import guessLanguage
+
+#------------------------------------------------------------------------------
+
+# App logger
+log = logging.getLogger(__name__)
+
+#------------------------------------------------------------------------------
+
+_regex = {
+    'paragraphs': re.compile(ur'\n\s*\n\s*|\n\s\s+', flags=re.UNICODE),
+    'spaces': re.compile(ur'\s+', flags=re.UNICODE),
+    'longdash': re.compile(ur'\-{2,}', flags=re.UNICODE),
+}
+
+XPATH_ALL_NODES = etree.XPath('//*')
+
+#------------------------------------------------------------------------------
+
+class CleanDocument:
+    """ Readable document fetched from `source_url`.
+    """
+
+    def __init__(self, source_url, title=None, content=None, author=None):
+        self.url = source_url
+        self.title = title
+        self.content = content
+        self.author = author
+        self._pyphen = None
+
+    @property
+    def source_url(self):
+        return self.url
+
+    def is_empty(self):
+        return (not self.content)
+
+    @classmethod
+    def from_json(cls, json_object):
+        """Create a document instance from JSON dictionary.
+        """
+        doc = cls(json_object['url'])
+
+        for key, value in json_object.iteritems():
+            setattr(doc, key, value)
+
+        return doc
+
+    def json_generator(self):
+        """Return generator that produces JSON strings.
+        """
+        json_object = dict((k, v) for (k, v) in self.__dict__.iteritems()
+                if not k.startswith('_'))
+
+        return lazygen.json_generator(json_object)
+
+    def textify(self):
+        """ Transform html content to plain text.
+        """
+        if not self.content:
+            return
+
+        assert isinstance(self.content, unicode)
+
+        doc = html.fromstring(self.content)
+
+        # Add padding so that text in adjacent tags wouldn't stick
+        # together. E.g., "<p>Hello<br/>World!</p>" should look as
+        # "Hello World!" and not as "HelloWorld!".
+        for node in XPATH_ALL_NODES(doc):
+            if node.tail:
+                node.tail = node.tail + ' '
+            else:
+                node.tail = ' '
+
+        txt = html.tostring(doc, method='text', encoding='unicode')
+
+        # Little cleanup surgery
+        paragraphs = _regex['paragraphs'].split(txt)
+        pcleaned = []
+
+        for parag in paragraphs:
+            words   = _regex['spaces'].split(parag)
+            wclean  = []
+
+            for word in words:
+                if word: # it must have been stripped by split()
+                    wclean.extend(self._clean_word(word, parag))
+
+            pcleaned.append(' '.join(wclean))
+
+        self.content = '\n'.join(pcleaned)
+
+    def _clean_word(self, word, wordcorpus):
+        outlist = []
+
+        dash_separated = _regex['longdash'].split(word)
+
+        if len(dash_separated) >= 2:
+            for subword in dash_separated:
+                outlist.extend(self._clean_word(subword, wordcorpus))
+                outlist.append(u'\u2014') # mdash
+            return outlist[:-1]
+
+        if len(word) <= settings.max_word_len:
+            return [word]
+        else:
+            if not self._pyphen:
+                lang = guessLanguage(wordcorpus)
+
+                log.info('Language guessed: %s', lang)
+
+                if lang == 'UNKNOWN': lang = 'en' # fallback
+
+                self._pyphen = pyphen.Pyphen(lang=lang)
+
+            return [self._pyphen.multiwrap(word, settings.max_word_len)]
+
+class Extractor:
+
+    def __init__(self):
+
+        # Params for working with Readability APIs
+        rdd_parser = settings.parsers['Readability']
+
+        self._rdd_api_url = rdd_parser['uri']
+        self._rdd_api_key = rdd_parser['token']
+
+    def extract(self, url):
+
+        # Try getting Readability content first
+        doc = self._get_from_rdd(url)
+
+        # Parse locally if Readability content is empty
+        if doc.is_empty():
+            log.warn('Readability content is empty, running local parser.')
+            self._update_content(doc)
+        else:
+            log.info('Returning Readability content.')
+
+        return doc
+
+    def _get_from_rdd(self, url):
+        """Use Readability online API.
+        """
+        rdd_args = urllib.urlencode( dict(url=url, token=self._rdd_api_key) )
+        rdd_req  = self._rdd_api_url + '?' + rdd_args
+
+        log.info('Getting Readability content from %s', rdd_req)
+
+        rdd_json = Extractor._get_raw_content(rdd_req, 'application/json')
+        rdd_doc  = CleanDocument.from_json(json.load(rdd_json))
+
+        # convert html to text
+        rdd_doc.textify()
+
+        return rdd_doc
+
+    def _update_content(self, doc):
+        """Get readable content using local parser.
+        """
+        rawhtml = Extractor._get_raw_content(doc.source_url).read()
+
+        rddoc = readability.Document(rawhtml)
+
+        title, html_content = rddoc.short_title(), rddoc.summary()
+
+        doc.title = doc.title or title
+        doc.word_count = float('NaN')
+        doc.content = html_content
+
+        # convert html to text
+        doc.textify()
+
+    @staticmethod
+    def _get_raw_content(url, mime=None, allowgzip=True):
+        """ Get data from given url.
+
+        Return file-like object so it can be fed to json.load()
+        """
+
+        req = urllib2.Request(url)
+
+        if mime:
+            req.add_header('Accept', mime)
+
+        if allowgzip:
+            req.add_header('Accept-Encoding', 'gzip,deflate')
+
+        resp = urllib2.urlopen(req)
+
+        meta = resp.info()
+
+        mime_type = meta.gettype()
+
+        log.debug('Opening mime type "%s"', mime_type)
+
+        content_type = meta.getheader('content-type', '')
+        content_encoding = meta.getheader('content-encoding', '')
+
+        log.debug('Content type: "%s"', content_type)
+        log.debug('Content encoding: "%s"', content_encoding)
+
+        # we'll gunzip even if not allowgzip :)
+        if content_encoding.lower() in ['gzip', 'deflate']:
+            log.debug('Decompressing gzip/deflate response.')
+
+            gunzip_gen = lazygen.gunzip_generator(resp)
+
+            return lazygen.StringGenStream(gunzip_gen)
+
+        return resp
+
+# Global extractor instance
+extractor = Extractor()
+