From 934ddfc6248df77796b523ef47cbe43b0fc82b24 Mon Sep 17 00:00:00 2001
From: kdavis-mozilla <kdavis@mozilla.com>
Date: Sat, 29 Dec 2018 06:35:23 +0100
Subject: [PATCH] Fixed #38 (common.py should remove control codes) ands Fixed
 #39 (common.py should remove byte order marks)

---
 src/corporacreator/preprocessors/common.py | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/src/corporacreator/preprocessors/common.py b/src/corporacreator/preprocessors/common.py
index 3a25241..1f29a6d 100644
--- a/src/corporacreator/preprocessors/common.py
+++ b/src/corporacreator/preprocessors/common.py
@@ -1,3 +1,5 @@
+import unicodedata
+
 from urllib.parse import unquote
 from html.parser import HTMLParser
 
@@ -39,6 +41,24 @@ def _strip_tags(html):
     return s.get_data()
 
 
+def _strip_string(sentence):
+    """Cleans a string based on a whitelist of printable unicode categories.
+
+    You can find a full list of categories here:
+    http://www.fileformat.info/info/unicode/category/index.htm
+    """
+    letters     = ('LC', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu')
+    numbers     = ('Nd', 'Nl', 'No')
+    marks       = ('Mc', 'Me', 'Mn')
+    punctuation = ('Pc', 'Pd', 'Pe', 'Pf', 'Pi', 'Po', 'Ps')
+    symbol      = ('Sc', 'Sk', 'Sm', 'So')
+    space       = ('Zs',)
+
+    allowed_categories = letters + numbers + marks + punctuation + symbol + space
+
+    return u''.join([c for c in sentence if unicodedata.category(c) in allowed_categories])
+
+
 def common(sentence):
     """Cleans up the passed sentence in a language independent manner, removing or reformatting invalid data.
 
@@ -53,5 +73,7 @@ def common(sentence):
     sentence = unquote(sentence)
     # Remove any HTML tags
     sentence = _strip_tags(sentence)
+    # Remove non-printable characters
+    sentence = _strip_string(sentence)
     # TODO: Clean up data in a language independent manner
     return sentence