atlanhq · dimitern · Jun 30, 2019 · Jul 1, 2019
diff --git a/camelot/utils.py b/camelot/utils.py
@@ -1,6 +1,7 @@
 # -*- coding: utf-8 -*-
 from __future__ import division
 
+import re
 import os
 import sys
 import random
@@ -385,6 +386,33 @@ def merge_close_lines(ar, line_tol=2):
     return ret
 
 
+def text_strip(text, strip=""):
+    """Strips any characters in `strip` that are present in `text`.
+
+    Parameters
+    ----------
+    text : str
+        Text to process and strip.
+    strip : str, optional (default: '')
+        Characters that should be stripped from `text`.
+
+    Returns
+    -------
+    stripped : str
+
+    """
+    if not strip:
+        return text
+
+    stripped = re.sub(
+        r"[{}]".format("".join(map(re.escape, strip))),
+        "",
+        text,
+        re.UNICODE,
+    )
+    return stripped
+
+
 # TODO: combine the following functions into a TextProcessor class which
 # applies corresponding transformations sequentially
 # (inspired from sklearn.pipeline.Pipeline)
@@ -428,10 +456,10 @@ def flag_font_size(textline, direction, strip_text=''):
                 fchars = [t[0] for t in chars]
                 if ''.join(fchars).strip():
                     flist.append(''.join(fchars))
-        fstring = ''.join(flist).strip(strip_text)
+        fstring = ''.join(flist)
     else:
-        fstring = ''.join([t.get_text() for t in textline]).strip(strip_text)
-    return fstring
+        fstring = ''.join([t.get_text() for t in textline])
+    return text_strip(fstring, strip_text)
 
 
 def split_textline(table, textline, direction, flag_size=False, strip_text=''):
@@ -515,7 +543,7 @@ def split_textline(table, textline, direction, flag_size=False, strip_text=''):
                 flag_font_size([t[2] for t in chars], direction, strip_text=strip_text)))
         else:
             gchars = [t[2].get_text() for t in chars]
-            grouped_chars.append((key[0], key[1], ''.join(gchars).strip(strip_text)))
+            grouped_chars.append((key[0], key[1], text_strip(''.join(gchars), strip_text)))
     return grouped_chars
 
 
@@ -599,7 +627,15 @@ def get_table_index(table, t, direction, split_text=False, flag_size=False, stri
         if flag_size:
             return [(r_idx, c_idx, flag_font_size(t._objs, direction, strip_text=strip_text))], error
         else:
-            return [(r_idx, c_idx, t.get_text().strip(strip_text))], error
+
+            return (
+                [(
+                    r_idx,
+                    c_idx,
+                    text_strip(t.get_text(), strip_text),
+                )],
+                error,
+            )
 
 
 def compute_accuracy(error_weights):

diff --git a/tests/data.py b/tests/data.py
@@ -313,30 +313,30 @@
 ]
 
 data_stream_strip_text = [
-    ["V i n s   a u   Ve r r e", ""],
-    ["Les Blancs", "12.5CL"],
-    ["A.O.P Côtes du Rhône", ""],
-    ["Domaine de la Guicharde «  Autour de la chapelle » 2016", "8 €"],
-    ["A.O.P Vacqueyras", ""],
-    ["Domaine de Montvac  « Melodine » 2016", "10 €"],
-    ["A.O.P Châteauneuf du Pape", ""],
-    ["Domaine de Beaurenard 2017", "13 €"],
-    ["A.O.P Côteaux du Languedoc", ""],
-    ["Villa Tempora « Un temps pour elle » 2014", "9 €"],
-    ["A.O.P Côtes de Provence", ""],
-    ["Château Grand Boise 2017", "9 €"],
-    ["Les Rosés", "12,5 CL"],
-    ["A.O.P Côtes du Rhône", ""],
-    ["Domaine de la Florane « A fleur de Pampre » 2016", "8 €"],
-    ["Famille Coulon (Domaine Beaurenard) Biotifulfox 2017", "8 €"],
-    ["A.O.P Vacqueyras", ""],
-    ["Domaine de Montvac 2017", "9 €"],
-    ["A.O.P Languedoc", ""],
-    ["Domaine de Joncas « Nébla » 2015", "8 €"],
-    ["Villa Tempora « L’arroseur arrosé » 2015", "9 €"],
-    ["A.O.P Côtes de Provence", ""],
-    ["Château Grand Boise « Sainte Victoire » 2017", "9 €"],
-    ["Château Léoube 2016", "10 €"]
+    ["VinsauVerre", ""],
+    ["LesBlancs", "12.5CL"],
+    ["A.O.PCôtesduRhône", ""],
+    ["DomainedelaGuicharde«Autourdelachapelle»2016", "8€"],
+    ["A.O.PVacqueyras", ""],
+    ["DomainedeMontvac«Melodine»2016", "10€"],
+    ["A.O.PChâteauneufduPape", ""],
+    ["DomainedeBeaurenard2017", "13€"],
+    ["A.O.PCôteauxduLanguedoc", ""],
+    ["VillaTempora«Untempspourelle»2014", "9€"],
+    ["A.O.PCôtesdeProvence", ""],
+    ["ChâteauGrandBoise2017", "9€"],
+    ["LesRosés", "125CL"],
+    ["A.O.PCôtesduRhône", ""],
+    ["DomainedelaFlorane«AfleurdePampre»2016", "8€"],
+    ["FamilleCoulon(DomaineBeaurenard)Biotifulfox2017", "8€"],
+    ["A.O.PVacqueyras", ""],
+    ["DomainedeMontvac2017", "9€"],
+    ["A.O.PLanguedoc", ""],
+    ["DomainedeJoncas«Nébla»2015", "8€"],
+    ["VillaTempora«L’arroseurarrosé»2015", "9€"],
+    ["A.O.PCôtesdeProvence", ""],
+    ["ChâteauGrandBoise«SainteVictoire»2017", "9€"],
+    ["ChâteauLéoube2016", "10€"]
 ]
 
 data_stream_edge_tol = [

diff --git a/tests/test_common.py b/tests/test_common.py
@@ -115,7 +115,7 @@ def test_stream_strip_text():
     df = pd.DataFrame(data_stream_strip_text)
 
     filename = os.path.join(testdir, "detect_vertical_false.pdf")
-    tables = camelot.read_pdf(filename, flavor="stream", strip_text="\n")
+    tables = camelot.read_pdf(filename, flavor="stream", strip_text=" ,\n")
     assert df.equals(tables[0].df)