Skip to content
This repository has been archived by the owner on Jan 6, 2025. It is now read-only.

[MRG] Fixed #350: make sure strip_text= argument works #351

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 41 additions & 5 deletions camelot/utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# -*- coding: utf-8 -*-
from __future__ import division

import re
import os
import sys
import random
Expand Down Expand Up @@ -385,6 +386,33 @@ def merge_close_lines(ar, line_tol=2):
return ret


def text_strip(text, strip=""):
"""Strips any characters in `strip` that are present in `text`.

Parameters
----------
text : str
Text to process and strip.
strip : str, optional (default: '')
Characters that should be stripped from `text`.

Returns
-------
stripped : str

"""
if not strip:
return text

stripped = re.sub(
r"[{}]".format("".join(map(re.escape, strip))),
"",
text,
re.UNICODE,
)
return stripped


# TODO: combine the following functions into a TextProcessor class which
# applies corresponding transformations sequentially
# (inspired from sklearn.pipeline.Pipeline)
Expand Down Expand Up @@ -428,10 +456,10 @@ def flag_font_size(textline, direction, strip_text=''):
fchars = [t[0] for t in chars]
if ''.join(fchars).strip():
flist.append(''.join(fchars))
fstring = ''.join(flist).strip(strip_text)
fstring = ''.join(flist)
else:
fstring = ''.join([t.get_text() for t in textline]).strip(strip_text)
return fstring
fstring = ''.join([t.get_text() for t in textline])
return text_strip(fstring, strip_text)


def split_textline(table, textline, direction, flag_size=False, strip_text=''):
Expand Down Expand Up @@ -515,7 +543,7 @@ def split_textline(table, textline, direction, flag_size=False, strip_text=''):
flag_font_size([t[2] for t in chars], direction, strip_text=strip_text)))
else:
gchars = [t[2].get_text() for t in chars]
grouped_chars.append((key[0], key[1], ''.join(gchars).strip(strip_text)))
grouped_chars.append((key[0], key[1], text_strip(''.join(gchars), strip_text)))
return grouped_chars


Expand Down Expand Up @@ -599,7 +627,15 @@ def get_table_index(table, t, direction, split_text=False, flag_size=False, stri
if flag_size:
return [(r_idx, c_idx, flag_font_size(t._objs, direction, strip_text=strip_text))], error
else:
return [(r_idx, c_idx, t.get_text().strip(strip_text))], error

return (
[(
r_idx,
c_idx,
text_strip(t.get_text(), strip_text),
)],
error,
)


def compute_accuracy(error_weights):
Expand Down
48 changes: 24 additions & 24 deletions tests/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -313,30 +313,30 @@
]

data_stream_strip_text = [
["V i n s a u Ve r r e", ""],
["Les Blancs", "12.5CL"],
["A.O.P Côtes du Rhône", ""],
["Domaine de la Guicharde « Autour de la chapelle » 2016", "8 €"],
["A.O.P Vacqueyras", ""],
["Domaine de Montvac « Melodine » 2016", "10 €"],
["A.O.P Châteauneuf du Pape", ""],
["Domaine de Beaurenard 2017", "13 €"],
["A.O.P Côteaux du Languedoc", ""],
["Villa Tempora « Un temps pour elle » 2014", "9 €"],
["A.O.P Côtes de Provence", ""],
["Château Grand Boise 2017", "9 €"],
["Les Rosés", "12,5 CL"],
["A.O.P Côtes du Rhône", ""],
["Domaine de la Florane « A fleur de Pampre » 2016", "8 €"],
["Famille Coulon (Domaine Beaurenard) Biotifulfox 2017", "8 €"],
["A.O.P Vacqueyras", ""],
["Domaine de Montvac 2017", "9 €"],
["A.O.P Languedoc", ""],
["Domaine de Joncas « Nébla » 2015", "8 €"],
["Villa Tempora « L’arroseur arrosé » 2015", "9 €"],
["A.O.P Côtes de Provence", ""],
["Château Grand Boise « Sainte Victoire » 2017", "9 €"],
["Château Léoube 2016", "10 €"]
["VinsauVerre", ""],
["LesBlancs", "12.5CL"],
["A.O.PCôtesduRhône", ""],
["DomainedelaGuicharde«Autourdelachapelle»2016", "8€"],
["A.O.PVacqueyras", ""],
["DomainedeMontvac«Melodine»2016", "10€"],
["A.O.PChâteauneufduPape", ""],
["DomainedeBeaurenard2017", "13€"],
["A.O.PCôteauxduLanguedoc", ""],
["VillaTempora«Untempspourelle»2014", "9€"],
["A.O.PCôtesdeProvence", ""],
["ChâteauGrandBoise2017", "9€"],
["LesRosés", "125CL"],
["A.O.PCôtesduRhône", ""],
["DomainedelaFlorane«AfleurdePampre»2016", "8€"],
["FamilleCoulon(DomaineBeaurenard)Biotifulfox2017", "8€"],
["A.O.PVacqueyras", ""],
["DomainedeMontvac2017", "9€"],
["A.O.PLanguedoc", ""],
["DomainedeJoncas«Nébla»2015", "8€"],
["VillaTempora«L’arroseurarrosé»2015", "9€"],
["A.O.PCôtesdeProvence", ""],
["ChâteauGrandBoise«SainteVictoire»2017", "9€"],
["ChâteauLéoube2016", "10€"]
]

data_stream_edge_tol = [
Expand Down
2 changes: 1 addition & 1 deletion tests/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ def test_stream_strip_text():
df = pd.DataFrame(data_stream_strip_text)

filename = os.path.join(testdir, "detect_vertical_false.pdf")
tables = camelot.read_pdf(filename, flavor="stream", strip_text="\n")
tables = camelot.read_pdf(filename, flavor="stream", strip_text=" ,\n")
assert df.equals(tables[0].df)


Expand Down