You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Hi,
page.extract_words() fails with TypeError: '<' not supported between instances of 'str' and 'NoneType'.
Issue with page no. 51 in an attached file gpfra2012_protocol.pdf
I use v0.6.0.
` in find_rects(page)
1 def find_rects(page):
2 rects = []
----> 3 words = page.extract_words()
4
5 markers = [
C:\Program Files (x86)\Microsoft Visual Studio\Shared\Anaconda3_64\lib\site-packages\pdfplumber\utils.py in make_cluster_dict(values, tolerance)
32 def make_cluster_dict(values, tolerance):
33 tolerance = decimalize(tolerance)
---> 34 clusters = cluster_list(set(values), tolerance)
35
36 nested_tuples = [ [ (val, i) for val in value_cluster ]
C:\Program Files (x86)\Microsoft Visual Studio\Shared\Anaconda3_64\lib\site-packages\pdfplumber\utils.py in cluster_list(xs, tolerance)
14 def cluster_list(xs, tolerance=0):
15 tolerance = decimalize(tolerance)
---> 16 if tolerance == 0: return [ [x] for x in sorted(xs) ]
17 if len(xs) < 2: return [ [x] for x in sorted(xs) ]
18 groups = []
TypeError: '<' not supported between instances of 'str' and 'NoneType'
The text was updated successfully, but these errors were encountered:
Thanks for identifying this bug, @VasiliyLukin! Was caused by some character objects not explicitly having a font name. Should now be fixed both in master and in the v0.6.0 branch. Feel free to re-open this issue, though, if the problem persists.
Hi,
page.extract_words() fails with TypeError: '<' not supported between instances of 'str' and 'NoneType'.
Issue with page no. 51 in an attached file
gpfra2012_protocol.pdf
I use v0.6.0.
` in find_rects(page)
1 def find_rects(page):
2 rects = []
----> 3 words = page.extract_words()
4
5 markers = [
C:\Program Files (x86)\Microsoft Visual Studio\Shared\Anaconda3_64\lib\site-packages\pdfplumber\page.py in extract_words(self, **kwargs)
173 def extract_words(self, **kwargs):
174
--> 175 return utils.extract_words(self.chars, **kwargs)
176
177 def find_text_edges(self, *args, **kwargs):
C:\Program Files (x86)\Microsoft Visual Studio\Shared\Anaconda3_64\lib\site-packages\pdfplumber\utils.py in extract_words(chars, x_tolerance, y_tolerance, fontsize_tolerance, keep_blank_chars, match_fontsize, match_fontname)
226 doctop_clusters = list(itertools.chain.from_iterable(
227 cluster_objects(chars, "fontname", 0)
--> 228 for chars in doctop_clusters))
229
230 nested = [ get_line_words(line_chars, tolerance=x_tolerance)
C:\Program Files (x86)\Microsoft Visual Studio\Shared\Anaconda3_64\lib\site-packages\pdfplumber\utils.py in (.0)
226 doctop_clusters = list(itertools.chain.from_iterable(
227 cluster_objects(chars, "fontname", 0)
--> 228 for chars in doctop_clusters))
229
230 nested = [ get_line_words(line_chars, tolerance=x_tolerance)
C:\Program Files (x86)\Microsoft Visual Studio\Shared\Anaconda3_64\lib\site-packages\pdfplumber\utils.py in cluster_objects(objs, attr, tolerance)
47 objs = to_list(objs)
48 values = map(attr_getter, objs)
---> 49 cluster_dict = make_cluster_dict(values, tolerance)
50
51 get_0, get_1 = itemgetter(0), itemgetter(1)
C:\Program Files (x86)\Microsoft Visual Studio\Shared\Anaconda3_64\lib\site-packages\pdfplumber\utils.py in make_cluster_dict(values, tolerance)
32 def make_cluster_dict(values, tolerance):
33 tolerance = decimalize(tolerance)
---> 34 clusters = cluster_list(set(values), tolerance)
35
36 nested_tuples = [ [ (val, i) for val in value_cluster ]
C:\Program Files (x86)\Microsoft Visual Studio\Shared\Anaconda3_64\lib\site-packages\pdfplumber\utils.py in cluster_list(xs, tolerance)
14 def cluster_list(xs, tolerance=0):
15 tolerance = decimalize(tolerance)
---> 16 if tolerance == 0: return [ [x] for x in sorted(xs) ]
17 if len(xs) < 2: return [ [x] for x in sorted(xs) ]
18 groups = []
TypeError: '<' not supported between instances of 'str' and 'NoneType'
The text was updated successfully, but these errors were encountered: