skip single pixel images in PDF (#546)

* skip pixels hopefully * add comments and reorder * add constant
ciromattia · Aug 5, 2023 · 9339abb · 9339abb
1 parent 154707a
commit 9339abb
Showing 1 changed file with 11 additions and 1 deletion.
diff --git a/kindlecomicconverter/pdfjpgextract.py b/kindlecomicconverter/pdfjpgextract.py
@@ -25,6 +25,11 @@
 from random import choice
 from string import ascii_uppercase, digits
 
+# skip stray images a few pixels in size in some PDFs
+# typical images are many thousands in length
+# https://github.com/ciromattia/kcc/pull/546
+STRAY_IMAGE_LENGTH_THRESHOLD = 300
+
 
 class PdfJpgExtract:
  def __init__(self, fname):
@@ -60,10 +65,15 @@ def extract(self):
  raise Exception("Didn't find end of JPG!")
  istart += startfix
  iend += endfix
+ i = iend
+
+ if iend - istart < STRAY_IMAGE_LENGTH_THRESHOLD:
+ continue
+
  jpg = pdf[istart:iend]
  jpgfile = open(self.path + "/jpg%d.jpg" % njpg, "wb")
  jpgfile.write(jpg)
  jpgfile.close()
  njpg += 1
- i = iend
+
  return self.path, njpg