py-pdf · MartinThoma · Jul 13, 2022 · Jun 10, 2022 · Jul 9, 2022 · Jul 9, 2022
diff --git a/PyPDF2/_cmap.py b/PyPDF2/_cmap.py
@@ -35,10 +35,17 @@ def build_char_map(
  for x in int_entry:
  if x <= 255:
  encoding[x] = chr(x)
- if font_name in _default_fonts_space_width:
+ try:
  # override space_width with new params
- space_width = _default_fonts_space_width[font_name]
- sp_width = compute_space_width(ft, space_code, space_width)
+ space_width = _default_fonts_space_width[cast(str, ft["/BaseFont"])]
+ except Exception:
+ pass
+ # I conside the space_code is available on one byte
+ if isinstance(space_code, str):
+ sp = space_code.encode("charmap")[0]
+ else:
+ sp = space_code
+ sp_width = compute_space_width(ft, sp, space_width)
 
  return (
  font_type,
@@ -193,7 +200,7 @@ def parse_to_unicode(
  )
 
  for l in cm.split(b"\n"):
- if l in (b"", b" "):
+ if l in (b"", b" ") or l[0] == 37: # 37 = %
  continue
  if b"beginbfrange" in l:
  process_rg = True
@@ -224,7 +231,7 @@ def parse_to_unicode(
  a += 1
  else:
  c = int(lst[2], 16)
- fmt2 = b"%%0%dX" % len(lst[2])
+ fmt2 = b"%%0%dX" % max(4, len(lst[2]))
  while a <= b:
  map_dict[
  unhexlify(fmt % a).decode(
@@ -259,30 +266,40 @@ def compute_space_width(
 ) -> float:
  sp_width: float = space_width * 2 # default value
  w = []
+ w1 = {}
  st: int = 0
- if "/W" in ft:
- if "/DW" in ft:
- sp_width = cast(float, ft["/DW"])
- w = list(ft["/W"]) # type: ignore
+ if "/DescendantFonts" in ft: # ft["/Subtype"].startswith("/CIDFontType"):
+ ft1 = ft["/DescendantFonts"][0].get_object() # type: ignore
+ try:
+ w1[-1] = cast(float, ft1["/DW"])
+ except Exception:
+ w1[-1] = 1000.0
+ w = list(ft1["/W"]) # type: ignore
  while len(w) > 0:
  st = w[0]
  second = w[1]
- if isinstance(int, second):
- if st <= space_code and space_code <= second:
- sp_width = w[2]
- break
+ if isinstance(second, int):
+ for x in range(st, second):
+ w1[x] = w[2]
  w = w[3:]
- if isinstance(list, second):
- if st <= space_code and space_code <= st + len(second) - 1:
- sp_width = second[space_code - st]
+ elif isinstance(second, list):
+ for y in second:
+ w1[st] = y
+ st += 1
  w = w[2:]
  else:
  warnings.warn(
- "unknown widths : \n" + (ft["/W"]).__repr__(),
+ "unknown widths : \n" + (ft1["/W"]).__repr__(),
  PdfReadWarning,
  )
  break
- if "/Widths" in ft:
+ try:
+ sp_width = w1[space_code]
+ except Exception:
+ sp_width = (
+ w1[-1] / 2.0
+ ) # if using default we consider space will be only half size
+ elif "/Widths" in ft:
  w = list(ft["/Widths"]) # type: ignore
  try:
  st = cast(int, ft["/FirstChar"])

diff --git a/PyPDF2/_page.py b/PyPDF2/_page.py
@@ -1143,22 +1143,53 @@ def _extract_text(
  # are strings where the byte->string encoding was unknown, so adding
  # them to the text here would be gibberish.
 
+ cm_matrix: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
+ cm_stack = []
  tm_matrix: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
- tm_prev: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
+ tm_prev: List[float] = [
+ 1.0,
+ 0.0,
+ 0.0,
+ 1.0,
+ 0.0,
+ 0.0,
+ ] # will store cm_matrix * tm_matrix
  char_scale = 1.0
  space_scale = 1.0
  _space_width: float = 500.0 # will be set correctly at first Tf
  TL = 0.0
  font_size = 12.0 # init just in case of
 
- # tm_matrix: Tuple = tm_matrix, output: str = output, text: str = text,
- # char_scale: float = char_scale,space_scale : float = space_scale, _space_width: float = _space_width,
- # TL: float = TL, font_size: float = font_size, cmap = cmap
+ def sign(x: float) -> float:
+ return 1 if x >= 0 else -1
+
+ def mult(m: List[float], n: List[float]) -> List[float]:
+ return [
+ m[0] * n[0] + m[1] * n[2],
+ m[0] * n[1] + m[1] * n[3],
+ m[2] * n[0] + m[3] * n[2],
+ m[2] * n[1] + m[3] * n[3],
+ m[4] * n[0] + m[5] * n[2] + n[4],
+ m[4] * n[1] + m[5] * n[3] + n[5],
+ ]
+
+ def orient(m: List[float]) -> int:
+ if m[3] > 1e-6:
+ return 0
+ elif m[3] < -1e-6:
+ return 180
+ elif m[1] > 0:
+ return 90
+ else:
+ return 270
+
+ def current_spacewidth() -> float:
+ # return space_scale * _space_width * char_scale
+ return _space_width / 1000.0
 
  def process_operation(operator: bytes, operands: List) -> None:
- nonlocal tm_matrix, tm_prev, output, text, char_scale, space_scale, _space_width, TL, font_size, cmap
- if tm_matrix[4] != 0 and tm_matrix[5] != 0: # o reuse of the
- tm_prev = list(tm_matrix)
+ nonlocal cm_matrix, cm_stack, tm_matrix, tm_prev, output, text, char_scale, space_scale, _space_width, TL, font_size, cmap
+ check_crlf_space: bool = False
  # Table 5.4 page 405
  if operator == b"BT":
  tm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
@@ -1172,6 +1203,29 @@ def process_operation(operator: bytes, operands: List) -> None:
  elif operator == b"ET":
  output += text
  text = ""
+ # table 4.7, page 219
+ # cm_matrix calculation is a reserved for the moment
+ elif operator == b"q":
+ cm_stack.append(cm_matrix)
+ elif operator == b"Q":
+ try:
+ cm_matrix = cm_stack.pop()
+ except Exception:
+ cm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
+ elif operator == b"cm":
+ output += text
+ text = ""
+ cm_matrix = mult(
+ [
+ float(operands[0]),
+ float(operands[1]),
+ float(operands[2]),
+ float(operands[3]),
+ float(operands[4]),
+ float(operands[5]),
+ ],
+ cm_matrix,
+ )
  # Table 5.2 page 398
  elif operator == b"Tz":
  char_scale = float(operands[0]) / 100.0
@@ -1203,9 +1257,11 @@ def process_operation(operator: bytes, operands: List) -> None:
  pass # keep previous size
  # Table 5.5 page 406
  elif operator == b"Td":
- tm_matrix[5] += float(operands[1])
+ check_crlf_space = True
  tm_matrix[4] += float(operands[0])
+ tm_matrix[5] += float(operands[1])
  elif operator == b"Tm":
+ check_crlf_space = True
  tm_matrix = [
  float(operands[0]),
  float(operands[1]),
@@ -1215,56 +1271,101 @@ def process_operation(operator: bytes, operands: List) -> None:
  float(operands[5]),
  ]
  elif operator == b"T*":
+ check_crlf_space = True
  tm_matrix[5] -= TL
+
  elif operator == b"Tj":
- t: str = ""
- tt: bytes = (
- encode_pdfdocencoding(operands[0])
- if isinstance(operands[0], str)
- else operands[0]
- )
- if isinstance(cmap[0], str):
- try:
- t = tt.decode(cmap[0], "surrogatepass") # apply str encoding
- except Exception: # the data does not match the expectation, we use the alternative ; text extraction may not be good
- t = tt.decode(
- "utf-16-be" if cmap[0] == "charmap" else "charmap",
- "surrogatepass",
- ) # apply str encoding
- else: # apply dict encoding
- t = "".join(
- [
- cmap[0][x] if x in cmap[0] else bytes((x,)).decode()
- for x in tt
- ]
+ check_crlf_space = True
+ if isinstance(operands[0], str):
+ text += operands[0]
+ else:
+ t: str = ""
+ tt: bytes = (
+ encode_pdfdocencoding(operands[0])
+ if isinstance(operands[0], str)
+ else operands[0]
  )
-
- text += "".join([cmap[1][x] if x in cmap[1] else x for x in t])
+ if isinstance(cmap[0], str):
+ try:
+ t = tt.decode(cmap[0], "surrogatepass") # apply str encoding
+ except Exception: # the data does not match the expectation, we use the alternative ; text extraction may not be good
+ t = tt.decode(
+ "utf-16-be" if cmap[0] == "charmap" else "charmap",
+ "surrogatepass",
+ ) # apply str encoding
+ else: # apply dict encoding
+ t = "".join(
+ [
+ cmap[0][x] if x in cmap[0] else bytes((x,)).decode()
+ for x in tt
+ ]
+ )
+
+ text += "".join([cmap[1][x] if x in cmap[1] else x for x in t])
  else:
  return None
- # process text changes due to positionchange: " "
- if tm_matrix[5] <= (
- tm_prev[5]
- - font_size # remove scaling * sqrt(tm_matrix[2] ** 2 + tm_matrix[3] ** 2)
- ): # it means that we are moving down by one line
- output += text + "\n" # .translate(cmap) + "\n"
- text = ""
- elif tm_matrix[4] >= (
- tm_prev[4] + space_scale * _space_width * char_scale
- ): # it means that we are moving down by one line
- text += " "
- return None
- # for clarity Operator in (b"g",b"G") : nothing to do
- # end of process_operation ######
+ if check_crlf_space:
+ m = mult(tm_matrix, cm_matrix)
+ o = orient(m)
+ deltaX = m[4] - tm_prev[4]
+ deltaY = m[5] - tm_prev[5]
+ k = math.sqrt(abs(m[0] * m[3]) + abs(m[1] * m[2]))
+ f = font_size * k
+ tm_prev = m
+ try:
+ if o == 0:
+ if deltaY < -0.8 * f:
+ if (output + text)[-1] != "\n":
+ text += "\n"
+ elif (
+ abs(deltaY) < f * 0.3
+ and abs(deltaX) > current_spacewidth() * f * 10
+ ):
+ if (output + text)[-1] != " ":
+ text += " "
+ elif o == 180:
+ if deltaY > 0.8 * f:
+ if (output + text)[-1] != "\n":
+ text += "\n"
+ elif (
+ abs(deltaY) < f * 0.3
+ and abs(deltaX) > current_spacewidth() * f * 10
+ ):
+ if (output + text)[-1] != " ":
+ text += " "
+ elif o == 90:
+ if deltaX > 0.8 * f:
+ if (output + text)[-1] != "\n":
+ text += "\n"
+ elif (
+ abs(deltaX) < f * 0.3
+ and abs(deltaY) > current_spacewidth() * f * 10
+ ):
+ if (output + text)[-1] != " ":
+ text += " "
+ elif o == 270:
+ if deltaX < -0.8 * f:
+ if (output + text)[-1] != "\n":
+ text += "\n"
+ elif (
+ abs(deltaX) < f * 0.3
+ and abs(deltaY) > current_spacewidth() * f * 10
+ ):
+ if (output + text)[-1] != " ":
+ text += " "
+ except Exception:
+ pass
 
  for operands, operator in content.operations:
  # multiple operators are defined in here ####
  if operator == b"'":
  process_operation(b"T*", [])
  process_operation(b"Tj", operands)
  elif operator == b'"':
+ process_operation(b"Tw", [operands[0]])
+ process_operation(b"Tc", [operands[1]])
  process_operation(b"T*", [])
- process_operation(b"TJ", operands)
+ process_operation(b"Tj", operands[2:])
  elif operator == b"TD":
  process_operation(b"TL", [-operands[1]])
  process_operation(b"Td", operands)
@@ -1273,15 +1374,23 @@ def process_operation(operator: bytes, operands: List) -> None:
  if isinstance(op, (str, bytes)):
  process_operation(b"Tj", [op])
  if isinstance(op, (int, float, NumberObject, FloatObject)):
- process_operation(b"Td", [-op, 0.0])
+ if (
+ (abs(float(op)) >= _space_width)
+ and (abs(float(op)) <= 8 * _space_width)
+ and (text[-1] != " ")
+ ):
+ process_operation(b"Tj", [" "])
  elif operator == b"Do":
  output += text
- if output != "":
- output += "\n"
+ try:
+ if output[-1] != "\n":
+ output += "\n"
+ except IndexError:
+ pass
  try:
  xobj = resources_dict["/XObject"] # type: ignore
  if xobj[operands[0]]["/Subtype"] != "/Image": # type: ignore
- output += text
+ # output += text
  text = self.extract_xform_text(xobj[operands[0]], space_width) # type: ignore
  output += text
  except Exception:

diff --git a/tests/test_page.py b/tests/test_page.py
@@ -35,6 +35,7 @@ def get_all_sample_files():
  [m for m in all_files_meta["data"] if not m["encrypted"]],
  ids=[m["path"] for m in all_files_meta["data"] if not m["encrypted"]],
 )
+@pytest.mark.filterwarnings("ignore::PyPDF2.errors.PdfReadWarning")
 def test_read(meta):
  pdf_path = os.path.join(EXTERNAL_ROOT, meta["path"])
  reader = PdfReader(pdf_path)