Merge branch 'selfmade-textract'

tfeldmann · Feb 17, 2024 · 1b146cc · 1b146cc
2 parents ba628a1 + 19dc073
commit 1b146cc
Show file tree

Hide file tree

Showing 9 changed files with 193 additions and 525 deletions.
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -39,9 +39,9 @@ jobs:
  PYTHON_KEYRING_BACKEND: keyring.backends.null.Keyring
  run: |
  python3 -m pip install -U pip setuptools
- python3 -m pip install poetry==1.7.1 lxml
+ python3 -m pip install poetry==1.7.1
  poetry config virtualenvs.create false
- poetry install --with=dev --extras=textract
+ poetry install --with=dev
 
  - name: Version info
  run: |

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,10 @@
 
 ## [Unreleased]
 
+- Integrated `pdftotext`, `pdfminer` and `docx2txt` interfaces into `filecontent` filter.
+- Removed `textract` and ~50 MB of dependencies as they are no longer needed.
+- Python 3.12 support
+
 ## v3.1.2 (2024-02-16)
 
 - Fixes a validation error where correctly defined actions were not accepted in Python 3.12.2.

diff --git a/Dockerfile b/Dockerfile
@@ -12,7 +12,7 @@ FROM base as pydeps
 RUN pip install "poetry==1.7.1" && \
  python -m venv ${VIRTUAL_ENV}
 COPY pyproject.toml poetry.lock ./
-RUN poetry install --only=main --extras=textract --no-interaction
+RUN poetry install --only=main --no-interaction
 
 
 FROM base as final

diff --git a/README.md b/README.md
@@ -77,13 +77,6 @@ Installation is done via pip. Note that the package name is `organize-tool`:
 pip install -U organize-tool
 ```
 
-If you want the text extraction capabilities, install with `textract` like this (the
-qoutes are important):
-
-```bash
-pip install "organize-tool[texttract]"
-```
-
 This command can also be used to update to the newest version. Now you can run `organize --help` to check if the installation was successful.
 
 ### Create your first rule

diff --git a/organize/cli.py b/organize/cli.py
@@ -2,13 +2,13 @@
 organize - The file management automation tool.
 
 Usage:
- organize run [options] [<config>]
- organize sim [options] [<config>]
- organize new [<config>]
- organize edit [<config>]
- organize check [<config>]
- organize debug [<config>]
- organize show [--path|--reveal] [<config>]
+ organize run  [options] [<config>]
+ organize sim  [options] [<config>]
+ organize new  [<config>]
+ organize edit  [<config>]
+ organize check  [<config>]
+ organize debug  [<config>]
+ organize show  [--path|--reveal] [<config>]
  organize list
  organize docs
  organize --version

diff --git a/organize/filters/exif.py b/organize/filters/exif.py
@@ -40,7 +40,7 @@ def exiftool_available() -> bool:
  )
  return True
  except subprocess.CalledProcessError:
- logging.warning("exiddftool not available. Falling back to exifread library.")
+ logging.warning("exiftool not available. Falling back to exifread library.")
  return False
 
 

diff --git a/organize/filters/filecontent.py b/organize/filters/filecontent.py
@@ -1,7 +1,9 @@
 import logging
 import re
+import subprocess
+from functools import lru_cache
 from pathlib import Path
-from typing import Any, ClassVar
+from typing import Any, Callable, ClassVar, Dict
 
 from pydantic.config import ConfigDict
 from pydantic.dataclasses import dataclass
@@ -11,9 +13,95 @@
 from organize.resource import Resource
 
 
+def _compress_chars(inp: str) -> str:
+ # Compress lines consisting only of separated chars ("H e l l o W o r l d")
+ result = []
+ for line in inp.splitlines():
+ if re.match(r"^(\S +)+\S$", line):
+ result.append(re.sub(r"(\S) ", repl=r"\g<1>", string=line))
+ else:
+ result.append(line)
+ return "\n".join(result)
+
+
+def _remove_nls(inp: str) -> str:
+ # remove superfluous newlines
+ return re.sub(pattern=r"\n{3,}", repl="\n\n", string=inp, flags=re.MULTILINE)
+
+
+def clean(inp: str) -> str:
+ return _remove_nls(_compress_chars(inp))
+
+
+def extract_txt(path: Path) -> str:
+ return path.read_text()
+
+
+@lru_cache(maxsize=1)
+def _pdftotext_available() -> bool:
+ # check whether the given path is executable
+ try:
+ subprocess.check_call(
+ ["pdftotext", "-v"],
+ stdout=subprocess.DEVNULL,
+ stderr=subprocess.STDOUT,
+ )
+ return True
+ except subprocess.CalledProcessError:
+ logging.warning("pdftotext not available. Falling back to pdfminer library.")
+ return False
+
+
+def _extract_with_pdftotext(path: Path, keep_layout: bool) -> str:
+ if keep_layout:
+ args = ["-layout", str(path), "-"]
+ else:
+ args = [str(path), "-"]
+ result = subprocess.check_output(("pdftotext", *args), text=True)
+ return clean(result)
+
+
+def _extract_with_pdfminer(path: Path) -> str:
+ from pdfminer import high_level
+
+ return clean(high_level.extract_text(path))
+
+
+def extract_pdf(path: Path, keep_layout: bool = True) -> str:
+ if _pdftotext_available():
+ return _extract_with_pdftotext(path=path, keep_layout=keep_layout)
+ return _extract_with_pdfminer(path=path)
+
+
+def extract_docx(path: Path) -> str:
+ import docx2txt # type: ignore
+
+ result = docx2txt.process(path)
+ return clean(result)
+
+
+EXTRACTORS: Dict[str, Callable[[Path], str]] = {
+ ".md": extract_txt,
+ ".txt": extract_txt,
+ ".log": extract_txt,
+ ".pdf": extract_pdf,
+ ".docx": extract_docx,
+}
+
+
+def textract(path: Path) -> str:
+ extractor = EXTRACTORS[path.suffix.lower()]
+ return extractor(path)
+
+
 @dataclass(config=ConfigDict(coerce_numbers_to_str=True, extra="forbid"))
 class FileContent:
- """Matches file content with the given regular expression
+ """Matches file content with the given regular expression.
+
+ Supports .md, .txt, .log, .pdf and .docx files.
+
+ For PDF content extraction poppler should be installed for the `pdftotext` command.
+ If this is not available `filecontent` will fall back to the `pdfminer` library.
 
  Attributes:
  expr (str): The regular expression to be matched.
@@ -39,19 +127,9 @@ def __post_init__(self):
  self._expr = re.compile(self.expr, re.MULTILINE | re.DOTALL)
 
  def matches(self, path: Path) -> Any:
- try:
- import textract
-
- content = textract.process(str(path), errors="ignore")
- match = self._expr.search(content.decode("utf-8", errors="ignore"))
- return match
- except ImportError as e:
- raise ImportError(
- "textract is not installed. "
- "Install with pip install organize-tool[textract]"
- ) from e
- except textract.exceptions.CommandLineError as e:
- logging.exception(e)
+ content = textract(path)
+ match = self._expr.search(content)
+ return match
 
  def pipeline(self, res: Resource, output: Output) -> bool:
  assert res.path is not None, "Does not support standalone mode"
@@ -60,3 +138,9 @@ def pipeline(self, res: Resource, output: Output) -> bool:
  if match:
  res.deep_merge(self.filter_config.name, match.groupdict())
  return bool(match)
+
+
+if __name__ == "__main__":
+ import sys
+
+ print(textract(Path(sys.argv[1])))