Skip to content

Commit

Permalink
Merge branch 'selfmade-textract'
Browse files Browse the repository at this point in the history
  • Loading branch information
tfeldmann committed Feb 17, 2024
2 parents ba628a1 + 19dc073 commit 1b146cc
Show file tree
Hide file tree
Showing 9 changed files with 193 additions and 525 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -39,9 +39,9 @@ jobs:
PYTHON_KEYRING_BACKEND: keyring.backends.null.Keyring
run: |
python3 -m pip install -U pip setuptools
python3 -m pip install poetry==1.7.1 lxml
python3 -m pip install poetry==1.7.1
poetry config virtualenvs.create false
poetry install --with=dev --extras=textract
poetry install --with=dev
- name: Version info
run: |
Expand Down
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@

## [Unreleased]

- Integrated `pdftotext`, `pdfminer` and `docx2txt` interfaces into `filecontent` filter.
- Removed `textract` and ~50 MB of dependencies as they are no longer needed.
- Python 3.12 support

## v3.1.2 (2024-02-16)

- Fixes a validation error where correctly defined actions were not accepted in Python 3.12.2.
Expand Down
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ FROM base as pydeps
RUN pip install "poetry==1.7.1" && \
python -m venv ${VIRTUAL_ENV}
COPY pyproject.toml poetry.lock ./
RUN poetry install --only=main --extras=textract --no-interaction
RUN poetry install --only=main --no-interaction


FROM base as final
Expand Down
7 changes: 0 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -77,13 +77,6 @@ Installation is done via pip. Note that the package name is `organize-tool`:
pip install -U organize-tool
```

If you want the text extraction capabilities, install with `textract` like this (the
qoutes are important):

```bash
pip install "organize-tool[texttract]"
```

This command can also be used to update to the newest version. Now you can run `organize --help` to check if the installation was successful.

### Create your first rule
Expand Down
14 changes: 7 additions & 7 deletions organize/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,13 @@
organize - The file management automation tool.
Usage:
organize run [options] [<config>]
organize sim [options] [<config>]
organize new [<config>]
organize edit [<config>]
organize check [<config>]
organize debug [<config>]
organize show [--path|--reveal] [<config>]
organize run [options] [<config>]
organize sim [options] [<config>]
organize new [<config>]
organize edit [<config>]
organize check [<config>]
organize debug [<config>]
organize show [--path|--reveal] [<config>]
organize list
organize docs
organize --version
Expand Down
2 changes: 1 addition & 1 deletion organize/filters/exif.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def exiftool_available() -> bool:
)
return True
except subprocess.CalledProcessError:
logging.warning("exiddftool not available. Falling back to exifread library.")
logging.warning("exiftool not available. Falling back to exifread library.")
return False


Expand Down
114 changes: 99 additions & 15 deletions organize/filters/filecontent.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import logging
import re
import subprocess
from functools import lru_cache
from pathlib import Path
from typing import Any, ClassVar
from typing import Any, Callable, ClassVar, Dict

from pydantic.config import ConfigDict
from pydantic.dataclasses import dataclass
Expand All @@ -11,9 +13,95 @@
from organize.resource import Resource


def _compress_chars(inp: str) -> str:
# Compress lines consisting only of separated chars ("H e l l o W o r l d")
result = []
for line in inp.splitlines():
if re.match(r"^(\S +)+\S$", line):
result.append(re.sub(r"(\S) ", repl=r"\g<1>", string=line))
else:
result.append(line)
return "\n".join(result)


def _remove_nls(inp: str) -> str:
# remove superfluous newlines
return re.sub(pattern=r"\n{3,}", repl="\n\n", string=inp, flags=re.MULTILINE)


def clean(inp: str) -> str:
return _remove_nls(_compress_chars(inp))


def extract_txt(path: Path) -> str:
return path.read_text()


@lru_cache(maxsize=1)
def _pdftotext_available() -> bool:
# check whether the given path is executable
try:
subprocess.check_call(
["pdftotext", "-v"],
stdout=subprocess.DEVNULL,
stderr=subprocess.STDOUT,
)
return True
except subprocess.CalledProcessError:
logging.warning("pdftotext not available. Falling back to pdfminer library.")
return False


def _extract_with_pdftotext(path: Path, keep_layout: bool) -> str:
if keep_layout:
args = ["-layout", str(path), "-"]
else:
args = [str(path), "-"]
result = subprocess.check_output(("pdftotext", *args), text=True)
return clean(result)


def _extract_with_pdfminer(path: Path) -> str:
from pdfminer import high_level

return clean(high_level.extract_text(path))


def extract_pdf(path: Path, keep_layout: bool = True) -> str:
if _pdftotext_available():
return _extract_with_pdftotext(path=path, keep_layout=keep_layout)
return _extract_with_pdfminer(path=path)


def extract_docx(path: Path) -> str:
import docx2txt # type: ignore

result = docx2txt.process(path)
return clean(result)


EXTRACTORS: Dict[str, Callable[[Path], str]] = {
".md": extract_txt,
".txt": extract_txt,
".log": extract_txt,
".pdf": extract_pdf,
".docx": extract_docx,
}


def textract(path: Path) -> str:
extractor = EXTRACTORS[path.suffix.lower()]
return extractor(path)


@dataclass(config=ConfigDict(coerce_numbers_to_str=True, extra="forbid"))
class FileContent:
"""Matches file content with the given regular expression
"""Matches file content with the given regular expression.
Supports .md, .txt, .log, .pdf and .docx files.
For PDF content extraction poppler should be installed for the `pdftotext` command.
If this is not available `filecontent` will fall back to the `pdfminer` library.
Attributes:
expr (str): The regular expression to be matched.
Expand All @@ -39,19 +127,9 @@ def __post_init__(self):
self._expr = re.compile(self.expr, re.MULTILINE | re.DOTALL)

def matches(self, path: Path) -> Any:
try:
import textract

content = textract.process(str(path), errors="ignore")
match = self._expr.search(content.decode("utf-8", errors="ignore"))
return match
except ImportError as e:
raise ImportError(
"textract is not installed. "
"Install with pip install organize-tool[textract]"
) from e
except textract.exceptions.CommandLineError as e:
logging.exception(e)
content = textract(path)
match = self._expr.search(content)
return match

def pipeline(self, res: Resource, output: Output) -> bool:
assert res.path is not None, "Does not support standalone mode"
Expand All @@ -60,3 +138,9 @@ def pipeline(self, res: Resource, output: Output) -> bool:
if match:
res.deep_merge(self.filter_config.name, match.groupdict())
return bool(match)


if __name__ == "__main__":
import sys

print(textract(Path(sys.argv[1])))
Loading

0 comments on commit 1b146cc

Please sign in to comment.