Skip to content

Commit

Permalink
New storage scheme for file storage (#210)
Browse files Browse the repository at this point in the history
  • Loading branch information
ciur authored Sep 27, 2023
1 parent 0e76b5f commit 24f0b05
Show file tree
Hide file tree
Showing 25 changed files with 567 additions and 411 deletions.
2 changes: 0 additions & 2 deletions docker/dev/config/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,5 +39,3 @@
'url',
default=f'xapian:///{os.path.join(PROJ_ROOT, "index_db")}'
)

PAPERMERGE_CREATE_SPECIAL_FOLDERS = False
3 changes: 3 additions & 0 deletions papermerge/core/constants.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
DEFAULT_THUMBNAIL_SIZE = 100 # 100 pixels wide
JPG = 'jpg'
PAGES = 'pages'
THUMBNAILS = 'thumbnails'
DOCVERS = 'docvers'
OCR = 'ocr'
DEFAULT_TAG_BG_COLOR = '#c41fff'
DEFAULT_TAG_FG_COLOR = '#ffffff'
INDEX_ADD_NODE = 'index_add_node'
Expand Down
15 changes: 3 additions & 12 deletions papermerge/core/lib/path.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import logging
import re
import os
import re

SUPPORTED_EXTENTIONS = re.compile(".*(jpeg|jpg|png|tiff|pdf)$", re.IGNORECASE)

Expand Down Expand Up @@ -37,14 +37,7 @@ def filter_by_extention(


class DocumentPath:
"""
Document path:
/<aux_dir>/<user_id>/<doc_id>/<version>/<file_name>
If version = 0, it is not included in DocumentPath.
Document's version is incremented everytime pdftk operation runs on it
(when pages are deleted, reordered, pasted)
"""
"""OBSOLETE. Do not use this class!"""

def __init__(
self,
Expand Down Expand Up @@ -157,9 +150,7 @@ def copy_from(doc_path, **kw):


class PagePath:
"""
<aux_dir>/<doc_id>/pages/<page_num>/page-<xyz>.jpg
"""
"""OBSOLETE. Do not use this class!"""

def __init__(
self,
Expand Down
123 changes: 43 additions & 80 deletions papermerge/core/lib/storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,9 @@
import shutil
from os import listdir
from os.path import isdir, join
from pathlib import Path

from .path import DocumentPath, PagePath, AUX_DIR_SIDECARS, AUX_DIR_DOCS
from .path import AUX_DIR_DOCS, AUX_DIR_SIDECARS, DocumentPath, PagePath
from .utils import safe_to_delete

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -130,61 +131,26 @@ def abspath(self, _path):
def path(self, _path):
return self.abspath(_path)

def delete_doc(self, doc_path: DocumentPath):
"""
Receives a DocumentPath instance
"""
# where original documents and their versions are stored
abs_dirname_docs = self.path(
doc_path.dirname_docs
)
# where OCRed information and generated thumbnails
# are stored
abs_dirname_sidecars = self.path(
doc_path.dir_sidecars
)
def delete_file(self, file_or_folder: Path):
# Before recursively deleting everything in folder
# double check that there are only
# .pdf, .txt, .hocr, .jpg files.
if safe_to_delete(
abs_dirname_docs
):
shutil.rmtree(abs_dirname_docs)
if os.path.exists(abs_dirname_docs):
os.rmdir(abs_dirname_docs)
if file_or_folder.is_dir() and safe_to_delete(file_or_folder):
shutil.rmtree(file_or_folder)
file_or_folder.rmdir()

if safe_to_delete(
abs_dirname_sidecars
):
shutil.rmtree(abs_dirname_sidecars)
if os.path.exists(abs_dirname_sidecars):
os.rmdir(abs_dirname_sidecars)
def copy_file(self, src: Path | io.BytesIO, dst: Path):
"""Copy source file to destination"""
logger.debug(f"copying {src} to {dst}")

def copy_doc(self, src: DocumentPath | io.BytesIO, dst: DocumentPath):
"""
copy given file src file path to destination
as absolute doc_path
"""
logger.debug(f"copy_doc {src} to {dst}")
dirname = os.path.dirname(
self.abspath(dst)
)
if not os.path.exists(
dirname
):
os.makedirs(
dirname, exist_ok=True
)
if isinstance(src, DocumentPath):
logger.debug(
f"copy_doc: {src} to {dst}"
)
shutil.copyfile(
self.abspath(src),
self.abspath(dst)
)
if not dst.parent.exists():
os.makedirs(dst.parent, exist_ok=True)

if isinstance(src, Path):
logger.debug(f"{src} is a Path instance")
shutil.copyfile(src, dst)
elif isinstance(src, io.BytesIO):
with open(self.abspath(dst), 'wb') as f:
with open(dst, 'wb') as f:
f.write(src.getvalue())
else:
raise ValueError(
Expand Down Expand Up @@ -248,42 +214,20 @@ def copy_page_preview(self, src: PagePath, dst: PagePath):

shutil.copy(src_preview, dst_preview)

def copy_page(self, src: PagePath, dst: PagePath):
def copy_page(self, src_folder: Path, dst_folder: Path):
"""
Copies page data from source folder/path to page destination folder/path
Copies page data from source folder to destination folder
Page data are files with 'txt', 'hocr', 'jpg', 'svg' extentions.
"""
for inst in [src, dst]:
if not isinstance(inst, PagePath):
raise ValueError("copy_page accepts only PagePath instances")

# copy .txt file
if self.exists(src.txt_url):
self.copy_page_txt(src=src, dst=dst)
else:
logger.debug(f"txt does not exits {src.txt_url}")
if not src_folder.is_dir():
raise ValueError(f"Source is not a folder {src_folder}")

# hocr
if self.exists(src.hocr_url):
self.copy_page_hocr(src=src, dst=dst)
else:
logger.debug(f"hocr does not exits {src.hocr_url}")

if self.exists(src.jpg_url):
self.copy_page_jpg(src=src, dst=dst)
else:
logger.debug(f"jpg does not exits {src.jpg_url}")
dst_folder.mkdir(parents=True, exist_ok=True)
if not dst_folder.is_dir():
raise ValueError(f"Destination is not a folder {dst_folder}")

if self.exists(src.svg_url):
self.copy_page_svg(src=src, dst=dst)
else:
logger.debug(f"svg does not exits {src.svg_url}")

if self.exists(src.preview_url):
self.copy_page_preview(src=src, dst=dst)
else:
logger.debug(f"preview does not exits {src.preview_url}")
shutil.copytree(src_folder, dst_folder, dirs_exist_ok=True)

def reorder_pages(self, doc_path, new_order):
"""
Expand Down Expand Up @@ -475,3 +419,22 @@ def paste_pages(

class FileSystemStorage(Storage):
pass


def copy_file(src: Path | io.BytesIO, dst: Path):
"""Copy source file to destination"""
logger.debug(f"copying {src} to {dst}")

if not dst.parent.exists():
os.makedirs(dst.parent, exist_ok=True)

if isinstance(src, Path):
logger.debug(f"{src} is a Path instance")
shutil.copyfile(src, dst)
elif isinstance(src, io.BytesIO):
with open(dst, 'wb') as f:
f.write(src.getvalue())
else:
raise ValueError(
f"src ({src}) is neither instance of DocumentPath nor io.Bytes"
)
12 changes: 6 additions & 6 deletions papermerge/core/lib/utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import os
import logging

import os
from pathlib import Path

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -31,15 +31,15 @@ def get_bool(key, default="NO"):
return False


def safe_to_delete(place):
if not os.path.exists(place):
def safe_to_delete(path: Path) -> True:
if not path.exists():
logging.warning(
f"Trying to delete not exising folder"
f" {place}"
f" {path}"
)
return False

for root, dirs, files in os.walk(place):
for root, dirs, files in os.walk(path):
for name in files:
base, ext = os.path.splitext(name)
if ext.lower() not in SAFE_EXTENSIONS:
Expand Down
38 changes: 26 additions & 12 deletions papermerge/core/management/commands/ocr-task.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,18 @@
import uuid

from django.core.management.base import BaseCommand

from papermerge.core.models import Document
from papermerge.core.tasks import ocr_document_task
from papermerge.core.ocr.document import ocr_document
from papermerge.core.tasks import _post_ocr_document


class Command(BaseCommand):
help = """
Triggers OCR task for given document UUID
Calls OCR document same way the `core.task.ocr_document_task`
Handy management command to quickly check if
OCRing works
"""

def add_arguments(self, parser):
Expand All @@ -16,14 +22,22 @@ def add_arguments(self, parser):
)

def handle(self, *args, **options):
uuid = options.get('UUID')
doc = Document.objects.get(id=uuid)

ocr_document_task.apply_async(
kwargs={
'document_id': str(doc.id),
'lang': doc.lang,
'namespace': None,
'user_id': str(doc.user.id)
}
doc_id = options.get('UUID')
doc = Document.objects.get(id=doc_id)
last_version = doc.versions.last()
target_docver_uuid = uuid.uuid4()
target_page_uuids = [
uuid.uuid4() for _ in range(last_version.pages.count())
]

ocr_document(
lang=doc.lang,
document_version=last_version,
target_docver_uuid=target_docver_uuid,
target_page_uuids=target_page_uuids
)
_post_ocr_document(
doc_id,
target_docver_uuid=target_docver_uuid,
target_page_uuids=target_page_uuids
)
14 changes: 12 additions & 2 deletions papermerge/core/management/commands/ocr.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import ocrmypdf

from django.core.management.base import BaseCommand


Expand Down Expand Up @@ -46,9 +45,18 @@ def add_arguments(self, parser):
action='store_true',
help="Keep temporary files"
)
parser.add_argument(
'-u',
'--uuids',
help="A list of uuids separated by comma. "
" Order of UUIDs matters. First UUID corresponds to first page ID, "
" second UUID corresponds to second page ID etc "
"Number of UUIDs should match number of pages in the document.",
)

def handle(self, *args, **options):
document = options['document']
uuids = options['uuids']
sidecar_dir = options['sidecar_dir']
sidecar_format = options['sidecar_format']
lang = options['lang']
Expand All @@ -66,6 +74,8 @@ def handle(self, *args, **options):
use_threads=True,
keep_temporary_files=keep,
sidecar_dir=sidecar_dir,
uuids=uuids,
sidecar_format=sidecar_format,
preview_width=preview_width
preview_width=preview_width,
force_ocr=True
)
21 changes: 21 additions & 0 deletions papermerge/core/management/commands/users.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
from django.core.management.base import BaseCommand

from papermerge.core.models import User


class Command(BaseCommand):
help = """
List all users
"""

def handle(self, *args, **options):
if User.objects.count() == 0:
self.stdout.write("No users in DB")
return

self.stdout.write("UUID\tusername\t")

for user in User.objects.all():
self.stdout.write(
f"{user.id}\t{user.username}"
)
Loading

0 comments on commit 24f0b05

Please sign in to comment.