New storage scheme for file storage (#210)

papermerge · Sep 27, 2023 · 24f0b05 · 24f0b05
1 parent 0e76b5f
commit 24f0b05
Show file tree

Hide file tree

Showing 25 changed files with 567 additions and 411 deletions.
diff --git a/docker/dev/config/settings.py b/docker/dev/config/settings.py
@@ -39,5 +39,3 @@
     'url',
     default=f'xapian:///{os.path.join(PROJ_ROOT, "index_db")}'
 )
-
-PAPERMERGE_CREATE_SPECIAL_FOLDERS = False
diff --git a/papermerge/core/constants.py b/papermerge/core/constants.py
@@ -1,6 +1,9 @@
 DEFAULT_THUMBNAIL_SIZE = 100  # 100 pixels wide
 JPG = 'jpg'
 PAGES = 'pages'
+THUMBNAILS = 'thumbnails'
+DOCVERS = 'docvers'
+OCR = 'ocr'
 DEFAULT_TAG_BG_COLOR = '#c41fff'
 DEFAULT_TAG_FG_COLOR = '#ffffff'
 INDEX_ADD_NODE = 'index_add_node'

diff --git a/papermerge/core/lib/path.py b/papermerge/core/lib/path.py
@@ -1,6 +1,6 @@
 import logging
-import re
 import os
+import re
 
 SUPPORTED_EXTENTIONS = re.compile(".*(jpeg|jpg|png|tiff|pdf)$", re.IGNORECASE)
 
@@ -37,14 +37,7 @@ def filter_by_extention(
 
 
 class DocumentPath:
-    """
-    Document path:
-    /<aux_dir>/<user_id>/<doc_id>/<version>/<file_name>
-
-    If version = 0, it is not included in DocumentPath.
-    Document's version is incremented everytime pdftk operation runs on it
-    (when pages are deleted, reordered, pasted)
-    """
+    """OBSOLETE. Do not use this class!"""
 
     def __init__(
         self,
@@ -157,9 +150,7 @@ def copy_from(doc_path, **kw):
 
 
 class PagePath:
-    """
-    <aux_dir>/<doc_id>/pages/<page_num>/page-<xyz>.jpg
-    """
+    """OBSOLETE. Do not use this class!"""
 
     def __init__(
         self,

diff --git a/papermerge/core/lib/storage.py b/papermerge/core/lib/storage.py
@@ -4,8 +4,9 @@
 import shutil
 from os import listdir
 from os.path import isdir, join
+from pathlib import Path
 
-from .path import DocumentPath, PagePath, AUX_DIR_SIDECARS, AUX_DIR_DOCS
+from .path import AUX_DIR_DOCS, AUX_DIR_SIDECARS, DocumentPath, PagePath
 from .utils import safe_to_delete
 
 logger = logging.getLogger(__name__)
@@ -130,61 +131,26 @@ def abspath(self, _path):
     def path(self, _path):
         return self.abspath(_path)
 
-    def delete_doc(self, doc_path: DocumentPath):
-        """
-        Receives a DocumentPath instance
-        """
-        # where original documents and their versions are stored
-        abs_dirname_docs = self.path(
-            doc_path.dirname_docs
-        )
-        # where OCRed information and generated thumbnails
-        # are stored
-        abs_dirname_sidecars = self.path(
-            doc_path.dir_sidecars
-        )
+    def delete_file(self, file_or_folder: Path):
         # Before recursively deleting everything in folder
         # double check that there are only
         # .pdf, .txt, .hocr, .jpg files.
-        if safe_to_delete(
-            abs_dirname_docs
-        ):
-            shutil.rmtree(abs_dirname_docs)
-            if os.path.exists(abs_dirname_docs):
-                os.rmdir(abs_dirname_docs)
+        if file_or_folder.is_dir() and safe_to_delete(file_or_folder):
+            shutil.rmtree(file_or_folder)
+            file_or_folder.rmdir()
 
-        if safe_to_delete(
-            abs_dirname_sidecars
-        ):
-            shutil.rmtree(abs_dirname_sidecars)
-            if os.path.exists(abs_dirname_sidecars):
-                os.rmdir(abs_dirname_sidecars)
+    def copy_file(self, src: Path | io.BytesIO, dst: Path):
+        """Copy source file to destination"""
+        logger.debug(f"copying {src} to {dst}")
 
-    def copy_doc(self, src: DocumentPath | io.BytesIO, dst: DocumentPath):
-        """
-        copy given file src file path to destination
-        as absolute doc_path
-        """
-        logger.debug(f"copy_doc {src} to {dst}")
-        dirname = os.path.dirname(
-            self.abspath(dst)
-        )
-        if not os.path.exists(
-            dirname
-        ):
-            os.makedirs(
-                dirname, exist_ok=True
-            )
-        if isinstance(src, DocumentPath):
-            logger.debug(
-                f"copy_doc: {src} to {dst}"
-            )
-            shutil.copyfile(
-                self.abspath(src),
-                self.abspath(dst)
-            )
+        if not dst.parent.exists():
+            os.makedirs(dst.parent, exist_ok=True)
+
+        if isinstance(src, Path):
+            logger.debug(f"{src} is a Path instance")
+            shutil.copyfile(src, dst)
         elif isinstance(src, io.BytesIO):
-            with open(self.abspath(dst), 'wb') as f:
+            with open(dst, 'wb') as f:
                 f.write(src.getvalue())
         else:
             raise ValueError(
@@ -248,42 +214,20 @@ def copy_page_preview(self, src: PagePath, dst: PagePath):
 
         shutil.copy(src_preview, dst_preview)
 
-    def copy_page(self, src: PagePath, dst: PagePath):
+    def copy_page(self, src_folder: Path, dst_folder: Path):
         """
-        Copies page data from source folder/path to page destination folder/path
+        Copies page data from source folder to destination folder
 
         Page data are files with 'txt', 'hocr', 'jpg', 'svg' extentions.
         """
-        for inst in [src, dst]:
-            if not isinstance(inst, PagePath):
-                raise ValueError("copy_page accepts only PagePath instances")
-
-        # copy .txt file
-        if self.exists(src.txt_url):
-            self.copy_page_txt(src=src, dst=dst)
-        else:
-            logger.debug(f"txt does not exits {src.txt_url}")
+        if not src_folder.is_dir():
+            raise ValueError(f"Source is not a folder {src_folder}")
 
-        # hocr
-        if self.exists(src.hocr_url):
-            self.copy_page_hocr(src=src, dst=dst)
-        else:
-            logger.debug(f"hocr does not exits {src.hocr_url}")
-
-        if self.exists(src.jpg_url):
-            self.copy_page_jpg(src=src, dst=dst)
-        else:
-            logger.debug(f"jpg does not exits {src.jpg_url}")
+        dst_folder.mkdir(parents=True, exist_ok=True)
+        if not dst_folder.is_dir():
+            raise ValueError(f"Destination is not a folder {dst_folder}")
 
-        if self.exists(src.svg_url):
-            self.copy_page_svg(src=src, dst=dst)
-        else:
-            logger.debug(f"svg does not exits {src.svg_url}")
-
-        if self.exists(src.preview_url):
-            self.copy_page_preview(src=src, dst=dst)
-        else:
-            logger.debug(f"preview does not exits {src.preview_url}")
+        shutil.copytree(src_folder, dst_folder, dirs_exist_ok=True)
 
     def reorder_pages(self, doc_path, new_order):
         """
@@ -475,3 +419,22 @@ def paste_pages(
 
 class FileSystemStorage(Storage):
     pass
+
+
+def copy_file(src: Path | io.BytesIO, dst: Path):
+    """Copy source file to destination"""
+    logger.debug(f"copying {src} to {dst}")
+
+    if not dst.parent.exists():
+        os.makedirs(dst.parent, exist_ok=True)
+
+    if isinstance(src, Path):
+        logger.debug(f"{src} is a Path instance")
+        shutil.copyfile(src, dst)
+    elif isinstance(src, io.BytesIO):
+        with open(dst, 'wb') as f:
+            f.write(src.getvalue())
+    else:
+        raise ValueError(
+            f"src ({src}) is neither instance of DocumentPath nor io.Bytes"
+        )
diff --git a/papermerge/core/lib/utils.py b/papermerge/core/lib/utils.py
@@ -1,6 +1,6 @@
-import os
 import logging
-
+import os
+from pathlib import Path
 
 logger = logging.getLogger(__name__)
 
@@ -31,15 +31,15 @@ def get_bool(key, default="NO"):
     return False
 
 
-def safe_to_delete(place):
-    if not os.path.exists(place):
+def safe_to_delete(path: Path) -> True:
+    if not path.exists():
         logging.warning(
             f"Trying to delete not exising folder"
-            f" {place}"
+            f" {path}"
         )
         return False
 
-    for root, dirs, files in os.walk(place):
+    for root, dirs, files in os.walk(path):
         for name in files:
             base, ext = os.path.splitext(name)
             if ext.lower() not in SAFE_EXTENSIONS:

diff --git a/papermerge/core/management/commands/ocr-task.py b/papermerge/core/management/commands/ocr-task.py
@@ -1,12 +1,18 @@
+import uuid
+
 from django.core.management.base import BaseCommand
 
 from papermerge.core.models import Document
-from papermerge.core.tasks import ocr_document_task
+from papermerge.core.ocr.document import ocr_document
+from papermerge.core.tasks import _post_ocr_document
 
 
 class Command(BaseCommand):
     help = """
-    Triggers OCR task for given document UUID
+    Calls OCR document same way the `core.task.ocr_document_task`
+
+    Handy management command to quickly check if
+    OCRing works
     """
 
     def add_arguments(self, parser):
@@ -16,14 +22,22 @@ def add_arguments(self, parser):
         )
 
     def handle(self, *args, **options):
-        uuid = options.get('UUID')
-        doc = Document.objects.get(id=uuid)
-
-        ocr_document_task.apply_async(
-            kwargs={
-                'document_id': str(doc.id),
-                'lang': doc.lang,
-                'namespace': None,
-                'user_id': str(doc.user.id)
-            }
+        doc_id = options.get('UUID')
+        doc = Document.objects.get(id=doc_id)
+        last_version = doc.versions.last()
+        target_docver_uuid = uuid.uuid4()
+        target_page_uuids = [
+            uuid.uuid4() for _ in range(last_version.pages.count())
+        ]
+
+        ocr_document(
+            lang=doc.lang,
+            document_version=last_version,
+            target_docver_uuid=target_docver_uuid,
+            target_page_uuids=target_page_uuids
+        )
+        _post_ocr_document(
+            doc_id,
+            target_docver_uuid=target_docver_uuid,
+            target_page_uuids=target_page_uuids
         )
diff --git a/papermerge/core/management/commands/ocr.py b/papermerge/core/management/commands/ocr.py
@@ -1,5 +1,4 @@
 import ocrmypdf
-
 from django.core.management.base import BaseCommand
 
 
@@ -46,9 +45,18 @@ def add_arguments(self, parser):
             action='store_true',
             help="Keep temporary files"
         )
+        parser.add_argument(
+            '-u',
+            '--uuids',
+            help="A list of uuids separated by comma. "
+            " Order of UUIDs matters. First UUID corresponds to first page ID, "
+            " second UUID corresponds to second page ID etc "
+            "Number of UUIDs should match number of pages in the document.",
+        )
 
     def handle(self, *args, **options):
         document = options['document']
+        uuids = options['uuids']
         sidecar_dir = options['sidecar_dir']
         sidecar_format = options['sidecar_format']
         lang = options['lang']
@@ -66,6 +74,8 @@ def handle(self, *args, **options):
             use_threads=True,
             keep_temporary_files=keep,
             sidecar_dir=sidecar_dir,
+            uuids=uuids,
             sidecar_format=sidecar_format,
-            preview_width=preview_width
+            preview_width=preview_width,
+            force_ocr=True
         )
diff --git a/papermerge/core/management/commands/users.py b/papermerge/core/management/commands/users.py
@@ -0,0 +1,21 @@
+from django.core.management.base import BaseCommand
+
+from papermerge.core.models import User
+
+
+class Command(BaseCommand):
+    help = """
+    List all users
+    """
+
+    def handle(self, *args, **options):
+        if User.objects.count() == 0:
+            self.stdout.write("No users in DB")
+            return
+
+        self.stdout.write("UUID\tusername\t")
+
+        for user in User.objects.all():
+            self.stdout.write(
+                f"{user.id}\t{user.username}"
+            )
-Original file line number
+Diff line change
@@ Expand Up / @@ -39,5 +39,3 @@ @@
         'url',
         default=f'xapian:///{os.path.join(PROJ_ROOT, "index_db")}'
     )
-    PAPERMERGE_CREATE_SPECIAL_FOLDERS = False