[wip][feature] Add SquashFS support

- Needs tests. - Optimize open and seek performance. - Optimize memory usage. - Factor out IndexedMountSource.
mxmlnkn · Apr 16, 2024 · 5ad7eba · 5ad7eba
1 parent 8a37af0
commit 5ad7eba
Show file tree

Hide file tree

Showing 6 changed files with 360 additions and 7 deletions.
diff --git a/core/ratarmountcore/SquashFSMountSource.py b/core/ratarmountcore/SquashFSMountSource.py
@@ -0,0 +1,271 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+import io
+import json
+import os
+import re
+import stat
+import tarfile
+import traceback
+from timeit import default_timer as timer
+
+from typing import Any, Dict, IO, Iterable, List, Optional, Tuple, Union
+
+from .compressions import PySquashfsImage, findSquashFSOffset
+from .MountSource import FileInfo, MountSource
+from .SQLiteIndex import SQLiteIndex, SQLiteIndexedTarUserData
+from .utils import InvalidIndexError, overrides
+
+
+class SquashFSMountSource(MountSource):
+    def __init__(
+        self,
+        # fmt: off
+        fileOrPath             : Union[str, IO[bytes]],
+        writeIndex             : bool                      = False,
+        clearIndexCache        : bool                      = False,
+        indexFilePath          : Optional[str]             = None,
+        indexFolders           : Optional[List[str]]       = None,
+        encoding               : str                       = tarfile.ENCODING,
+        verifyModificationTime : bool                      = False,
+        printDebug             : int                       = 0,
+        indexMinimumFileCount  : int                       = 1000,
+        transform              : Optional[Tuple[str, str]] = None,
+        **options
+        # fmt: on
+    ) -> None:
+        self.rawFileObject = open(fileOrPath, 'rb') if isinstance(fileOrPath, str) else fileOrPath
+        self.rawFileObject.seek(0)
+        offset = findSquashFSOffset(self.rawFileObject)
+        if offset < 0:
+            raise ValueError("Not a valid SquashFS image!")
+
+        # fmt: off
+        self.fileObject             = PySquashfsImage.SquashFsImage(self.rawFileObject, offset=offset)
+        self.archiveFilePath        = fileOrPath if isinstance(fileOrPath, str) else None
+        self.encoding               = encoding
+        self.verifyModificationTime = verifyModificationTime
+        self.printDebug             = printDebug
+        self.options                = options
+        self.transformPattern       = transform
+        # fmt: on
+
+        self.transform = (
+            (lambda x: re.sub(self.transformPattern[0], self.transformPattern[1], x))
+            if isinstance(self.transformPattern, (tuple, list)) and len(self.transformPattern) == 2
+            else (lambda x: x)
+        )
+
+        # TODO This might cause memory issues for very large archives! I guess I would have to extend the index
+        #      to add additional data to recreate the inode object or I'd need to reparse it ad-hoc from the number.
+        self._inodes = {info.inode.inode_number: info.inode for info in self.fileObject}
+
+        self.index = SQLiteIndex(
+            indexFilePath,
+            indexFolders=indexFolders,
+            archiveFilePath=self.archiveFilePath,
+            encoding=self.encoding,
+            checkMetadata=self._checkMetadata,
+            printDebug=self.printDebug,
+            indexMinimumFileCount=indexMinimumFileCount,
+            backendName='SquashFSMountSource',
+        )
+
+        if clearIndexCache:
+            self.index.clearIndexes()
+
+        isFileObject = not isinstance(fileOrPath, str)
+
+        self.index.openExisting()
+        if self.index.indexIsLoaded():
+            # self._loadOrStoreCompressionOffsets()  # load
+            self.index.reloadIndexReadOnly()
+        else:
+            # Open new database when we didn't find an existing one.
+            # Simply open in memory without an error even if writeIndex is True but when not indication
+            # for a index file location has been given.
+            if writeIndex and (indexFilePath or not isFileObject):
+                self.index.openWritable()
+            else:
+                self.index.openInMemory()
+
+            self._createIndex()
+            # self._loadOrStoreCompressionOffsets()  # store
+            if self.index.indexIsLoaded():
+                self._storeMetadata()
+                self.index.reloadIndexReadOnly()
+
+    def _storeMetadata(self) -> None:
+        argumentsToSave = ['encoding', 'transformPattern']
+        argumentsMetadata = json.dumps({argument: getattr(self, argument) for argument in argumentsToSave})
+        self.index.storeMetadata(argumentsMetadata, self.archiveFilePath)
+
+    def _convertToRow(self, info: "PySquashfsImage.file.File") -> Tuple:  # type: ignore
+        mode = 0o555 | (stat.S_IFDIR if info.is_dir else stat.S_IFREG)
+        mtime = info.time
+
+        linkname = ""
+        if info.is_symlink:
+            linkname = info.readlink()
+            mode = 0o555 | stat.S_IFLNK
+
+        path, name = SQLiteIndex.normpath(self.transform(info.path)).rsplit("/", 1)
+
+        # Currently unused. Squashfs files are stored in multiple blocks, so a single offset is insufficient.
+        dataOffset = 0
+
+        # SquashFS also returns non-zero sizes for directory, FIFOs, symbolic links, and device files
+        fileSize = info.size if info.is_file else 0
+        headerOffset = info.inode.inode_number
+        assert isinstance(headerOffset, int)
+
+        # fmt: off
+        fileInfo : Tuple = (
+            path              ,  # 0  : path
+            name              ,  # 1  : file name
+            headerOffset      ,  # 2  : header offset
+            dataOffset        ,  # 3  : data offset
+            fileSize          ,  # 4  : file size
+            mtime             ,  # 5  : modification time
+            mode              ,  # 6  : file mode / permissions
+            0                 ,  # 7  : TAR file type. Currently unused. Overlaps with mode
+            linkname          ,  # 8  : linkname
+            0                 ,  # 9  : user ID
+            0                 ,  # 10 : group ID
+            False             ,  # 11 : is TAR (unused?)
+            False             ,  # 12 : is sparse
+        )
+        # fmt: on
+
+        return fileInfo
+
+    def _createIndex(self) -> None:
+        if self.printDebug >= 1:
+            print(f"Creating offset dictionary for {self.archiveFilePath} ...")
+        t0 = timer()
+
+        self.index.ensureIntermediaryTables()
+
+        # TODO Doing this in a chunked manner with generators would make it work better for large archives.
+        fileInfos = []
+        for info in self.fileObject:
+            fileInfos.append(self._convertToRow(info))
+        self.index.setFileInfos(fileInfos)
+
+        # Resort by (path,name). This one-time resort is faster than resorting on each INSERT (cache spill)
+        if self.printDebug >= 2:
+            print("Resorting files by path ...")
+
+        self.index.finalize()
+
+        t1 = timer()
+        if self.printDebug >= 1:
+            print(f"Creating offset dictionary for {self.archiveFilePath} took {t1 - t0:.2f}s")
+
+    def __enter__(self):
+        return self
+
+    @overrides(MountSource)
+    def __exit__(self, exception_type, exception_value, exception_traceback):
+        self.index.close()
+        self.rawFileObject.close()
+        self.fileObject.close()
+
+    @overrides(MountSource)
+    def isImmutable(self) -> bool:
+        return True
+
+    @overrides(MountSource)
+    def getFileInfo(self, path: str, fileVersion: int = 0) -> Optional[FileInfo]:
+        return self.index.getFileInfo(path, fileVersion=fileVersion)
+
+    @overrides(MountSource)
+    def listDir(self, path: str) -> Optional[Union[Iterable[str], Dict[str, FileInfo]]]:
+        return self.index.listDir(path)
+
+    @overrides(MountSource)
+    def fileVersions(self, path: str) -> int:
+        fileVersions = self.index.fileVersions(path)
+        return len(fileVersions) if isinstance(fileVersions, dict) else 0
+
+    @overrides(MountSource)
+    def open(self, fileInfo: FileInfo) -> IO[bytes]:
+        assert fileInfo.userdata
+        extendedFileInfo = fileInfo.userdata[-1]
+        assert isinstance(extendedFileInfo, SQLiteIndexedTarUserData)
+        inode = self._inodes[extendedFileInfo.offsetheader]
+        # CPython's zipfile module does handle multiple file objects being opened and reading from the
+        # same underlying file object concurrently by using a _SharedFile class that even includes a lock.
+        # Very nice!
+        # https://github.com/python/cpython/blob/a87c46eab3c306b1c5b8a072b7b30ac2c50651c0/Lib/zipfile/__init__.py#L1569
+        # TODO need something similar for SquashFS
+        # TODO Obviously not efficient to read the whole file. Need to add a file object class around the inode.
+        return io.BytesIO(self.fileObject.read_file(inode))
+
+    def _tryToOpenFirstFile(self):
+        # Get first row that has the regular file bit set in mode (stat.S_IFREG == 32768 == 1<<15).
+        result = self.index.getConnection().execute(
+            f"""SELECT path,name {SQLiteIndex.FROM_REGULAR_FILES} ORDER BY "offsetheader" ASC LIMIT 1;"""
+        )
+        if not result:
+            return
+        firstFile = result.fetchone()
+        if not firstFile:
+            return
+
+        if self.printDebug >= 2:
+            print(
+                "[Info] The index contains no backend name. Therefore, will try to open the first file as "
+                "an integrity check."
+            )
+        try:
+            fileInfo = self.getFileInfo(firstFile[0] + '/' + firstFile[1])
+            if not fileInfo:
+                return
+
+            with self.open(fileInfo) as file:
+                file.read(1)
+        except Exception as exception:
+            if self.printDebug >= 2:
+                print("[Info] Trying to open the first file raised an exception:", exception)
+            if self.printDebug >= 3:
+                traceback.print_exc()
+            raise InvalidIndexError("Integrity check of opening the first file failed.") from exception
+
+    def _checkMetadata(self, metadata: Dict[str, Any]) -> None:
+        """Raises an exception if the metadata mismatches so much that the index has to be treated as incompatible."""
+
+        if 'tarstats' in metadata:
+            if not self.archiveFilePath:
+                raise InvalidIndexError("Archive contains file stats but cannot stat real archive!")
+
+            storedStats = json.loads(metadata['tarstats'])
+            archiveStats = os.stat(self.archiveFilePath)
+
+            if hasattr(archiveStats, "st_size") and 'st_size' in storedStats:
+                if archiveStats.st_size < storedStats['st_size']:
+                    raise InvalidIndexError(
+                        f"Archive for this SQLite index has shrunk in size from "
+                        f"{storedStats['st_size']} to {archiveStats.st_size}"
+                    )
+
+            # Only happens very rarely, e.g., for more recent files with the same size.
+            if (
+                self.verifyModificationTime
+                and hasattr(archiveStats, "st_mtime")
+                and 'st_mtime' in storedStats
+                and archiveStats.st_mtime != storedStats['st_mtime']
+            ):
+                raise InvalidIndexError(
+                    f"The modification date for the archive file {storedStats['st_mtime']} "
+                    f"to this SQLite index has changed ({str(archiveStats.st_mtime)})",
+                )
+
+        if 'arguments' in metadata:
+            SQLiteIndex.checkMetadataArguments(
+                json.loads(metadata['arguments']), self, argumentsToCheck=['encoding', 'transformPattern']
+            )
+
+        if 'backendName' not in metadata:
+            self._tryToOpenFirstFile()
diff --git a/core/ratarmountcore/compressions.py b/core/ratarmountcore/compressions.py
@@ -66,6 +66,11 @@
 except ImportError:
     libarchive = None
 
+try:
+    import PySquashfsImage
+except ImportError:
+    PySquashfsImage = None
+
 
 CompressionModuleInfo = collections.namedtuple('CompressionModuleInfo', ['name', 'open'])
 # Defining lambdas does not yet check the names of entities used inside the lambda!
@@ -167,6 +172,69 @@ def isRarFile(fileObject) -> bool:
 }
 
 
+def isSquashFS(fileObject) -> bool:
+    offset = fileObject.tell()
+    try:
+        # https://dr-emann.github.io/squashfs/squashfs.html#_the_superblock
+        magicBytes = fileObject.read(4)
+        if magicBytes != b"hsqs":
+            return False
+
+        _inodeCount, _modificationTime, blockSize, _fragmentCount = struct.unpack('<IIII', fileObject.read(4 * 4))
+        compressor, blockSizeLog2, _flags, _idCount, major, minor = struct.unpack('<HHHHHH', fileObject.read(6 * 2))
+        # root_inode, bytes_used, id_table, xattr_table, inode_table, dir_table, frag_table, export_table =
+        # struct.unpack('<QQQQQQQQ', fileObject.read(8 * 8))
+
+        # The size of a data block in bytes. Must be a power of two between 4096 (4k) and 1048576 (1 MiB).
+        # log2 4096 = 12, log2 1024*1024 = 20
+        if blockSizeLog2 < 12 or blockSizeLog2 > 20 or 2**blockSizeLog2 != blockSize:
+            return False
+
+        if major != 4 or minor != 0:
+            return False
+
+        # Compressions: 0:None, 1:GZIP, 2:LZMA, 3:LZO, 4:XZ, 5:LZ4, 6:ZSTD
+        if compressor > 6:
+            return False
+
+    finally:
+        fileObject.seek(offset)
+
+    return True
+
+
+def findSquashFSOffset(fileObject, maxSkip=1024 * 1024) -> int:
+    # https://dr-emann.github.io/squashfs/squashfs.html#_the_superblock
+    if isSquashFS(fileObject):
+        return 0
+
+    oldOffset = fileObject.tell()
+    try:
+        magic = b"hsqs"
+        data = fileObject.read(maxSkip + len(magic))
+        magicOffset = 0
+        while True:
+            magicOffset = data.find(magic, magicOffset + 1)
+            if magicOffset < 0 or magicOffset >= len(data):
+                break
+            fileObject.seek(magicOffset)
+            if isSquashFS(fileObject):
+                return magicOffset
+    finally:
+        fileObject.seek(oldOffset)
+
+    return -1
+
+
+if 'PySquashfsImage' in sys.modules and isinstance(PySquashfsImage, types.ModuleType):
+    ARCHIVE_FORMATS['squashfs'] = CompressionInfo(
+        ['squashfs', 'AppImage', 'snap'],
+        [],
+        [CompressionModuleInfo('PySquashfsImage', lambda x: PySquashfsImage.SquashFsImage(x))],
+        lambda x: findSquashFSOffset(x) >= 0,
+    )
+
+
 # libarchive support is split into filters (compressors or encoders working on a single file) and (archive) formats.
 # For now, only list formats here that are not supported by other backends, because libarchive is slower anyway.
 LIBARCHIVE_FILTER_FORMATS: Dict[str, CompressionInfo] = {}