Skip to content

Commit

Permalink
[wip][feature] Add SquashFS support
Browse files Browse the repository at this point in the history
 - Needs tests.
 - Optimize open and seek performance.
 - Optimize memory usage.
 - Factor out IndexedMountSource.
  • Loading branch information
mxmlnkn committed Apr 16, 2024
1 parent 8a37af0 commit 5ad7eba
Show file tree
Hide file tree
Showing 6 changed files with 360 additions and 7 deletions.
271 changes: 271 additions & 0 deletions core/ratarmountcore/SquashFSMountSource.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,271 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import io
import json
import os
import re
import stat
import tarfile
import traceback
from timeit import default_timer as timer

from typing import Any, Dict, IO, Iterable, List, Optional, Tuple, Union

from .compressions import PySquashfsImage, findSquashFSOffset
from .MountSource import FileInfo, MountSource
from .SQLiteIndex import SQLiteIndex, SQLiteIndexedTarUserData
from .utils import InvalidIndexError, overrides


class SquashFSMountSource(MountSource):
def __init__(
self,
# fmt: off
fileOrPath : Union[str, IO[bytes]],
writeIndex : bool = False,
clearIndexCache : bool = False,
indexFilePath : Optional[str] = None,
indexFolders : Optional[List[str]] = None,
encoding : str = tarfile.ENCODING,
verifyModificationTime : bool = False,
printDebug : int = 0,
indexMinimumFileCount : int = 1000,
transform : Optional[Tuple[str, str]] = None,
**options
# fmt: on
) -> None:
self.rawFileObject = open(fileOrPath, 'rb') if isinstance(fileOrPath, str) else fileOrPath
self.rawFileObject.seek(0)
offset = findSquashFSOffset(self.rawFileObject)
if offset < 0:
raise ValueError("Not a valid SquashFS image!")

# fmt: off
self.fileObject = PySquashfsImage.SquashFsImage(self.rawFileObject, offset=offset)
self.archiveFilePath = fileOrPath if isinstance(fileOrPath, str) else None
self.encoding = encoding
self.verifyModificationTime = verifyModificationTime
self.printDebug = printDebug
self.options = options
self.transformPattern = transform
# fmt: on

self.transform = (
(lambda x: re.sub(self.transformPattern[0], self.transformPattern[1], x))
if isinstance(self.transformPattern, (tuple, list)) and len(self.transformPattern) == 2
else (lambda x: x)
)

# TODO This might cause memory issues for very large archives! I guess I would have to extend the index
# to add additional data to recreate the inode object or I'd need to reparse it ad-hoc from the number.
self._inodes = {info.inode.inode_number: info.inode for info in self.fileObject}

self.index = SQLiteIndex(
indexFilePath,
indexFolders=indexFolders,
archiveFilePath=self.archiveFilePath,
encoding=self.encoding,
checkMetadata=self._checkMetadata,
printDebug=self.printDebug,
indexMinimumFileCount=indexMinimumFileCount,
backendName='SquashFSMountSource',
)

if clearIndexCache:
self.index.clearIndexes()

isFileObject = not isinstance(fileOrPath, str)

self.index.openExisting()
if self.index.indexIsLoaded():
# self._loadOrStoreCompressionOffsets() # load
self.index.reloadIndexReadOnly()
else:
# Open new database when we didn't find an existing one.
# Simply open in memory without an error even if writeIndex is True but when not indication
# for a index file location has been given.
if writeIndex and (indexFilePath or not isFileObject):
self.index.openWritable()
else:
self.index.openInMemory()

self._createIndex()
# self._loadOrStoreCompressionOffsets() # store
if self.index.indexIsLoaded():
self._storeMetadata()
self.index.reloadIndexReadOnly()

def _storeMetadata(self) -> None:
argumentsToSave = ['encoding', 'transformPattern']
argumentsMetadata = json.dumps({argument: getattr(self, argument) for argument in argumentsToSave})
self.index.storeMetadata(argumentsMetadata, self.archiveFilePath)

def _convertToRow(self, info: "PySquashfsImage.file.File") -> Tuple: # type: ignore
mode = 0o555 | (stat.S_IFDIR if info.is_dir else stat.S_IFREG)
mtime = info.time

linkname = ""
if info.is_symlink:
linkname = info.readlink()
mode = 0o555 | stat.S_IFLNK

path, name = SQLiteIndex.normpath(self.transform(info.path)).rsplit("/", 1)

# Currently unused. Squashfs files are stored in multiple blocks, so a single offset is insufficient.
dataOffset = 0

# SquashFS also returns non-zero sizes for directory, FIFOs, symbolic links, and device files
fileSize = info.size if info.is_file else 0
headerOffset = info.inode.inode_number
assert isinstance(headerOffset, int)

# fmt: off
fileInfo : Tuple = (
path , # 0 : path
name , # 1 : file name
headerOffset , # 2 : header offset
dataOffset , # 3 : data offset
fileSize , # 4 : file size
mtime , # 5 : modification time
mode , # 6 : file mode / permissions
0 , # 7 : TAR file type. Currently unused. Overlaps with mode
linkname , # 8 : linkname
0 , # 9 : user ID
0 , # 10 : group ID
False , # 11 : is TAR (unused?)
False , # 12 : is sparse
)
# fmt: on

return fileInfo

def _createIndex(self) -> None:
if self.printDebug >= 1:
print(f"Creating offset dictionary for {self.archiveFilePath} ...")
t0 = timer()

self.index.ensureIntermediaryTables()

# TODO Doing this in a chunked manner with generators would make it work better for large archives.
fileInfos = []
for info in self.fileObject:
fileInfos.append(self._convertToRow(info))
self.index.setFileInfos(fileInfos)

# Resort by (path,name). This one-time resort is faster than resorting on each INSERT (cache spill)
if self.printDebug >= 2:
print("Resorting files by path ...")

self.index.finalize()

t1 = timer()
if self.printDebug >= 1:
print(f"Creating offset dictionary for {self.archiveFilePath} took {t1 - t0:.2f}s")

def __enter__(self):
return self

@overrides(MountSource)
def __exit__(self, exception_type, exception_value, exception_traceback):
self.index.close()
self.rawFileObject.close()
self.fileObject.close()

@overrides(MountSource)
def isImmutable(self) -> bool:
return True

@overrides(MountSource)
def getFileInfo(self, path: str, fileVersion: int = 0) -> Optional[FileInfo]:
return self.index.getFileInfo(path, fileVersion=fileVersion)

@overrides(MountSource)
def listDir(self, path: str) -> Optional[Union[Iterable[str], Dict[str, FileInfo]]]:
return self.index.listDir(path)

@overrides(MountSource)
def fileVersions(self, path: str) -> int:
fileVersions = self.index.fileVersions(path)
return len(fileVersions) if isinstance(fileVersions, dict) else 0

@overrides(MountSource)
def open(self, fileInfo: FileInfo) -> IO[bytes]:
assert fileInfo.userdata
extendedFileInfo = fileInfo.userdata[-1]
assert isinstance(extendedFileInfo, SQLiteIndexedTarUserData)
inode = self._inodes[extendedFileInfo.offsetheader]
# CPython's zipfile module does handle multiple file objects being opened and reading from the
# same underlying file object concurrently by using a _SharedFile class that even includes a lock.
# Very nice!
# https://github.com/python/cpython/blob/a87c46eab3c306b1c5b8a072b7b30ac2c50651c0/Lib/zipfile/__init__.py#L1569
# TODO need something similar for SquashFS
# TODO Obviously not efficient to read the whole file. Need to add a file object class around the inode.
return io.BytesIO(self.fileObject.read_file(inode))

def _tryToOpenFirstFile(self):
# Get first row that has the regular file bit set in mode (stat.S_IFREG == 32768 == 1<<15).
result = self.index.getConnection().execute(
f"""SELECT path,name {SQLiteIndex.FROM_REGULAR_FILES} ORDER BY "offsetheader" ASC LIMIT 1;"""
)
if not result:
return
firstFile = result.fetchone()
if not firstFile:
return

if self.printDebug >= 2:
print(
"[Info] The index contains no backend name. Therefore, will try to open the first file as "
"an integrity check."
)
try:
fileInfo = self.getFileInfo(firstFile[0] + '/' + firstFile[1])
if not fileInfo:
return

with self.open(fileInfo) as file:
file.read(1)
except Exception as exception:
if self.printDebug >= 2:
print("[Info] Trying to open the first file raised an exception:", exception)
if self.printDebug >= 3:
traceback.print_exc()
raise InvalidIndexError("Integrity check of opening the first file failed.") from exception

def _checkMetadata(self, metadata: Dict[str, Any]) -> None:
"""Raises an exception if the metadata mismatches so much that the index has to be treated as incompatible."""

if 'tarstats' in metadata:
if not self.archiveFilePath:
raise InvalidIndexError("Archive contains file stats but cannot stat real archive!")

storedStats = json.loads(metadata['tarstats'])
archiveStats = os.stat(self.archiveFilePath)

if hasattr(archiveStats, "st_size") and 'st_size' in storedStats:
if archiveStats.st_size < storedStats['st_size']:
raise InvalidIndexError(
f"Archive for this SQLite index has shrunk in size from "
f"{storedStats['st_size']} to {archiveStats.st_size}"
)

# Only happens very rarely, e.g., for more recent files with the same size.
if (
self.verifyModificationTime
and hasattr(archiveStats, "st_mtime")
and 'st_mtime' in storedStats
and archiveStats.st_mtime != storedStats['st_mtime']
):
raise InvalidIndexError(
f"The modification date for the archive file {storedStats['st_mtime']} "
f"to this SQLite index has changed ({str(archiveStats.st_mtime)})",
)

if 'arguments' in metadata:
SQLiteIndex.checkMetadataArguments(
json.loads(metadata['arguments']), self, argumentsToCheck=['encoding', 'transformPattern']
)

if 'backendName' not in metadata:
self._tryToOpenFirstFile()
68 changes: 68 additions & 0 deletions core/ratarmountcore/compressions.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,11 @@
except ImportError:
libarchive = None

try:
import PySquashfsImage
except ImportError:
PySquashfsImage = None


CompressionModuleInfo = collections.namedtuple('CompressionModuleInfo', ['name', 'open'])
# Defining lambdas does not yet check the names of entities used inside the lambda!
Expand Down Expand Up @@ -167,6 +172,69 @@ def isRarFile(fileObject) -> bool:
}


def isSquashFS(fileObject) -> bool:
offset = fileObject.tell()
try:
# https://dr-emann.github.io/squashfs/squashfs.html#_the_superblock
magicBytes = fileObject.read(4)
if magicBytes != b"hsqs":
return False

_inodeCount, _modificationTime, blockSize, _fragmentCount = struct.unpack('<IIII', fileObject.read(4 * 4))
compressor, blockSizeLog2, _flags, _idCount, major, minor = struct.unpack('<HHHHHH', fileObject.read(6 * 2))
# root_inode, bytes_used, id_table, xattr_table, inode_table, dir_table, frag_table, export_table =
# struct.unpack('<QQQQQQQQ', fileObject.read(8 * 8))

# The size of a data block in bytes. Must be a power of two between 4096 (4k) and 1048576 (1 MiB).
# log2 4096 = 12, log2 1024*1024 = 20
if blockSizeLog2 < 12 or blockSizeLog2 > 20 or 2**blockSizeLog2 != blockSize:
return False

if major != 4 or minor != 0:
return False

# Compressions: 0:None, 1:GZIP, 2:LZMA, 3:LZO, 4:XZ, 5:LZ4, 6:ZSTD
if compressor > 6:
return False

finally:
fileObject.seek(offset)

return True


def findSquashFSOffset(fileObject, maxSkip=1024 * 1024) -> int:
# https://dr-emann.github.io/squashfs/squashfs.html#_the_superblock
if isSquashFS(fileObject):
return 0

oldOffset = fileObject.tell()
try:
magic = b"hsqs"
data = fileObject.read(maxSkip + len(magic))
magicOffset = 0
while True:
magicOffset = data.find(magic, magicOffset + 1)
if magicOffset < 0 or magicOffset >= len(data):
break
fileObject.seek(magicOffset)
if isSquashFS(fileObject):
return magicOffset
finally:
fileObject.seek(oldOffset)

return -1


if 'PySquashfsImage' in sys.modules and isinstance(PySquashfsImage, types.ModuleType):
ARCHIVE_FORMATS['squashfs'] = CompressionInfo(
['squashfs', 'AppImage', 'snap'],
[],
[CompressionModuleInfo('PySquashfsImage', lambda x: PySquashfsImage.SquashFsImage(x))],
lambda x: findSquashFSOffset(x) >= 0,
)


# libarchive support is split into filters (compressors or encoders working on a single file) and (archive) formats.
# For now, only list formats here that are not supported by other backends, because libarchive is slower anyway.
LIBARCHIVE_FILTER_FORMATS: Dict[str, CompressionInfo] = {}
Expand Down
Loading

0 comments on commit 5ad7eba

Please sign in to comment.