-
Notifications
You must be signed in to change notification settings - Fork 40
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Needs tests. - Optimize open and seek performance. - Optimize memory usage. - Factor out IndexedMountSource.
- Loading branch information
Showing
6 changed files
with
360 additions
and
7 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,271 @@ | ||
#!/usr/bin/env python3 | ||
# -*- coding: utf-8 -*- | ||
|
||
import io | ||
import json | ||
import os | ||
import re | ||
import stat | ||
import tarfile | ||
import traceback | ||
from timeit import default_timer as timer | ||
|
||
from typing import Any, Dict, IO, Iterable, List, Optional, Tuple, Union | ||
|
||
from .compressions import PySquashfsImage, findSquashFSOffset | ||
from .MountSource import FileInfo, MountSource | ||
from .SQLiteIndex import SQLiteIndex, SQLiteIndexedTarUserData | ||
from .utils import InvalidIndexError, overrides | ||
|
||
|
||
class SquashFSMountSource(MountSource): | ||
def __init__( | ||
self, | ||
# fmt: off | ||
fileOrPath : Union[str, IO[bytes]], | ||
writeIndex : bool = False, | ||
clearIndexCache : bool = False, | ||
indexFilePath : Optional[str] = None, | ||
indexFolders : Optional[List[str]] = None, | ||
encoding : str = tarfile.ENCODING, | ||
verifyModificationTime : bool = False, | ||
printDebug : int = 0, | ||
indexMinimumFileCount : int = 1000, | ||
transform : Optional[Tuple[str, str]] = None, | ||
**options | ||
# fmt: on | ||
) -> None: | ||
self.rawFileObject = open(fileOrPath, 'rb') if isinstance(fileOrPath, str) else fileOrPath | ||
self.rawFileObject.seek(0) | ||
offset = findSquashFSOffset(self.rawFileObject) | ||
if offset < 0: | ||
raise ValueError("Not a valid SquashFS image!") | ||
|
||
# fmt: off | ||
self.fileObject = PySquashfsImage.SquashFsImage(self.rawFileObject, offset=offset) | ||
self.archiveFilePath = fileOrPath if isinstance(fileOrPath, str) else None | ||
self.encoding = encoding | ||
self.verifyModificationTime = verifyModificationTime | ||
self.printDebug = printDebug | ||
self.options = options | ||
self.transformPattern = transform | ||
# fmt: on | ||
|
||
self.transform = ( | ||
(lambda x: re.sub(self.transformPattern[0], self.transformPattern[1], x)) | ||
if isinstance(self.transformPattern, (tuple, list)) and len(self.transformPattern) == 2 | ||
else (lambda x: x) | ||
) | ||
|
||
# TODO This might cause memory issues for very large archives! I guess I would have to extend the index | ||
# to add additional data to recreate the inode object or I'd need to reparse it ad-hoc from the number. | ||
self._inodes = {info.inode.inode_number: info.inode for info in self.fileObject} | ||
|
||
self.index = SQLiteIndex( | ||
indexFilePath, | ||
indexFolders=indexFolders, | ||
archiveFilePath=self.archiveFilePath, | ||
encoding=self.encoding, | ||
checkMetadata=self._checkMetadata, | ||
printDebug=self.printDebug, | ||
indexMinimumFileCount=indexMinimumFileCount, | ||
backendName='SquashFSMountSource', | ||
) | ||
|
||
if clearIndexCache: | ||
self.index.clearIndexes() | ||
|
||
isFileObject = not isinstance(fileOrPath, str) | ||
|
||
self.index.openExisting() | ||
if self.index.indexIsLoaded(): | ||
# self._loadOrStoreCompressionOffsets() # load | ||
self.index.reloadIndexReadOnly() | ||
else: | ||
# Open new database when we didn't find an existing one. | ||
# Simply open in memory without an error even if writeIndex is True but when not indication | ||
# for a index file location has been given. | ||
if writeIndex and (indexFilePath or not isFileObject): | ||
self.index.openWritable() | ||
else: | ||
self.index.openInMemory() | ||
|
||
self._createIndex() | ||
# self._loadOrStoreCompressionOffsets() # store | ||
if self.index.indexIsLoaded(): | ||
self._storeMetadata() | ||
self.index.reloadIndexReadOnly() | ||
|
||
def _storeMetadata(self) -> None: | ||
argumentsToSave = ['encoding', 'transformPattern'] | ||
argumentsMetadata = json.dumps({argument: getattr(self, argument) for argument in argumentsToSave}) | ||
self.index.storeMetadata(argumentsMetadata, self.archiveFilePath) | ||
|
||
def _convertToRow(self, info: "PySquashfsImage.file.File") -> Tuple: # type: ignore | ||
mode = 0o555 | (stat.S_IFDIR if info.is_dir else stat.S_IFREG) | ||
mtime = info.time | ||
|
||
linkname = "" | ||
if info.is_symlink: | ||
linkname = info.readlink() | ||
mode = 0o555 | stat.S_IFLNK | ||
|
||
path, name = SQLiteIndex.normpath(self.transform(info.path)).rsplit("/", 1) | ||
|
||
# Currently unused. Squashfs files are stored in multiple blocks, so a single offset is insufficient. | ||
dataOffset = 0 | ||
|
||
# SquashFS also returns non-zero sizes for directory, FIFOs, symbolic links, and device files | ||
fileSize = info.size if info.is_file else 0 | ||
headerOffset = info.inode.inode_number | ||
assert isinstance(headerOffset, int) | ||
|
||
# fmt: off | ||
fileInfo : Tuple = ( | ||
path , # 0 : path | ||
name , # 1 : file name | ||
headerOffset , # 2 : header offset | ||
dataOffset , # 3 : data offset | ||
fileSize , # 4 : file size | ||
mtime , # 5 : modification time | ||
mode , # 6 : file mode / permissions | ||
0 , # 7 : TAR file type. Currently unused. Overlaps with mode | ||
linkname , # 8 : linkname | ||
0 , # 9 : user ID | ||
0 , # 10 : group ID | ||
False , # 11 : is TAR (unused?) | ||
False , # 12 : is sparse | ||
) | ||
# fmt: on | ||
|
||
return fileInfo | ||
|
||
def _createIndex(self) -> None: | ||
if self.printDebug >= 1: | ||
print(f"Creating offset dictionary for {self.archiveFilePath} ...") | ||
t0 = timer() | ||
|
||
self.index.ensureIntermediaryTables() | ||
|
||
# TODO Doing this in a chunked manner with generators would make it work better for large archives. | ||
fileInfos = [] | ||
for info in self.fileObject: | ||
fileInfos.append(self._convertToRow(info)) | ||
self.index.setFileInfos(fileInfos) | ||
|
||
# Resort by (path,name). This one-time resort is faster than resorting on each INSERT (cache spill) | ||
if self.printDebug >= 2: | ||
print("Resorting files by path ...") | ||
|
||
self.index.finalize() | ||
|
||
t1 = timer() | ||
if self.printDebug >= 1: | ||
print(f"Creating offset dictionary for {self.archiveFilePath} took {t1 - t0:.2f}s") | ||
|
||
def __enter__(self): | ||
return self | ||
|
||
@overrides(MountSource) | ||
def __exit__(self, exception_type, exception_value, exception_traceback): | ||
self.index.close() | ||
self.rawFileObject.close() | ||
self.fileObject.close() | ||
|
||
@overrides(MountSource) | ||
def isImmutable(self) -> bool: | ||
return True | ||
|
||
@overrides(MountSource) | ||
def getFileInfo(self, path: str, fileVersion: int = 0) -> Optional[FileInfo]: | ||
return self.index.getFileInfo(path, fileVersion=fileVersion) | ||
|
||
@overrides(MountSource) | ||
def listDir(self, path: str) -> Optional[Union[Iterable[str], Dict[str, FileInfo]]]: | ||
return self.index.listDir(path) | ||
|
||
@overrides(MountSource) | ||
def fileVersions(self, path: str) -> int: | ||
fileVersions = self.index.fileVersions(path) | ||
return len(fileVersions) if isinstance(fileVersions, dict) else 0 | ||
|
||
@overrides(MountSource) | ||
def open(self, fileInfo: FileInfo) -> IO[bytes]: | ||
assert fileInfo.userdata | ||
extendedFileInfo = fileInfo.userdata[-1] | ||
assert isinstance(extendedFileInfo, SQLiteIndexedTarUserData) | ||
inode = self._inodes[extendedFileInfo.offsetheader] | ||
# CPython's zipfile module does handle multiple file objects being opened and reading from the | ||
# same underlying file object concurrently by using a _SharedFile class that even includes a lock. | ||
# Very nice! | ||
# https://github.com/python/cpython/blob/a87c46eab3c306b1c5b8a072b7b30ac2c50651c0/Lib/zipfile/__init__.py#L1569 | ||
# TODO need something similar for SquashFS | ||
# TODO Obviously not efficient to read the whole file. Need to add a file object class around the inode. | ||
return io.BytesIO(self.fileObject.read_file(inode)) | ||
|
||
def _tryToOpenFirstFile(self): | ||
# Get first row that has the regular file bit set in mode (stat.S_IFREG == 32768 == 1<<15). | ||
result = self.index.getConnection().execute( | ||
f"""SELECT path,name {SQLiteIndex.FROM_REGULAR_FILES} ORDER BY "offsetheader" ASC LIMIT 1;""" | ||
) | ||
if not result: | ||
return | ||
firstFile = result.fetchone() | ||
if not firstFile: | ||
return | ||
|
||
if self.printDebug >= 2: | ||
print( | ||
"[Info] The index contains no backend name. Therefore, will try to open the first file as " | ||
"an integrity check." | ||
) | ||
try: | ||
fileInfo = self.getFileInfo(firstFile[0] + '/' + firstFile[1]) | ||
if not fileInfo: | ||
return | ||
|
||
with self.open(fileInfo) as file: | ||
file.read(1) | ||
except Exception as exception: | ||
if self.printDebug >= 2: | ||
print("[Info] Trying to open the first file raised an exception:", exception) | ||
if self.printDebug >= 3: | ||
traceback.print_exc() | ||
raise InvalidIndexError("Integrity check of opening the first file failed.") from exception | ||
|
||
def _checkMetadata(self, metadata: Dict[str, Any]) -> None: | ||
"""Raises an exception if the metadata mismatches so much that the index has to be treated as incompatible.""" | ||
|
||
if 'tarstats' in metadata: | ||
if not self.archiveFilePath: | ||
raise InvalidIndexError("Archive contains file stats but cannot stat real archive!") | ||
|
||
storedStats = json.loads(metadata['tarstats']) | ||
archiveStats = os.stat(self.archiveFilePath) | ||
|
||
if hasattr(archiveStats, "st_size") and 'st_size' in storedStats: | ||
if archiveStats.st_size < storedStats['st_size']: | ||
raise InvalidIndexError( | ||
f"Archive for this SQLite index has shrunk in size from " | ||
f"{storedStats['st_size']} to {archiveStats.st_size}" | ||
) | ||
|
||
# Only happens very rarely, e.g., for more recent files with the same size. | ||
if ( | ||
self.verifyModificationTime | ||
and hasattr(archiveStats, "st_mtime") | ||
and 'st_mtime' in storedStats | ||
and archiveStats.st_mtime != storedStats['st_mtime'] | ||
): | ||
raise InvalidIndexError( | ||
f"The modification date for the archive file {storedStats['st_mtime']} " | ||
f"to this SQLite index has changed ({str(archiveStats.st_mtime)})", | ||
) | ||
|
||
if 'arguments' in metadata: | ||
SQLiteIndex.checkMetadataArguments( | ||
json.loads(metadata['arguments']), self, argumentsToCheck=['encoding', 'transformPattern'] | ||
) | ||
|
||
if 'backendName' not in metadata: | ||
self._tryToOpenFirstFile() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.