Skip to content

Commit

Permalink
Merge pull request #468 from emnoor-reef/ls-filters
Browse files Browse the repository at this point in the history
Add filters to `Bucket.ls`
  • Loading branch information
mjurbanski-reef authored Jan 31, 2024
2 parents e84b408 + 3c472bb commit c319ed4
Show file tree
Hide file tree
Showing 7 changed files with 372 additions and 0 deletions.
3 changes: 3 additions & 0 deletions b2sdk/_v3/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -257,3 +257,6 @@
)
from b2sdk.session import B2Session
from b2sdk.utils.thread_pool import ThreadPoolMixin

# filter
from b2sdk.filter import FilterType, Filter
9 changes: 9 additions & 0 deletions b2sdk/bucket.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
import logging
import pathlib
from contextlib import suppress
from typing import Sequence

from .encryption.setting import EncryptionSetting, EncryptionSettingFactory
from .encryption.types import EncryptionMode
Expand All @@ -33,6 +34,7 @@
LegalHold,
)
from .file_version import DownloadVersion, FileVersion
from .filter import Filter, FilterMatcher
from .http_constants import LIST_FILE_NAMES_MAX_LIMIT
from .progress import AbstractProgressListener, DoNothingProgressListener
from .raw_api import LifecycleRule
Expand Down Expand Up @@ -369,6 +371,7 @@ def ls(
recursive: bool = False,
fetch_count: int | None = LIST_FILE_NAMES_MAX_LIMIT,
with_wildcard: bool = False,
filters: Sequence[Filter] = (),
):
"""
Pretend that folders exist and yields the information about the files in a folder.
Expand All @@ -390,6 +393,7 @@ def ls(
:param with_wildcard: Accepts "*", "?", "[]" and "[!]" in folder_to_list, similarly to what shell does.
As of 1.19.0 it can only be enabled when recursive is also enabled.
Also, in this mode, folder_to_list is considered to be a filename or a pattern.
:param filters: list of filters to apply to the files returned by the server.
:rtype: generator[tuple[b2sdk.v2.FileVersion, str]]
:returns: generator of (file_version, folder_name) tuples
Expand Down Expand Up @@ -445,6 +449,7 @@ def ls(
# "folder". If the first search doesn't produce enough results,
# then we keep calling list_file_names until we get all of the
# names in this "folder".
filter_matcher = FilterMatcher(filters)
current_dir = None
start_file_name = prefix
start_file_id = None
Expand All @@ -466,6 +471,10 @@ def ls(
):
# File doesn't match our wildcard rules
continue

if not filter_matcher.match(file_version.file_name):
continue

after_prefix = file_version.file_name[len(prefix):]
# In case of wildcards, we don't care about folders at all, and it's recursive by default.
if '/' not in after_prefix or recursive:
Expand Down
67 changes: 67 additions & 0 deletions b2sdk/filter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
######################################################################
#
# File: b2sdk/filter.py
#
# Copyright 2024 Backblaze Inc. All Rights Reserved.
#
# License https://www.backblaze.com/using_b2_code.html
#
######################################################################
from __future__ import annotations

import fnmatch
from dataclasses import dataclass
from enum import Enum
from typing import Sequence


class FilterType(Enum):
INCLUDE = "include"
EXCLUDE = "exclude"


@dataclass
class Filter:
type: FilterType
pattern: str

@classmethod
def include(cls, pattern: str) -> Filter:
return cls(type=FilterType.INCLUDE, pattern=pattern)

@classmethod
def exclude(cls, pattern: str) -> Filter:
return cls(type=FilterType.EXCLUDE, pattern=pattern)


class FilterMatcher:
"""
Holds a list of filters and matches a string (i.e. file name) against them.
The order of filters matters. The *last* matching filter decides whether
the string is included or excluded. If no filter matches, the string is
included by default.
If the given list of filters contains only INCLUDE filters, then it is
assumed that all files are excluded by default. In this case, an additional
EXCLUDE filter is prepended to the list.
:param filters: list of filters
"""

def __init__(self, filters: Sequence[Filter]):
if filters and all(filter_.type == FilterType.INCLUDE for filter_ in filters):
filters = [Filter(type=FilterType.EXCLUDE, pattern="*"), *filters]

self.filters = filters

def match(self, s: str) -> bool:
include_file = True
for filter_ in self.filters:
matched = fnmatch.fnmatchcase(s, filter_.pattern)
if matched and filter_.type == FilterType.INCLUDE:
include_file = True
elif matched and filter_.type == FilterType.EXCLUDE:
include_file = False

return include_file
1 change: 1 addition & 0 deletions changelog.d/+bucket_ls_filters.added.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Add support for filters to `Bucket.ls()`.
241 changes: 241 additions & 0 deletions test/unit/bucket/test_bucket.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@
FakeResponse,
FileRetentionSetting,
FileSimulator,
Filter,
InMemoryCache,
LargeFileUploadState,
LegalHold,
Expand Down Expand Up @@ -788,6 +789,246 @@ def test_matching_exact_filename(self):
]
self.assertEqual(expected, actual)

def test_filters_wildcard_matching(self):
data = b'hello world'
self.bucket.upload_bytes(data, 'a')
self.bucket.upload_bytes(data, 'b/1/test-1.txt')
self.bucket.upload_bytes(data, 'b/2/test-2.csv')
self.bucket.upload_bytes(data, 'b/2/test-3.txt')
self.bucket.upload_bytes(data, 'b/3/test-4.jpg')
self.bucket.upload_bytes(data, 'b/3/test-4.txt')
self.bucket.upload_bytes(data, 'b/3/test-5.txt')
expected = [
('b/1/test-1.txt', len(data), 'upload', None),
('b/2/test-3.txt', len(data), 'upload', None),
('b/3/test-4.txt', len(data), 'upload', None),
('b/3/test-5.txt', len(data), 'upload', None),
]
actual = [
(info.file_name, info.size, info.action, folder) for (info, folder) in self.bucket_ls(
'b/',
recursive=True,
filters=[Filter.include("*.txt")],
)
]
self.assertEqual(expected, actual)

def test_filters_wildcard_matching_including_root(self):
data = b'hello world'
self.bucket.upload_bytes(data, 'b/1/test.csv')
self.bucket.upload_bytes(data, 'b/1/test.txt')
self.bucket.upload_bytes(data, 'b/2/test.tsv')
self.bucket.upload_bytes(data, 'b/2/test.txt')
self.bucket.upload_bytes(data, 'b/3/test.txt')
self.bucket.upload_bytes(data, 'test.txt')
self.bucket.upload_bytes(data, 'test.csv')

expected = [
('b/1/test.txt', len(data), 'upload', None),
('b/2/test.txt', len(data), 'upload', None),
('b/3/test.txt', len(data), 'upload', None),
('test.txt', len(data), 'upload', None),
]
actual = [
(info.file_name, info.size, info.action, folder)
for (info, folder) in self.bucket_ls(recursive=True, filters=[Filter.include('*.txt')])
]
self.assertEqual(expected, actual)

expected = [
('b/1/test.csv', len(data), 'upload', None),
('b/2/test.tsv', len(data), 'upload', None),
('test.csv', len(data), 'upload', None),
]
actual = [
(info.file_name, info.size, info.action, folder)
for (info, folder) in self.bucket_ls(recursive=True, filters=[Filter.exclude('*.txt')])
]
self.assertEqual(expected, actual)

def test_filters_single_character_matching(self):
data = b'hello world'
self.bucket.upload_bytes(data, 'a')
self.bucket.upload_bytes(data, 'b/2/test.csv')
self.bucket.upload_bytes(data, 'b/2/test.txt')
self.bucket.upload_bytes(data, 'b/2/test.tsv')

expected = [
('b/2/test.csv', len(data), 'upload', None),
('b/2/test.tsv', len(data), 'upload', None),
]
actual = [
(info.file_name, info.size, info.action, folder) for (info, folder) in self.bucket_ls(
recursive=True,
filters=[Filter.include('b/2/test.?sv')],
)
]
self.assertEqual(expected, actual)

expected = [
('a', len(data), 'upload', None),
('b/2/test.txt', len(data), 'upload', None),
]
actual = [
(info.file_name, info.size, info.action, folder) for (info, folder) in self.bucket_ls(
recursive=True,
filters=[Filter.exclude('b/2/test.?sv')],
)
]
self.assertEqual(expected, actual)

def test_filters_sequence_matching(self):
data = b'hello world'
self.bucket.upload_bytes(data, 'a')
self.bucket.upload_bytes(data, 'b/2/test.csv')
self.bucket.upload_bytes(data, 'b/2/test.ksv')
self.bucket.upload_bytes(data, 'b/2/test.tsv')

expected = [
('b/2/test.csv', len(data), 'upload', None),
('b/2/test.tsv', len(data), 'upload', None),
]
actual = [
(info.file_name, info.size, info.action, folder) for (info, folder) in self.bucket_ls(
recursive=True,
filters=[Filter.include('b/2/test.[tc]sv')],
)
]
self.assertEqual(expected, actual)

expected = [
('a', len(data), 'upload', None),
('b/2/test.ksv', len(data), 'upload', None),
]
actual = [
(info.file_name, info.size, info.action, folder) for (info, folder) in self.bucket_ls(
recursive=True,
filters=[Filter.exclude('b/2/test.[tc]sv')],
)
]
self.assertEqual(expected, actual)

def test_filters_negative_sequence_matching(self):
data = b'hello world'
self.bucket.upload_bytes(data, 'a')
self.bucket.upload_bytes(data, 'b/2/test.csv')
self.bucket.upload_bytes(data, 'b/2/test.ksv')
self.bucket.upload_bytes(data, 'b/2/test.tsv')

expected = [
('b/2/test.tsv', len(data), 'upload', None),
]
actual = [
(info.file_name, info.size, info.action, folder) for (info, folder) in self.bucket_ls(
recursive=True,
filters=[Filter.include('b/2/test.[!ck]sv')],
)
]
self.assertEqual(expected, actual)

expected = [
('a', len(data), 'upload', None),
('b/2/test.csv', len(data), 'upload', None),
('b/2/test.ksv', len(data), 'upload', None),
]
actual = [
(info.file_name, info.size, info.action, folder) for (info, folder) in self.bucket_ls(
recursive=True,
filters=[Filter.exclude('b/2/test.[!ck]sv')],
)
]
self.assertEqual(expected, actual)

def test_filters_matching_exact_filename(self):
data = b'hello world'
self.bucket.upload_bytes(data, 'b/a.txt')
self.bucket.upload_bytes(data, 'b/b.txt')

expected = [
('b/a.txt', len(data), 'upload', None),
]
actual = [
(info.file_name, info.size, info.action, folder) for (info, folder) in self.bucket_ls(
recursive=True,
filters=[Filter.include('b/a.txt')],
)
]
self.assertEqual(expected, actual)

expected = [
('b/b.txt', len(data), 'upload', None),
]
actual = [
(info.file_name, info.size, info.action, folder) for (info, folder) in self.bucket_ls(
recursive=True,
filters=[Filter.exclude('b/a.txt')],
)
]
self.assertEqual(expected, actual)

def test_filters_mixed_with_wildcards(self):
data = b'hello world'
self.bucket.upload_bytes(data, 'a.csv')
self.bucket.upload_bytes(data, 'a.txt')
self.bucket.upload_bytes(data, 'b/a-1.csv')
self.bucket.upload_bytes(data, 'b/a-1.txt')
self.bucket.upload_bytes(data, 'b/a-2.csv')
self.bucket.upload_bytes(data, 'b/a-2.txt')
self.bucket.upload_bytes(data, 'b/a-a.csv')
self.bucket.upload_bytes(data, 'b/a-a.txt')
self.bucket.upload_bytes(data, 'b/a.csv')
self.bucket.upload_bytes(data, 'b/a.txt')

expected = [
('a.txt', len(data), 'upload', None),
('b/a-1.txt', len(data), 'upload', None),
('b/a-a.txt', len(data), 'upload', None),
('b/a.txt', len(data), 'upload', None),
]
actual = [
(info.file_name, info.size, info.action, folder) for (info, folder) in self.bucket_ls(
'*.txt',
recursive=True,
with_wildcard=True,
filters=[Filter.exclude('*-2.txt')],
)
]
self.assertEqual(expected, actual)

expected = [
('b/a-1.csv', len(data), 'upload', None),
('b/a-1.txt', len(data), 'upload', None),
]
actual = [
(info.file_name, info.size, info.action, folder) for (info, folder) in self.bucket_ls(
'b/?-[1234567890].*',
recursive=True,
with_wildcard=True,
filters=[Filter.exclude('*-2.*')]
)
]
self.assertEqual(expected, actual)

def test_filters_combination(self):
data = b'hello world'
self.bucket.upload_bytes(data, 'a.txt')
self.bucket.upload_bytes(data, 'b/a-1.csv')
self.bucket.upload_bytes(data, 'b/a-1.txt')

expected = [
('a.txt', len(data), 'upload', None),
('b/a-1.csv', len(data), 'upload', None),
]
actual = [
(info.file_name, info.size, info.action, folder) for (info, folder) in self.bucket_ls(
recursive=True,
filters=[Filter.include('b/*'),
Filter.exclude('*.txt'),
Filter.include('a.txt')],
)
]
self.assertEqual(expected, actual)


class TestGetFreshState(TestCaseWithBucket):
def test_ok(self):
Expand Down
10 changes: 10 additions & 0 deletions test/unit/filter/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
######################################################################
#
# File: test/unit/filter/__init__.py
#
# Copyright 2024 Backblaze Inc. All Rights Reserved.
#
# License https://www.backblaze.com/using_b2_code.html
#
######################################################################
from __future__ import annotations
Loading

0 comments on commit c319ed4

Please sign in to comment.