From a661224f2bf7866cba5e1bff5aa91c8fd27e65a3 Mon Sep 17 00:00:00 2001 From: ZeyadTarekkk Date: Wed, 6 Nov 2024 14:13:41 +0200 Subject: [PATCH 1/9] Implement filecontent class --- .../content_type/content_base.py | 8 ++--- .../content_type/file_content.py | 30 +++++++++++++++++++ .../tests/test_file_content_type.py | 20 +++++++++++++ 3 files changed, 54 insertions(+), 4 deletions(-) create mode 100644 python-threatexchange/threatexchange/content_type/file_content.py create mode 100644 python-threatexchange/threatexchange/tests/test_file_content_type.py diff --git a/python-threatexchange/threatexchange/content_type/content_base.py b/python-threatexchange/threatexchange/content_type/content_base.py index 81e600877..c7a1ae00b 100644 --- a/python-threatexchange/threatexchange/content_type/content_base.py +++ b/python-threatexchange/threatexchange/content_type/content_base.py @@ -10,7 +10,7 @@ from enum import Enum, auto import typing as t -from threatexchange import common +import common class ContentType: @@ -21,8 +21,8 @@ def get_name(cls) -> str: @classmethod def extract_additional_content( - cls, content_arg: str - ) -> t.List[t.Tuple[t.Type["ContentType"], str]]: + cls, content_in_file: Path, available_content: t.Sequence[t.Type["ContentType"]] + ) -> t.Dict[t.Type["ContentType"], t.List[Path]]: """ Post-process/download content to find additional components @@ -32,7 +32,7 @@ def extract_additional_content( * Photo => run OCR and extract text * Video => break out photo thumbnail, close caption text, audio """ - return [] + return {} class RotationType(Enum): diff --git a/python-threatexchange/threatexchange/content_type/file_content.py b/python-threatexchange/threatexchange/content_type/file_content.py new file mode 100644 index 000000000..060002ba6 --- /dev/null +++ b/python-threatexchange/threatexchange/content_type/file_content.py @@ -0,0 +1,30 @@ +import typing as t +from content_base import ContentType +from photo import PhotoContent +from video import VideoContent + +class FileContent(ContentType): + """ + Content representing a file. Determines if a file is a photo or video based on file extension. + """ + + VALID_PHOTO_EXTENSIONS = {".jpg", ".jpeg", ".png", ".gif"} + VALID_VIDEO_EXTENSIONS = {".mp4", ".avi", ".mov", ".mkv"} + + def __init__(self, file_name: str): + self.file_name = file_name + + @classmethod + def get_name(cls) -> str: + return "File" + + def get_content_type_from_filename(self) -> t.Type[ContentType]: + """ + Determines content type based on file extension. + """ + if any(self.file_name.endswith(ext) for ext in self.VALID_PHOTO_EXTENSIONS): + return PhotoContent + elif any(self.file_name.endswith(ext) for ext in self.VALID_VIDEO_EXTENSIONS): + return VideoContent + else: + raise ValueError(f"Unknown content type for file: {self.file_name}") \ No newline at end of file diff --git a/python-threatexchange/threatexchange/tests/test_file_content_type.py b/python-threatexchange/threatexchange/tests/test_file_content_type.py new file mode 100644 index 000000000..18290a421 --- /dev/null +++ b/python-threatexchange/threatexchange/tests/test_file_content_type.py @@ -0,0 +1,20 @@ +import unittest +from threatexchange.content_type.photo import PhotoContent +from threatexchange.content_type.video import VideoContent +from threatexchange.content_type.file_content import FileContent + +class TestFileContentType(unittest.TestCase): + def test_photo_detection(self): + file_content = FileContent("file.jpg") + content_type = file_content.get_content_type_from_filename() + self.assertEqual(content_type, PhotoContent) + + def test_video_detection(self): + file_content = FileContent("file.mp4") + content_type = file_content.get_content_type_from_filename() + self.assertEqual(content_type, VideoContent) + + def test_unknown_file_type(self): + file_content = FileContent("file.txt") + with self.assertRaises(ValueError): + file_content.get_content_type_from_filename() \ No newline at end of file From b31647e4f39cbd17c60335fc8af3a9f4912670c8 Mon Sep 17 00:00:00 2001 From: ZeyadTarekk Date: Fri, 8 Nov 2024 14:16:28 +0200 Subject: [PATCH 2/9] discard extract_additional_content changes --- .../threatexchange/content_type/content_base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python-threatexchange/threatexchange/content_type/content_base.py b/python-threatexchange/threatexchange/content_type/content_base.py index c7a1ae00b..e2a2de5de 100644 --- a/python-threatexchange/threatexchange/content_type/content_base.py +++ b/python-threatexchange/threatexchange/content_type/content_base.py @@ -21,8 +21,8 @@ def get_name(cls) -> str: @classmethod def extract_additional_content( - cls, content_in_file: Path, available_content: t.Sequence[t.Type["ContentType"]] - ) -> t.Dict[t.Type["ContentType"], t.List[Path]]: + cls, content_arg: str + ) -> t.List[t.Tuple[t.Type["ContentType"], str]]: """ Post-process/download content to find additional components From 8aff837d930fd96a3d587ef40e25517f97f6fa18 Mon Sep 17 00:00:00 2001 From: ZeyadTarekk Date: Fri, 8 Nov 2024 14:35:52 +0200 Subject: [PATCH 3/9] Change the valid photo types --- .../content_type/file_content.py | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/python-threatexchange/threatexchange/content_type/file_content.py b/python-threatexchange/threatexchange/content_type/file_content.py index 060002ba6..a529a7713 100644 --- a/python-threatexchange/threatexchange/content_type/file_content.py +++ b/python-threatexchange/threatexchange/content_type/file_content.py @@ -2,29 +2,29 @@ from content_base import ContentType from photo import PhotoContent from video import VideoContent +from PIL import Image class FileContent(ContentType): """ - Content representing a file. Determines if a file is a photo or video based on file extension. + ContentType representing a generic file. + + Determines if a file is a photo or video based on file extension. """ - VALID_PHOTO_EXTENSIONS = {".jpg", ".jpeg", ".png", ".gif"} + VALID_PHOTO_EXTENSIONS = {ext.lower() for ext in Image.registered_extensions()} VALID_VIDEO_EXTENSIONS = {".mp4", ".avi", ".mov", ".mkv"} - def __init__(self, file_name: str): - self.file_name = file_name - @classmethod - def get_name(cls) -> str: - return "File" - - def get_content_type_from_filename(self) -> t.Type[ContentType]: + def get_content_type_from_filename(cls, file_name: str) -> t.Type[ContentType]: """ Determines content type based on file extension. """ - if any(self.file_name.endswith(ext) for ext in self.VALID_PHOTO_EXTENSIONS): + file_extension = file_name.lower().rsplit('.', 1)[-1] + file_extension = f".{file_extension}" + + if file_extension in cls.VALID_PHOTO_EXTENSIONS: return PhotoContent - elif any(self.file_name.endswith(ext) for ext in self.VALID_VIDEO_EXTENSIONS): + elif file_extension in cls.VALID_VIDEO_EXTENSIONS: return VideoContent else: - raise ValueError(f"Unknown content type for file: {self.file_name}") \ No newline at end of file + return None \ No newline at end of file From d18a8e977445dc95e786da97b408bf8004c1984c Mon Sep 17 00:00:00 2001 From: ZeyadTarekk Date: Fri, 8 Nov 2024 14:36:09 +0200 Subject: [PATCH 4/9] Change the unit tests --- .../tests/test_file_content_type.py | 35 +++++++++++++------ 1 file changed, 24 insertions(+), 11 deletions(-) diff --git a/python-threatexchange/threatexchange/tests/test_file_content_type.py b/python-threatexchange/threatexchange/tests/test_file_content_type.py index 18290a421..ba27094cc 100644 --- a/python-threatexchange/threatexchange/tests/test_file_content_type.py +++ b/python-threatexchange/threatexchange/tests/test_file_content_type.py @@ -4,17 +4,30 @@ from threatexchange.content_type.file_content import FileContent class TestFileContentType(unittest.TestCase): - def test_photo_detection(self): - file_content = FileContent("file.jpg") - content_type = file_content.get_content_type_from_filename() - self.assertEqual(content_type, PhotoContent) + def test_photo_detection_jpg(self): + file_content = FileContent.get_content_type_from_filename("file.jpg") + self.assertEqual(file_content, PhotoContent) - def test_video_detection(self): - file_content = FileContent("file.mp4") - content_type = file_content.get_content_type_from_filename() - self.assertEqual(content_type, VideoContent) + def test_photo_detection_uppercase_extension(self): + file_content = FileContent.get_content_type_from_filename("file.JPG") + self.assertEqual(file_content, PhotoContent) + + def test_video_detection_mp4(self): + file_content = FileContent.get_content_type_from_filename("file.mp4") + self.assertEqual(file_content, VideoContent) + + def test_video_detection_uppercase_extension(self): + file_content = FileContent.get_content_type_from_filename("file.MP4") + self.assertEqual(file_content, VideoContent) def test_unknown_file_type(self): - file_content = FileContent("file.txt") - with self.assertRaises(ValueError): - file_content.get_content_type_from_filename() \ No newline at end of file + file_content = FileContent.get_content_type_from_filename("file.txt") + self.assertIsNone(file_content) + + def test_photo_with_multiple_dots(self): + file_content = FileContent.get_content_type_from_filename("archive.photo.png") + self.assertEqual(file_content, PhotoContent) + + def test_video_with_multiple_dots(self): + file_content = FileContent.get_content_type_from_filename("movie.backup.mp4") + self.assertEqual(file_content, VideoContent) \ No newline at end of file From 95282f035eecb17b85ca8e6ab0f9c57b33891667 Mon Sep 17 00:00:00 2001 From: ZeyadTarekkk Date: Fri, 8 Nov 2024 14:46:33 +0000 Subject: [PATCH 5/9] use pytest in unit tests --- .../tests/test_file_content_type.py | 27 +++++++++++++++ .../tests/test_file_content_type.py | 33 ------------------- 2 files changed, 27 insertions(+), 33 deletions(-) create mode 100644 python-threatexchange/threatexchange/content_type/tests/test_file_content_type.py delete mode 100644 python-threatexchange/threatexchange/tests/test_file_content_type.py diff --git a/python-threatexchange/threatexchange/content_type/tests/test_file_content_type.py b/python-threatexchange/threatexchange/content_type/tests/test_file_content_type.py new file mode 100644 index 000000000..077052353 --- /dev/null +++ b/python-threatexchange/threatexchange/content_type/tests/test_file_content_type.py @@ -0,0 +1,27 @@ +import pytest +from threatexchange.content_type.photo import PhotoContent +from threatexchange.content_type.video import VideoContent +from threatexchange.content_type.file_content import FileContent + +@pytest.mark.parametrize("file_name,expected_content_type", [ + ("file.jpg", PhotoContent), + ("file.JPG", PhotoContent), + ("file.mp4", VideoContent), + ("file.MP4", VideoContent), + ("archive.photo.png", PhotoContent), + ("movie.backup.mp4", VideoContent), +]) +def test_file_content_detection(file_name, expected_content_type): + """ + Tests that FileContent correctly identifies the content type + as either PhotoContent or VideoContent based on file extension. + """ + content_type = FileContent.get_content_type_from_filename(file_name) + assert content_type == expected_content_type, f"Failed for {file_name}" + +def test_unknown_file_type(): + """ + Tests that an unknown file type returns None. + """ + file_content = FileContent.get_content_type_from_filename("file.txt") + assert file_content is None diff --git a/python-threatexchange/threatexchange/tests/test_file_content_type.py b/python-threatexchange/threatexchange/tests/test_file_content_type.py deleted file mode 100644 index ba27094cc..000000000 --- a/python-threatexchange/threatexchange/tests/test_file_content_type.py +++ /dev/null @@ -1,33 +0,0 @@ -import unittest -from threatexchange.content_type.photo import PhotoContent -from threatexchange.content_type.video import VideoContent -from threatexchange.content_type.file_content import FileContent - -class TestFileContentType(unittest.TestCase): - def test_photo_detection_jpg(self): - file_content = FileContent.get_content_type_from_filename("file.jpg") - self.assertEqual(file_content, PhotoContent) - - def test_photo_detection_uppercase_extension(self): - file_content = FileContent.get_content_type_from_filename("file.JPG") - self.assertEqual(file_content, PhotoContent) - - def test_video_detection_mp4(self): - file_content = FileContent.get_content_type_from_filename("file.mp4") - self.assertEqual(file_content, VideoContent) - - def test_video_detection_uppercase_extension(self): - file_content = FileContent.get_content_type_from_filename("file.MP4") - self.assertEqual(file_content, VideoContent) - - def test_unknown_file_type(self): - file_content = FileContent.get_content_type_from_filename("file.txt") - self.assertIsNone(file_content) - - def test_photo_with_multiple_dots(self): - file_content = FileContent.get_content_type_from_filename("archive.photo.png") - self.assertEqual(file_content, PhotoContent) - - def test_video_with_multiple_dots(self): - file_content = FileContent.get_content_type_from_filename("movie.backup.mp4") - self.assertEqual(file_content, VideoContent) \ No newline at end of file From 69d9abc96c36a180a971e170dcd9fd46585ddd79 Mon Sep 17 00:00:00 2001 From: ZeyadTarekk Date: Fri, 8 Nov 2024 17:06:17 +0200 Subject: [PATCH 6/9] modify the tests --- .../content_type/tests/test_file_content_type.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/python-threatexchange/threatexchange/content_type/tests/test_file_content_type.py b/python-threatexchange/threatexchange/content_type/tests/test_file_content_type.py index 077052353..9dd18234e 100644 --- a/python-threatexchange/threatexchange/content_type/tests/test_file_content_type.py +++ b/python-threatexchange/threatexchange/content_type/tests/test_file_content_type.py @@ -10,6 +10,7 @@ ("file.MP4", VideoContent), ("archive.photo.png", PhotoContent), ("movie.backup.mp4", VideoContent), + ("file.txt", None), ]) def test_file_content_detection(file_name, expected_content_type): """ @@ -17,11 +18,4 @@ def test_file_content_detection(file_name, expected_content_type): as either PhotoContent or VideoContent based on file extension. """ content_type = FileContent.get_content_type_from_filename(file_name) - assert content_type == expected_content_type, f"Failed for {file_name}" - -def test_unknown_file_type(): - """ - Tests that an unknown file type returns None. - """ - file_content = FileContent.get_content_type_from_filename("file.txt") - assert file_content is None + assert content_type == expected_content_type From bb816aa6baedf0988828dc54f31f8a30dd8e8b09 Mon Sep 17 00:00:00 2001 From: ZeyadTarekk Date: Fri, 8 Nov 2024 17:21:26 +0200 Subject: [PATCH 7/9] Fix linting problems --- .../content_type/file_content.py | 9 ++++---- .../tests/test_file_content_type.py | 22 +++++++++++-------- 2 files changed, 18 insertions(+), 13 deletions(-) diff --git a/python-threatexchange/threatexchange/content_type/file_content.py b/python-threatexchange/threatexchange/content_type/file_content.py index a529a7713..29daec73a 100644 --- a/python-threatexchange/threatexchange/content_type/file_content.py +++ b/python-threatexchange/threatexchange/content_type/file_content.py @@ -4,10 +4,11 @@ from video import VideoContent from PIL import Image + class FileContent(ContentType): """ - ContentType representing a generic file. - + ContentType representing a generic file. + Determines if a file is a photo or video based on file extension. """ @@ -19,7 +20,7 @@ def get_content_type_from_filename(cls, file_name: str) -> t.Type[ContentType]: """ Determines content type based on file extension. """ - file_extension = file_name.lower().rsplit('.', 1)[-1] + file_extension = file_name.lower().rsplit(".", 1)[-1] file_extension = f".{file_extension}" if file_extension in cls.VALID_PHOTO_EXTENSIONS: @@ -27,4 +28,4 @@ def get_content_type_from_filename(cls, file_name: str) -> t.Type[ContentType]: elif file_extension in cls.VALID_VIDEO_EXTENSIONS: return VideoContent else: - return None \ No newline at end of file + return None diff --git a/python-threatexchange/threatexchange/content_type/tests/test_file_content_type.py b/python-threatexchange/threatexchange/content_type/tests/test_file_content_type.py index 9dd18234e..77ec248e4 100644 --- a/python-threatexchange/threatexchange/content_type/tests/test_file_content_type.py +++ b/python-threatexchange/threatexchange/content_type/tests/test_file_content_type.py @@ -3,15 +3,19 @@ from threatexchange.content_type.video import VideoContent from threatexchange.content_type.file_content import FileContent -@pytest.mark.parametrize("file_name,expected_content_type", [ - ("file.jpg", PhotoContent), - ("file.JPG", PhotoContent), - ("file.mp4", VideoContent), - ("file.MP4", VideoContent), - ("archive.photo.png", PhotoContent), - ("movie.backup.mp4", VideoContent), - ("file.txt", None), -]) + +@pytest.mark.parametrize( + "file_name,expected_content_type", + [ + ("file.jpg", PhotoContent), + ("file.JPG", PhotoContent), + ("file.mp4", VideoContent), + ("file.MP4", VideoContent), + ("archive.photo.png", PhotoContent), + ("movie.backup.mp4", VideoContent), + ("file.txt", None), + ], +) def test_file_content_detection(file_name, expected_content_type): """ Tests that FileContent correctly identifies the content type From a5a193016da71156f311df8270d143a7f0ba190b Mon Sep 17 00:00:00 2001 From: ZeyadTarekk Date: Fri, 8 Nov 2024 17:31:27 +0200 Subject: [PATCH 8/9] Fix error --- .../threatexchange/content_type/content_base.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/python-threatexchange/threatexchange/content_type/content_base.py b/python-threatexchange/threatexchange/content_type/content_base.py index e2a2de5de..1c01cfce0 100644 --- a/python-threatexchange/threatexchange/content_type/content_base.py +++ b/python-threatexchange/threatexchange/content_type/content_base.py @@ -10,8 +10,7 @@ from enum import Enum, auto import typing as t -import common - +from threatexchange import common class ContentType: @classmethod @@ -32,7 +31,7 @@ def extract_additional_content( * Photo => run OCR and extract text * Video => break out photo thumbnail, close caption text, audio """ - return {} + return [] class RotationType(Enum): From 2580a48f0ed5e76053894e69b195f09ca999ca8f Mon Sep 17 00:00:00 2001 From: ZeyadTarekk Date: Fri, 8 Nov 2024 17:32:58 +0200 Subject: [PATCH 9/9] fix formatting --- .../threatexchange/content_type/content_base.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python-threatexchange/threatexchange/content_type/content_base.py b/python-threatexchange/threatexchange/content_type/content_base.py index 1c01cfce0..81e600877 100644 --- a/python-threatexchange/threatexchange/content_type/content_base.py +++ b/python-threatexchange/threatexchange/content_type/content_base.py @@ -12,6 +12,7 @@ from threatexchange import common + class ContentType: @classmethod def get_name(cls) -> str: