forked from x4nth055/ethical-hacking-tools-python
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmetadata.py
101 lines (87 loc) · 2.99 KB
/
metadata.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import ffmpeg
from tinytag import TinyTag
import sys
from pprint import pprint # for printing Python dictionaries in a human-readable way
from PIL import Image
from PIL.ExifTags import TAGS
import sys
import pikepdf
from docx import Document
def get_media_metadata(media_file):
# uses ffprobe command to extract all possible metadata from the media file
ffmpeg_data = ffmpeg.probe(media_file)["streams"][0]
tt_data = TinyTag.get(media_file).as_dict()
# add both data to a single dict
return {**tt_data, **ffmpeg_data}
def get_image_metadata(image_file):
# read the image data using PIL
image = Image.open(image_file)
# extract other basic metadata
info_dict = {
"Filename": image.filename,
"Image Size": image.size,
"Image Height": image.height,
"Image Width": image.width,
"Image Format": image.format,
"Image Mode": image.mode,
"Image is Animated": getattr(image, "is_animated", False),
"Frames in Image": getattr(image, "n_frames", 1)
}
# extract EXIF data
exifdata = image.getexif()
# iterating over all EXIF data fields
for tag_id in exifdata:
# get the tag name, instead of human unreadable tag id
tag = TAGS.get(tag_id, tag_id)
data = exifdata.get(tag_id)
# decode bytes
if isinstance(data, bytes):
data = data.decode()
# print(f"{tag:25}: {data}")
info_dict[tag] = data
return info_dict
def get_pdf_metadata(pdf_file):
# read the pdf file
pdf = pikepdf.Pdf.open(pdf_file)
# .docinfo attribute contains all the metadata of
# the PDF document
return dict(pdf.docinfo)
def get_docx_metadata(docx_file):
"""
Extracts metadata from a DOCX file.
Args:
docx_file (str): The path to the .docx file.
Returns:
dict: A dictionary containing metadata information.
"""
# Load the DOCX file
doc = Document(docx_file)
# Accessing document properties
props = doc.core_properties
return {
"author": props.author,
"category": props.category,
"comments": props.comments,
"content_status": props.content_status,
"created": props.created,
"identifier": props.identifier,
"keywords": props.keywords,
"language": props.language,
"last_modified_by": props.last_modified_by,
"last_printed": props.last_printed,
"modified": props.modified,
"revision": props.revision,
"subject": props.subject,
"title": props.title,
"version": props.version
}
if __name__ == "__main__":
file = sys.argv[1]
if file.endswith(".pdf"):
print(get_pdf_metadata(file))
elif file.endswith(".jpg"):
pprint(get_image_metadata(file))
elif file.endswith(".docx"):
pprint(get_docx_metadata(file))
else:
pprint(get_media_metadata(file))