diff --git a/detect.py b/detect.py index 9abb9a0152d0..be1c3a5cee06 100644 --- a/detect.py +++ b/detect.py @@ -61,7 +61,7 @@ def detect(save_img=False): t0 = time.time() img = torch.zeros((1, 3, imgsz, imgsz), device=device) # init img _ = model(img.half() if half else img) if device.type != 'cpu' else None # run once - for path, img, im0s, vid_cap in dataset: + for path, img, im0s, vid_cap, rotation in dataset: img = torch.from_numpy(img).to(device) img = img.half() if half else img.float() # uint8 to fp16/32 img /= 255.0 # 0 - 255 to 0.0 - 1.0 @@ -133,8 +133,8 @@ def detect(save_img=False): fourcc = 'mp4v' # output video codec fps = vid_cap.get(cv2.CAP_PROP_FPS) - w = int(vid_cap.get(cv2.CAP_PROP_FRAME_WIDTH)) - h = int(vid_cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) + w = int(vid_cap.get(cv2.CAP_PROP_FRAME_WIDTH)) if not rotation else int(vid_cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) + h = int(vid_cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) if not rotation else int(vid_cap.get(cv2.CAP_PROP_FRAME_WIDTH)) vid_writer = cv2.VideoWriter(save_path, cv2.VideoWriter_fourcc(*fourcc), fps, (w, h)) vid_writer.write(im0) diff --git a/requirements.txt b/requirements.txt index aaeedd9afd66..44ead7d8110a 100755 --- a/requirements.txt +++ b/requirements.txt @@ -12,6 +12,8 @@ tensorboard>=2.2 torch>=1.6.0 torchvision>=0.7.0 tqdm>=4.41.0 +scikit-video +ffmpeg # logging ------------------------------------- # wandb diff --git a/utils/datasets.py b/utils/datasets.py index 18ad02b6de26..3faa2a99b555 100755 --- a/utils/datasets.py +++ b/utils/datasets.py @@ -8,6 +8,7 @@ import cv2 import math +import skvideo.io import numpy as np import torch from PIL import Image, ExifTags @@ -122,9 +123,19 @@ def __init__(self, path, img_size=640): images = [x for x in files if os.path.splitext(x)[-1].lower() in img_formats] videos = [x for x in files if os.path.splitext(x)[-1].lower() in vid_formats] ni, nv = len(images), len(videos) + videos_rotation = [None for _ in videos] + for index in range(nv): + metadata = skvideo.io.ffprobe(videos[index]) + if 'video' in metadata and 'tag' in metadata['video']: + tags = metadata['video']['tag'] + + for tag in tags: + if tag['@key'] == 'rotate': + videos_rotation[index] = tag['@value'] self.img_size = img_size self.files = images + videos + self.rotation = [None for _ in images] + videos_rotation self.nf = ni + nv # number of files self.video_flag = [False] * ni + [True] * nv self.mode = 'images' @@ -143,6 +154,7 @@ def __next__(self): if self.count == self.nf: raise StopIteration path = self.files[self.count] + rotation = self.rotation[self.count] if self.video_flag[self.count]: # Read video @@ -168,6 +180,10 @@ def __next__(self): assert img0 is not None, 'Image Not Found ' + path print('image %g/%g %s: ' % (self.count, self.nf, path), end='') + # Rotation Valid + if rotation: + img0 = cv2.rotate(img0, 0) + # Padded resize img = letterbox(img0, new_shape=self.img_size)[0] @@ -176,7 +192,7 @@ def __next__(self): img = np.ascontiguousarray(img) # cv2.imwrite(path + '.letterbox.jpg', 255 * img.transpose((1, 2, 0))[:, :, ::-1]) # save letterbox image - return path, img, img0, self.cap + return path, img, img0, self.cap, rotation def new_video(self, path): self.frame = 0 @@ -246,7 +262,7 @@ def __next__(self): img = img[:, :, ::-1].transpose(2, 0, 1) # BGR to RGB, to 3x416x416 img = np.ascontiguousarray(img) - return img_path, img, img0, None + return img_path, img, img0, None, None def __len__(self): return 0 @@ -319,7 +335,7 @@ def __next__(self): img = img[:, :, :, ::-1].transpose(0, 3, 1, 2) # BGR to RGB, to bsx3x416x416 img = np.ascontiguousarray(img) - return self.sources, img, img0, None + return self.sources, img, img0, None, None def __len__(self): return 0 # 1E12 frames = 32 streams at 30 FPS for 30 years