objectDetection.py

import cv2                                # state of the art computer vision algorithms library
import numpy as np                        # fundamental package for scientific computing
import matplotlib.pyplot as plt           # 2D plotting library producing publication quality figures
import pyrealsense2 as rs                 # Intel RealSense cross-platform open-source API
print("Environment Ready")

def filtering(depth_frame):

    decimation = rs.decimation_filter()
    decimation.set_option(rs.option.filter_magnitude, 4)
    spatial = rs.spatial_filter()
    spatial.set_option(rs.option.filter_magnitude, 5)
    spatial.set_option(rs.option.filter_smooth_alpha, 1)
    spatial.set_option(rs.option.filter_smooth_delta, 50)
    spatial.set_option(rs.option.holes_fill, 3)
    hole_filling = rs.hole_filling_filter()
    temporal = rs.temporal_filter()

    depth_to_disparity = rs.disparity_transform(True)
    disparity_to_depth = rs.disparity_transform(False)

    frame = depth_frame
    frame = decimation.process(frame)
    frame = depth_to_disparity.process(frame)
    frame = spatial.process(frame)
    frame = temporal.process(frame)
    frame = disparity_to_depth.process(frame)
    depth_frame = hole_filling.process(frame)

    return depth_frame

def colorSegementation(color, xmin, xmax, ymin, ymax):

    # Estrealla red color
    hsv_frame = cv2.cvtColor(color, cv2.COLOR_RGB2HSV)

    mask1 = cv2.inRange(hsv_frame, (0,50,20), (5,255,255))
    mask2 = cv2.inRange(hsv_frame, (175,50,20), (180,255,255))

    ## Merge the mask and crop the red regions
    masked = cv2.bitwise_or(mask1, mask2)
    red = cv2.bitwise_and(color, color, mask=masked)

    indices = np.where(red != [0])

    avg_x_ = None
    avg_y_ = None
    counter = 0
    if len(indices[0]) and len(indices[1]):
        avg_x_ = 0
        avg_y_ = 0
        counter = 0

        for y in indices[0]:
            if (y >= int(ymin) and y <= int(ymax)):

                avg_y_ += y
                counter += 1
        try:
            avg_y_ /= counter
        except: 
            pass
        counter = 0
        for x in indices[1]:
            if (x >= int(xmin) and x <= int(xmax)):

                avg_x_ += x
                counter += 1
        try:
            avg_x_ /= counter
        except:
            pass
        #print("c:",counter)
    
    if (avg_x_ >= int(xmin) and avg_x_ <= int(xmax)) and (avg_y_ >= int(ymin) and avg_y_ <= int(ymax)) and counter >= 3000: #counter is a hyperparameter
        
        return "Estrella"
    else:
        return "Unknown"

# Setup:
pipe = rs.pipeline()

config = rs.config()
config.enable_stream(rs.stream.color, 424, 240, rs.format.rgb8, 30)
config.enable_stream(rs.stream.depth, 424, 240, rs.format.z16, 30)
config.enable_stream(rs.stream.infrared, 1)
config.enable_stream(rs.stream.infrared, 2)
profile = pipe.start(config)


# Skip 15 first frames to give the Auto-Exposure time to adjust
for x in range(15):
  pipe.wait_for_frames()


# Store next frameset for later processing:
frameset = pipe.wait_for_frames()
color_frame = frameset.get_color_frame()
depth_frame = frameset.get_depth_frame()

depth_frame = filtering(depth_frame)

# Cleanup:
pipe.stop()
print("Frames Captured")

color = np.asanyarray(color_frame.get_data())
#plt.rcParams["axes.grid"] = False
#plt.rcParams['figure.figsize'] = [12, 6]
#plt.imshow(color)
#plt.show()
colorizer = rs.colorizer()
colorized_depth = np.asanyarray(colorizer.colorize(depth_frame).get_data())
#plt.imshow(colorized_depth)

# Create alignment primitive with color as its target stream:
align = rs.align(rs.stream.color)
frameset = align.process(frameset)

# Update color and depth frames:
aligned_depth_frame = frameset.get_depth_frame()
colorized_depth = np.asanyarray(colorizer.colorize(aligned_depth_frame).get_data())

# Show the two frames together:
#images = np.hstack((color, colorized_depth))
#plt.imshow(images)

# Standard OpenCV boilerplate for running the net:
height, width = color.shape[:2] #240, 424

expected = 300
aspect = width / height
resized_image = cv2.resize(color, (int(round(expected * aspect)), expected))
crop_start = int(round(expected * (aspect - 1) / 2))
crop_img = resized_image[0:expected, crop_start:crop_start+expected]

arg1 = "MobileNetSSD_deploy.prototxt.txt"
arg2 = "MobileNetSSD_deploy.caffemodel"
net = cv2.dnn.readNetFromCaffe(arg1, arg2)
inScaleFactor = 0.007843
meanVal       = 127.53
classNames = ("background", "aeroplane", "bicycle", "bird", "boat",
              "bottle", "bus", "car", "cat", "chair",
              "cow", "diningtable", "dog", "horse",
              "motorbike", "person", "pottedplant",
              "sheep", "sofa", "train", "tvmonitor")

blob = cv2.dnn.blobFromImage(crop_img, inScaleFactor, (expected, expected), meanVal, False)
net.setInput(blob, "data")
detections = net.forward("detection_out")

results = []
for i in np.arange(0, detections.shape[2]):
    idx = int(detections[0, 0, i, 1])
    #print(classNames[idx])


    label = detections[0,0,i,1]
    conf  = detections[0,0,i,2]
    xmin  = detections[0,0,i,3]
    ymin  = detections[0,0,i,4]
    xmax  = detections[0,0,i,5]
    ymax  = detections[0,0,i,6]

    className = classNames[int(label)]

    #cv2.rectangle(crop_img, (int(xmin * expected), int(ymin * expected)), 
    #         (int(xmax * expected), int(ymax * expected)), (255, 255, 255), 2)
    #cv2.putText(crop_img, className, 
    #        (int(xmin * expected), int(ymin * expected) - 5),
    #        cv2.FONT_HERSHEY_COMPLEX, 0.5, (255,255,255))

    #plt.imshow(crop_img)
    #plt.show()

    scale = height / expected
    xmin_depth = int((xmin * expected + crop_start) * scale)
    ymin_depth = int((ymin * expected) * scale)
    xmax_depth = int((xmax * expected + crop_start) * scale)
    ymax_depth = int((ymax * expected) * scale)
    xmin_depth,ymin_depth,xmax_depth,ymax_depth
    cv2.rectangle(colorized_depth, (xmin_depth, ymin_depth), 
                (xmax_depth, ymax_depth), (255, 255, 255), 2)
    plt.imshow(colorized_depth)
    # plt.show()

    x_depth_center = 0.5 * (xmax_depth + xmin_depth)
    y_depth_center = 0.5 * (ymax_depth + ymin_depth)

    depth = np.asanyarray(aligned_depth_frame.get_data())
    # Crop depth data:
    depth = depth[xmin_depth:xmax_depth,ymin_depth:ymax_depth].astype(float)

    # Get data scale from the device and convert to meters
    #depth_scale = profile.get_device().first_depth_sensor().get_depth_scale()
    #depth = depth * depth_scale
    #dist,_,_,_ = cv2.mean(depth)
    dist = aligned_depth_frame.get_distance(int(x_depth_center), int(y_depth_center))


    #avg_x = 0.5 * (xmin * (width/expected) + xmax * (width/expected))
    #avg_y = 0.5 * (ymin * (height/expected) + ymax * (height/expected))
    #print(avg_x, avg_y)
    depth_intrin = aligned_depth_frame.profile.as_video_stream_profile().intrinsics
    depth = aligned_depth_frame.get_distance(int(x_depth_center), int(y_depth_center))
    realx, realy, realz = rs.rs2_deproject_pixel_to_point(depth_intrin, [int(x_depth_center),int(y_depth_center)],depth)

    objectType = colorSegementation(crop_img,int(xmin * expected), int(xmax * expected), int(ymin * expected), int(ymax * expected))
    results.append((realx, realy, realz, objectType, className))

for item in results:
    # if (item[3]=="bottle"):
        print("Detected a {0} of type {4} at (x, y, z) : {1:.3}, {2:.3}, {3:.3}.".format(item[4], item[0], item[1], item[2], item[3]))


#The detections are in the list 'results' in the form of '(x, y, z, drinkType, className)'