-
Notifications
You must be signed in to change notification settings - Fork 1
/
tinyYOLOv3.py
252 lines (208 loc) · 10.9 KB
/
tinyYOLOv3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
import argparse
import cv2
from math import exp as exp
from time import time
from imutils.video import FPS
from inference import Network
from YoloParams import YoloParams
# Build the argument parser and parse the arguments
def get_args():
"""
Obtains the application arguments from the command line.
"""
ap = argparse.ArgumentParser("Run tiny YOLOv3 inference on an input camera stream or input video")
# Adds required and optional argument groups
ap._action_groups.pop()
required = ap.add_argument_group('required arguments')
optional = ap.add_argument_group('optional arguments')
# Sets the individual arguments with respect to the relevant argument group
required.add_argument("--m", "--model", help="Required: Path to .xml file with trained YOLOv3 model",
required=True, type=str)
optional.add_argument("--i", "--input_file", help="Optional: Path to a video file. If not provided, \
camera input is used", default='CAM', type=str)
optional.add_argument("--l", "--label_file", help="Optional: Path to a file of object class labels", type=str)
optional.add_argument("--d", "--device_name", help="Optional: Name of the device to do inference on. CPU (default),\
GPU, FPGA, HDDL or MYRIAD", default='CPU', type=str)
optional.add_argument("--t", "--threshold", help="Optional: Confidence threshold for filtering of object \
detections", default=0.5, type=float)
optional.add_argument("--o", "--overlap", help="Optional: Maximum permissible bounding box overlap", default=0.3,
type=float)
args = ap.parse_args()
return args
def entry_index(side, coord, classes, location, entry):
side_power_2 = side ** 2
n = location // side_power_2
loc = location % side_power_2
return int(side_power_2 * (n * (coord + classes + 1) + entry) + loc)
def scale_bbox(x, y, h, w, class_id, confidence, h_scale, w_scale):
"""
Takes (x, y) to be the centre point of the bounding box.
Scales the box according to the width and height scaling parameters.
"""
xmin = int((x - w / 2) * w_scale)
ymin = int((y - h / 2) * h_scale)
xmax = int(xmin + w * w_scale)
ymax = int(ymin + h * h_scale)
return dict(xmin=xmin, xmax=xmax, ymin=ymin, ymax=ymax, class_id=class_id, confidence=confidence)
def intersection_over_union(box_1, box_2):
"""
Determines extent of aerial overlap between bounding boxes 1 and 2
"""
width_of_overlap_area = min(box_1['xmax'], box_2['xmax']) - max(box_1['xmin'], box_2['xmin'])
height_of_overlap_area = min(box_1['ymax'], box_2['ymax']) - max(box_1['ymin'], box_2['ymin'])
if width_of_overlap_area < 0 or height_of_overlap_area < 0:
area_of_overlap = 0
else:
area_of_overlap = width_of_overlap_area * height_of_overlap_area
box_1_area = (box_1['ymax'] - box_1['ymin']) * (box_1['xmax'] - box_1['xmin'])
box_2_area = (box_2['ymax'] - box_2['ymin']) * (box_2['xmax'] - box_2['xmin'])
area_of_union = box_1_area + box_2_area - area_of_overlap
if area_of_union == 0:
return 0
return area_of_overlap / area_of_union
def parse_yolo_region(blob, resized_image_shape, original_im_shape, params, threshold):
"""
Takes the YOLO output blob, parses validated regions and
adds scaled bounding boxes to the list of detected objects
"""
# Validating output parameters
_, _, out_blob_h, out_blob_w = blob.shape
assert out_blob_w == out_blob_h, "Invalid size of output blob. It should be in NCHW layout and height should " \
"be equal to width. Current height = {}, current width = {}" \
"".format(out_blob_h, out_blob_w)
# Extracting layer parameters
orig_im_h, orig_im_w = original_im_shape
resized_image_h, resized_image_w = resized_image_shape
objects = list()
predictions = blob.flatten()
side_square = params.side * params.side
# Parsing YOLO region output
for i in range(side_square):
row = i // params.side
col = i % params.side
for n in range(params.num):
obj_index = entry_index(params.side, params.coords, params.classes, n * side_square + i, params.coords)
scale = predictions[obj_index]
if scale < threshold:
continue
box_index = entry_index(params.side, params.coords, params.classes, n * side_square + i, 0)
# Network produces location predictions in absolute coordinates of feature maps.
# Scale it to relative coordinates.
x = (col + predictions[box_index + 0 * side_square]) / params.side
y = (row + predictions[box_index + 1 * side_square]) / params.side
# Value for exp is very big number in some cases, so check for overflow
try:
w_exp = exp(predictions[box_index + 2 * side_square])
h_exp = exp(predictions[box_index + 3 * side_square])
except OverflowError:
continue
# Depending on topology, frame size is to be normalised by feature maps (<YOLOv3) or input shape (=YOLOv3)
w = w_exp * params.anchors[2 * n] / (resized_image_w if params.isYoloV3 else params.side)
h = h_exp * params.anchors[2 * n + 1] / (resized_image_h if params.isYoloV3 else params.side)
for j in range(params.classes):
class_index = entry_index(params.side, params.coords, params.classes, n * side_square + i,
params.coords + 1 + j)
confidence = scale * predictions[class_index]
if confidence < threshold:
continue
objects.append(scale_bbox(x=x, y=y, h=h, w=w, class_id=j, confidence=confidence,
h_scale=orig_im_h, w_scale=orig_im_w))
return objects
def infer_video(args):
"""
Take input video or camera stream and perform asynchronous object detection inference
using tiny YOLOv3 on a frame-by-frame basis
"""
# Initialize the OpenVINO Inference Engine
net = Network()
# Read object class labels (here: coco.names)
if args.l:
with open(args.l, 'r') as f:
labels_map = [x.strip() for x in f]
else:
labels_map = None
# Load the Deep Learning Computer Vision model into the Inference Engine
net.load_model(args.m, args.d)
n, c, h, w = net.get_input_shape()
# Window for stream and video inference result display
window_name = "Processing video input"
cv2.namedWindow(window_name)
# Set up argument for local camera frame capture, if applicable
if args.i == 'CAM':
args.i = 0
# Set up OpenCV video capturing object for local camera or video file processing
capture = cv2.VideoCapture(args.i)
capture.open(args.i)
# Set up OpenCV video writer object for output video generation using XVID codec
file_in = "output.avi"
codec = cv2.VideoWriter_fourcc("M", "P", "4", "2") # Raspbian setting after FFMPEG 1.4 installation
frame_rate = 30
width = int(capture.get(3))
height = int(capture.get(4))
resolution = (width, height)
file_out = cv2.VideoWriter(file_in, codec, frame_rate, resolution)
# Process input frames until end of video or process is exited by escape keystroke
fps = FPS().start()
while capture.isOpened():
flag, frame = capture.read()
if not flag:
break
key_pressed = cv2.waitKey(60)
# Pre-process input frame as required by Deep Learning model
# 1. Resize to shape expected by Deep Learning model
# 2. Transpose data layout from HWC (height, width, channel) to CHW
# 3. Reshape the frame to add a "batch" of 1 at the start
pre_frame = cv2.resize(frame, (w, h))
pre_frame = pre_frame.transpose((2, 0, 1))
pre_frame = pre_frame.reshape((n, c, h, w))
# Start inference on the pre-processed frame and compute inference duration
start_time = time()
net.async_inference(pre_frame)
detection_time = time() - start_time
# Obtain the inference result
objects = list()
if net.wait() == 0:
output = net.extract_output()
for layer_name, out_blob in output.items():
out_blob = out_blob.reshape(net.network.layers[net.network.layers[layer_name].parents[0]].shape)
layer_params = YoloParams(net.network.layers[layer_name].params, out_blob.shape[2])
objects += parse_yolo_region(out_blob, pre_frame.shape[2:], frame.shape[:-1], layer_params, args.t)
# Filter out overlapping bounding boxes with respect to the IoU parameter
objects = sorted(objects, key=lambda obj : obj['confidence'], reverse=True)
for i in range(len(objects)):
if objects[i]['confidence'] == 0:
continue
for j in range(i + 1, len(objects)):
if intersection_over_union(objects[i], objects[j]) > args.o:
objects[j]['confidence'] = 0
# Draw objects with respect to probability threshold parameter
objects = [obj for obj in objects if obj['confidence'] >= args.t]
origin_im_size = frame.shape[:-1]
for obj in objects:
if obj['xmax'] > origin_im_size[1] or obj['ymax'] > origin_im_size[0] or obj['xmin'] < 0 or obj['ymin'] < 0:
continue
colour = (int(min(obj['class_id'] * 12.5, 255)), min(obj['class_id'] * 7, 255), min(obj['class_id'] * 5, 255))
det_label = labels_map[obj['class_id']] if labels_map and len(labels_map) >= obj['class_id'] else \
str(obj['class_id'])
cv2.rectangle(frame, (obj['xmin'], obj['ymin']), (obj['xmax'], obj['ymax']), colour, 2)
cv2.putText(frame, det_label + ' ' + str(round(obj['confidence'] * 100, 1)) + ' %',
(obj['xmin'], obj['ymin'] - 7), cv2.FONT_HERSHEY_COMPLEX, 0.6, colour, 1)
cv2.putText(frame, "Inference time: {:.3f} ms".format(detection_time * 1e3), (15, 15),
cv2.FONT_HERSHEY_COMPLEX, 0.5, (200, 10, 10), 1)
file_out.write(frame)
cv2.imshow(window_name, frame)
if key_pressed == 27:
break
# Update frames-per-second information
fps.update()
fps.stop()
print("Elapsed time: {:.2f}".format(fps.elapsed()))
print("Approximate FPS: {:.2f}".format(fps.fps()))
file_out.release()
capture.release()
cv2.destroyAllWindows()
def main():
args = get_args()
infer_video(args)
if __name__ == "__main__":
main()