forked from oulutan/ACAM_Demo
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathsimple_detect_actions_on_tube.py
99 lines (61 loc) · 2.75 KB
/
simple_detect_actions_on_tube.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import cv2
import os
import numpy as np
import imageio
import action_detection.action_detector as act
def set_up_detector():
act_detector = act.Action_Detector('soft_attn')
#ckpt_name = 'model_ckpt_RGB_soft_attn-16'
#ckpt_name = 'model_ckpt_soft_attn_ava-23'
ckpt_name = 'model_ckpt_soft_attn_pooled_cosine_drop_ava-130'
input_seq, rois, roi_batch_indices, pred_probs = act_detector.define_inference_with_placeholders()
ckpt_path = os.path.join('action_detection', 'weights', ckpt_name)
act_detector.restore_model(ckpt_path)
detector_dict = { 'detector':act_detector,
'input_seq': input_seq,
'rois': rois,
'roi_batch_indices': roi_batch_indices,
'pred_probs': pred_probs}
return detector_dict
def detect_on_tube(input_tube, detector_dict):
""" Input tube has to be of shape batch_size x 32 x 400 x 400 x 3
In this function I am assuming the actor is centered in the tube and the tube is a larger context
ex: center 200x200 is the actor and remaining outside area is context"""
batch_size = input_tube.shape[0]
# assuming actors are centered
# (this should be changed depending on how much of context is included in the tube)
rois_np = np.array([[0.25,0.25,0.75,0.75]]*batch_size)
roi_batch_indices_np = np.arange(batch_size)
act_detector = detector_dict['detector']
# inputs
input_seq_tf = detector_dict['input_seq']
rois_tf = detector_dict['rois']
roi_batch_indices_tf = detector_dict['roi_batch_indices']
# output
predictions_tf = detector_dict['pred_probs']
feed_dict = { input_seq_tf: input_tube,
rois_tf: rois_np,
roi_batch_indices_tf: roi_batch_indices_np}
prediction_probabilites = act_detector.session.run(predictions_tf, feed_dict=feed_dict)
return prediction_probabilites
def main():
reader = imageio.get_reader("person_0_tube.mp4")
frames = []
for cur_frame in reader:
frames.append(cur_frame)
input_tube = np.stack(frames[:32], axis=0)
input_tube = np.expand_dims(input_tube, axis=0) # batch dimension
detector_dict = set_up_detector()
prediction_probabilites = detect_on_tube(input_tube, detector_dict)
top_k = 5
top_classes = np.argsort(prediction_probabilites[0,:])[:-top_k-1:-1]
print("Results")
for ii in range(top_k):
class_id = top_classes[ii]
class_str = act.ACTION_STRINGS[class_id]
class_prob = prediction_probabilites[0,class_id]
print("%.10s : %.3f" % (class_str, class_prob))
cv2.imshow('midframe', input_tube[0,16,:,:,::-1])
cv2.waitKey(0)
if __name__ == '__main__':
main()