-
Notifications
You must be signed in to change notification settings - Fork 0
/
preprocess.py
242 lines (187 loc) · 8.88 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
import cv2
import librosa
import python_speech_features
import numpy as np
import os
import subprocess
from time import process_time
class DataPipeline:
def __init__(self, data_path, load_limit):
self.data_path = data_path
self.load_limit = load_limit
self.visual_inputs = []
self.audio_inputs = []
self.is_synced_labels = []
"""
* The network ingests 0.2-second clips of both audio and video inputs.
This function...
1) Splits the video at video_path into segments of 0.2 seconds each
2) Separates the visual and audio for each clip
3) Formats the visual and audio using format_input_visual and format_input_audio
4) Appends this data to instance variables self.audio_inputs self.visual_inputs
"""
def format_video(self, video_path):
tmp_video_path, tmp_audio_path = self.get_and_create_tmp_video_audio_files(video_path)
try:
mfccs, frames = self.trim_mfcc_and_visual(
self.format_input_audio(tmp_audio_path),
self.format_input_visual(tmp_video_path)
)
self.visual_inputs.extend(frames)
self.audio_inputs.extend(mfccs)
finally:
self.delete_file(tmp_video_path)
self.delete_file(tmp_audio_path)
@staticmethod
def group_mfccs(mfcc):
# get the number of columns in the mfcc
num_cols = mfcc.shape[1]
# make sure there are more than 20 columns (video is longer that 0.2 seconds)
# 20 columns == 0.2 seconds
assert (num_cols > 20)
# determine the number of columns to remove to make the audio an even multiple of 0.2 seconds
to_trim = num_cols % 20
# if there are columns to trim, trim them
if to_trim > 0:
mfcc = mfcc[:, :-to_trim]
# group mfcc cols into sets of 20
mfcc_groups = np.hsplit(mfcc, mfcc.shape[1] / 20)
mfcc_groups = [np.expand_dims(mfcc, axis=-1) for mfcc in mfcc_groups]
for mfcc_group in mfcc_groups:
assert (mfcc_group.shape == (13, 20, 1))
return mfcc_groups
@staticmethod
def group_frames(frames):
# get the number of frames
num_frames = frames.shape[0]
# make sure there are more than 5 frames (video is longer that 0.2 seconds)
# 5 frames == 0.2 seconds
assert (num_frames > 5)
# determine the number of frames to remove to make the video an even multiple of 0.2 seconds
to_trim = num_frames % 5
# if there are frames to trim, trim them
if to_trim > 0:
frames = frames[:-to_trim, :, :]
# group frames into sets of 5
frame_groups = np.split(frames, frames.shape[0] / 5)
#frame_groups = [np.expand_dims(group, axis=-1) for group in frames_groups]
for frame_group in frame_groups:
assert (frame_group.shape[0] == 5)
return frame_groups
def trim_mfcc_and_visual(self, mfcc_groups, frame_groups):
num_groups = min(len(mfcc_groups), len(frame_groups))
mfcc_groups = mfcc_groups[:num_groups]
frame_groups = frame_groups[:num_groups]
assert (len(mfcc_groups) == len(frame_groups))
return mfcc_groups, frame_groups
"""
Converts a audio clip to MFCC map of N x 13 x 20 x 1
13 MFCC features representing powers at different frequency bins.
Sampled at rate of 100Hz, gives 20 time steps for a 0.2-second input signal.
"""
def format_input_audio(self, wav_file):
wav, sample_rate = librosa.load(wav_file, sr=100)
assert (sample_rate == 100)
mfccs = python_speech_features.mfcc(wav, sample_rate).transpose()
assert (mfccs.shape[0] == 13)
return self.group_mfccs(mfccs)
"""
Converts a video clip to sets of 5 frames (at the 25Hz frame rate)
Gives 5 frames for every 0.2 second
"""
def format_input_visual(self, video_path):
video = cv2.VideoCapture(video_path)
assert (video.get(cv2.CAP_PROP_FPS) == 25)
success, frame = video.read()
frames = []
while success:
frames.append(cv2.resize(frame, (100,100)))
success, frame = video.read()
frames = np.array(frames)
return self.group_frames(frames)
def get_shuffled_and_label_data(self):
visual_data = np.array(self.visual_inputs)
audio_data = np.array(self.audio_inputs)
num_data_points = visual_data.shape[0]
half_way_mark = num_data_points // 2
# break data in 1/2
# shuffle second half to create "false" data
false_visual_data = visual_data[half_way_mark:num_data_points]
np.random.shuffle(false_visual_data)
visual_data[half_way_mark:num_data_points] = false_visual_data
false_audio_data = audio_data[half_way_mark:num_data_points]
np.random.shuffle(false_audio_data)
audio_data[half_way_mark:num_data_points] = false_audio_data
is_synced_labels = np.ones(num_data_points)
is_synced_labels[half_way_mark:num_data_points] = 0
new_idx_order = np.arange(num_data_points)
np.random.shuffle(new_idx_order)
# shuffle the labels and pairs together
visual_data = visual_data[new_idx_order]
audio_data = audio_data[new_idx_order]
is_synced_labels = is_synced_labels[new_idx_order]
return visual_data, audio_data, is_synced_labels
def get_data(self):
# Returns list with tensors
try:
print("Retrieving saved data...")
start = process_time()
visual_data = np.load("data/frames.npy")
audio_data = np.load("data/audio.npy")
is_synced_labels = np.load("data/labels.npy")
end = process_time()
print("Data retrieval completed in " + str(end - start) + " seconds!")
except:
print("Could not find saved dataset, generating and saving new dataset...")
if self.data_path is None:
print("No saved dataset, you must specify a data directory to load and process data from!")
exit()
print("Preprocessing Raw Video Data...")
start = process_time()
for root, dirs, files in os.walk(self.data_path):
for i, file in enumerate(files):
if file.endswith(".mp4"):
video_path = root + "/" + file
print(f"Preprocessing video {video_path}...")
self.format_video(video_path)
if len(self.audio_inputs) >= self.load_limit or len(self.visual_inputs) >= self.load_limit:
break
print("Done processing Raw Videos! Shuffling, labeling and saving processed data...")
visual_data, audio_data, is_synced_labels = self.get_shuffled_and_label_data()
# For saving as numpy arrays
np.save("data/frames", visual_data)
np.save("data/audio", audio_data)
np.save("data/labels", is_synced_labels)
print("Saved data!")
end = process_time()
print("Preprocessing completed in " + str(end - start) + " seconds!")
assert (visual_data.shape[0] == audio_data.shape[0])
num_data_points = len(is_synced_labels)
print("Number of Data Points: ", num_data_points)
print("Shape of Visual Data: ", visual_data.shape)
print("Shape of Audio Data: ", audio_data.shape)
self.visual_inputs = visual_data
self.audio_inputs = audio_data
self.is_synced_labels = is_synced_labels
return self.visual_inputs, self.audio_inputs, self.is_synced_labels
def get_and_create_tmp_video_audio_files(self, video_path):
_, video_name = os.path.split(video_path)
video_name = os.path.splitext(video_name)[0]
tmp_video_path = os.path.dirname(video_path) + "/" + video_name + "_25_fps.mp4"
tmp_audio_path = os.path.dirname(video_path) + "/" + video_name + "_audio.wav"
self.create_tmp_video_audio_files(video_path, tmp_video_path, tmp_audio_path)
return tmp_video_path, tmp_audio_path
@staticmethod
def create_tmp_video_audio_files(origional_video_path, tmp_audio_path, tmp_video_path):
silence_suffix = " > /dev/null 2>&1 < /dev/null"
audio_generation_cmd = "ffmpeg -i " + origional_video_path + " " + tmp_audio_path
audio_generation_process = subprocess.Popen(audio_generation_cmd + silence_suffix, shell=True)
audio_generation_process.wait()
video_generation_cmd = "ffmpeg -i " + origional_video_path + " -filter:v fps=25 " + tmp_video_path
video_generation_process = subprocess.Popen(video_generation_cmd + silence_suffix, shell=True)
video_generation_process.wait()
@staticmethod
def delete_file(path_to_delete):
rm_cmd = "rm " + path_to_delete
rm_process = subprocess.Popen(rm_cmd, shell=True)
rm_process.wait()