-
-
Notifications
You must be signed in to change notification settings - Fork 27
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
c7bc181
commit 721e7f4
Showing
3 changed files
with
133 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
import csv | ||
import numpy as np | ||
import random | ||
|
||
|
||
def load_landmark_openface(csv_path): | ||
''' | ||
load openface landmark from .csv file | ||
''' | ||
with open(csv_path, 'r') as f: | ||
reader = csv.reader(f) | ||
data_all = [row for row in reader] | ||
x_list = [] | ||
y_list = [] | ||
for row_index,row in enumerate(data_all[1:]): | ||
frame_num = float(row[0]) | ||
if int(frame_num)!= row_index+1: | ||
return None | ||
x_list.append([float(x) for x in row[5:5+68]]) | ||
y_list.append([float(y) for y in row[5+68:5+68 + 68]]) | ||
x_array = np.array(x_list) | ||
y_array = np.array(y_list) | ||
landmark_array = np.stack([x_array,y_array],2) | ||
return landmark_array | ||
|
||
|
||
def compute_crop_radius(video_size,landmark_data_clip,random_scale = None): | ||
''' | ||
judge if crop face and compute crop radius | ||
''' | ||
video_w, video_h = video_size[0], video_size[1] | ||
landmark_max_clip = np.max(landmark_data_clip, axis=1) | ||
if random_scale is None: | ||
random_scale = random.random() / 10 + 1.05 | ||
else: | ||
random_scale = random_scale | ||
radius_h = (landmark_max_clip[:,1] - landmark_data_clip[:,29, 1]) * random_scale | ||
radius_w = (landmark_data_clip[:,54, 0] - landmark_data_clip[:,48, 0]) * random_scale | ||
radius_clip = np.max(np.stack([radius_h, radius_w],1),1) // 2 | ||
radius_max = np.max(radius_clip) | ||
radius_max = (int(radius_max/4) + 1 ) * 4 | ||
radius_max_1_4 = radius_max//4 | ||
clip_min_h = landmark_data_clip[:, 29, | ||
1] - radius_max | ||
clip_max_h = landmark_data_clip[:, 29, | ||
1] + radius_max * 2 + radius_max_1_4 | ||
clip_min_w = landmark_data_clip[:, 33, | ||
0] - radius_max - radius_max_1_4 | ||
clip_max_w = landmark_data_clip[:, 33, | ||
0] + radius_max + radius_max_1_4 | ||
if min(clip_min_h.tolist() + clip_min_w.tolist()) < 0: | ||
return False,None | ||
elif max(clip_max_h.tolist()) > video_h: | ||
return False,None | ||
elif max(clip_max_w.tolist()) > video_w: | ||
return False,None | ||
elif max(radius_clip) > min(radius_clip) * 1.5: | ||
return False, None | ||
else: | ||
return True,radius_max |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,73 @@ | ||
import numpy as np | ||
import warnings | ||
import resampy | ||
from scipy.io import wavfile | ||
from python_speech_features import mfcc | ||
import tensorflow as tf | ||
|
||
# Suppress specific warnings from scipy | ||
warnings.filterwarnings("ignore", category=wavfile.WavFileWarning) | ||
|
||
class DeepSpeech(): | ||
def __init__(self, model_path): | ||
self.graph, self.logits_ph, self.input_node_ph, self.input_lengths_ph = self._prepare_deepspeech_net(model_path) | ||
self.target_sample_rate = 16000 | ||
|
||
def _prepare_deepspeech_net(self, deepspeech_pb_path): | ||
with tf.io.gfile.GFile(deepspeech_pb_path, "rb") as f: | ||
graph_def = tf.compat.v1.GraphDef() | ||
graph_def.ParseFromString(f.read()) | ||
graph = tf.compat.v1.get_default_graph() | ||
tf.import_graph_def(graph_def, name="deepspeech") | ||
logits_ph = graph.get_tensor_by_name("logits:0") | ||
input_node_ph = graph.get_tensor_by_name("input_node:0") | ||
input_lengths_ph = graph.get_tensor_by_name("input_lengths:0") | ||
return graph, logits_ph, input_node_ph, input_lengths_ph | ||
|
||
def conv_audio_to_deepspeech_input_vector(self, audio, sample_rate, num_cepstrum, num_context): | ||
features = mfcc(signal=audio, samplerate=sample_rate, numcep=num_cepstrum) | ||
features = features[::2] # We only keep every second feature (BiRNN stride = 2) | ||
num_strides = len(features) | ||
empty_context = np.zeros((num_context, num_cepstrum), dtype=features.dtype) | ||
features = np.concatenate((empty_context, features, empty_context)) | ||
window_size = 2 * num_context + 1 | ||
train_inputs = np.lib.stride_tricks.as_strided(features, shape=(num_strides, window_size, num_cepstrum), | ||
strides=(features.strides[0], features.strides[0], features.strides[1]), | ||
writeable=False) | ||
train_inputs = np.reshape(train_inputs, [num_strides, -1]) | ||
train_inputs = np.copy(train_inputs) | ||
train_inputs = (train_inputs - np.mean(train_inputs)) / np.std(train_inputs) | ||
return train_inputs | ||
|
||
def compute_audio_feature(self, audio_path): | ||
audio_sample_rate, audio = wavfile.read(audio_path) | ||
if audio.ndim > 1: | ||
# Print the ASCII art in green | ||
print("\033[32m" # Green color start | ||
"███████████████████████████████████████\n" | ||
"█▄─▄███▄─▄█▄─▄▄─█─▄▄▄▄█▄─▄█─▄▄▄─█▄─█─▄█\n" | ||
"██─██▀██─███─▄▄▄█▄▄▄▄─██─██─███▀██─▄▀██\n" | ||
"▀▄▄▄▄▄▀▄▄▄▀▄▄▄▀▀▀▄▄▄▄▄▀▄▄▄▀▄▄▄▄▄▀▄▄▀▄▄▀\033[0m") # Reset to default color | ||
print("Hang tight! We're processing your audio, which might take a little while depending on its length.") | ||
audio = audio[:, 0] # Use only the first channel if multi-channel | ||
if audio_sample_rate != self.target_sample_rate: | ||
resampled_audio = resampy.resample(audio.astype(float), sr_orig=audio_sample_rate, sr_new=self.target_sample_rate) | ||
else: | ||
resampled_audio = audio.astype(float) | ||
|
||
with tf.compat.v1.Session(graph=self.graph) as sess: | ||
input_vector = self.conv_audio_to_deepspeech_input_vector(audio=resampled_audio.astype(np.int16), | ||
sample_rate=self.target_sample_rate, | ||
num_cepstrum=26, | ||
num_context=9) | ||
network_output = sess.run(self.logits_ph, feed_dict={self.input_node_ph: input_vector[np.newaxis, ...], | ||
self.input_lengths_ph: [input_vector.shape[0]]}) | ||
ds_features = network_output[::2, 0, :] | ||
return ds_features | ||
|
||
if __name__ == '__main__': | ||
audio_path = r'./00168.wav' | ||
model_path = r'./output_graph.pb' | ||
DSModel = DeepSpeech(model_path) | ||
ds_feature = DSModel.compute_audio_feature(audio_path) | ||
print(ds_feature) |
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.