Skip to content

Commit

Permalink
Add files via upload
Browse files Browse the repository at this point in the history
  • Loading branch information
Inferencer authored Apr 23, 2024
1 parent c7bc181 commit 721e7f4
Show file tree
Hide file tree
Showing 3 changed files with 133 additions and 0 deletions.
60 changes: 60 additions & 0 deletions utils/data_processing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
import csv
import numpy as np
import random


def load_landmark_openface(csv_path):
'''
load openface landmark from .csv file
'''
with open(csv_path, 'r') as f:
reader = csv.reader(f)
data_all = [row for row in reader]
x_list = []
y_list = []
for row_index,row in enumerate(data_all[1:]):
frame_num = float(row[0])
if int(frame_num)!= row_index+1:
return None
x_list.append([float(x) for x in row[5:5+68]])
y_list.append([float(y) for y in row[5+68:5+68 + 68]])
x_array = np.array(x_list)
y_array = np.array(y_list)
landmark_array = np.stack([x_array,y_array],2)
return landmark_array


def compute_crop_radius(video_size,landmark_data_clip,random_scale = None):
'''
judge if crop face and compute crop radius
'''
video_w, video_h = video_size[0], video_size[1]
landmark_max_clip = np.max(landmark_data_clip, axis=1)
if random_scale is None:
random_scale = random.random() / 10 + 1.05
else:
random_scale = random_scale
radius_h = (landmark_max_clip[:,1] - landmark_data_clip[:,29, 1]) * random_scale
radius_w = (landmark_data_clip[:,54, 0] - landmark_data_clip[:,48, 0]) * random_scale
radius_clip = np.max(np.stack([radius_h, radius_w],1),1) // 2
radius_max = np.max(radius_clip)
radius_max = (int(radius_max/4) + 1 ) * 4
radius_max_1_4 = radius_max//4
clip_min_h = landmark_data_clip[:, 29,
1] - radius_max
clip_max_h = landmark_data_clip[:, 29,
1] + radius_max * 2 + radius_max_1_4
clip_min_w = landmark_data_clip[:, 33,
0] - radius_max - radius_max_1_4
clip_max_w = landmark_data_clip[:, 33,
0] + radius_max + radius_max_1_4
if min(clip_min_h.tolist() + clip_min_w.tolist()) < 0:
return False,None
elif max(clip_max_h.tolist()) > video_h:
return False,None
elif max(clip_max_w.tolist()) > video_w:
return False,None
elif max(radius_clip) > min(radius_clip) * 1.5:
return False, None
else:
return True,radius_max
73 changes: 73 additions & 0 deletions utils/deep_speech.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
import numpy as np
import warnings
import resampy
from scipy.io import wavfile
from python_speech_features import mfcc
import tensorflow as tf

# Suppress specific warnings from scipy
warnings.filterwarnings("ignore", category=wavfile.WavFileWarning)

class DeepSpeech():
def __init__(self, model_path):
self.graph, self.logits_ph, self.input_node_ph, self.input_lengths_ph = self._prepare_deepspeech_net(model_path)
self.target_sample_rate = 16000

def _prepare_deepspeech_net(self, deepspeech_pb_path):
with tf.io.gfile.GFile(deepspeech_pb_path, "rb") as f:
graph_def = tf.compat.v1.GraphDef()
graph_def.ParseFromString(f.read())
graph = tf.compat.v1.get_default_graph()
tf.import_graph_def(graph_def, name="deepspeech")
logits_ph = graph.get_tensor_by_name("logits:0")
input_node_ph = graph.get_tensor_by_name("input_node:0")
input_lengths_ph = graph.get_tensor_by_name("input_lengths:0")
return graph, logits_ph, input_node_ph, input_lengths_ph

def conv_audio_to_deepspeech_input_vector(self, audio, sample_rate, num_cepstrum, num_context):
features = mfcc(signal=audio, samplerate=sample_rate, numcep=num_cepstrum)
features = features[::2] # We only keep every second feature (BiRNN stride = 2)
num_strides = len(features)
empty_context = np.zeros((num_context, num_cepstrum), dtype=features.dtype)
features = np.concatenate((empty_context, features, empty_context))
window_size = 2 * num_context + 1
train_inputs = np.lib.stride_tricks.as_strided(features, shape=(num_strides, window_size, num_cepstrum),
strides=(features.strides[0], features.strides[0], features.strides[1]),
writeable=False)
train_inputs = np.reshape(train_inputs, [num_strides, -1])
train_inputs = np.copy(train_inputs)
train_inputs = (train_inputs - np.mean(train_inputs)) / np.std(train_inputs)
return train_inputs

def compute_audio_feature(self, audio_path):
audio_sample_rate, audio = wavfile.read(audio_path)
if audio.ndim > 1:
# Print the ASCII art in green
print("\033[32m" # Green color start
"███████████████████████████████████████\n"
"█▄─▄███▄─▄█▄─▄▄─█─▄▄▄▄█▄─▄█─▄▄▄─█▄─█─▄█\n"
"██─██▀██─███─▄▄▄█▄▄▄▄─██─██─███▀██─▄▀██\n"
"▀▄▄▄▄▄▀▄▄▄▀▄▄▄▀▀▀▄▄▄▄▄▀▄▄▄▀▄▄▄▄▄▀▄▄▀▄▄▀\033[0m") # Reset to default color
print("Hang tight! We're processing your audio, which might take a little while depending on its length.")
audio = audio[:, 0] # Use only the first channel if multi-channel
if audio_sample_rate != self.target_sample_rate:
resampled_audio = resampy.resample(audio.astype(float), sr_orig=audio_sample_rate, sr_new=self.target_sample_rate)
else:
resampled_audio = audio.astype(float)

with tf.compat.v1.Session(graph=self.graph) as sess:
input_vector = self.conv_audio_to_deepspeech_input_vector(audio=resampled_audio.astype(np.int16),
sample_rate=self.target_sample_rate,
num_cepstrum=26,
num_context=9)
network_output = sess.run(self.logits_ph, feed_dict={self.input_node_ph: input_vector[np.newaxis, ...],
self.input_lengths_ph: [input_vector.shape[0]]})
ds_features = network_output[::2, 0, :]
return ds_features

if __name__ == '__main__':
audio_path = r'./00168.wav'
model_path = r'./output_graph.pb'
DSModel = DeepSpeech(model_path)
ds_feature = DSModel.compute_audio_feature(audio_path)
print(ds_feature)
Binary file added utils/logo/LipSick.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.

0 comments on commit 721e7f4

Please sign in to comment.