Add files via upload

Inferencer · Apr 23, 2024 · 721e7f4 · 721e7f4
1 parent c7bc181
commit 721e7f4
Show file tree

Hide file tree

Showing 3 changed files with 133 additions and 0 deletions.
diff --git a/utils/data_processing.py b/utils/data_processing.py
@@ -0,0 +1,60 @@
+import csv
+import numpy as np
+import random
+
+
+def load_landmark_openface(csv_path):
+    '''
+    load openface landmark from .csv file
+    '''
+    with open(csv_path, 'r') as f:
+        reader = csv.reader(f)
+        data_all = [row for row in reader]
+    x_list = []
+    y_list = []
+    for row_index,row in enumerate(data_all[1:]):
+        frame_num = float(row[0])
+        if int(frame_num)!= row_index+1:
+            return None
+        x_list.append([float(x) for x in row[5:5+68]])
+        y_list.append([float(y) for y in row[5+68:5+68 + 68]])
+    x_array = np.array(x_list)
+    y_array = np.array(y_list)
+    landmark_array = np.stack([x_array,y_array],2)
+    return landmark_array
+
+
+def compute_crop_radius(video_size,landmark_data_clip,random_scale = None):
+    '''
+    judge if crop face and compute crop radius
+    '''
+    video_w, video_h = video_size[0], video_size[1]
+    landmark_max_clip = np.max(landmark_data_clip, axis=1)
+    if random_scale is None:
+        random_scale = random.random() / 10 + 1.05
+    else:
+        random_scale = random_scale
+    radius_h = (landmark_max_clip[:,1] - landmark_data_clip[:,29, 1]) * random_scale
+    radius_w = (landmark_data_clip[:,54, 0] - landmark_data_clip[:,48, 0]) * random_scale
+    radius_clip = np.max(np.stack([radius_h, radius_w],1),1) // 2
+    radius_max = np.max(radius_clip)
+    radius_max = (int(radius_max/4) + 1 ) * 4
+    radius_max_1_4 = radius_max//4
+    clip_min_h = landmark_data_clip[:, 29,
+                 1] - radius_max
+    clip_max_h = landmark_data_clip[:, 29,
+                 1] + radius_max * 2  + radius_max_1_4
+    clip_min_w = landmark_data_clip[:, 33,
+                 0] - radius_max - radius_max_1_4
+    clip_max_w = landmark_data_clip[:, 33,
+                 0] + radius_max + radius_max_1_4
+    if min(clip_min_h.tolist() + clip_min_w.tolist()) < 0:
+        return False,None
+    elif max(clip_max_h.tolist()) > video_h:
+        return False,None
+    elif max(clip_max_w.tolist()) > video_w:
+        return False,None
+    elif max(radius_clip) > min(radius_clip) * 1.5:
+        return False, None
+    else:
+        return True,radius_max
diff --git a/utils/deep_speech.py b/utils/deep_speech.py
@@ -0,0 +1,73 @@
+import numpy as np
+import warnings
+import resampy
+from scipy.io import wavfile
+from python_speech_features import mfcc
+import tensorflow as tf
+
+# Suppress specific warnings from scipy
+warnings.filterwarnings("ignore", category=wavfile.WavFileWarning)
+
+class DeepSpeech():
+    def __init__(self, model_path):
+        self.graph, self.logits_ph, self.input_node_ph, self.input_lengths_ph = self._prepare_deepspeech_net(model_path)
+        self.target_sample_rate = 16000
+
+    def _prepare_deepspeech_net(self, deepspeech_pb_path):
+        with tf.io.gfile.GFile(deepspeech_pb_path, "rb") as f:
+            graph_def = tf.compat.v1.GraphDef()
+            graph_def.ParseFromString(f.read())
+        graph = tf.compat.v1.get_default_graph()
+        tf.import_graph_def(graph_def, name="deepspeech")
+        logits_ph = graph.get_tensor_by_name("logits:0")
+        input_node_ph = graph.get_tensor_by_name("input_node:0")
+        input_lengths_ph = graph.get_tensor_by_name("input_lengths:0")
+        return graph, logits_ph, input_node_ph, input_lengths_ph
+
+    def conv_audio_to_deepspeech_input_vector(self, audio, sample_rate, num_cepstrum, num_context):
+        features = mfcc(signal=audio, samplerate=sample_rate, numcep=num_cepstrum)
+        features = features[::2]  # We only keep every second feature (BiRNN stride = 2)
+        num_strides = len(features)
+        empty_context = np.zeros((num_context, num_cepstrum), dtype=features.dtype)
+        features = np.concatenate((empty_context, features, empty_context))
+        window_size = 2 * num_context + 1
+        train_inputs = np.lib.stride_tricks.as_strided(features, shape=(num_strides, window_size, num_cepstrum),
+                                                      strides=(features.strides[0], features.strides[0], features.strides[1]),
+                                                      writeable=False)
+        train_inputs = np.reshape(train_inputs, [num_strides, -1])
+        train_inputs = np.copy(train_inputs)
+        train_inputs = (train_inputs - np.mean(train_inputs)) / np.std(train_inputs)
+        return train_inputs
+
+    def compute_audio_feature(self, audio_path):
+        audio_sample_rate, audio = wavfile.read(audio_path)
+        if audio.ndim > 1:
+            # Print the ASCII art in green
+            print("\033[32m"  # Green color start
+                  "███████████████████████████████████████\n"
+                  "█▄─▄███▄─▄█▄─▄▄─█─▄▄▄▄█▄─▄█─▄▄▄─█▄─█─▄█\n"
+                  "██─██▀██─███─▄▄▄█▄▄▄▄─██─██─███▀██─▄▀██\n"
+                  "▀▄▄▄▄▄▀▄▄▄▀▄▄▄▀▀▀▄▄▄▄▄▀▄▄▄▀▄▄▄▄▄▀▄▄▀▄▄▀\033[0m")  # Reset to default color
+            print("Hang tight! We're processing your audio, which might take a little while depending on its length.")
+            audio = audio[:, 0]  # Use only the first channel if multi-channel
+        if audio_sample_rate != self.target_sample_rate:
+            resampled_audio = resampy.resample(audio.astype(float), sr_orig=audio_sample_rate, sr_new=self.target_sample_rate)
+        else:
+            resampled_audio = audio.astype(float)
+
+        with tf.compat.v1.Session(graph=self.graph) as sess:
+            input_vector = self.conv_audio_to_deepspeech_input_vector(audio=resampled_audio.astype(np.int16),
+                                                                      sample_rate=self.target_sample_rate,
+                                                                      num_cepstrum=26,
+                                                                      num_context=9)
+            network_output = sess.run(self.logits_ph, feed_dict={self.input_node_ph: input_vector[np.newaxis, ...],
+                                                                 self.input_lengths_ph: [input_vector.shape[0]]})
+            ds_features = network_output[::2, 0, :]
+        return ds_features
+
+if __name__ == '__main__':
+    audio_path = r'./00168.wav'
+    model_path = r'./output_graph.pb'
+    DSModel = DeepSpeech(model_path)
+    ds_feature = DSModel.compute_audio_feature(audio_path)
+    print(ds_feature)
diff --git a/utils/logo/LipSick.png b/utils/logo/LipSick.png