Skip to content

Commit fd7a48e

Browse files
committed
replacing Python HF Hub for a C++ version
1 parent 750ff91 commit fd7a48e

File tree

13 files changed

+306
-190
lines changed

13 files changed

+306
-190
lines changed

Dockerfile

+1-8
Original file line numberDiff line numberDiff line change
@@ -7,19 +7,12 @@ SHELL ["/bin/bash", "-c"]
77
COPY . /root/ros2_ws/src
88

99
# Install dependencies
10-
RUN apt-get update
11-
RUN apt-get -y --quiet --no-install-recommends install python3-pip
12-
RUN if [ "$ROS_DISTRO" = "jazzy" ] || [ "$ROS_DISTRO" = "rolling" ]; then \
13-
pip3 install -r src/requirements.txt --break-system-packages; \
14-
else \
15-
pip3 install -r src/requirements.txt; \
16-
fi
17-
1810
WORKDIR /root/ros2_ws/src
1911
RUN git clone https://github.com/mgonzs13/audio_common.git
2012

2113
WORKDIR /root/ros2_ws
2214
RUN source /opt/ros/${ROS_DISTRO}/setup.bash
15+
RUN apt-get update
2316
RUN rosdep update --include-eol-distros && rosdep install --from-paths src --ignore-src -r -y
2417
RUN rosdep install --from-paths src --ignore-src -r -y
2518

README.md

-1
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,6 @@ To run whisper_ros with CUDA, first, you must install the [CUDA Toolkit](https:/
3535
cd ~/ros2_ws/src
3636
git clone https://github.com/mgonzs13/audio_common.git
3737
git clone https://github.com/mgonzs13/whisper_ros.git
38-
pip3 install -r whisper_ros/requirements.txt
3938
cd ~/ros2_ws
4039
rosdep install --from-paths src --ignore-src -r -y
4140
colcon build --cmake-args -DGGML_CUDA=ON # add this for CUDA

requirements.txt

-1
This file was deleted.

whisper_bringup/launch/silero-vad.launch.py

+10-47
Original file line numberDiff line numberDiff line change
@@ -22,25 +22,14 @@
2222

2323

2424
from launch_ros.actions import Node
25-
from launch import LaunchDescription, LaunchContext
25+
from launch import LaunchDescription
2626
from launch.substitutions import LaunchConfiguration
27-
from launch.actions import OpaqueFunction, DeclareLaunchArgument
28-
from huggingface_hub import hf_hub_download
2927

3028

3129
def generate_launch_description():
3230

33-
def run_silero_vad(context: LaunchContext, repo, file, model_path):
34-
repo = str(context.perform_substitution(repo))
35-
file = str(context.perform_substitution(file))
36-
model_path = str(context.perform_substitution(model_path))
37-
38-
if not model_path:
39-
model_path = hf_hub_download(
40-
repo_id=repo, filename=file, force_download=False
41-
)
42-
43-
return (
31+
return LaunchDescription(
32+
[
4433
Node(
4534
package="whisper_ros",
4635
executable="silero_vad_node",
@@ -49,7 +38,13 @@ def run_silero_vad(context: LaunchContext, repo, file, model_path):
4938
parameters=[
5039
{
5140
"enabled": LaunchConfiguration("enabled", default=True),
52-
"model_path": model_path,
41+
"model_repo": LaunchConfiguration(
42+
"model_repo", default="mgonzs13/silero-vad-onnx"
43+
),
44+
"model_filename": LaunchConfiguration(
45+
"model_filename", default="silero_vad.onnx"
46+
),
47+
"model_path": LaunchConfiguration("model_path", default=""),
5348
"sample_rate": LaunchConfiguration("sample_rate", default=16000),
5449
"frame_size_ms": LaunchConfiguration("frame_size_ms", default=32),
5550
"threshold": LaunchConfiguration("threshold", default=0.5),
@@ -61,37 +56,5 @@ def run_silero_vad(context: LaunchContext, repo, file, model_path):
6156
],
6257
remappings=[("audio", "/audio/in")],
6358
),
64-
)
65-
66-
model_repo = LaunchConfiguration("model_repo")
67-
model_repo_cmd = DeclareLaunchArgument(
68-
"model_repo",
69-
default_value="mgonzs13/silero-vad-onnx",
70-
description="Hugging Face model repo",
71-
)
72-
73-
model_filename = LaunchConfiguration("model_filename")
74-
model_filename_cmd = DeclareLaunchArgument(
75-
"model_filename",
76-
default_value="silero_vad.onnx",
77-
description="Hugging Face model filename",
78-
)
79-
80-
model_path = LaunchConfiguration("model_path")
81-
model_path_cmd = DeclareLaunchArgument(
82-
"model_path",
83-
default_value="",
84-
description="Local path to the model file",
85-
)
86-
87-
return LaunchDescription(
88-
[
89-
model_repo_cmd,
90-
model_filename_cmd,
91-
model_path_cmd,
92-
OpaqueFunction(
93-
function=run_silero_vad,
94-
args=[model_repo, model_filename, model_path],
95-
),
9659
]
9760
)

whisper_bringup/launch/whisper.launch.py

+72-112
Original file line numberDiff line numberDiff line change
@@ -23,126 +23,75 @@
2323

2424
import os
2525
from launch_ros.actions import Node
26-
from launch import LaunchDescription, LaunchContext
26+
from launch import LaunchDescription
2727
from launch.conditions import IfCondition, UnlessCondition
2828
from launch.substitutions import LaunchConfiguration, PythonExpression
2929
from launch.launch_description_sources import PythonLaunchDescriptionSource
30-
from launch.actions import OpaqueFunction, DeclareLaunchArgument, IncludeLaunchDescription
30+
from launch.actions import DeclareLaunchArgument, IncludeLaunchDescription
3131
from ament_index_python.packages import get_package_share_directory
32-
from huggingface_hub import hf_hub_download
3332

3433

3534
def generate_launch_description():
3635

37-
def run_whisper(context: LaunchContext, repo, file, model_path):
38-
repo = str(context.perform_substitution(repo))
39-
file = str(context.perform_substitution(file))
40-
model_path = str(context.perform_substitution(model_path))
41-
42-
if not model_path:
43-
model_path = hf_hub_download(
44-
repo_id=repo, filename=file, force_download=False
45-
)
46-
47-
params = {
48-
"sampling_strategy": LaunchConfiguration(
49-
"sampling_strategy", default="beam_search"
50-
),
51-
"model": LaunchConfiguration("model", default=model_path),
52-
"openvino_encode_device": LaunchConfiguration(
53-
"openvino_encode_device", default="CPU"
54-
),
55-
"n_threads": LaunchConfiguration("n_threads", default=4),
56-
"n_max_text_ctx": LaunchConfiguration("n_max_text_ctx", default=16384),
57-
"offset_ms": LaunchConfiguration("offset_ms", default=0),
58-
"duration_ms": LaunchConfiguration("duration_ms", default=0),
59-
"translate": LaunchConfiguration("translate", default=False),
60-
"no_context": LaunchConfiguration("no_context", default=True),
61-
"single_segment": LaunchConfiguration("single_segment", default=True),
62-
"token_timestamps": LaunchConfiguration("token_timestamps", default=False),
63-
"thold_pt": LaunchConfiguration("thold_pt", default=0.01),
64-
"thold_ptsum": LaunchConfiguration("thold_ptsum", default=0.01),
65-
"max_len": LaunchConfiguration("max_len", default=0),
66-
"split_on_word": LaunchConfiguration("split_on_word", default=False),
67-
"max_tokens": LaunchConfiguration("max_tokens", default=0),
68-
"audio_ctx": LaunchConfiguration("audio_ctx", default=0),
69-
"suppress_regex": LaunchConfiguration("suppress_regex", default=""),
70-
"language": LaunchConfiguration("language", default="en"),
71-
"detect_language": LaunchConfiguration("detect_language", default=False),
72-
"suppress_blank": LaunchConfiguration("suppress_blank", default=True),
73-
"suppress_nst": LaunchConfiguration("suppress_nst", default=False),
74-
"temperature": LaunchConfiguration("temperature", default=0.00),
75-
"max_initial_ts": LaunchConfiguration("max_initial_ts", default=1.00),
76-
"length_penalty": LaunchConfiguration("length_penalty", default=-1.00),
77-
"temperature_inc": LaunchConfiguration("temperature_inc", default=0.40),
78-
"entropy_thold": LaunchConfiguration("entropy_thold", default=2.40),
79-
"logprob_thold": LaunchConfiguration("logprob_thold", default=-1.00),
80-
"no_speech_thold": LaunchConfiguration("no_speech_thold", default=0.60),
81-
"greedy_best_of": LaunchConfiguration("greedy_best_of", default=5),
82-
"beam_search_beam_size": LaunchConfiguration(
83-
"beam_search_beam_size", default=5
84-
),
85-
"beam_search_patience": LaunchConfiguration(
86-
"beam_search_patience", default=-1.00
87-
),
88-
"n_processors": LaunchConfiguration("n_processors", default=1),
89-
"use_gpu": LaunchConfiguration("use_gpu", default=True),
90-
"gpu_device": LaunchConfiguration("gpu_device", default=0),
91-
"flash_attn": LaunchConfiguration("flash_attn", default=False),
92-
"dtw_n_top": LaunchConfiguration("dtw_n_top", default=-1),
93-
"dtw_token_timestamps": LaunchConfiguration(
94-
"dtw_token_timestamps", default=False
95-
),
96-
"dtw_aheads": LaunchConfiguration("dtw_aheads", default="none"),
97-
}
98-
99-
return (
100-
Node(
101-
package="whisper_ros",
102-
executable="whisper_server_node",
103-
name="whisper_node",
104-
namespace="whisper",
105-
parameters=[params],
106-
condition=UnlessCondition(
107-
PythonExpression([LaunchConfiguration("stream")])
108-
),
109-
),
110-
Node(
111-
package="whisper_ros",
112-
executable="whisper_node",
113-
name="whisper_node",
114-
namespace="whisper",
115-
parameters=[params],
116-
condition=IfCondition(PythonExpression([LaunchConfiguration("stream")])),
117-
),
118-
)
119-
12036
stream_cmd = DeclareLaunchArgument(
12137
"stream",
12238
default_value="False",
12339
description="Whether to launch stream or server node",
12440
)
12541

126-
model_repo = LaunchConfiguration("model_repo")
127-
model_repo_cmd = DeclareLaunchArgument(
128-
"model_repo",
129-
default_value="ggerganov/whisper.cpp",
130-
description="Hugging Face model repo for Whisper",
131-
)
132-
133-
model_filename = LaunchConfiguration("model_filename")
134-
model_filename_cmd = DeclareLaunchArgument(
135-
"model_filename",
136-
default_value="ggml-large-v3-turbo-q5_0.bin",
137-
description="Hugging Face model filename for Whisper",
138-
)
139-
140-
model_path = LaunchConfiguration("model_path")
141-
model_path_cmd = DeclareLaunchArgument(
142-
"model_path",
143-
default_value="",
144-
description="Local path to the model file for Whisper",
145-
)
42+
whisper_params = {
43+
"sampling_strategy": LaunchConfiguration(
44+
"sampling_strategy", default="beam_search"
45+
),
46+
"model_repo": LaunchConfiguration("model_repo", default="ggerganov/whisper.cpp"),
47+
"model_filename": LaunchConfiguration(
48+
"model_filename", default="ggml-large-v3-turbo-q5_0.bin"
49+
),
50+
"model": LaunchConfiguration("model", default=""),
51+
"openvino_encode_device": LaunchConfiguration(
52+
"openvino_encode_device", default="CPU"
53+
),
54+
"n_threads": LaunchConfiguration("n_threads", default=4),
55+
"n_max_text_ctx": LaunchConfiguration("n_max_text_ctx", default=16384),
56+
"offset_ms": LaunchConfiguration("offset_ms", default=0),
57+
"duration_ms": LaunchConfiguration("duration_ms", default=0),
58+
"translate": LaunchConfiguration("translate", default=False),
59+
"no_context": LaunchConfiguration("no_context", default=True),
60+
"single_segment": LaunchConfiguration("single_segment", default=True),
61+
"token_timestamps": LaunchConfiguration("token_timestamps", default=False),
62+
"thold_pt": LaunchConfiguration("thold_pt", default=0.01),
63+
"thold_ptsum": LaunchConfiguration("thold_ptsum", default=0.01),
64+
"max_len": LaunchConfiguration("max_len", default=0),
65+
"split_on_word": LaunchConfiguration("split_on_word", default=False),
66+
"max_tokens": LaunchConfiguration("max_tokens", default=0),
67+
"audio_ctx": LaunchConfiguration("audio_ctx", default=0),
68+
"suppress_regex": LaunchConfiguration("suppress_regex", default=""),
69+
"language": LaunchConfiguration("language", default="en"),
70+
"detect_language": LaunchConfiguration("detect_language", default=False),
71+
"suppress_blank": LaunchConfiguration("suppress_blank", default=True),
72+
"suppress_nst": LaunchConfiguration("suppress_nst", default=False),
73+
"temperature": LaunchConfiguration("temperature", default=0.00),
74+
"max_initial_ts": LaunchConfiguration("max_initial_ts", default=1.00),
75+
"length_penalty": LaunchConfiguration("length_penalty", default=-1.00),
76+
"temperature_inc": LaunchConfiguration("temperature_inc", default=0.40),
77+
"entropy_thold": LaunchConfiguration("entropy_thold", default=2.40),
78+
"logprob_thold": LaunchConfiguration("logprob_thold", default=-1.00),
79+
"no_speech_thold": LaunchConfiguration("no_speech_thold", default=0.60),
80+
"greedy_best_of": LaunchConfiguration("greedy_best_of", default=5),
81+
"beam_search_beam_size": LaunchConfiguration("beam_search_beam_size", default=5),
82+
"beam_search_patience": LaunchConfiguration(
83+
"beam_search_patience", default=-1.00
84+
),
85+
"n_processors": LaunchConfiguration("n_processors", default=1),
86+
"use_gpu": LaunchConfiguration("use_gpu", default=True),
87+
"gpu_device": LaunchConfiguration("gpu_device", default=0),
88+
"flash_attn": LaunchConfiguration("flash_attn", default=False),
89+
"dtw_n_top": LaunchConfiguration("dtw_n_top", default=-1),
90+
"dtw_token_timestamps": LaunchConfiguration(
91+
"dtw_token_timestamps", default=False
92+
),
93+
"dtw_aheads": LaunchConfiguration("dtw_aheads", default="none"),
94+
}
14695

14796
silero_vad_model_repo = LaunchConfiguration("silero_vad_model_repo")
14897
silero_vad_model_repo_cmd = DeclareLaunchArgument(
@@ -168,15 +117,26 @@ def run_whisper(context: LaunchContext, repo, file, model_path):
168117
return LaunchDescription(
169118
[
170119
stream_cmd,
171-
model_repo_cmd,
172-
model_filename_cmd,
173-
model_path_cmd,
174120
silero_vad_model_repo_cmd,
175121
silero_vad_model_filename_cmd,
176122
silero_vad_model_path_cmd,
177-
OpaqueFunction(
178-
function=run_whisper,
179-
args=[model_repo, model_filename, model_path],
123+
Node(
124+
package="whisper_ros",
125+
executable="whisper_server_node",
126+
name="whisper_node",
127+
namespace="whisper",
128+
parameters=[whisper_params],
129+
condition=UnlessCondition(
130+
PythonExpression([LaunchConfiguration("stream")])
131+
),
132+
),
133+
Node(
134+
package="whisper_ros",
135+
executable="whisper_node",
136+
name="whisper_node",
137+
namespace="whisper",
138+
parameters=[whisper_params],
139+
condition=IfCondition(PythonExpression([LaunchConfiguration("stream")])),
180140
),
181141
IncludeLaunchDescription(
182142
PythonLaunchDescriptionSource(

whisper_hfhub_vendor/CMakeLists.txt

+35
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
cmake_minimum_required(VERSION 3.8)
2+
project(whisper_hfhub_vendor)
3+
4+
if(CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
5+
add_compile_options(-Wall -Wextra -Wpedantic)
6+
endif()
7+
8+
include(FetchContent)
9+
find_package(ament_cmake REQUIRED)
10+
find_package(CURL REQUIRED) # Add CURL as a required package
11+
12+
# Declare hfhub from Git repository
13+
FetchContent_Declare(
14+
hfhub
15+
GIT_REPOSITORY https://github.com/agonzc34/huggingface-hub-cpp
16+
GIT_TAG v1.0.0
17+
GIT_SHALLOW TRUE
18+
)
19+
20+
FetchContent_MakeAvailable(hfhub)
21+
22+
# Export targets and include directories
23+
install(
24+
TARGETS hfhub
25+
EXPORT export_hfhub
26+
LIBRARY DESTINATION lib
27+
INCLUDES DESTINATION include
28+
)
29+
30+
# Export include directories and dependencies (CURL)
31+
ament_export_include_directories(include)
32+
ament_export_dependencies(CURL) # Export CURL dependency
33+
ament_export_targets(export_hfhub HAS_LIBRARY_TARGET)
34+
35+
ament_package()

whisper_hfhub_vendor/package.xml

+17
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
<?xml version="1.0"?>
2+
<?xml-model href="http://download.ros.org/schema/package_format3.xsd" schematypens="http://www.w3.org/2001/XMLSchema"?>
3+
<package format="3">
4+
<name>whisper_hfhub_vendor</name>
5+
<version>3.0.3</version>
6+
<description>huggingface-hub-cpp vendor package for whisper_ros</description>
7+
<maintainer email="mgons@unileon.es">Miguel Ángel González Santamarta</maintainer>
8+
<license>MIT</license>
9+
10+
<buildtool_depend>ament_cmake</buildtool_depend>
11+
12+
<depend>curl</depend>
13+
14+
<export>
15+
<build_type>ament_cmake</build_type>
16+
</export>
17+
</package>

0 commit comments

Comments
 (0)