Skip to content

feat(transformers): merge musicgen functionalities to a single backend #4620

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Jan 17, 2025
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .bruno/LocalAI Test Requests/tts/musicgen.bru
Original file line number Diff line number Diff line change
@@ -16,7 +16,7 @@ headers {

body:json {
{
"backend": "transformers-musicgen",
"backend": "transformers",
"model": "facebook/musicgen-small",
"input": "80s Synths playing Jazz"
}
4 changes: 0 additions & 4 deletions .github/dependabot.yml
Original file line number Diff line number Diff line change
@@ -81,10 +81,6 @@ updates:
directory: "/backend/python/transformers"
schedule:
interval: "weekly"
- package-ecosystem: "pip"
directory: "/backend/python/transformers-musicgen"
schedule:
interval: "weekly"
- package-ecosystem: "pip"
directory: "/backend/python/vllm"
schedule:
40 changes: 20 additions & 20 deletions .github/workflows/test-extra.yml
Original file line number Diff line number Diff line change
@@ -153,27 +153,27 @@ jobs:
make --jobs=5 --output-sync=target -C backend/python/openvoice
make --jobs=5 --output-sync=target -C backend/python/openvoice test
tests-transformers-musicgen:
runs-on: ubuntu-latest
steps:
- name: Clone
uses: actions/checkout@v4
with:
submodules: true
- name: Dependencies
run: |
sudo apt-get update
sudo apt-get install build-essential ffmpeg
# Install UV
curl -LsSf https://astral.sh/uv/install.sh | sh
sudo apt-get install -y ca-certificates cmake curl patch python3-pip
sudo apt-get install -y libopencv-dev
pip install --user --no-cache-dir grpcio-tools==1.64.1
# tests-transformers-musicgen:
# runs-on: ubuntu-latest
# steps:
# - name: Clone
# uses: actions/checkout@v4
# with:
# submodules: true
# - name: Dependencies
# run: |
# sudo apt-get update
# sudo apt-get install build-essential ffmpeg
# # Install UV
# curl -LsSf https://astral.sh/uv/install.sh | sh
# sudo apt-get install -y ca-certificates cmake curl patch python3-pip
# sudo apt-get install -y libopencv-dev
# pip install --user --no-cache-dir grpcio-tools==1.64.1

- name: Test transformers-musicgen
run: |
make --jobs=5 --output-sync=target -C backend/python/transformers-musicgen
make --jobs=5 --output-sync=target -C backend/python/transformers-musicgen test
# - name: Test transformers-musicgen
# run: |
# make --jobs=5 --output-sync=target -C backend/python/transformers-musicgen
# make --jobs=5 --output-sync=target -C backend/python/transformers-musicgen test

# tests-bark:
# runs-on: ubuntu-latest
5 changes: 1 addition & 4 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -15,7 +15,7 @@ ARG TARGETARCH
ARG TARGETVARIANT

ENV DEBIAN_FRONTEND=noninteractive
ENV EXTERNAL_GRPC_BACKENDS="coqui:/build/backend/python/coqui/run.sh,huggingface-embeddings:/build/backend/python/sentencetransformers/run.sh,transformers:/build/backend/python/transformers/run.sh,sentencetransformers:/build/backend/python/sentencetransformers/run.sh,rerankers:/build/backend/python/rerankers/run.sh,autogptq:/build/backend/python/autogptq/run.sh,bark:/build/backend/python/bark/run.sh,diffusers:/build/backend/python/diffusers/run.sh,openvoice:/build/backend/python/openvoice/run.sh,kokoro:/build/backend/python/kokoro/run.sh,vllm:/build/backend/python/vllm/run.sh,mamba:/build/backend/python/mamba/run.sh,exllama2:/build/backend/python/exllama2/run.sh,transformers-musicgen:/build/backend/python/transformers-musicgen/run.sh,parler-tts:/build/backend/python/parler-tts/run.sh"
ENV EXTERNAL_GRPC_BACKENDS="coqui:/build/backend/python/coqui/run.sh,huggingface-embeddings:/build/backend/python/sentencetransformers/run.sh,transformers:/build/backend/python/transformers/run.sh,sentencetransformers:/build/backend/python/sentencetransformers/run.sh,rerankers:/build/backend/python/rerankers/run.sh,autogptq:/build/backend/python/autogptq/run.sh,bark:/build/backend/python/bark/run.sh,diffusers:/build/backend/python/diffusers/run.sh,openvoice:/build/backend/python/openvoice/run.sh,kokoro:/build/backend/python/kokoro/run.sh,vllm:/build/backend/python/vllm/run.sh,mamba:/build/backend/python/mamba/run.sh,exllama2:/build/backend/python/exllama2/run.sh,parler-tts:/build/backend/python/parler-tts/run.sh"


RUN apt-get update && \
@@ -448,9 +448,6 @@ RUN if [[ ( "${EXTRA_BACKENDS}" =~ "coqui" || -z "${EXTRA_BACKENDS}" ) && "$IMAG
; fi && \
if [[ ( "${EXTRA_BACKENDS}" =~ "diffusers" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
make -C backend/python/diffusers \
; fi && \
if [[ ( "${EXTRA_BACKENDS}" =~ "transformers-musicgen" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
make -C backend/python/transformers-musicgen \
; fi

RUN if [[ ( "${EXTRA_BACKENDS}" =~ "kokoro" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
13 changes: 2 additions & 11 deletions Makefile
Original file line number Diff line number Diff line change
@@ -583,10 +583,10 @@ protogen-go-clean:
$(RM) bin/*

.PHONY: protogen-python
protogen-python: autogptq-protogen bark-protogen coqui-protogen diffusers-protogen exllama2-protogen mamba-protogen rerankers-protogen sentencetransformers-protogen transformers-protogen parler-tts-protogen transformers-musicgen-protogen kokoro-protogen vllm-protogen openvoice-protogen
protogen-python: autogptq-protogen bark-protogen coqui-protogen diffusers-protogen exllama2-protogen mamba-protogen rerankers-protogen sentencetransformers-protogen transformers-protogen parler-tts-protogen kokoro-protogen vllm-protogen openvoice-protogen

.PHONY: protogen-python-clean
protogen-python-clean: autogptq-protogen-clean bark-protogen-clean coqui-protogen-clean diffusers-protogen-clean exllama2-protogen-clean mamba-protogen-clean sentencetransformers-protogen-clean rerankers-protogen-clean transformers-protogen-clean transformers-musicgen-protogen-clean parler-tts-protogen-clean kokoro-protogen-clean vllm-protogen-clean openvoice-protogen-clean
protogen-python-clean: autogptq-protogen-clean bark-protogen-clean coqui-protogen-clean diffusers-protogen-clean exllama2-protogen-clean mamba-protogen-clean sentencetransformers-protogen-clean rerankers-protogen-clean transformers-protogen-clean parler-tts-protogen-clean kokoro-protogen-clean vllm-protogen-clean openvoice-protogen-clean

.PHONY: autogptq-protogen
autogptq-protogen:
@@ -668,14 +668,6 @@ parler-tts-protogen:
parler-tts-protogen-clean:
$(MAKE) -C backend/python/parler-tts protogen-clean

.PHONY: transformers-musicgen-protogen
transformers-musicgen-protogen:
$(MAKE) -C backend/python/transformers-musicgen protogen

.PHONY: transformers-musicgen-protogen-clean
transformers-musicgen-protogen-clean:
$(MAKE) -C backend/python/transformers-musicgen protogen-clean

.PHONY: kokoro-protogen
kokoro-protogen:
$(MAKE) -C backend/python/kokoro protogen
@@ -712,7 +704,6 @@ prepare-extra-conda-environments: protogen-python
$(MAKE) -C backend/python/sentencetransformers
$(MAKE) -C backend/python/rerankers
$(MAKE) -C backend/python/transformers
$(MAKE) -C backend/python/transformers-musicgen
$(MAKE) -C backend/python/parler-tts
$(MAKE) -C backend/python/kokoro
$(MAKE) -C backend/python/openvoice
29 changes: 0 additions & 29 deletions backend/python/transformers-musicgen/Makefile

This file was deleted.

5 changes: 0 additions & 5 deletions backend/python/transformers-musicgen/README.md

This file was deleted.

176 changes: 0 additions & 176 deletions backend/python/transformers-musicgen/backend.py

This file was deleted.

14 changes: 0 additions & 14 deletions backend/python/transformers-musicgen/install.sh

This file was deleted.

3 changes: 0 additions & 3 deletions backend/python/transformers-musicgen/requirements-cpu.txt

This file was deleted.

This file was deleted.

This file was deleted.

4 changes: 0 additions & 4 deletions backend/python/transformers-musicgen/requirements-hipblas.txt

This file was deleted.

8 changes: 0 additions & 8 deletions backend/python/transformers-musicgen/requirements-intel.txt

This file was deleted.

4 changes: 0 additions & 4 deletions backend/python/transformers-musicgen/requirements.txt

This file was deleted.

4 changes: 0 additions & 4 deletions backend/python/transformers-musicgen/run.sh

This file was deleted.

100 changes: 0 additions & 100 deletions backend/python/transformers-musicgen/test.py

This file was deleted.

6 changes: 0 additions & 6 deletions backend/python/transformers-musicgen/test.sh

This file was deleted.

117 changes: 106 additions & 11 deletions backend/python/transformers/backend.py
Original file line number Diff line number Diff line change
@@ -22,6 +22,8 @@

XPU=os.environ.get("XPU", "0") == "1"
from transformers import AutoTokenizer, AutoModel, set_seed, TextIteratorStreamer, StoppingCriteriaList, StopStringCriteria
from transformers import AutoProcessor, MusicgenForConditionalGeneration
from scipy.io import wavfile


_ONE_DAY_IN_SECONDS = 60 * 60 * 24
@@ -191,6 +193,9 @@ def LoadModel(self, request, context):
export=True,
device=device_map)
self.OV = True
elif request.Type == "MusicgenForConditionalGeneration":
self.processor = AutoProcessor.from_pretrained(model_name)
self.model = MusicgenForConditionalGeneration.from_pretrained(model_name)
else:
print("Automodel", file=sys.stderr)
self.model = AutoModel.from_pretrained(model_name,
@@ -201,19 +206,22 @@ def LoadModel(self, request, context):
torch_dtype=compute)
if request.ContextSize > 0:
self.max_tokens = request.ContextSize
else:
elif request.Type != "MusicgenForConditionalGeneration":
self.max_tokens = self.model.config.max_position_embeddings
else:
self.max_tokens = 512

self.tokenizer = AutoTokenizer.from_pretrained(model_name, use_safetensors=True)
self.XPU = False

if XPU and self.OV == False:
self.XPU = True
try:
print("Optimizing model", model_name, "to XPU.", file=sys.stderr)
self.model = ipex.optimize_transformers(self.model, inplace=True, dtype=torch.float16, device="xpu")
except Exception as err:
print("Not using XPU:", err, file=sys.stderr)
if request.Type != "MusicgenForConditionalGeneration":
self.tokenizer = AutoTokenizer.from_pretrained(model_name, use_safetensors=True)
self.XPU = False

if XPU and self.OV == False:
self.XPU = True
try:
print("Optimizing model", model_name, "to XPU.", file=sys.stderr)
self.model = ipex.optimize_transformers(self.model, inplace=True, dtype=torch.float16, device="xpu")
except Exception as err:
print("Not using XPU:", err, file=sys.stderr)

except Exception as err:
print("Error:", err, file=sys.stderr)
@@ -380,6 +388,93 @@ async def PredictStream(self, request, context):
finally:
await iterations.aclose()

def SoundGeneration(self, request, context):
model_name = request.model
try:
if self.processor is None:
if model_name == "":
return backend_pb2.Result(success=False, message="request.model is required")
self.processor = AutoProcessor.from_pretrained(model_name)
if self.model is None:
if model_name == "":
return backend_pb2.Result(success=False, message="request.model is required")
self.model = MusicgenForConditionalGeneration.from_pretrained(model_name)
inputs = None
if request.text == "":
inputs = self.model.get_unconditional_inputs(num_samples=1)
elif request.HasField('src'):
# TODO SECURITY CODE GOES HERE LOL
# WHO KNOWS IF THIS WORKS???
sample_rate, wsamples = wavfile.read('path_to_your_file.wav')

if request.HasField('src_divisor'):
wsamples = wsamples[: len(wsamples) // request.src_divisor]

inputs = self.processor(
audio=wsamples,
sampling_rate=sample_rate,
text=[request.text],
padding=True,
return_tensors="pt",
)
else:
inputs = self.processor(
text=[request.text],
padding=True,
return_tensors="pt",
)

tokens = 256
if request.HasField('duration'):
tokens = int(request.duration * 51.2) # 256 tokens = 5 seconds, therefore 51.2 tokens is one second
guidance = 3.0
if request.HasField('temperature'):
guidance = request.temperature
dosample = True
if request.HasField('sample'):
dosample = request.sample
audio_values = self.model.generate(**inputs, do_sample=dosample, guidance_scale=guidance, max_new_tokens=tokens)
print("[transformers-musicgen] SoundGeneration generated!", file=sys.stderr)
sampling_rate = self.model.config.audio_encoder.sampling_rate
wavfile.write(request.dst, rate=sampling_rate, data=audio_values[0, 0].numpy())
print("[transformers-musicgen] SoundGeneration saved to", request.dst, file=sys.stderr)
print("[transformers-musicgen] SoundGeneration for", file=sys.stderr)
print("[transformers-musicgen] SoundGeneration requested tokens", tokens, file=sys.stderr)
print(request, file=sys.stderr)
except Exception as err:
return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
return backend_pb2.Result(success=True)


# The TTS endpoint is older, and provides fewer features, but exists for compatibility reasons
def TTS(self, request, context):
model_name = request.model
try:
if self.processor is None:
if model_name == "":
return backend_pb2.Result(success=False, message="request.model is required")
self.processor = AutoProcessor.from_pretrained(model_name)
if self.model is None:
if model_name == "":
return backend_pb2.Result(success=False, message="request.model is required")
self.model = MusicgenForConditionalGeneration.from_pretrained(model_name)
inputs = self.processor(
text=[request.text],
padding=True,
return_tensors="pt",
)
tokens = 512 # No good place to set the "length" in TTS, so use 10s as a sane default
audio_values = self.model.generate(**inputs, max_new_tokens=tokens)
print("[transformers-musicgen] TTS generated!", file=sys.stderr)
sampling_rate = self.model.config.audio_encoder.sampling_rate
wavfile.write(request.dst, rate=sampling_rate, data=audio_values[0, 0].numpy())
print("[transformers-musicgen] TTS saved to", request.dst, file=sys.stderr)
print("[transformers-musicgen] TTS for", file=sys.stderr)
print(request, file=sys.stderr)
except Exception as err:
return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
return backend_pb2.Result(success=True)

async def serve(address):
# Start asyncio gRPC server
server = grpc.aio.server(migration_thread_pool=futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
3 changes: 2 additions & 1 deletion backend/python/transformers/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
grpcio==1.69.0
protobuf
certifi
setuptools
setuptools
scipy==1.14.0
59 changes: 56 additions & 3 deletions backend/python/transformers/test.py
Original file line number Diff line number Diff line change
@@ -19,6 +19,7 @@ def setUp(self):
This method sets up the gRPC service by starting the server
"""
self.service = subprocess.Popen(["python3", "backend.py", "--addr", "localhost:50051"])
time.sleep(10)

def tearDown(self) -> None:
"""
@@ -31,7 +32,6 @@ def test_server_startup(self):
"""
This method tests if the server starts up successfully
"""
time.sleep(10)
try:
self.setUp()
with grpc.insecure_channel("localhost:50051") as channel:
@@ -48,7 +48,6 @@ def test_load_model(self):
"""
This method tests if the model is loaded successfully
"""
time.sleep(10)
try:
self.setUp()
with grpc.insecure_channel("localhost:50051") as channel:
@@ -66,7 +65,6 @@ def test_embedding(self):
"""
This method tests if the embeddings are generated successfully
"""
time.sleep(10)
try:
self.setUp()
with grpc.insecure_channel("localhost:50051") as channel:
@@ -80,5 +78,60 @@ def test_embedding(self):
except Exception as err:
print(err)
self.fail("Embedding service failed")
finally:
self.tearDown()

def test_audio_load_model(self):
"""
This method tests if the model is loaded successfully
"""
try:
self.setUp()
with grpc.insecure_channel("localhost:50051") as channel:
stub = backend_pb2_grpc.BackendStub(channel)
response = stub.LoadModel(backend_pb2.ModelOptions(Model="facebook/musicgen-small",Type="MusicgenForConditionalGeneration"))
self.assertTrue(response.success)
self.assertEqual(response.message, "Model loaded successfully")
except Exception as err:
print(err)
self.fail("LoadModel service failed")
finally:
self.tearDown()

def test_tts(self):
"""
This method tests if TTS is generated successfully
"""
try:
self.setUp()
with grpc.insecure_channel("localhost:50051") as channel:
stub = backend_pb2_grpc.BackendStub(channel)
response = stub.LoadModel(backend_pb2.ModelOptions(Model="facebook/musicgen-small",Type="MusicgenForConditionalGeneration"))
self.assertTrue(response.success)
tts_request = backend_pb2.TTSRequest(text="80s TV news production music hit for tonight's biggest story")
tts_response = stub.TTS(tts_request)
self.assertIsNotNone(tts_response)
except Exception as err:
print(err)
self.fail("TTS service failed")
finally:
self.tearDown()

def test_sound_generation(self):
"""
This method tests if SoundGeneration is generated successfully
"""
try:
self.setUp()
with grpc.insecure_channel("localhost:50051") as channel:
stub = backend_pb2_grpc.BackendStub(channel)
response = stub.LoadModel(backend_pb2.ModelOptions(Model="facebook/musicgen-small",Type="MusicgenForConditionalGeneration"))
self.assertTrue(response.success)
sg_request = backend_pb2.SoundGenerationRequest(text="80s TV news production music hit for tonight's biggest story")
sg_response = stub.SoundGeneration(sg_request)
self.assertIsNotNone(sg_response)
except Exception as err:
print(err)
self.fail("SoundGeneration service failed")
finally:
self.tearDown()