Skip to content

Commit

Permalink
Get rid of server_args in internal iterfaces
Browse files Browse the repository at this point in the history
  • Loading branch information
JianyuZhan committed Aug 19, 2024
1 parent 560a5ac commit b33164f
Show file tree
Hide file tree
Showing 33 changed files with 830 additions and 1,728 deletions.
8 changes: 8 additions & 0 deletions .github/workflows/e2e-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,14 @@ jobs:
pip install -e "python[all]"
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
- name: Set PYTHONPATH
run:
echo "PYTHONPATH=$PYTHONPATH:$(pwd)/python" >> $GITHUB_ENV

- name: Verify import
run: |
python3 -c "import sglang.srt.serving"
- name: Benchmark Serving Throughput
run: |
cd test/srt
Expand Down
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ coverage.xml
.hypothesis/
.pytest_cache/
cover/
human-eval/

# Translations
*.mo
Expand Down
20 changes: 10 additions & 10 deletions examples/usage/llava_video/srt_example_llava_v.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,20 +184,20 @@ def batch(video_dir, save_dir, cur_chunk, num_chunks, num_frames=16, batch_size=
print("Invalid model path. Please specify a valid model path.")
exit()

model_overide_args = {}
model_override_args = {}

model_overide_args["mm_spatial_pool_stride"] = args.mm_spatial_pool_stride
model_overide_args["architectures"] = ["LlavaVidForCausalLM"]
model_overide_args["num_frames"] = args.num_frames
model_overide_args["model_type"] = "llava"
model_override_args["mm_spatial_pool_stride"] = args.mm_spatial_pool_stride
model_override_args["architectures"] = ["LlavaVidForCausalLM"]
model_override_args["num_frames"] = args.num_frames
model_override_args["model_type"] = "llava"

if "34b" in args.model_path.lower():
model_overide_args["image_token_index"] = 64002
model_override_args["image_token_index"] = 64002

if args.num_frames == 32:
model_overide_args["rope_scaling"] = {"factor": 2.0, "type": "linear"}
model_overide_args["max_sequence_length"] = 4096 * 2
model_overide_args["tokenizer_model_max_length"] = 4096 * 2
model_override_args["rope_scaling"] = {"factor": 2.0, "type": "linear"}
model_override_args["max_sequence_length"] = 4096 * 2
model_override_args["tokenizer_model_max_length"] = 4096 * 2
elif args.num_frames < 32:
pass
else:
Expand All @@ -211,7 +211,7 @@ def batch(video_dir, save_dir, cur_chunk, num_chunks, num_frames=16, batch_size=
tokenizer_path=tokenizer_path,
port=cur_port,
additional_ports=[cur_port + 1, cur_port + 2, cur_port + 3, cur_port + 4],
model_overide_args=model_overide_args,
model_override_args=model_override_args,
tp_size=1,
)
sgl.set_default_backend(runtime)
Expand Down
4 changes: 2 additions & 2 deletions examples/usage/llm_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,5 +16,5 @@
outputs = llm.generate(prompts, sampling_params)
# Print the outputs.
for prompt, output in zip(prompts, outputs):
print('===============================')
print(f"Prompt: {prompt}\nGenerated text: {output['text']}")
print("===============================")
print(f"Prompt: {prompt}\nGenerated text: {output['text']}")
2 changes: 1 addition & 1 deletion python/sglang/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@

from sglang.api import (
LLM,
SamplingParams,
Runtime,
SamplingParams,
assistant,
assistant_begin,
assistant_end,
Expand Down
5 changes: 3 additions & 2 deletions python/sglang/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,9 @@
SglSelect,
SglVideo,
)
from sglang.srt.serving.engine import LLM
from sglang.srt.sampling_params import SamplingParams
from sglang.srt.serving.engine import LLM


def function(
func: Optional[Callable] = None, num_api_spec_tokens: Optional[int] = None
Expand All @@ -36,7 +37,7 @@ def decorator(func):
def Runtime(*args, **kwargs):
# Avoid importing unnecessary dependency
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
from sglang.srt.server import Runtime
from sglang.srt.serving.server import Runtime

return Runtime(*args, **kwargs)

Expand Down
23 changes: 16 additions & 7 deletions python/sglang/bench_latency.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,13 +49,18 @@
import torch
import torch.distributed as dist

from sglang.srt.config import (
ModelConfig,
OptimizationConfig,
ParallelConfig,
ScheduleConfig,
)
from sglang.srt.hf_transformers_utils import get_tokenizer
from sglang.srt.managers.schedule_batch import Req, ScheduleBatch
from sglang.srt.model_config import ModelConfig
from sglang.srt.model_executor.forward_batch_info import ForwardMode
from sglang.srt.model_executor.model_runner import ModelRunner
from sglang.srt.sampling_params import SamplingParams
from sglang.srt.server_args import ServerArgs
from sglang.srt.serving.server_args import ServerArgs
from sglang.srt.utils import suppress_other_loggers


Expand Down Expand Up @@ -111,15 +116,19 @@ def load_model(server_args, tp_rank):
suppress_other_loggers()
rank_print = print if tp_rank == 0 else lambda *args, **kwargs: None

model_config = ModelConfig(path=server_args.model_path)
model_config = ModelConfig(model_path=server_args.model_path)
optimization_config = OptimizationConfig()
parallel_config = ParallelConfig(tp_size=server_args.tp_size, nccl_ports=[28888])
schedule_config = ScheduleConfig(
mem_fraction_static=server_args.mem_fraction_static
)
model_runner = ModelRunner(
model_config=model_config,
mem_fraction_static=server_args.mem_fraction_static,
optimization_config=optimization_config,
parallel_config=parallel_config,
schedule_config=schedule_config,
gpu_id=tp_rank,
tp_rank=tp_rank,
tp_size=server_args.tp_size,
nccl_port=28888,
server_args=server_args,
)
rank_print(f"max_total_num_tokens={model_runner.max_total_num_tokens}")
tokenizer = get_tokenizer(
Expand Down
4 changes: 2 additions & 2 deletions python/sglang/launch_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@

import argparse

from sglang.srt.server import launch_server
from sglang.srt.server_args import ServerArgs
from sglang.srt.serving.server import launch_server
from sglang.srt.serving.server_args import ServerArgs

if __name__ == "__main__":
parser = argparse.ArgumentParser()
Expand Down
29 changes: 15 additions & 14 deletions python/sglang/launch_server_llavavid.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,28 +2,29 @@

import argparse

from sglang.srt.server import ServerArgs, launch_server
from sglang.srt.serving.server import ServerArgs, launch_server

if __name__ == "__main__":
model_overide_args = {}

model_overide_args["mm_spatial_pool_stride"] = 2
model_overide_args["architectures"] = ["LlavaVidForCausalLM"]
model_overide_args["num_frames"] = 16
model_overide_args["model_type"] = "llavavid"
if model_overide_args["num_frames"] == 32:
model_overide_args["rope_scaling"] = {"factor": 2.0, "type": "linear"}
model_overide_args["max_sequence_length"] = 4096 * 2
model_overide_args["tokenizer_model_max_length"] = 4096 * 2
model_overide_args["model_max_length"] = 4096 * 2
model_override_args = {}

model_override_args["mm_spatial_pool_stride"] = 2
model_override_args["architectures"] = ["LlavaVidForCausalLM"]
model_override_args["num_frames"] = 16
model_override_args["model_type"] = "llavavid"
if model_override_args["num_frames"] == 32:
model_override_args["rope_scaling"] = {"factor": 2.0, "type": "linear"}
model_override_args["max_sequence_length"] = 4096 * 2
model_override_args["tokenizer_model_max_length"] = 4096 * 2
model_override_args["model_max_length"] = 4096 * 2

parser = argparse.ArgumentParser()
ServerArgs.add_cli_args(parser)
args = parser.parse_args()

if "34b" in args.model_path.lower():
model_overide_args["image_token_index"] = 64002
model_override_args["image_token_index"] = 64002

server_args = ServerArgs.from_cli_args(args)
server_args.model_override_args = model_override_args

launch_server(server_args, model_overide_args, None)
launch_server(server_args, None)
Empty file added python/sglang/srt/__init__.py
Empty file.
Loading

0 comments on commit b33164f

Please sign in to comment.