llama-stack/llama_stack/templates/remote-vllm/vllm.py
Xi Yan 9d005154d7
fix vllm template (#813)
# What does this PR do?

- Fix vLLM template to resolve
https://github.com/meta-llama/llama-stack/issues/805
- Fix agents test with shields

## Test Plan

```
vllm serve meta-llama/Llama-3.1-8B-Instruct
VLLM_URL="http://localhost:8000/v1" INFERENCE_MODEL="meta-llama/Llama-3.1-8B-Instruct" llama stack run ./llama_stack/templates/remote-vllm/run.yaml
```

```
LLAMA_STACK_BASE_URL=http://localhost:8321 pytest -v ./tests/client-sdk/
```

<img width="1245" alt="image"
src="https://github.com/user-attachments/assets/9af27684-5a9c-4187-b338-cbfc5211bd99"
/>


- custom tool flaky due to model outputs
- /completions API not implemented

**Vision Model**
- 11B-Vision-Instruct
<img width="1240" alt="image"
src="https://github.com/user-attachments/assets/1d3b3b17-fa09-43a7-b56c-3f77263825c5"
/>


## Sources

Please link relevant resources if necessary.


## Before submitting

- [ ] This PR fixes a typo or improves the docs (you can dismiss the
other checks if that's the case).
- [ ] Ran pre-commit to handle lint / formatting issues.
- [ ] Read the [contributor
guideline](https://github.com/meta-llama/llama-stack/blob/main/CONTRIBUTING.md),
      Pull Request section?
- [ ] Updated relevant documentation.
- [ ] Wrote necessary unit or integration tests.
2025-01-17 15:34:29 -08:00

157 lines
5.5 KiB
Python

# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from pathlib import Path
from llama_stack.apis.models.models import ModelType
from llama_stack.distribution.datatypes import (
ModelInput,
Provider,
ShieldInput,
ToolGroupInput,
)
from llama_stack.providers.inline.inference.sentence_transformers import (
SentenceTransformersInferenceConfig,
)
from llama_stack.providers.inline.memory.faiss.config import FaissImplConfig
from llama_stack.providers.remote.inference.vllm import VLLMInferenceAdapterConfig
from llama_stack.templates.template import DistributionTemplate, RunConfigSettings
def get_distribution_template() -> DistributionTemplate:
providers = {
"inference": ["remote::vllm"],
"memory": ["inline::faiss", "remote::chromadb", "remote::pgvector"],
"safety": ["inline::llama-guard"],
"agents": ["inline::meta-reference"],
"eval": ["inline::meta-reference"],
"datasetio": ["remote::huggingface", "inline::localfs"],
"scoring": ["inline::basic", "inline::llm-as-judge", "inline::braintrust"],
"telemetry": ["inline::meta-reference"],
"tool_runtime": [
"remote::brave-search",
"remote::tavily-search",
"inline::code-interpreter",
"inline::memory-runtime",
],
}
name = "remote-vllm"
inference_provider = Provider(
provider_id="vllm-inference",
provider_type="remote::vllm",
config=VLLMInferenceAdapterConfig.sample_run_config(
url="${env.VLLM_URL}",
),
)
embedding_provider = Provider(
provider_id="sentence-transformers",
provider_type="inline::sentence-transformers",
config=SentenceTransformersInferenceConfig.sample_run_config(),
)
memory_provider = Provider(
provider_id="faiss",
provider_type="inline::faiss",
config=FaissImplConfig.sample_run_config(f"distributions/{name}"),
)
inference_model = ModelInput(
model_id="${env.INFERENCE_MODEL}",
provider_id="vllm-inference",
)
safety_model = ModelInput(
model_id="${env.SAFETY_MODEL}",
provider_id="vllm-safety",
)
embedding_model = ModelInput(
model_id="all-MiniLM-L6-v2",
provider_id="sentence-transformers",
model_type=ModelType.embedding,
metadata={
"embedding_dimension": 384,
},
)
default_tool_groups = [
ToolGroupInput(
toolgroup_id="builtin::websearch",
provider_id="tavily-search",
),
ToolGroupInput(
toolgroup_id="builtin::memory",
provider_id="memory-runtime",
),
ToolGroupInput(
toolgroup_id="builtin::code_interpreter",
provider_id="code-interpreter",
),
]
return DistributionTemplate(
name=name,
distro_type="self_hosted",
description="Use (an external) vLLM server for running LLM inference",
template_path=Path(__file__).parent / "doc_template.md",
providers=providers,
default_models=[inference_model, safety_model],
run_configs={
"run.yaml": RunConfigSettings(
provider_overrides={
"inference": [inference_provider, embedding_provider],
"memory": [memory_provider],
},
default_models=[inference_model, embedding_model],
default_tool_groups=default_tool_groups,
),
"run-with-safety.yaml": RunConfigSettings(
provider_overrides={
"inference": [
inference_provider,
Provider(
provider_id="vllm-safety",
provider_type="remote::vllm",
config=VLLMInferenceAdapterConfig.sample_run_config(
url="${env.SAFETY_VLLM_URL}",
),
),
embedding_provider,
],
"memory": [memory_provider],
},
default_models=[
inference_model,
safety_model,
embedding_model,
],
default_shields=[ShieldInput(shield_id="${env.SAFETY_MODEL}")],
default_tool_groups=default_tool_groups,
),
},
run_config_env_vars={
"LLAMA_STACK_PORT": (
"5001",
"Port for the Llama Stack distribution server",
),
"INFERENCE_MODEL": (
"meta-llama/Llama-3.2-3B-Instruct",
"Inference model loaded into the vLLM server",
),
"VLLM_URL": (
"http://host.docker.internal:5100/v1",
"URL of the vLLM server with the main inference model",
),
"MAX_TOKENS": (
"4096",
"Maximum number of tokens for generation",
),
"SAFETY_VLLM_URL": (
"http://host.docker.internal:5101/v1",
"URL of the vLLM server with the safety model",
),
"SAFETY_MODEL": (
"meta-llama/Llama-Guard-3-1B",
"Name of the safety (Llama-Guard) model to use",
),
},
)