mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-06-28 02:53:30 +00:00
Some templates were still using the old environment variable substition syntax instead of the new one and were not getting substituted properly. Also, some places didn't handle the new None vs old empty string ("") values that come from the conditional environment variable substitution. This gets the starter and remote-vllm distributions starting again, and I tested various permutations of the starter as chroma and pgvector needed some adjustments to their config classes to handle the new possible `None` values. And, I had to tweak our `Provider` class to also handle `None` values, for cases where we disable providers in the starter config via environment variables. This may not have caught everything that was missed, but I did grep around quite a bit to try and find anything lingering. Signed-off-by: Ben Browning <bbrownin@redhat.com>
157 lines
5.6 KiB
Python
157 lines
5.6 KiB
Python
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
# All rights reserved.
|
|
#
|
|
# This source code is licensed under the terms described in the LICENSE file in
|
|
# the root directory of this source tree.
|
|
|
|
from pathlib import Path
|
|
|
|
from llama_stack.apis.models import ModelType
|
|
from llama_stack.distribution.datatypes import (
|
|
ModelInput,
|
|
Provider,
|
|
ShieldInput,
|
|
ToolGroupInput,
|
|
)
|
|
from llama_stack.providers.inline.inference.sentence_transformers import (
|
|
SentenceTransformersInferenceConfig,
|
|
)
|
|
from llama_stack.providers.inline.vector_io.faiss.config import FaissVectorIOConfig
|
|
from llama_stack.providers.remote.inference.vllm import VLLMInferenceAdapterConfig
|
|
from llama_stack.templates.template import DistributionTemplate, RunConfigSettings
|
|
|
|
|
|
def get_distribution_template() -> DistributionTemplate:
|
|
providers = {
|
|
"inference": ["remote::vllm", "inline::sentence-transformers"],
|
|
"vector_io": ["inline::faiss", "remote::chromadb", "remote::pgvector"],
|
|
"safety": ["inline::llama-guard"],
|
|
"agents": ["inline::meta-reference"],
|
|
"eval": ["inline::meta-reference"],
|
|
"datasetio": ["remote::huggingface", "inline::localfs"],
|
|
"scoring": ["inline::basic", "inline::llm-as-judge", "inline::braintrust"],
|
|
"telemetry": ["inline::meta-reference"],
|
|
"tool_runtime": [
|
|
"remote::brave-search",
|
|
"remote::tavily-search",
|
|
"inline::rag-runtime",
|
|
"remote::model-context-protocol",
|
|
"remote::wolfram-alpha",
|
|
],
|
|
}
|
|
name = "remote-vllm"
|
|
inference_provider = Provider(
|
|
provider_id="vllm-inference",
|
|
provider_type="remote::vllm",
|
|
config=VLLMInferenceAdapterConfig.sample_run_config(
|
|
url="${env.VLLM_URL:=http://localhost:8000/v1}",
|
|
),
|
|
)
|
|
embedding_provider = Provider(
|
|
provider_id="sentence-transformers",
|
|
provider_type="inline::sentence-transformers",
|
|
config=SentenceTransformersInferenceConfig.sample_run_config(),
|
|
)
|
|
vector_io_provider = Provider(
|
|
provider_id="faiss",
|
|
provider_type="inline::faiss",
|
|
config=FaissVectorIOConfig.sample_run_config(f"~/.llama/distributions/{name}"),
|
|
)
|
|
|
|
inference_model = ModelInput(
|
|
model_id="${env.INFERENCE_MODEL}",
|
|
provider_id="vllm-inference",
|
|
)
|
|
safety_model = ModelInput(
|
|
model_id="${env.SAFETY_MODEL}",
|
|
provider_id="vllm-safety",
|
|
)
|
|
embedding_model = ModelInput(
|
|
model_id="all-MiniLM-L6-v2",
|
|
provider_id="sentence-transformers",
|
|
model_type=ModelType.embedding,
|
|
metadata={
|
|
"embedding_dimension": 384,
|
|
},
|
|
)
|
|
default_tool_groups = [
|
|
ToolGroupInput(
|
|
toolgroup_id="builtin::websearch",
|
|
provider_id="tavily-search",
|
|
),
|
|
ToolGroupInput(
|
|
toolgroup_id="builtin::rag",
|
|
provider_id="rag-runtime",
|
|
),
|
|
ToolGroupInput(
|
|
toolgroup_id="builtin::wolfram_alpha",
|
|
provider_id="wolfram-alpha",
|
|
),
|
|
]
|
|
|
|
return DistributionTemplate(
|
|
name=name,
|
|
distro_type="self_hosted",
|
|
description="Use (an external) vLLM server for running LLM inference",
|
|
template_path=Path(__file__).parent / "doc_template.md",
|
|
providers=providers,
|
|
run_configs={
|
|
"run.yaml": RunConfigSettings(
|
|
provider_overrides={
|
|
"inference": [inference_provider, embedding_provider],
|
|
"vector_io": [vector_io_provider],
|
|
},
|
|
default_models=[inference_model, embedding_model],
|
|
default_tool_groups=default_tool_groups,
|
|
),
|
|
"run-with-safety.yaml": RunConfigSettings(
|
|
provider_overrides={
|
|
"inference": [
|
|
inference_provider,
|
|
Provider(
|
|
provider_id="vllm-safety",
|
|
provider_type="remote::vllm",
|
|
config=VLLMInferenceAdapterConfig.sample_run_config(
|
|
url="${env.SAFETY_VLLM_URL}",
|
|
),
|
|
),
|
|
embedding_provider,
|
|
],
|
|
"vector_io": [vector_io_provider],
|
|
},
|
|
default_models=[
|
|
inference_model,
|
|
safety_model,
|
|
embedding_model,
|
|
],
|
|
default_shields=[ShieldInput(shield_id="${env.SAFETY_MODEL}")],
|
|
default_tool_groups=default_tool_groups,
|
|
),
|
|
},
|
|
run_config_env_vars={
|
|
"LLAMA_STACK_PORT": (
|
|
"8321",
|
|
"Port for the Llama Stack distribution server",
|
|
),
|
|
"INFERENCE_MODEL": (
|
|
"meta-llama/Llama-3.2-3B-Instruct",
|
|
"Inference model loaded into the vLLM server",
|
|
),
|
|
"VLLM_URL": (
|
|
"http://host.docker.internal:5100/v1",
|
|
"URL of the vLLM server with the main inference model",
|
|
),
|
|
"MAX_TOKENS": (
|
|
"4096",
|
|
"Maximum number of tokens for generation",
|
|
),
|
|
"SAFETY_VLLM_URL": (
|
|
"http://host.docker.internal:5101/v1",
|
|
"URL of the vLLM server with the safety model",
|
|
),
|
|
"SAFETY_MODEL": (
|
|
"meta-llama/Llama-Guard-3-1B",
|
|
"Name of the safety (Llama-Guard) model to use",
|
|
),
|
|
},
|
|
)
|