add embedding model by default to distribution templates (#617)

# What does this PR do?
Adds the sentence transformer provider and the `all-MiniLM-L6-v2`
embedding model to the default models to register in the run.yaml for
all providers.

## Test Plan
llama stack build --template together --image-type conda
llama stack run
~/.llama/distributions/llamastack-together/together-run.yaml
This commit is contained in:
Dinesh Yeduguru 2024-12-13 12:48:00 -08:00 committed by GitHub
parent e893b22868
commit 516e1a3e59
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
41 changed files with 473 additions and 64 deletions

View file

@ -249,6 +249,7 @@
"redis", "redis",
"scikit-learn", "scikit-learn",
"scipy", "scipy",
"sentence-transformers",
"sentencepiece", "sentencepiece",
"torch", "torch",
"torchvision", "torchvision",
@ -287,6 +288,7 @@
"redis", "redis",
"scikit-learn", "scikit-learn",
"scipy", "scipy",
"sentence-transformers",
"sentencepiece", "sentencepiece",
"torch", "torch",
"torchao==0.5.0", "torchao==0.5.0",

View file

@ -21,9 +21,10 @@ class CommonModelFields(BaseModel):
) )
class ModelType(Enum): @json_schema_type
class ModelType(str, Enum):
llm = "llm" llm = "llm"
embedding_model = "embedding" embedding = "embedding"
@json_schema_type @json_schema_type

View file

@ -109,7 +109,7 @@ class InferenceRouter(Inference):
model = await self.routing_table.get_model(model_id) model = await self.routing_table.get_model(model_id)
if model is None: if model is None:
raise ValueError(f"Model '{model_id}' not found") raise ValueError(f"Model '{model_id}' not found")
if model.model_type == ModelType.embedding_model: if model.model_type == ModelType.embedding:
raise ValueError( raise ValueError(
f"Model '{model_id}' is an embedding model and does not support chat completions" f"Model '{model_id}' is an embedding model and does not support chat completions"
) )
@ -142,7 +142,7 @@ class InferenceRouter(Inference):
model = await self.routing_table.get_model(model_id) model = await self.routing_table.get_model(model_id)
if model is None: if model is None:
raise ValueError(f"Model '{model_id}' not found") raise ValueError(f"Model '{model_id}' not found")
if model.model_type == ModelType.embedding_model: if model.model_type == ModelType.embedding:
raise ValueError( raise ValueError(
f"Model '{model_id}' is an embedding model and does not support chat completions" f"Model '{model_id}' is an embedding model and does not support chat completions"
) )

View file

@ -225,10 +225,7 @@ class ModelsRoutingTable(CommonRoutingTableImpl, Models):
metadata = {} metadata = {}
if model_type is None: if model_type is None:
model_type = ModelType.llm model_type = ModelType.llm
if ( if "embedding_dimension" not in metadata and model_type == ModelType.embedding:
"embedding_dimension" not in metadata
and model_type == ModelType.embedding_model
):
raise ValueError( raise ValueError(
"Embedding model must have an embedding dimension in its metadata" "Embedding model must have an embedding dimension in its metadata"
) )
@ -311,8 +308,15 @@ class MemoryBanksRoutingTable(CommonRoutingTableImpl, MemoryBanks):
) )
model = await self.get_object_by_identifier("model", params.embedding_model) model = await self.get_object_by_identifier("model", params.embedding_model)
if model is None: if model is None:
raise ValueError(f"Model {params.embedding_model} not found") if params.embedding_model == "all-MiniLM-L6-v2":
if model.model_type != ModelType.embedding_model: raise ValueError(
"Embeddings are now served via Inference providers. "
"Please upgrade your run.yaml to include inline::sentence-transformer as an additional inference provider. "
"See https://github.com/meta-llama/llama-stack/blob/main/llama_stack/templates/together/run.yaml for an example."
)
else:
raise ValueError(f"Model {params.embedding_model} not found")
if model.model_type != ModelType.embedding:
raise ValueError( raise ValueError(
f"Model {params.embedding_model} is not an embedding model" f"Model {params.embedding_model} is not an embedding model"
) )

View file

@ -83,7 +83,7 @@ class MetaReferenceInferenceImpl(
async def register_model(self, model: Model) -> Model: async def register_model(self, model: Model) -> Model:
model = await self.model_registry_helper.register_model(model) model = await self.model_registry_helper.register_model(model)
if model.model_type == ModelType.embedding_model: if model.model_type == ModelType.embedding:
self._load_sentence_transformer_model(model.provider_resource_id) self._load_sentence_transformer_model(model.provider_resource_id)
return model return model

View file

@ -4,7 +4,13 @@
# This source code is licensed under the terms described in the LICENSE file in # This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree. # the root directory of this source tree.
from typing import Any, Dict
from pydantic import BaseModel from pydantic import BaseModel
class SentenceTransformersInferenceConfig(BaseModel): ... class SentenceTransformersInferenceConfig(BaseModel):
@classmethod
def sample_run_config(cls) -> Dict[str, Any]:
return {}

View file

@ -337,7 +337,7 @@ class OllamaInferenceAdapter(Inference, ModelsProtocolPrivate):
async def register_model(self, model: Model) -> Model: async def register_model(self, model: Model) -> Model:
# ollama does not have embedding models running. Check if the model is in list of available models. # ollama does not have embedding models running. Check if the model is in list of available models.
if model.model_type == ModelType.embedding_model: if model.model_type == ModelType.embedding:
response = await self.client.list() response = await self.client.list()
available_models = [m["model"] for m in response["models"]] available_models = [m["model"] for m in response["models"]]
if model.provider_resource_id not in available_models: if model.provider_resource_id not in available_models:

View file

@ -207,7 +207,7 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
model = await self.model_store.get_model(model_id) model = await self.model_store.get_model(model_id)
kwargs = {} kwargs = {}
assert model.model_type == ModelType.embedding_model assert model.model_type == ModelType.embedding
assert model.metadata.get("embedding_dimensions") assert model.metadata.get("embedding_dimensions")
kwargs["dimensions"] = model.metadata.get("embedding_dimensions") kwargs["dimensions"] = model.metadata.get("embedding_dimensions")
assert all( assert all(

View file

@ -238,7 +238,7 @@ async def inference_stack(request, inference_model):
model_type = ModelType.llm model_type = ModelType.llm
metadata = {} metadata = {}
if os.getenv("EMBEDDING_DIMENSION"): if os.getenv("EMBEDDING_DIMENSION"):
model_type = ModelType.embedding_model model_type = ModelType.embedding
metadata["embedding_dimension"] = get_env_or_fail("EMBEDDING_DIMENSION") metadata["embedding_dimension"] = get_env_or_fail("EMBEDDING_DIMENSION")
test_stack = await construct_stack_for_test( test_stack = await construct_stack_for_test(

View file

@ -18,7 +18,7 @@ class TestEmbeddings:
inference_impl, models_impl = inference_stack inference_impl, models_impl = inference_stack
model = await models_impl.get_model(inference_model) model = await models_impl.get_model(inference_model)
if model.model_type != ModelType.embedding_model: if model.model_type != ModelType.embedding:
pytest.skip("This test is only applicable for embedding models") pytest.skip("This test is only applicable for embedding models")
response = await inference_impl.embeddings( response = await inference_impl.embeddings(
@ -39,7 +39,7 @@ class TestEmbeddings:
inference_impl, models_impl = inference_stack inference_impl, models_impl = inference_stack
model = await models_impl.get_model(inference_model) model = await models_impl.get_model(inference_model)
if model.model_type != ModelType.embedding_model: if model.model_type != ModelType.embedding:
pytest.skip("This test is only applicable for embedding models") pytest.skip("This test is only applicable for embedding models")
texts = ["Hello, world!", "This is a test", "Testing embeddings"] texts = ["Hello, world!", "This is a test", "Testing embeddings"]

View file

@ -125,7 +125,7 @@ async def memory_stack(inference_model, request):
models=[ models=[
ModelInput( ModelInput(
model_id=inference_model, model_id=inference_model,
model_type=ModelType.embedding_model, model_type=ModelType.embedding,
metadata={ metadata={
"embedding_dimension": get_env_or_fail("EMBEDDING_DIMENSION"), "embedding_dimension": get_env_or_fail("EMBEDDING_DIMENSION"),
}, },

View file

@ -78,7 +78,7 @@ class ModelRegistryHelper(ModelsProtocolPrivate):
return None return None
async def register_model(self, model: Model) -> Model: async def register_model(self, model: Model) -> Model:
if model.model_type == ModelType.embedding_model: if model.model_type == ModelType.embedding:
# embedding models are always registered by their provider model id and does not need to be mapped to a llama model # embedding models are always registered by their provider model id and does not need to be mapped to a llama model
provider_resource_id = model.provider_resource_id provider_resource_id = model.provider_resource_id
else: else:

View file

@ -8,10 +8,14 @@ from pathlib import Path
from llama_models.sku_list import all_registered_models from llama_models.sku_list import all_registered_models
from llama_stack.apis.models.models import ModelType
from llama_stack.distribution.datatypes import ModelInput, Provider, ShieldInput from llama_stack.distribution.datatypes import ModelInput, Provider, ShieldInput
from llama_stack.providers.inline.inference.sentence_transformers import (
SentenceTransformersInferenceConfig,
)
from llama_stack.providers.remote.inference.cerebras import CerebrasImplConfig from llama_stack.providers.remote.inference.cerebras import CerebrasImplConfig
from llama_stack.providers.remote.inference.cerebras.cerebras import model_aliases from llama_stack.providers.remote.inference.cerebras.cerebras import model_aliases
from llama_stack.templates.template import DistributionTemplate, RunConfigSettings from llama_stack.templates.template import DistributionTemplate, RunConfigSettings
@ -29,6 +33,11 @@ def get_distribution_template() -> DistributionTemplate:
provider_type="remote::cerebras", provider_type="remote::cerebras",
config=CerebrasImplConfig.sample_run_config(), config=CerebrasImplConfig.sample_run_config(),
) )
embedding_provider = Provider(
provider_id="sentence-transformers",
provider_type="inline::sentence-transformers",
config=SentenceTransformersInferenceConfig.sample_run_config(),
)
core_model_to_hf_repo = { core_model_to_hf_repo = {
m.descriptor(): m.huggingface_repo for m in all_registered_models() m.descriptor(): m.huggingface_repo for m in all_registered_models()
@ -37,9 +46,18 @@ def get_distribution_template() -> DistributionTemplate:
ModelInput( ModelInput(
model_id=core_model_to_hf_repo[m.llama_model], model_id=core_model_to_hf_repo[m.llama_model],
provider_model_id=m.provider_model_id, provider_model_id=m.provider_model_id,
provider_id="cerebras",
) )
for m in model_aliases for m in model_aliases
] ]
embedding_model = ModelInput(
model_id="all-MiniLM-L6-v2",
provider_id="sentence-transformers",
model_type=ModelType.embedding,
metadata={
"embedding_dimension": 384,
},
)
return DistributionTemplate( return DistributionTemplate(
name="cerebras", name="cerebras",
@ -52,9 +70,9 @@ def get_distribution_template() -> DistributionTemplate:
run_configs={ run_configs={
"run.yaml": RunConfigSettings( "run.yaml": RunConfigSettings(
provider_overrides={ provider_overrides={
"inference": [inference_provider], "inference": [inference_provider, embedding_provider],
}, },
default_models=default_models, default_models=default_models + [embedding_model],
default_shields=[ShieldInput(shield_id="meta-llama/Llama-Guard-3-8B")], default_shields=[ShieldInput(shield_id="meta-llama/Llama-Guard-3-8B")],
), ),
}, },

View file

@ -15,6 +15,9 @@ providers:
config: config:
base_url: https://api.cerebras.ai base_url: https://api.cerebras.ai
api_key: ${env.CEREBRAS_API_KEY} api_key: ${env.CEREBRAS_API_KEY}
- provider_id: sentence-transformers
provider_type: inline::sentence-transformers
config: {}
safety: safety:
- provider_id: llama-guard - provider_id: llama-guard
provider_type: inline::llama-guard provider_type: inline::llama-guard
@ -49,12 +52,20 @@ metadata_store:
models: models:
- metadata: {} - metadata: {}
model_id: meta-llama/Llama-3.1-8B-Instruct model_id: meta-llama/Llama-3.1-8B-Instruct
provider_id: null provider_id: cerebras
provider_model_id: llama3.1-8b provider_model_id: llama3.1-8b
model_type: llm
- metadata: {} - metadata: {}
model_id: meta-llama/Llama-3.1-70B-Instruct model_id: meta-llama/Llama-3.1-70B-Instruct
provider_id: null provider_id: cerebras
provider_model_id: llama3.1-70b provider_model_id: llama3.1-70b
model_type: llm
- metadata:
embedding_dimension: 384
model_id: all-MiniLM-L6-v2
provider_id: sentence-transformers
provider_model_id: null
model_type: embedding
shields: shields:
- params: null - params: null
shield_id: meta-llama/Llama-Guard-3-8B shield_id: meta-llama/Llama-Guard-3-8B

View file

@ -8,11 +8,15 @@ from pathlib import Path
from llama_models.sku_list import all_registered_models from llama_models.sku_list import all_registered_models
from llama_stack.apis.models.models import ModelType
from llama_stack.distribution.datatypes import ModelInput, Provider, ShieldInput from llama_stack.distribution.datatypes import ModelInput, Provider, ShieldInput
from llama_stack.providers.inline.inference.sentence_transformers import (
SentenceTransformersInferenceConfig,
)
from llama_stack.providers.inline.memory.faiss.config import FaissImplConfig from llama_stack.providers.inline.memory.faiss.config import FaissImplConfig
from llama_stack.providers.remote.inference.fireworks import FireworksImplConfig from llama_stack.providers.remote.inference.fireworks import FireworksImplConfig
from llama_stack.providers.remote.inference.fireworks.fireworks import MODEL_ALIASES from llama_stack.providers.remote.inference.fireworks.fireworks import MODEL_ALIASES
from llama_stack.templates.template import DistributionTemplate, RunConfigSettings from llama_stack.templates.template import DistributionTemplate, RunConfigSettings
@ -35,6 +39,11 @@ def get_distribution_template() -> DistributionTemplate:
provider_type="remote::fireworks", provider_type="remote::fireworks",
config=FireworksImplConfig.sample_run_config(), config=FireworksImplConfig.sample_run_config(),
) )
embedding_provider = Provider(
provider_id="sentence-transformers",
provider_type="inline::sentence-transformers",
config=SentenceTransformersInferenceConfig.sample_run_config(),
)
memory_provider = Provider( memory_provider = Provider(
provider_id="faiss", provider_id="faiss",
provider_type="inline::faiss", provider_type="inline::faiss",
@ -48,9 +57,18 @@ def get_distribution_template() -> DistributionTemplate:
ModelInput( ModelInput(
model_id=core_model_to_hf_repo[m.llama_model], model_id=core_model_to_hf_repo[m.llama_model],
provider_model_id=m.provider_model_id, provider_model_id=m.provider_model_id,
provider_id="fireworks",
) )
for m in MODEL_ALIASES for m in MODEL_ALIASES
] ]
embedding_model = ModelInput(
model_id="all-MiniLM-L6-v2",
provider_id="sentence-transformers",
model_type=ModelType.embedding,
metadata={
"embedding_dimension": 384,
},
)
return DistributionTemplate( return DistributionTemplate(
name=name, name=name,
@ -63,10 +81,10 @@ def get_distribution_template() -> DistributionTemplate:
run_configs={ run_configs={
"run.yaml": RunConfigSettings( "run.yaml": RunConfigSettings(
provider_overrides={ provider_overrides={
"inference": [inference_provider], "inference": [inference_provider, embedding_provider],
"memory": [memory_provider], "memory": [memory_provider],
}, },
default_models=default_models, default_models=default_models + [embedding_model],
default_shields=[ShieldInput(shield_id="meta-llama/Llama-Guard-3-8B")], default_shields=[ShieldInput(shield_id="meta-llama/Llama-Guard-3-8B")],
), ),
}, },

View file

@ -16,8 +16,11 @@ providers:
- provider_id: fireworks - provider_id: fireworks
provider_type: remote::fireworks provider_type: remote::fireworks
config: config:
url: https://api.fireworks.ai/inference url: https://api.fireworks.ai/inference/v1
api_key: ${env.FIREWORKS_API_KEY} api_key: ${env.FIREWORKS_API_KEY}
- provider_id: sentence-transformers
provider_type: inline::sentence-transformers
config: {}
memory: memory:
- provider_id: faiss - provider_id: faiss
provider_type: inline::faiss provider_type: inline::faiss
@ -74,40 +77,55 @@ metadata_store:
models: models:
- metadata: {} - metadata: {}
model_id: meta-llama/Llama-3.1-8B-Instruct model_id: meta-llama/Llama-3.1-8B-Instruct
provider_id: null provider_id: fireworks
provider_model_id: fireworks/llama-v3p1-8b-instruct provider_model_id: fireworks/llama-v3p1-8b-instruct
model_type: llm
- metadata: {} - metadata: {}
model_id: meta-llama/Llama-3.1-70B-Instruct model_id: meta-llama/Llama-3.1-70B-Instruct
provider_id: null provider_id: fireworks
provider_model_id: fireworks/llama-v3p1-70b-instruct provider_model_id: fireworks/llama-v3p1-70b-instruct
model_type: llm
- metadata: {} - metadata: {}
model_id: meta-llama/Llama-3.1-405B-Instruct-FP8 model_id: meta-llama/Llama-3.1-405B-Instruct-FP8
provider_id: null provider_id: fireworks
provider_model_id: fireworks/llama-v3p1-405b-instruct provider_model_id: fireworks/llama-v3p1-405b-instruct
model_type: llm
- metadata: {} - metadata: {}
model_id: meta-llama/Llama-3.2-1B-Instruct model_id: meta-llama/Llama-3.2-1B-Instruct
provider_id: null provider_id: fireworks
provider_model_id: fireworks/llama-v3p2-1b-instruct provider_model_id: fireworks/llama-v3p2-1b-instruct
model_type: llm
- metadata: {} - metadata: {}
model_id: meta-llama/Llama-3.2-3B-Instruct model_id: meta-llama/Llama-3.2-3B-Instruct
provider_id: null provider_id: fireworks
provider_model_id: fireworks/llama-v3p2-3b-instruct provider_model_id: fireworks/llama-v3p2-3b-instruct
model_type: llm
- metadata: {} - metadata: {}
model_id: meta-llama/Llama-3.2-11B-Vision-Instruct model_id: meta-llama/Llama-3.2-11B-Vision-Instruct
provider_id: null provider_id: fireworks
provider_model_id: fireworks/llama-v3p2-11b-vision-instruct provider_model_id: fireworks/llama-v3p2-11b-vision-instruct
model_type: llm
- metadata: {} - metadata: {}
model_id: meta-llama/Llama-3.2-90B-Vision-Instruct model_id: meta-llama/Llama-3.2-90B-Vision-Instruct
provider_id: null provider_id: fireworks
provider_model_id: fireworks/llama-v3p2-90b-vision-instruct provider_model_id: fireworks/llama-v3p2-90b-vision-instruct
model_type: llm
- metadata: {} - metadata: {}
model_id: meta-llama/Llama-Guard-3-8B model_id: meta-llama/Llama-Guard-3-8B
provider_id: null provider_id: fireworks
provider_model_id: fireworks/llama-guard-3-8b provider_model_id: fireworks/llama-guard-3-8b
model_type: llm
- metadata: {} - metadata: {}
model_id: meta-llama/Llama-Guard-3-11B-Vision model_id: meta-llama/Llama-Guard-3-11B-Vision
provider_id: null provider_id: fireworks
provider_model_id: fireworks/llama-guard-3-11b-vision provider_model_id: fireworks/llama-guard-3-11b-vision
model_type: llm
- metadata:
embedding_dimension: 384
model_id: all-MiniLM-L6-v2
provider_id: sentence-transformers
provider_model_id: null
model_type: embedding
shields: shields:
- params: null - params: null
shield_id: meta-llama/Llama-Guard-3-8B shield_id: meta-llama/Llama-Guard-3-8B

View file

@ -4,7 +4,11 @@
# This source code is licensed under the terms described in the LICENSE file in # This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree. # the root directory of this source tree.
from llama_stack.apis.models.models import ModelType
from llama_stack.distribution.datatypes import ModelInput, Provider, ShieldInput from llama_stack.distribution.datatypes import ModelInput, Provider, ShieldInput
from llama_stack.providers.inline.inference.sentence_transformers import (
SentenceTransformersInferenceConfig,
)
from llama_stack.providers.inline.memory.faiss.config import FaissImplConfig from llama_stack.providers.inline.memory.faiss.config import FaissImplConfig
from llama_stack.providers.remote.inference.tgi import InferenceEndpointImplConfig from llama_stack.providers.remote.inference.tgi import InferenceEndpointImplConfig
from llama_stack.templates.template import DistributionTemplate, RunConfigSettings from llama_stack.templates.template import DistributionTemplate, RunConfigSettings
@ -27,6 +31,11 @@ def get_distribution_template() -> DistributionTemplate:
provider_type="remote::hf::endpoint", provider_type="remote::hf::endpoint",
config=InferenceEndpointImplConfig.sample_run_config(), config=InferenceEndpointImplConfig.sample_run_config(),
) )
embedding_provider = Provider(
provider_id="sentence-transformers",
provider_type="inline::sentence-transformers",
config=SentenceTransformersInferenceConfig.sample_run_config(),
)
memory_provider = Provider( memory_provider = Provider(
provider_id="faiss", provider_id="faiss",
provider_type="inline::faiss", provider_type="inline::faiss",
@ -41,6 +50,14 @@ def get_distribution_template() -> DistributionTemplate:
model_id="${env.SAFETY_MODEL}", model_id="${env.SAFETY_MODEL}",
provider_id="hf-endpoint-safety", provider_id="hf-endpoint-safety",
) )
embedding_model = ModelInput(
model_id="all-MiniLM-L6-v2",
provider_id="sentence-transformers",
model_type=ModelType.embedding,
metadata={
"embedding_dimension": 384,
},
)
return DistributionTemplate( return DistributionTemplate(
name=name, name=name,
@ -53,15 +70,16 @@ def get_distribution_template() -> DistributionTemplate:
run_configs={ run_configs={
"run.yaml": RunConfigSettings( "run.yaml": RunConfigSettings(
provider_overrides={ provider_overrides={
"inference": [inference_provider], "inference": [inference_provider, embedding_provider],
"memory": [memory_provider], "memory": [memory_provider],
}, },
default_models=[inference_model], default_models=[inference_model, embedding_model],
), ),
"run-with-safety.yaml": RunConfigSettings( "run-with-safety.yaml": RunConfigSettings(
provider_overrides={ provider_overrides={
"inference": [ "inference": [
inference_provider, inference_provider,
embedding_provider,
Provider( Provider(
provider_id="hf-endpoint-safety", provider_id="hf-endpoint-safety",
provider_type="remote::hf::endpoint", provider_type="remote::hf::endpoint",
@ -75,6 +93,7 @@ def get_distribution_template() -> DistributionTemplate:
default_models=[ default_models=[
inference_model, inference_model,
safety_model, safety_model,
embedding_model,
], ],
default_shields=[ShieldInput(shield_id="${env.SAFETY_MODEL}")], default_shields=[ShieldInput(shield_id="${env.SAFETY_MODEL}")],
), ),

View file

@ -18,6 +18,9 @@ providers:
config: config:
endpoint_name: ${env.INFERENCE_ENDPOINT_NAME} endpoint_name: ${env.INFERENCE_ENDPOINT_NAME}
api_token: ${env.HF_API_TOKEN} api_token: ${env.HF_API_TOKEN}
- provider_id: sentence-transformers
provider_type: inline::sentence-transformers
config: {}
- provider_id: hf-endpoint-safety - provider_id: hf-endpoint-safety
provider_type: remote::hf::endpoint provider_type: remote::hf::endpoint
config: config:
@ -81,10 +84,18 @@ models:
model_id: ${env.INFERENCE_MODEL} model_id: ${env.INFERENCE_MODEL}
provider_id: hf-endpoint provider_id: hf-endpoint
provider_model_id: null provider_model_id: null
model_type: llm
- metadata: {} - metadata: {}
model_id: ${env.SAFETY_MODEL} model_id: ${env.SAFETY_MODEL}
provider_id: hf-endpoint-safety provider_id: hf-endpoint-safety
provider_model_id: null provider_model_id: null
model_type: llm
- metadata:
embedding_dimension: 384
model_id: all-MiniLM-L6-v2
provider_id: sentence-transformers
provider_model_id: null
model_type: embedding
shields: shields:
- params: null - params: null
shield_id: ${env.SAFETY_MODEL} shield_id: ${env.SAFETY_MODEL}

View file

@ -18,6 +18,9 @@ providers:
config: config:
endpoint_name: ${env.INFERENCE_ENDPOINT_NAME} endpoint_name: ${env.INFERENCE_ENDPOINT_NAME}
api_token: ${env.HF_API_TOKEN} api_token: ${env.HF_API_TOKEN}
- provider_id: sentence-transformers
provider_type: inline::sentence-transformers
config: {}
memory: memory:
- provider_id: faiss - provider_id: faiss
provider_type: inline::faiss provider_type: inline::faiss
@ -76,6 +79,13 @@ models:
model_id: ${env.INFERENCE_MODEL} model_id: ${env.INFERENCE_MODEL}
provider_id: hf-endpoint provider_id: hf-endpoint
provider_model_id: null provider_model_id: null
model_type: llm
- metadata:
embedding_dimension: 384
model_id: all-MiniLM-L6-v2
provider_id: sentence-transformers
provider_model_id: null
model_type: embedding
shields: [] shields: []
memory_banks: [] memory_banks: []
datasets: [] datasets: []

View file

@ -4,7 +4,11 @@
# This source code is licensed under the terms described in the LICENSE file in # This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree. # the root directory of this source tree.
from llama_stack.apis.models.models import ModelType
from llama_stack.distribution.datatypes import ModelInput, Provider, ShieldInput from llama_stack.distribution.datatypes import ModelInput, Provider, ShieldInput
from llama_stack.providers.inline.inference.sentence_transformers import (
SentenceTransformersInferenceConfig,
)
from llama_stack.providers.inline.memory.faiss.config import FaissImplConfig from llama_stack.providers.inline.memory.faiss.config import FaissImplConfig
from llama_stack.providers.remote.inference.tgi import InferenceAPIImplConfig from llama_stack.providers.remote.inference.tgi import InferenceAPIImplConfig
from llama_stack.templates.template import DistributionTemplate, RunConfigSettings from llama_stack.templates.template import DistributionTemplate, RunConfigSettings
@ -28,6 +32,11 @@ def get_distribution_template() -> DistributionTemplate:
provider_type="remote::hf::serverless", provider_type="remote::hf::serverless",
config=InferenceAPIImplConfig.sample_run_config(), config=InferenceAPIImplConfig.sample_run_config(),
) )
embedding_provider = Provider(
provider_id="sentence-transformers",
provider_type="inline::sentence-transformers",
config=SentenceTransformersInferenceConfig.sample_run_config(),
)
memory_provider = Provider( memory_provider = Provider(
provider_id="faiss", provider_id="faiss",
provider_type="inline::faiss", provider_type="inline::faiss",
@ -42,6 +51,14 @@ def get_distribution_template() -> DistributionTemplate:
model_id="${env.SAFETY_MODEL}", model_id="${env.SAFETY_MODEL}",
provider_id="hf-serverless-safety", provider_id="hf-serverless-safety",
) )
embedding_model = ModelInput(
model_id="all-MiniLM-L6-v2",
provider_id="sentence-transformers",
model_type=ModelType.embedding,
metadata={
"embedding_dimension": 384,
},
)
return DistributionTemplate( return DistributionTemplate(
name=name, name=name,
@ -54,15 +71,16 @@ def get_distribution_template() -> DistributionTemplate:
run_configs={ run_configs={
"run.yaml": RunConfigSettings( "run.yaml": RunConfigSettings(
provider_overrides={ provider_overrides={
"inference": [inference_provider], "inference": [inference_provider, embedding_provider],
"memory": [memory_provider], "memory": [memory_provider],
}, },
default_models=[inference_model], default_models=[inference_model, embedding_model],
), ),
"run-with-safety.yaml": RunConfigSettings( "run-with-safety.yaml": RunConfigSettings(
provider_overrides={ provider_overrides={
"inference": [ "inference": [
inference_provider, inference_provider,
embedding_provider,
Provider( Provider(
provider_id="hf-serverless-safety", provider_id="hf-serverless-safety",
provider_type="remote::hf::serverless", provider_type="remote::hf::serverless",
@ -76,6 +94,7 @@ def get_distribution_template() -> DistributionTemplate:
default_models=[ default_models=[
inference_model, inference_model,
safety_model, safety_model,
embedding_model,
], ],
default_shields=[ShieldInput(shield_id="${env.SAFETY_MODEL}")], default_shields=[ShieldInput(shield_id="${env.SAFETY_MODEL}")],
), ),

View file

@ -18,6 +18,9 @@ providers:
config: config:
huggingface_repo: ${env.INFERENCE_MODEL} huggingface_repo: ${env.INFERENCE_MODEL}
api_token: ${env.HF_API_TOKEN} api_token: ${env.HF_API_TOKEN}
- provider_id: sentence-transformers
provider_type: inline::sentence-transformers
config: {}
- provider_id: hf-serverless-safety - provider_id: hf-serverless-safety
provider_type: remote::hf::serverless provider_type: remote::hf::serverless
config: config:
@ -81,10 +84,18 @@ models:
model_id: ${env.INFERENCE_MODEL} model_id: ${env.INFERENCE_MODEL}
provider_id: hf-serverless provider_id: hf-serverless
provider_model_id: null provider_model_id: null
model_type: llm
- metadata: {} - metadata: {}
model_id: ${env.SAFETY_MODEL} model_id: ${env.SAFETY_MODEL}
provider_id: hf-serverless-safety provider_id: hf-serverless-safety
provider_model_id: null provider_model_id: null
model_type: llm
- metadata:
embedding_dimension: 384
model_id: all-MiniLM-L6-v2
provider_id: sentence-transformers
provider_model_id: null
model_type: embedding
shields: shields:
- params: null - params: null
shield_id: ${env.SAFETY_MODEL} shield_id: ${env.SAFETY_MODEL}

View file

@ -18,6 +18,9 @@ providers:
config: config:
huggingface_repo: ${env.INFERENCE_MODEL} huggingface_repo: ${env.INFERENCE_MODEL}
api_token: ${env.HF_API_TOKEN} api_token: ${env.HF_API_TOKEN}
- provider_id: sentence-transformers
provider_type: inline::sentence-transformers
config: {}
memory: memory:
- provider_id: faiss - provider_id: faiss
provider_type: inline::faiss provider_type: inline::faiss
@ -76,6 +79,13 @@ models:
model_id: ${env.INFERENCE_MODEL} model_id: ${env.INFERENCE_MODEL}
provider_id: hf-serverless provider_id: hf-serverless
provider_model_id: null provider_model_id: null
model_type: llm
- metadata:
embedding_dimension: 384
model_id: all-MiniLM-L6-v2
provider_id: sentence-transformers
provider_model_id: null
model_type: embedding
shields: [] shields: []
memory_banks: [] memory_banks: []
datasets: [] datasets: []

View file

@ -6,10 +6,15 @@
from pathlib import Path from pathlib import Path
from llama_stack.apis.models.models import ModelType
from llama_stack.distribution.datatypes import ModelInput, Provider, ShieldInput from llama_stack.distribution.datatypes import ModelInput, Provider, ShieldInput
from llama_stack.providers.inline.inference.meta_reference import ( from llama_stack.providers.inline.inference.meta_reference import (
MetaReferenceInferenceConfig, MetaReferenceInferenceConfig,
) )
from llama_stack.providers.inline.inference.sentence_transformers import (
SentenceTransformersInferenceConfig,
)
from llama_stack.providers.inline.memory.faiss.config import FaissImplConfig from llama_stack.providers.inline.memory.faiss.config import FaissImplConfig
from llama_stack.templates.template import DistributionTemplate, RunConfigSettings from llama_stack.templates.template import DistributionTemplate, RunConfigSettings
@ -34,6 +39,11 @@ def get_distribution_template() -> DistributionTemplate:
checkpoint_dir="${env.INFERENCE_CHECKPOINT_DIR:null}", checkpoint_dir="${env.INFERENCE_CHECKPOINT_DIR:null}",
), ),
) )
embedding_provider = Provider(
provider_id="sentence-transformers",
provider_type="inline::sentence-transformers",
config=SentenceTransformersInferenceConfig.sample_run_config(),
)
memory_provider = Provider( memory_provider = Provider(
provider_id="faiss", provider_id="faiss",
provider_type="inline::faiss", provider_type="inline::faiss",
@ -44,6 +54,14 @@ def get_distribution_template() -> DistributionTemplate:
model_id="${env.INFERENCE_MODEL}", model_id="${env.INFERENCE_MODEL}",
provider_id="meta-reference-inference", provider_id="meta-reference-inference",
) )
embedding_model = ModelInput(
model_id="all-MiniLM-L6-v2",
provider_id="sentence-transformers",
model_type=ModelType.embedding,
metadata={
"embedding_dimension": 384,
},
)
safety_model = ModelInput( safety_model = ModelInput(
model_id="${env.SAFETY_MODEL}", model_id="${env.SAFETY_MODEL}",
provider_id="meta-reference-safety", provider_id="meta-reference-safety",
@ -59,15 +77,16 @@ def get_distribution_template() -> DistributionTemplate:
run_configs={ run_configs={
"run.yaml": RunConfigSettings( "run.yaml": RunConfigSettings(
provider_overrides={ provider_overrides={
"inference": [inference_provider], "inference": [inference_provider, embedding_provider],
"memory": [memory_provider], "memory": [memory_provider],
}, },
default_models=[inference_model], default_models=[inference_model, embedding_model],
), ),
"run-with-safety.yaml": RunConfigSettings( "run-with-safety.yaml": RunConfigSettings(
provider_overrides={ provider_overrides={
"inference": [ "inference": [
inference_provider, inference_provider,
embedding_provider,
Provider( Provider(
provider_id="meta-reference-safety", provider_id="meta-reference-safety",
provider_type="inline::meta-reference", provider_type="inline::meta-reference",
@ -82,6 +101,7 @@ def get_distribution_template() -> DistributionTemplate:
default_models=[ default_models=[
inference_model, inference_model,
safety_model, safety_model,
embedding_model,
], ],
default_shields=[ShieldInput(shield_id="${env.SAFETY_MODEL}")], default_shields=[ShieldInput(shield_id="${env.SAFETY_MODEL}")],
), ),

View file

@ -19,6 +19,9 @@ providers:
model: ${env.INFERENCE_MODEL} model: ${env.INFERENCE_MODEL}
max_seq_len: 4096 max_seq_len: 4096
checkpoint_dir: ${env.INFERENCE_CHECKPOINT_DIR:null} checkpoint_dir: ${env.INFERENCE_CHECKPOINT_DIR:null}
- provider_id: sentence-transformers
provider_type: inline::sentence-transformers
config: {}
- provider_id: meta-reference-safety - provider_id: meta-reference-safety
provider_type: inline::meta-reference provider_type: inline::meta-reference
config: config:
@ -83,10 +86,18 @@ models:
model_id: ${env.INFERENCE_MODEL} model_id: ${env.INFERENCE_MODEL}
provider_id: meta-reference-inference provider_id: meta-reference-inference
provider_model_id: null provider_model_id: null
model_type: llm
- metadata: {} - metadata: {}
model_id: ${env.SAFETY_MODEL} model_id: ${env.SAFETY_MODEL}
provider_id: meta-reference-safety provider_id: meta-reference-safety
provider_model_id: null provider_model_id: null
model_type: llm
- metadata:
embedding_dimension: 384
model_id: all-MiniLM-L6-v2
provider_id: sentence-transformers
provider_model_id: null
model_type: embedding
shields: shields:
- params: null - params: null
shield_id: ${env.SAFETY_MODEL} shield_id: ${env.SAFETY_MODEL}

View file

@ -19,6 +19,9 @@ providers:
model: ${env.INFERENCE_MODEL} model: ${env.INFERENCE_MODEL}
max_seq_len: 4096 max_seq_len: 4096
checkpoint_dir: ${env.INFERENCE_CHECKPOINT_DIR:null} checkpoint_dir: ${env.INFERENCE_CHECKPOINT_DIR:null}
- provider_id: sentence-transformers
provider_type: inline::sentence-transformers
config: {}
memory: memory:
- provider_id: faiss - provider_id: faiss
provider_type: inline::faiss provider_type: inline::faiss
@ -77,6 +80,13 @@ models:
model_id: ${env.INFERENCE_MODEL} model_id: ${env.INFERENCE_MODEL}
provider_id: meta-reference-inference provider_id: meta-reference-inference
provider_model_id: null provider_model_id: null
model_type: llm
- metadata:
embedding_dimension: 384
model_id: all-MiniLM-L6-v2
provider_id: sentence-transformers
provider_model_id: null
model_type: embedding
shields: [] shields: []
memory_banks: [] memory_banks: []
datasets: [] datasets: []

View file

@ -6,10 +6,15 @@
from pathlib import Path from pathlib import Path
from llama_stack.apis.models.models import ModelType
from llama_stack.distribution.datatypes import ModelInput, Provider from llama_stack.distribution.datatypes import ModelInput, Provider
from llama_stack.providers.inline.inference.meta_reference import ( from llama_stack.providers.inline.inference.meta_reference import (
MetaReferenceQuantizedInferenceConfig, MetaReferenceQuantizedInferenceConfig,
) )
from llama_stack.providers.inline.inference.sentence_transformers import (
SentenceTransformersInferenceConfig,
)
from llama_stack.providers.inline.memory.faiss.config import FaissImplConfig from llama_stack.providers.inline.memory.faiss.config import FaissImplConfig
from llama_stack.templates.template import DistributionTemplate, RunConfigSettings from llama_stack.templates.template import DistributionTemplate, RunConfigSettings
@ -34,6 +39,11 @@ def get_distribution_template() -> DistributionTemplate:
checkpoint_dir="${env.INFERENCE_CHECKPOINT_DIR:null}", checkpoint_dir="${env.INFERENCE_CHECKPOINT_DIR:null}",
), ),
) )
embedding_provider = Provider(
provider_id="sentence-transformers",
provider_type="inline::sentence-transformers",
config=SentenceTransformersInferenceConfig.sample_run_config(),
)
memory_provider = Provider( memory_provider = Provider(
provider_id="faiss", provider_id="faiss",
provider_type="inline::faiss", provider_type="inline::faiss",
@ -44,6 +54,14 @@ def get_distribution_template() -> DistributionTemplate:
model_id="${env.INFERENCE_MODEL}", model_id="${env.INFERENCE_MODEL}",
provider_id="meta-reference-inference", provider_id="meta-reference-inference",
) )
embedding_model = ModelInput(
model_id="all-MiniLM-L6-v2",
provider_id="sentence-transformers",
model_type=ModelType.embedding,
metadata={
"embedding_dimension": 384,
},
)
return DistributionTemplate( return DistributionTemplate(
name=name, name=name,
distro_type="self_hosted", distro_type="self_hosted",
@ -54,10 +72,10 @@ def get_distribution_template() -> DistributionTemplate:
run_configs={ run_configs={
"run.yaml": RunConfigSettings( "run.yaml": RunConfigSettings(
provider_overrides={ provider_overrides={
"inference": [inference_provider], "inference": [inference_provider, embedding_provider],
"memory": [memory_provider], "memory": [memory_provider],
}, },
default_models=[inference_model], default_models=[inference_model, embedding_model],
), ),
}, },
run_config_env_vars={ run_config_env_vars={

View file

@ -21,6 +21,9 @@ providers:
checkpoint_dir: ${env.INFERENCE_CHECKPOINT_DIR:null} checkpoint_dir: ${env.INFERENCE_CHECKPOINT_DIR:null}
quantization: quantization:
type: fp8 type: fp8
- provider_id: sentence-transformers
provider_type: inline::sentence-transformers
config: {}
memory: memory:
- provider_id: faiss - provider_id: faiss
provider_type: inline::faiss provider_type: inline::faiss
@ -79,6 +82,13 @@ models:
model_id: ${env.INFERENCE_MODEL} model_id: ${env.INFERENCE_MODEL}
provider_id: meta-reference-inference provider_id: meta-reference-inference
provider_model_id: null provider_model_id: null
model_type: llm
- metadata:
embedding_dimension: 384
model_id: all-MiniLM-L6-v2
provider_id: sentence-transformers
provider_model_id: null
model_type: embedding
shields: [] shields: []
memory_banks: [] memory_banks: []
datasets: [] datasets: []

View file

@ -6,7 +6,12 @@
from pathlib import Path from pathlib import Path
from llama_stack.apis.models.models import ModelType
from llama_stack.distribution.datatypes import ModelInput, Provider, ShieldInput from llama_stack.distribution.datatypes import ModelInput, Provider, ShieldInput
from llama_stack.providers.inline.inference.sentence_transformers import (
SentenceTransformersInferenceConfig,
)
from llama_stack.providers.inline.memory.faiss.config import FaissImplConfig from llama_stack.providers.inline.memory.faiss.config import FaissImplConfig
from llama_stack.providers.remote.inference.ollama import OllamaImplConfig from llama_stack.providers.remote.inference.ollama import OllamaImplConfig
from llama_stack.templates.template import DistributionTemplate, RunConfigSettings from llama_stack.templates.template import DistributionTemplate, RunConfigSettings
@ -29,6 +34,11 @@ def get_distribution_template() -> DistributionTemplate:
provider_type="remote::ollama", provider_type="remote::ollama",
config=OllamaImplConfig.sample_run_config(), config=OllamaImplConfig.sample_run_config(),
) )
embedding_provider = Provider(
provider_id="sentence-transformers",
provider_type="inline::sentence-transformers",
config=SentenceTransformersInferenceConfig.sample_run_config(),
)
memory_provider = Provider( memory_provider = Provider(
provider_id="faiss", provider_id="faiss",
provider_type="inline::faiss", provider_type="inline::faiss",
@ -43,6 +53,14 @@ def get_distribution_template() -> DistributionTemplate:
model_id="${env.SAFETY_MODEL}", model_id="${env.SAFETY_MODEL}",
provider_id="ollama", provider_id="ollama",
) )
embedding_model = ModelInput(
model_id="all-MiniLM-L6-v2",
provider_id="sentence-transformers",
model_type=ModelType.embedding,
metadata={
"embedding_dimension": 384,
},
)
return DistributionTemplate( return DistributionTemplate(
name=name, name=name,
@ -55,21 +73,23 @@ def get_distribution_template() -> DistributionTemplate:
run_configs={ run_configs={
"run.yaml": RunConfigSettings( "run.yaml": RunConfigSettings(
provider_overrides={ provider_overrides={
"inference": [inference_provider], "inference": [inference_provider, embedding_provider],
"memory": [memory_provider], "memory": [memory_provider],
}, },
default_models=[inference_model], default_models=[inference_model, embedding_model],
), ),
"run-with-safety.yaml": RunConfigSettings( "run-with-safety.yaml": RunConfigSettings(
provider_overrides={ provider_overrides={
"inference": [ "inference": [
inference_provider, inference_provider,
embedding_provider,
], ],
"memory": [memory_provider], "memory": [memory_provider],
}, },
default_models=[ default_models=[
inference_model, inference_model,
safety_model, safety_model,
embedding_model,
], ],
default_shields=[ShieldInput(shield_id="${env.SAFETY_MODEL}")], default_shields=[ShieldInput(shield_id="${env.SAFETY_MODEL}")],
), ),

View file

@ -17,6 +17,9 @@ providers:
provider_type: remote::ollama provider_type: remote::ollama
config: config:
url: ${env.OLLAMA_URL:http://localhost:11434} url: ${env.OLLAMA_URL:http://localhost:11434}
- provider_id: sentence-transformers
provider_type: inline::sentence-transformers
config: {}
memory: memory:
- provider_id: faiss - provider_id: faiss
provider_type: inline::faiss provider_type: inline::faiss
@ -75,10 +78,18 @@ models:
model_id: ${env.INFERENCE_MODEL} model_id: ${env.INFERENCE_MODEL}
provider_id: ollama provider_id: ollama
provider_model_id: null provider_model_id: null
model_type: llm
- metadata: {} - metadata: {}
model_id: ${env.SAFETY_MODEL} model_id: ${env.SAFETY_MODEL}
provider_id: ollama provider_id: ollama
provider_model_id: null provider_model_id: null
model_type: llm
- metadata:
embedding_dimension: 384
model_id: all-MiniLM-L6-v2
provider_id: sentence-transformers
provider_model_id: null
model_type: embedding
shields: shields:
- params: null - params: null
shield_id: ${env.SAFETY_MODEL} shield_id: ${env.SAFETY_MODEL}

View file

@ -17,6 +17,9 @@ providers:
provider_type: remote::ollama provider_type: remote::ollama
config: config:
url: ${env.OLLAMA_URL:http://localhost:11434} url: ${env.OLLAMA_URL:http://localhost:11434}
- provider_id: sentence-transformers
provider_type: inline::sentence-transformers
config: {}
memory: memory:
- provider_id: faiss - provider_id: faiss
provider_type: inline::faiss provider_type: inline::faiss
@ -75,6 +78,13 @@ models:
model_id: ${env.INFERENCE_MODEL} model_id: ${env.INFERENCE_MODEL}
provider_id: ollama provider_id: ollama
provider_model_id: null provider_model_id: null
model_type: llm
- metadata:
embedding_dimension: 384
model_id: all-MiniLM-L6-v2
provider_id: sentence-transformers
provider_model_id: null
model_type: embedding
shields: [] shields: []
memory_banks: [] memory_banks: []
datasets: [] datasets: []

View file

@ -22,6 +22,9 @@ providers:
url: ${env.SAFETY_VLLM_URL} url: ${env.SAFETY_VLLM_URL}
max_tokens: ${env.VLLM_MAX_TOKENS:4096} max_tokens: ${env.VLLM_MAX_TOKENS:4096}
api_token: ${env.VLLM_API_TOKEN:fake} api_token: ${env.VLLM_API_TOKEN:fake}
- provider_id: sentence-transformers
provider_type: inline::sentence-transformers
config: {}
memory: memory:
- provider_id: faiss - provider_id: faiss
provider_type: inline::faiss provider_type: inline::faiss
@ -58,10 +61,18 @@ models:
model_id: ${env.INFERENCE_MODEL} model_id: ${env.INFERENCE_MODEL}
provider_id: vllm-inference provider_id: vllm-inference
provider_model_id: null provider_model_id: null
model_type: llm
- metadata: {} - metadata: {}
model_id: ${env.SAFETY_MODEL} model_id: ${env.SAFETY_MODEL}
provider_id: vllm-safety provider_id: vllm-safety
provider_model_id: null provider_model_id: null
model_type: llm
- metadata:
embedding_dimension: 384
model_id: all-MiniLM-L6-v2
provider_id: sentence-transformers
provider_model_id: null
model_type: embedding
shields: shields:
- params: null - params: null
shield_id: ${env.SAFETY_MODEL} shield_id: ${env.SAFETY_MODEL}

View file

@ -16,6 +16,9 @@ providers:
url: ${env.VLLM_URL} url: ${env.VLLM_URL}
max_tokens: ${env.VLLM_MAX_TOKENS:4096} max_tokens: ${env.VLLM_MAX_TOKENS:4096}
api_token: ${env.VLLM_API_TOKEN:fake} api_token: ${env.VLLM_API_TOKEN:fake}
- provider_id: sentence-transformers
provider_type: inline::sentence-transformers
config: {}
memory: memory:
- provider_id: faiss - provider_id: faiss
provider_type: inline::faiss provider_type: inline::faiss
@ -52,6 +55,13 @@ models:
model_id: ${env.INFERENCE_MODEL} model_id: ${env.INFERENCE_MODEL}
provider_id: vllm-inference provider_id: vllm-inference
provider_model_id: null provider_model_id: null
model_type: llm
- metadata:
embedding_dimension: 384
model_id: all-MiniLM-L6-v2
provider_id: sentence-transformers
provider_model_id: null
model_type: embedding
shields: [] shields: []
memory_banks: [] memory_banks: []
datasets: [] datasets: []

View file

@ -6,7 +6,12 @@
from pathlib import Path from pathlib import Path
from llama_stack.apis.models.models import ModelType
from llama_stack.distribution.datatypes import ModelInput, Provider, ShieldInput from llama_stack.distribution.datatypes import ModelInput, Provider, ShieldInput
from llama_stack.providers.inline.inference.sentence_transformers import (
SentenceTransformersInferenceConfig,
)
from llama_stack.providers.inline.memory.faiss.config import FaissImplConfig from llama_stack.providers.inline.memory.faiss.config import FaissImplConfig
from llama_stack.providers.remote.inference.vllm import VLLMInferenceAdapterConfig from llama_stack.providers.remote.inference.vllm import VLLMInferenceAdapterConfig
from llama_stack.templates.template import DistributionTemplate, RunConfigSettings from llama_stack.templates.template import DistributionTemplate, RunConfigSettings
@ -28,6 +33,11 @@ def get_distribution_template() -> DistributionTemplate:
url="${env.VLLM_URL}", url="${env.VLLM_URL}",
), ),
) )
embedding_provider = Provider(
provider_id="sentence-transformers",
provider_type="inline::sentence-transformers",
config=SentenceTransformersInferenceConfig.sample_run_config(),
)
memory_provider = Provider( memory_provider = Provider(
provider_id="faiss", provider_id="faiss",
provider_type="inline::faiss", provider_type="inline::faiss",
@ -42,6 +52,14 @@ def get_distribution_template() -> DistributionTemplate:
model_id="${env.SAFETY_MODEL}", model_id="${env.SAFETY_MODEL}",
provider_id="vllm-safety", provider_id="vllm-safety",
) )
embedding_model = ModelInput(
model_id="all-MiniLM-L6-v2",
provider_id="sentence-transformers",
model_type=ModelType.embedding,
metadata={
"embedding_dimension": 384,
},
)
return DistributionTemplate( return DistributionTemplate(
name=name, name=name,
@ -53,10 +71,10 @@ def get_distribution_template() -> DistributionTemplate:
run_configs={ run_configs={
"run.yaml": RunConfigSettings( "run.yaml": RunConfigSettings(
provider_overrides={ provider_overrides={
"inference": [inference_provider], "inference": [inference_provider, embedding_provider],
"memory": [memory_provider], "memory": [memory_provider],
}, },
default_models=[inference_model], default_models=[inference_model, embedding_model],
), ),
"run-with-safety.yaml": RunConfigSettings( "run-with-safety.yaml": RunConfigSettings(
provider_overrides={ provider_overrides={
@ -69,12 +87,14 @@ def get_distribution_template() -> DistributionTemplate:
url="${env.SAFETY_VLLM_URL}", url="${env.SAFETY_VLLM_URL}",
), ),
), ),
embedding_provider,
], ],
"memory": [memory_provider], "memory": [memory_provider],
}, },
default_models=[ default_models=[
inference_model, inference_model,
safety_model, safety_model,
embedding_model,
], ],
default_shields=[ShieldInput(shield_id="${env.SAFETY_MODEL}")], default_shields=[ShieldInput(shield_id="${env.SAFETY_MODEL}")],
), ),

View file

@ -11,6 +11,7 @@ import jinja2
import yaml import yaml
from pydantic import BaseModel, Field from pydantic import BaseModel, Field
from llama_stack.apis.models.models import ModelType
from llama_stack.distribution.datatypes import ( from llama_stack.distribution.datatypes import (
Api, Api,
BuildConfig, BuildConfig,
@ -146,6 +147,13 @@ class DistributionTemplate(BaseModel):
) )
def save_distribution(self, yaml_output_dir: Path, doc_output_dir: Path) -> None: def save_distribution(self, yaml_output_dir: Path, doc_output_dir: Path) -> None:
def enum_representer(dumper, data):
return dumper.represent_scalar("tag:yaml.org,2002:str", data.value)
# Register YAML representer for ModelType
yaml.add_representer(ModelType, enum_representer)
yaml.SafeDumper.add_representer(ModelType, enum_representer)
for output_dir in [yaml_output_dir, doc_output_dir]: for output_dir in [yaml_output_dir, doc_output_dir]:
output_dir.mkdir(parents=True, exist_ok=True) output_dir.mkdir(parents=True, exist_ok=True)

View file

@ -79,10 +79,12 @@ models:
model_id: ${env.INFERENCE_MODEL} model_id: ${env.INFERENCE_MODEL}
provider_id: tgi-inference provider_id: tgi-inference
provider_model_id: null provider_model_id: null
model_type: llm
- metadata: {} - metadata: {}
model_id: ${env.SAFETY_MODEL} model_id: ${env.SAFETY_MODEL}
provider_id: tgi-safety provider_id: tgi-safety
provider_model_id: null provider_model_id: null
model_type: llm
shields: shields:
- params: null - params: null
shield_id: ${env.SAFETY_MODEL} shield_id: ${env.SAFETY_MODEL}

View file

@ -17,6 +17,9 @@ providers:
provider_type: remote::tgi provider_type: remote::tgi
config: config:
url: ${env.TGI_URL} url: ${env.TGI_URL}
- provider_id: sentence-transformers
provider_type: inline::sentence-transformers
config: {}
memory: memory:
- provider_id: faiss - provider_id: faiss
provider_type: inline::faiss provider_type: inline::faiss
@ -75,6 +78,13 @@ models:
model_id: ${env.INFERENCE_MODEL} model_id: ${env.INFERENCE_MODEL}
provider_id: tgi-inference provider_id: tgi-inference
provider_model_id: null provider_model_id: null
model_type: llm
- metadata:
embedding_dimension: 384
model_id: all-MiniLM-L6-v2
provider_id: sentence-transformers
provider_model_id: null
model_type: embedding
shields: [] shields: []
memory_banks: [] memory_banks: []
datasets: [] datasets: []

View file

@ -6,7 +6,12 @@
from pathlib import Path from pathlib import Path
from llama_stack.apis.models.models import ModelType
from llama_stack.distribution.datatypes import ModelInput, Provider, ShieldInput from llama_stack.distribution.datatypes import ModelInput, Provider, ShieldInput
from llama_stack.providers.inline.inference.sentence_transformers import (
SentenceTransformersInferenceConfig,
)
from llama_stack.providers.inline.memory.faiss.config import FaissImplConfig from llama_stack.providers.inline.memory.faiss.config import FaissImplConfig
from llama_stack.providers.remote.inference.tgi import TGIImplConfig from llama_stack.providers.remote.inference.tgi import TGIImplConfig
from llama_stack.templates.template import DistributionTemplate, RunConfigSettings from llama_stack.templates.template import DistributionTemplate, RunConfigSettings
@ -31,6 +36,11 @@ def get_distribution_template() -> DistributionTemplate:
url="${env.TGI_URL}", url="${env.TGI_URL}",
), ),
) )
embedding_provider = Provider(
provider_id="sentence-transformers",
provider_type="inline::sentence-transformers",
config=SentenceTransformersInferenceConfig.sample_run_config(),
)
memory_provider = Provider( memory_provider = Provider(
provider_id="faiss", provider_id="faiss",
provider_type="inline::faiss", provider_type="inline::faiss",
@ -41,6 +51,14 @@ def get_distribution_template() -> DistributionTemplate:
model_id="${env.INFERENCE_MODEL}", model_id="${env.INFERENCE_MODEL}",
provider_id="tgi-inference", provider_id="tgi-inference",
) )
embedding_model = ModelInput(
model_id="all-MiniLM-L6-v2",
provider_id="sentence-transformers",
model_type=ModelType.embedding,
metadata={
"embedding_dimension": 384,
},
)
safety_model = ModelInput( safety_model = ModelInput(
model_id="${env.SAFETY_MODEL}", model_id="${env.SAFETY_MODEL}",
provider_id="tgi-safety", provider_id="tgi-safety",
@ -57,10 +75,10 @@ def get_distribution_template() -> DistributionTemplate:
run_configs={ run_configs={
"run.yaml": RunConfigSettings( "run.yaml": RunConfigSettings(
provider_overrides={ provider_overrides={
"inference": [inference_provider], "inference": [inference_provider, embedding_provider],
"memory": [memory_provider], "memory": [memory_provider],
}, },
default_models=[inference_model], default_models=[inference_model, embedding_model],
), ),
"run-with-safety.yaml": RunConfigSettings( "run-with-safety.yaml": RunConfigSettings(
provider_overrides={ provider_overrides={

View file

@ -18,6 +18,9 @@ providers:
config: config:
url: https://api.together.xyz/v1 url: https://api.together.xyz/v1
api_key: ${env.TOGETHER_API_KEY} api_key: ${env.TOGETHER_API_KEY}
- provider_id: sentence-transformers
provider_type: inline::sentence-transformers
config: {}
memory: memory:
- provider_id: faiss - provider_id: faiss
provider_type: inline::faiss provider_type: inline::faiss
@ -74,36 +77,50 @@ metadata_store:
models: models:
- metadata: {} - metadata: {}
model_id: meta-llama/Llama-3.1-8B-Instruct model_id: meta-llama/Llama-3.1-8B-Instruct
provider_id: null provider_id: together
provider_model_id: meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo provider_model_id: meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo
model_type: llm
- metadata: {} - metadata: {}
model_id: meta-llama/Llama-3.1-70B-Instruct model_id: meta-llama/Llama-3.1-70B-Instruct
provider_id: null provider_id: together
provider_model_id: meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo provider_model_id: meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo
model_type: llm
- metadata: {} - metadata: {}
model_id: meta-llama/Llama-3.1-405B-Instruct-FP8 model_id: meta-llama/Llama-3.1-405B-Instruct-FP8
provider_id: null provider_id: together
provider_model_id: meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo provider_model_id: meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo
model_type: llm
- metadata: {} - metadata: {}
model_id: meta-llama/Llama-3.2-3B-Instruct model_id: meta-llama/Llama-3.2-3B-Instruct
provider_id: null provider_id: together
provider_model_id: meta-llama/Llama-3.2-3B-Instruct-Turbo provider_model_id: meta-llama/Llama-3.2-3B-Instruct-Turbo
model_type: llm
- metadata: {} - metadata: {}
model_id: meta-llama/Llama-3.2-11B-Vision-Instruct model_id: meta-llama/Llama-3.2-11B-Vision-Instruct
provider_id: null provider_id: together
provider_model_id: meta-llama/Llama-3.2-11B-Vision-Instruct-Turbo provider_model_id: meta-llama/Llama-3.2-11B-Vision-Instruct-Turbo
model_type: llm
- metadata: {} - metadata: {}
model_id: meta-llama/Llama-3.2-90B-Vision-Instruct model_id: meta-llama/Llama-3.2-90B-Vision-Instruct
provider_id: null provider_id: together
provider_model_id: meta-llama/Llama-3.2-90B-Vision-Instruct-Turbo provider_model_id: meta-llama/Llama-3.2-90B-Vision-Instruct-Turbo
model_type: llm
- metadata: {} - metadata: {}
model_id: meta-llama/Llama-Guard-3-8B model_id: meta-llama/Llama-Guard-3-8B
provider_id: null provider_id: together
provider_model_id: meta-llama/Meta-Llama-Guard-3-8B provider_model_id: meta-llama/Meta-Llama-Guard-3-8B
model_type: llm
- metadata: {} - metadata: {}
model_id: meta-llama/Llama-Guard-3-11B-Vision model_id: meta-llama/Llama-Guard-3-11B-Vision
provider_id: null provider_id: together
provider_model_id: meta-llama/Llama-Guard-3-11B-Vision-Turbo provider_model_id: meta-llama/Llama-Guard-3-11B-Vision-Turbo
model_type: llm
- metadata:
embedding_dimension: 384
model_id: all-MiniLM-L6-v2
provider_id: sentence-transformers
provider_model_id: null
model_type: embedding
shields: shields:
- params: null - params: null
shield_id: meta-llama/Llama-Guard-3-8B shield_id: meta-llama/Llama-Guard-3-8B

View file

@ -8,11 +8,15 @@ from pathlib import Path
from llama_models.sku_list import all_registered_models from llama_models.sku_list import all_registered_models
from llama_stack.apis.models.models import ModelType
from llama_stack.distribution.datatypes import ModelInput, Provider, ShieldInput from llama_stack.distribution.datatypes import ModelInput, Provider, ShieldInput
from llama_stack.providers.inline.inference.sentence_transformers import (
SentenceTransformersInferenceConfig,
)
from llama_stack.providers.inline.memory.faiss.config import FaissImplConfig from llama_stack.providers.inline.memory.faiss.config import FaissImplConfig
from llama_stack.providers.remote.inference.together import TogetherImplConfig from llama_stack.providers.remote.inference.together import TogetherImplConfig
from llama_stack.providers.remote.inference.together.together import MODEL_ALIASES from llama_stack.providers.remote.inference.together.together import MODEL_ALIASES
from llama_stack.templates.template import DistributionTemplate, RunConfigSettings from llama_stack.templates.template import DistributionTemplate, RunConfigSettings
@ -38,6 +42,11 @@ def get_distribution_template() -> DistributionTemplate:
provider_type="inline::faiss", provider_type="inline::faiss",
config=FaissImplConfig.sample_run_config(f"distributions/{name}"), config=FaissImplConfig.sample_run_config(f"distributions/{name}"),
) )
embedding_provider = Provider(
provider_id="sentence-transformers",
provider_type="inline::sentence-transformers",
config=SentenceTransformersInferenceConfig.sample_run_config(),
)
core_model_to_hf_repo = { core_model_to_hf_repo = {
m.descriptor(): m.huggingface_repo for m in all_registered_models() m.descriptor(): m.huggingface_repo for m in all_registered_models()
@ -46,9 +55,18 @@ def get_distribution_template() -> DistributionTemplate:
ModelInput( ModelInput(
model_id=core_model_to_hf_repo[m.llama_model], model_id=core_model_to_hf_repo[m.llama_model],
provider_model_id=m.provider_model_id, provider_model_id=m.provider_model_id,
provider_id="together",
) )
for m in MODEL_ALIASES for m in MODEL_ALIASES
] ]
embedding_model = ModelInput(
model_id="all-MiniLM-L6-v2",
provider_id="sentence-transformers",
model_type=ModelType.embedding,
metadata={
"embedding_dimension": 384,
},
)
return DistributionTemplate( return DistributionTemplate(
name=name, name=name,
@ -61,10 +79,10 @@ def get_distribution_template() -> DistributionTemplate:
run_configs={ run_configs={
"run.yaml": RunConfigSettings( "run.yaml": RunConfigSettings(
provider_overrides={ provider_overrides={
"inference": [inference_provider], "inference": [inference_provider, embedding_provider],
"memory": [memory_provider], "memory": [memory_provider],
}, },
default_models=default_models, default_models=default_models + [embedding_model],
default_shields=[ShieldInput(shield_id="meta-llama/Llama-Guard-3-8B")], default_shields=[ShieldInput(shield_id="meta-llama/Llama-Guard-3-8B")],
), ),
}, },

View file

@ -21,6 +21,9 @@ providers:
max_tokens: ${env.MAX_TOKENS:4096} max_tokens: ${env.MAX_TOKENS:4096}
enforce_eager: ${env.ENFORCE_EAGER:False} enforce_eager: ${env.ENFORCE_EAGER:False}
gpu_memory_utilization: ${env.GPU_MEMORY_UTILIZATION:0.7} gpu_memory_utilization: ${env.GPU_MEMORY_UTILIZATION:0.7}
- provider_id: sentence-transformers
provider_type: inline::sentence-transformers
config: {}
memory: memory:
- provider_id: faiss - provider_id: faiss
provider_type: inline::faiss provider_type: inline::faiss
@ -79,6 +82,13 @@ models:
model_id: ${env.INFERENCE_MODEL} model_id: ${env.INFERENCE_MODEL}
provider_id: vllm provider_id: vllm
provider_model_id: null provider_model_id: null
model_type: llm
- metadata:
embedding_dimension: 384
model_id: all-MiniLM-L6-v2
provider_id: sentence-transformers
provider_model_id: null
model_type: embedding
shields: [] shields: []
memory_banks: [] memory_banks: []
datasets: [] datasets: []

View file

@ -4,7 +4,11 @@
# This source code is licensed under the terms described in the LICENSE file in # This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree. # the root directory of this source tree.
from llama_stack.apis.models.models import ModelType
from llama_stack.distribution.datatypes import ModelInput, Provider from llama_stack.distribution.datatypes import ModelInput, Provider
from llama_stack.providers.inline.inference.sentence_transformers import (
SentenceTransformersInferenceConfig,
)
from llama_stack.providers.inline.inference.vllm import VLLMConfig from llama_stack.providers.inline.inference.vllm import VLLMConfig
from llama_stack.providers.inline.memory.faiss.config import FaissImplConfig from llama_stack.providers.inline.memory.faiss.config import FaissImplConfig
from llama_stack.templates.template import DistributionTemplate, RunConfigSettings from llama_stack.templates.template import DistributionTemplate, RunConfigSettings
@ -32,11 +36,24 @@ def get_distribution_template() -> DistributionTemplate:
provider_type="inline::faiss", provider_type="inline::faiss",
config=FaissImplConfig.sample_run_config(f"distributions/{name}"), config=FaissImplConfig.sample_run_config(f"distributions/{name}"),
) )
embedding_provider = Provider(
provider_id="sentence-transformers",
provider_type="inline::sentence-transformers",
config=SentenceTransformersInferenceConfig.sample_run_config(),
)
inference_model = ModelInput( inference_model = ModelInput(
model_id="${env.INFERENCE_MODEL}", model_id="${env.INFERENCE_MODEL}",
provider_id="vllm", provider_id="vllm",
) )
embedding_model = ModelInput(
model_id="all-MiniLM-L6-v2",
provider_id="sentence-transformers",
model_type=ModelType.embedding,
metadata={
"embedding_dimension": 384,
},
)
return DistributionTemplate( return DistributionTemplate(
name=name, name=name,
@ -49,10 +66,10 @@ def get_distribution_template() -> DistributionTemplate:
run_configs={ run_configs={
"run.yaml": RunConfigSettings( "run.yaml": RunConfigSettings(
provider_overrides={ provider_overrides={
"inference": [inference_provider], "inference": [inference_provider, embedding_provider],
"memory": [memory_provider], "memory": [memory_provider],
}, },
default_models=[inference_model], default_models=[inference_model, embedding_model],
), ),
}, },
run_config_env_vars={ run_config_env_vars={