forked from phoenix-oss/llama-stack-mirror
add embedding model by default to distribution templates (#617)
# What does this PR do? Adds the sentence transformer provider and the `all-MiniLM-L6-v2` embedding model to the default models to register in the run.yaml for all providers. ## Test Plan llama stack build --template together --image-type conda llama stack run ~/.llama/distributions/llamastack-together/together-run.yaml
This commit is contained in:
parent
e893b22868
commit
516e1a3e59
41 changed files with 473 additions and 64 deletions
|
@ -249,6 +249,7 @@
|
|||
"redis",
|
||||
"scikit-learn",
|
||||
"scipy",
|
||||
"sentence-transformers",
|
||||
"sentencepiece",
|
||||
"torch",
|
||||
"torchvision",
|
||||
|
@ -287,6 +288,7 @@
|
|||
"redis",
|
||||
"scikit-learn",
|
||||
"scipy",
|
||||
"sentence-transformers",
|
||||
"sentencepiece",
|
||||
"torch",
|
||||
"torchao==0.5.0",
|
||||
|
|
|
@ -21,9 +21,10 @@ class CommonModelFields(BaseModel):
|
|||
)
|
||||
|
||||
|
||||
class ModelType(Enum):
|
||||
@json_schema_type
|
||||
class ModelType(str, Enum):
|
||||
llm = "llm"
|
||||
embedding_model = "embedding"
|
||||
embedding = "embedding"
|
||||
|
||||
|
||||
@json_schema_type
|
||||
|
|
|
@ -109,7 +109,7 @@ class InferenceRouter(Inference):
|
|||
model = await self.routing_table.get_model(model_id)
|
||||
if model is None:
|
||||
raise ValueError(f"Model '{model_id}' not found")
|
||||
if model.model_type == ModelType.embedding_model:
|
||||
if model.model_type == ModelType.embedding:
|
||||
raise ValueError(
|
||||
f"Model '{model_id}' is an embedding model and does not support chat completions"
|
||||
)
|
||||
|
@ -142,7 +142,7 @@ class InferenceRouter(Inference):
|
|||
model = await self.routing_table.get_model(model_id)
|
||||
if model is None:
|
||||
raise ValueError(f"Model '{model_id}' not found")
|
||||
if model.model_type == ModelType.embedding_model:
|
||||
if model.model_type == ModelType.embedding:
|
||||
raise ValueError(
|
||||
f"Model '{model_id}' is an embedding model and does not support chat completions"
|
||||
)
|
||||
|
|
|
@ -225,10 +225,7 @@ class ModelsRoutingTable(CommonRoutingTableImpl, Models):
|
|||
metadata = {}
|
||||
if model_type is None:
|
||||
model_type = ModelType.llm
|
||||
if (
|
||||
"embedding_dimension" not in metadata
|
||||
and model_type == ModelType.embedding_model
|
||||
):
|
||||
if "embedding_dimension" not in metadata and model_type == ModelType.embedding:
|
||||
raise ValueError(
|
||||
"Embedding model must have an embedding dimension in its metadata"
|
||||
)
|
||||
|
@ -311,8 +308,15 @@ class MemoryBanksRoutingTable(CommonRoutingTableImpl, MemoryBanks):
|
|||
)
|
||||
model = await self.get_object_by_identifier("model", params.embedding_model)
|
||||
if model is None:
|
||||
if params.embedding_model == "all-MiniLM-L6-v2":
|
||||
raise ValueError(
|
||||
"Embeddings are now served via Inference providers. "
|
||||
"Please upgrade your run.yaml to include inline::sentence-transformer as an additional inference provider. "
|
||||
"See https://github.com/meta-llama/llama-stack/blob/main/llama_stack/templates/together/run.yaml for an example."
|
||||
)
|
||||
else:
|
||||
raise ValueError(f"Model {params.embedding_model} not found")
|
||||
if model.model_type != ModelType.embedding_model:
|
||||
if model.model_type != ModelType.embedding:
|
||||
raise ValueError(
|
||||
f"Model {params.embedding_model} is not an embedding model"
|
||||
)
|
||||
|
|
|
@ -83,7 +83,7 @@ class MetaReferenceInferenceImpl(
|
|||
|
||||
async def register_model(self, model: Model) -> Model:
|
||||
model = await self.model_registry_helper.register_model(model)
|
||||
if model.model_type == ModelType.embedding_model:
|
||||
if model.model_type == ModelType.embedding:
|
||||
self._load_sentence_transformer_model(model.provider_resource_id)
|
||||
return model
|
||||
|
||||
|
|
|
@ -4,7 +4,13 @@
|
|||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
from typing import Any, Dict
|
||||
|
||||
from pydantic import BaseModel
|
||||
|
||||
|
||||
class SentenceTransformersInferenceConfig(BaseModel): ...
|
||||
class SentenceTransformersInferenceConfig(BaseModel):
|
||||
|
||||
@classmethod
|
||||
def sample_run_config(cls) -> Dict[str, Any]:
|
||||
return {}
|
||||
|
|
|
@ -337,7 +337,7 @@ class OllamaInferenceAdapter(Inference, ModelsProtocolPrivate):
|
|||
|
||||
async def register_model(self, model: Model) -> Model:
|
||||
# ollama does not have embedding models running. Check if the model is in list of available models.
|
||||
if model.model_type == ModelType.embedding_model:
|
||||
if model.model_type == ModelType.embedding:
|
||||
response = await self.client.list()
|
||||
available_models = [m["model"] for m in response["models"]]
|
||||
if model.provider_resource_id not in available_models:
|
||||
|
|
|
@ -207,7 +207,7 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
|
|||
model = await self.model_store.get_model(model_id)
|
||||
|
||||
kwargs = {}
|
||||
assert model.model_type == ModelType.embedding_model
|
||||
assert model.model_type == ModelType.embedding
|
||||
assert model.metadata.get("embedding_dimensions")
|
||||
kwargs["dimensions"] = model.metadata.get("embedding_dimensions")
|
||||
assert all(
|
||||
|
|
|
@ -238,7 +238,7 @@ async def inference_stack(request, inference_model):
|
|||
model_type = ModelType.llm
|
||||
metadata = {}
|
||||
if os.getenv("EMBEDDING_DIMENSION"):
|
||||
model_type = ModelType.embedding_model
|
||||
model_type = ModelType.embedding
|
||||
metadata["embedding_dimension"] = get_env_or_fail("EMBEDDING_DIMENSION")
|
||||
|
||||
test_stack = await construct_stack_for_test(
|
||||
|
|
|
@ -18,7 +18,7 @@ class TestEmbeddings:
|
|||
inference_impl, models_impl = inference_stack
|
||||
model = await models_impl.get_model(inference_model)
|
||||
|
||||
if model.model_type != ModelType.embedding_model:
|
||||
if model.model_type != ModelType.embedding:
|
||||
pytest.skip("This test is only applicable for embedding models")
|
||||
|
||||
response = await inference_impl.embeddings(
|
||||
|
@ -39,7 +39,7 @@ class TestEmbeddings:
|
|||
inference_impl, models_impl = inference_stack
|
||||
model = await models_impl.get_model(inference_model)
|
||||
|
||||
if model.model_type != ModelType.embedding_model:
|
||||
if model.model_type != ModelType.embedding:
|
||||
pytest.skip("This test is only applicable for embedding models")
|
||||
|
||||
texts = ["Hello, world!", "This is a test", "Testing embeddings"]
|
||||
|
|
|
@ -125,7 +125,7 @@ async def memory_stack(inference_model, request):
|
|||
models=[
|
||||
ModelInput(
|
||||
model_id=inference_model,
|
||||
model_type=ModelType.embedding_model,
|
||||
model_type=ModelType.embedding,
|
||||
metadata={
|
||||
"embedding_dimension": get_env_or_fail("EMBEDDING_DIMENSION"),
|
||||
},
|
||||
|
|
|
@ -78,7 +78,7 @@ class ModelRegistryHelper(ModelsProtocolPrivate):
|
|||
return None
|
||||
|
||||
async def register_model(self, model: Model) -> Model:
|
||||
if model.model_type == ModelType.embedding_model:
|
||||
if model.model_type == ModelType.embedding:
|
||||
# embedding models are always registered by their provider model id and does not need to be mapped to a llama model
|
||||
provider_resource_id = model.provider_resource_id
|
||||
else:
|
||||
|
|
|
@ -8,10 +8,14 @@ from pathlib import Path
|
|||
|
||||
from llama_models.sku_list import all_registered_models
|
||||
|
||||
from llama_stack.apis.models.models import ModelType
|
||||
|
||||
from llama_stack.distribution.datatypes import ModelInput, Provider, ShieldInput
|
||||
from llama_stack.providers.inline.inference.sentence_transformers import (
|
||||
SentenceTransformersInferenceConfig,
|
||||
)
|
||||
from llama_stack.providers.remote.inference.cerebras import CerebrasImplConfig
|
||||
from llama_stack.providers.remote.inference.cerebras.cerebras import model_aliases
|
||||
|
||||
from llama_stack.templates.template import DistributionTemplate, RunConfigSettings
|
||||
|
||||
|
||||
|
@ -29,6 +33,11 @@ def get_distribution_template() -> DistributionTemplate:
|
|||
provider_type="remote::cerebras",
|
||||
config=CerebrasImplConfig.sample_run_config(),
|
||||
)
|
||||
embedding_provider = Provider(
|
||||
provider_id="sentence-transformers",
|
||||
provider_type="inline::sentence-transformers",
|
||||
config=SentenceTransformersInferenceConfig.sample_run_config(),
|
||||
)
|
||||
|
||||
core_model_to_hf_repo = {
|
||||
m.descriptor(): m.huggingface_repo for m in all_registered_models()
|
||||
|
@ -37,9 +46,18 @@ def get_distribution_template() -> DistributionTemplate:
|
|||
ModelInput(
|
||||
model_id=core_model_to_hf_repo[m.llama_model],
|
||||
provider_model_id=m.provider_model_id,
|
||||
provider_id="cerebras",
|
||||
)
|
||||
for m in model_aliases
|
||||
]
|
||||
embedding_model = ModelInput(
|
||||
model_id="all-MiniLM-L6-v2",
|
||||
provider_id="sentence-transformers",
|
||||
model_type=ModelType.embedding,
|
||||
metadata={
|
||||
"embedding_dimension": 384,
|
||||
},
|
||||
)
|
||||
|
||||
return DistributionTemplate(
|
||||
name="cerebras",
|
||||
|
@ -52,9 +70,9 @@ def get_distribution_template() -> DistributionTemplate:
|
|||
run_configs={
|
||||
"run.yaml": RunConfigSettings(
|
||||
provider_overrides={
|
||||
"inference": [inference_provider],
|
||||
"inference": [inference_provider, embedding_provider],
|
||||
},
|
||||
default_models=default_models,
|
||||
default_models=default_models + [embedding_model],
|
||||
default_shields=[ShieldInput(shield_id="meta-llama/Llama-Guard-3-8B")],
|
||||
),
|
||||
},
|
||||
|
|
|
@ -15,6 +15,9 @@ providers:
|
|||
config:
|
||||
base_url: https://api.cerebras.ai
|
||||
api_key: ${env.CEREBRAS_API_KEY}
|
||||
- provider_id: sentence-transformers
|
||||
provider_type: inline::sentence-transformers
|
||||
config: {}
|
||||
safety:
|
||||
- provider_id: llama-guard
|
||||
provider_type: inline::llama-guard
|
||||
|
@ -49,12 +52,20 @@ metadata_store:
|
|||
models:
|
||||
- metadata: {}
|
||||
model_id: meta-llama/Llama-3.1-8B-Instruct
|
||||
provider_id: null
|
||||
provider_id: cerebras
|
||||
provider_model_id: llama3.1-8b
|
||||
model_type: llm
|
||||
- metadata: {}
|
||||
model_id: meta-llama/Llama-3.1-70B-Instruct
|
||||
provider_id: null
|
||||
provider_id: cerebras
|
||||
provider_model_id: llama3.1-70b
|
||||
model_type: llm
|
||||
- metadata:
|
||||
embedding_dimension: 384
|
||||
model_id: all-MiniLM-L6-v2
|
||||
provider_id: sentence-transformers
|
||||
provider_model_id: null
|
||||
model_type: embedding
|
||||
shields:
|
||||
- params: null
|
||||
shield_id: meta-llama/Llama-Guard-3-8B
|
||||
|
|
|
@ -8,11 +8,15 @@ from pathlib import Path
|
|||
|
||||
from llama_models.sku_list import all_registered_models
|
||||
|
||||
from llama_stack.apis.models.models import ModelType
|
||||
|
||||
from llama_stack.distribution.datatypes import ModelInput, Provider, ShieldInput
|
||||
from llama_stack.providers.inline.inference.sentence_transformers import (
|
||||
SentenceTransformersInferenceConfig,
|
||||
)
|
||||
from llama_stack.providers.inline.memory.faiss.config import FaissImplConfig
|
||||
from llama_stack.providers.remote.inference.fireworks import FireworksImplConfig
|
||||
from llama_stack.providers.remote.inference.fireworks.fireworks import MODEL_ALIASES
|
||||
|
||||
from llama_stack.templates.template import DistributionTemplate, RunConfigSettings
|
||||
|
||||
|
||||
|
@ -35,6 +39,11 @@ def get_distribution_template() -> DistributionTemplate:
|
|||
provider_type="remote::fireworks",
|
||||
config=FireworksImplConfig.sample_run_config(),
|
||||
)
|
||||
embedding_provider = Provider(
|
||||
provider_id="sentence-transformers",
|
||||
provider_type="inline::sentence-transformers",
|
||||
config=SentenceTransformersInferenceConfig.sample_run_config(),
|
||||
)
|
||||
memory_provider = Provider(
|
||||
provider_id="faiss",
|
||||
provider_type="inline::faiss",
|
||||
|
@ -48,9 +57,18 @@ def get_distribution_template() -> DistributionTemplate:
|
|||
ModelInput(
|
||||
model_id=core_model_to_hf_repo[m.llama_model],
|
||||
provider_model_id=m.provider_model_id,
|
||||
provider_id="fireworks",
|
||||
)
|
||||
for m in MODEL_ALIASES
|
||||
]
|
||||
embedding_model = ModelInput(
|
||||
model_id="all-MiniLM-L6-v2",
|
||||
provider_id="sentence-transformers",
|
||||
model_type=ModelType.embedding,
|
||||
metadata={
|
||||
"embedding_dimension": 384,
|
||||
},
|
||||
)
|
||||
|
||||
return DistributionTemplate(
|
||||
name=name,
|
||||
|
@ -63,10 +81,10 @@ def get_distribution_template() -> DistributionTemplate:
|
|||
run_configs={
|
||||
"run.yaml": RunConfigSettings(
|
||||
provider_overrides={
|
||||
"inference": [inference_provider],
|
||||
"inference": [inference_provider, embedding_provider],
|
||||
"memory": [memory_provider],
|
||||
},
|
||||
default_models=default_models,
|
||||
default_models=default_models + [embedding_model],
|
||||
default_shields=[ShieldInput(shield_id="meta-llama/Llama-Guard-3-8B")],
|
||||
),
|
||||
},
|
||||
|
|
|
@ -16,8 +16,11 @@ providers:
|
|||
- provider_id: fireworks
|
||||
provider_type: remote::fireworks
|
||||
config:
|
||||
url: https://api.fireworks.ai/inference
|
||||
url: https://api.fireworks.ai/inference/v1
|
||||
api_key: ${env.FIREWORKS_API_KEY}
|
||||
- provider_id: sentence-transformers
|
||||
provider_type: inline::sentence-transformers
|
||||
config: {}
|
||||
memory:
|
||||
- provider_id: faiss
|
||||
provider_type: inline::faiss
|
||||
|
@ -74,40 +77,55 @@ metadata_store:
|
|||
models:
|
||||
- metadata: {}
|
||||
model_id: meta-llama/Llama-3.1-8B-Instruct
|
||||
provider_id: null
|
||||
provider_id: fireworks
|
||||
provider_model_id: fireworks/llama-v3p1-8b-instruct
|
||||
model_type: llm
|
||||
- metadata: {}
|
||||
model_id: meta-llama/Llama-3.1-70B-Instruct
|
||||
provider_id: null
|
||||
provider_id: fireworks
|
||||
provider_model_id: fireworks/llama-v3p1-70b-instruct
|
||||
model_type: llm
|
||||
- metadata: {}
|
||||
model_id: meta-llama/Llama-3.1-405B-Instruct-FP8
|
||||
provider_id: null
|
||||
provider_id: fireworks
|
||||
provider_model_id: fireworks/llama-v3p1-405b-instruct
|
||||
model_type: llm
|
||||
- metadata: {}
|
||||
model_id: meta-llama/Llama-3.2-1B-Instruct
|
||||
provider_id: null
|
||||
provider_id: fireworks
|
||||
provider_model_id: fireworks/llama-v3p2-1b-instruct
|
||||
model_type: llm
|
||||
- metadata: {}
|
||||
model_id: meta-llama/Llama-3.2-3B-Instruct
|
||||
provider_id: null
|
||||
provider_id: fireworks
|
||||
provider_model_id: fireworks/llama-v3p2-3b-instruct
|
||||
model_type: llm
|
||||
- metadata: {}
|
||||
model_id: meta-llama/Llama-3.2-11B-Vision-Instruct
|
||||
provider_id: null
|
||||
provider_id: fireworks
|
||||
provider_model_id: fireworks/llama-v3p2-11b-vision-instruct
|
||||
model_type: llm
|
||||
- metadata: {}
|
||||
model_id: meta-llama/Llama-3.2-90B-Vision-Instruct
|
||||
provider_id: null
|
||||
provider_id: fireworks
|
||||
provider_model_id: fireworks/llama-v3p2-90b-vision-instruct
|
||||
model_type: llm
|
||||
- metadata: {}
|
||||
model_id: meta-llama/Llama-Guard-3-8B
|
||||
provider_id: null
|
||||
provider_id: fireworks
|
||||
provider_model_id: fireworks/llama-guard-3-8b
|
||||
model_type: llm
|
||||
- metadata: {}
|
||||
model_id: meta-llama/Llama-Guard-3-11B-Vision
|
||||
provider_id: null
|
||||
provider_id: fireworks
|
||||
provider_model_id: fireworks/llama-guard-3-11b-vision
|
||||
model_type: llm
|
||||
- metadata:
|
||||
embedding_dimension: 384
|
||||
model_id: all-MiniLM-L6-v2
|
||||
provider_id: sentence-transformers
|
||||
provider_model_id: null
|
||||
model_type: embedding
|
||||
shields:
|
||||
- params: null
|
||||
shield_id: meta-llama/Llama-Guard-3-8B
|
||||
|
|
|
@ -4,7 +4,11 @@
|
|||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
from llama_stack.apis.models.models import ModelType
|
||||
from llama_stack.distribution.datatypes import ModelInput, Provider, ShieldInput
|
||||
from llama_stack.providers.inline.inference.sentence_transformers import (
|
||||
SentenceTransformersInferenceConfig,
|
||||
)
|
||||
from llama_stack.providers.inline.memory.faiss.config import FaissImplConfig
|
||||
from llama_stack.providers.remote.inference.tgi import InferenceEndpointImplConfig
|
||||
from llama_stack.templates.template import DistributionTemplate, RunConfigSettings
|
||||
|
@ -27,6 +31,11 @@ def get_distribution_template() -> DistributionTemplate:
|
|||
provider_type="remote::hf::endpoint",
|
||||
config=InferenceEndpointImplConfig.sample_run_config(),
|
||||
)
|
||||
embedding_provider = Provider(
|
||||
provider_id="sentence-transformers",
|
||||
provider_type="inline::sentence-transformers",
|
||||
config=SentenceTransformersInferenceConfig.sample_run_config(),
|
||||
)
|
||||
memory_provider = Provider(
|
||||
provider_id="faiss",
|
||||
provider_type="inline::faiss",
|
||||
|
@ -41,6 +50,14 @@ def get_distribution_template() -> DistributionTemplate:
|
|||
model_id="${env.SAFETY_MODEL}",
|
||||
provider_id="hf-endpoint-safety",
|
||||
)
|
||||
embedding_model = ModelInput(
|
||||
model_id="all-MiniLM-L6-v2",
|
||||
provider_id="sentence-transformers",
|
||||
model_type=ModelType.embedding,
|
||||
metadata={
|
||||
"embedding_dimension": 384,
|
||||
},
|
||||
)
|
||||
|
||||
return DistributionTemplate(
|
||||
name=name,
|
||||
|
@ -53,15 +70,16 @@ def get_distribution_template() -> DistributionTemplate:
|
|||
run_configs={
|
||||
"run.yaml": RunConfigSettings(
|
||||
provider_overrides={
|
||||
"inference": [inference_provider],
|
||||
"inference": [inference_provider, embedding_provider],
|
||||
"memory": [memory_provider],
|
||||
},
|
||||
default_models=[inference_model],
|
||||
default_models=[inference_model, embedding_model],
|
||||
),
|
||||
"run-with-safety.yaml": RunConfigSettings(
|
||||
provider_overrides={
|
||||
"inference": [
|
||||
inference_provider,
|
||||
embedding_provider,
|
||||
Provider(
|
||||
provider_id="hf-endpoint-safety",
|
||||
provider_type="remote::hf::endpoint",
|
||||
|
@ -75,6 +93,7 @@ def get_distribution_template() -> DistributionTemplate:
|
|||
default_models=[
|
||||
inference_model,
|
||||
safety_model,
|
||||
embedding_model,
|
||||
],
|
||||
default_shields=[ShieldInput(shield_id="${env.SAFETY_MODEL}")],
|
||||
),
|
||||
|
|
|
@ -18,6 +18,9 @@ providers:
|
|||
config:
|
||||
endpoint_name: ${env.INFERENCE_ENDPOINT_NAME}
|
||||
api_token: ${env.HF_API_TOKEN}
|
||||
- provider_id: sentence-transformers
|
||||
provider_type: inline::sentence-transformers
|
||||
config: {}
|
||||
- provider_id: hf-endpoint-safety
|
||||
provider_type: remote::hf::endpoint
|
||||
config:
|
||||
|
@ -81,10 +84,18 @@ models:
|
|||
model_id: ${env.INFERENCE_MODEL}
|
||||
provider_id: hf-endpoint
|
||||
provider_model_id: null
|
||||
model_type: llm
|
||||
- metadata: {}
|
||||
model_id: ${env.SAFETY_MODEL}
|
||||
provider_id: hf-endpoint-safety
|
||||
provider_model_id: null
|
||||
model_type: llm
|
||||
- metadata:
|
||||
embedding_dimension: 384
|
||||
model_id: all-MiniLM-L6-v2
|
||||
provider_id: sentence-transformers
|
||||
provider_model_id: null
|
||||
model_type: embedding
|
||||
shields:
|
||||
- params: null
|
||||
shield_id: ${env.SAFETY_MODEL}
|
||||
|
|
|
@ -18,6 +18,9 @@ providers:
|
|||
config:
|
||||
endpoint_name: ${env.INFERENCE_ENDPOINT_NAME}
|
||||
api_token: ${env.HF_API_TOKEN}
|
||||
- provider_id: sentence-transformers
|
||||
provider_type: inline::sentence-transformers
|
||||
config: {}
|
||||
memory:
|
||||
- provider_id: faiss
|
||||
provider_type: inline::faiss
|
||||
|
@ -76,6 +79,13 @@ models:
|
|||
model_id: ${env.INFERENCE_MODEL}
|
||||
provider_id: hf-endpoint
|
||||
provider_model_id: null
|
||||
model_type: llm
|
||||
- metadata:
|
||||
embedding_dimension: 384
|
||||
model_id: all-MiniLM-L6-v2
|
||||
provider_id: sentence-transformers
|
||||
provider_model_id: null
|
||||
model_type: embedding
|
||||
shields: []
|
||||
memory_banks: []
|
||||
datasets: []
|
||||
|
|
|
@ -4,7 +4,11 @@
|
|||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
from llama_stack.apis.models.models import ModelType
|
||||
from llama_stack.distribution.datatypes import ModelInput, Provider, ShieldInput
|
||||
from llama_stack.providers.inline.inference.sentence_transformers import (
|
||||
SentenceTransformersInferenceConfig,
|
||||
)
|
||||
from llama_stack.providers.inline.memory.faiss.config import FaissImplConfig
|
||||
from llama_stack.providers.remote.inference.tgi import InferenceAPIImplConfig
|
||||
from llama_stack.templates.template import DistributionTemplate, RunConfigSettings
|
||||
|
@ -28,6 +32,11 @@ def get_distribution_template() -> DistributionTemplate:
|
|||
provider_type="remote::hf::serverless",
|
||||
config=InferenceAPIImplConfig.sample_run_config(),
|
||||
)
|
||||
embedding_provider = Provider(
|
||||
provider_id="sentence-transformers",
|
||||
provider_type="inline::sentence-transformers",
|
||||
config=SentenceTransformersInferenceConfig.sample_run_config(),
|
||||
)
|
||||
memory_provider = Provider(
|
||||
provider_id="faiss",
|
||||
provider_type="inline::faiss",
|
||||
|
@ -42,6 +51,14 @@ def get_distribution_template() -> DistributionTemplate:
|
|||
model_id="${env.SAFETY_MODEL}",
|
||||
provider_id="hf-serverless-safety",
|
||||
)
|
||||
embedding_model = ModelInput(
|
||||
model_id="all-MiniLM-L6-v2",
|
||||
provider_id="sentence-transformers",
|
||||
model_type=ModelType.embedding,
|
||||
metadata={
|
||||
"embedding_dimension": 384,
|
||||
},
|
||||
)
|
||||
|
||||
return DistributionTemplate(
|
||||
name=name,
|
||||
|
@ -54,15 +71,16 @@ def get_distribution_template() -> DistributionTemplate:
|
|||
run_configs={
|
||||
"run.yaml": RunConfigSettings(
|
||||
provider_overrides={
|
||||
"inference": [inference_provider],
|
||||
"inference": [inference_provider, embedding_provider],
|
||||
"memory": [memory_provider],
|
||||
},
|
||||
default_models=[inference_model],
|
||||
default_models=[inference_model, embedding_model],
|
||||
),
|
||||
"run-with-safety.yaml": RunConfigSettings(
|
||||
provider_overrides={
|
||||
"inference": [
|
||||
inference_provider,
|
||||
embedding_provider,
|
||||
Provider(
|
||||
provider_id="hf-serverless-safety",
|
||||
provider_type="remote::hf::serverless",
|
||||
|
@ -76,6 +94,7 @@ def get_distribution_template() -> DistributionTemplate:
|
|||
default_models=[
|
||||
inference_model,
|
||||
safety_model,
|
||||
embedding_model,
|
||||
],
|
||||
default_shields=[ShieldInput(shield_id="${env.SAFETY_MODEL}")],
|
||||
),
|
||||
|
|
|
@ -18,6 +18,9 @@ providers:
|
|||
config:
|
||||
huggingface_repo: ${env.INFERENCE_MODEL}
|
||||
api_token: ${env.HF_API_TOKEN}
|
||||
- provider_id: sentence-transformers
|
||||
provider_type: inline::sentence-transformers
|
||||
config: {}
|
||||
- provider_id: hf-serverless-safety
|
||||
provider_type: remote::hf::serverless
|
||||
config:
|
||||
|
@ -81,10 +84,18 @@ models:
|
|||
model_id: ${env.INFERENCE_MODEL}
|
||||
provider_id: hf-serverless
|
||||
provider_model_id: null
|
||||
model_type: llm
|
||||
- metadata: {}
|
||||
model_id: ${env.SAFETY_MODEL}
|
||||
provider_id: hf-serverless-safety
|
||||
provider_model_id: null
|
||||
model_type: llm
|
||||
- metadata:
|
||||
embedding_dimension: 384
|
||||
model_id: all-MiniLM-L6-v2
|
||||
provider_id: sentence-transformers
|
||||
provider_model_id: null
|
||||
model_type: embedding
|
||||
shields:
|
||||
- params: null
|
||||
shield_id: ${env.SAFETY_MODEL}
|
||||
|
|
|
@ -18,6 +18,9 @@ providers:
|
|||
config:
|
||||
huggingface_repo: ${env.INFERENCE_MODEL}
|
||||
api_token: ${env.HF_API_TOKEN}
|
||||
- provider_id: sentence-transformers
|
||||
provider_type: inline::sentence-transformers
|
||||
config: {}
|
||||
memory:
|
||||
- provider_id: faiss
|
||||
provider_type: inline::faiss
|
||||
|
@ -76,6 +79,13 @@ models:
|
|||
model_id: ${env.INFERENCE_MODEL}
|
||||
provider_id: hf-serverless
|
||||
provider_model_id: null
|
||||
model_type: llm
|
||||
- metadata:
|
||||
embedding_dimension: 384
|
||||
model_id: all-MiniLM-L6-v2
|
||||
provider_id: sentence-transformers
|
||||
provider_model_id: null
|
||||
model_type: embedding
|
||||
shields: []
|
||||
memory_banks: []
|
||||
datasets: []
|
||||
|
|
|
@ -6,10 +6,15 @@
|
|||
|
||||
from pathlib import Path
|
||||
|
||||
from llama_stack.apis.models.models import ModelType
|
||||
|
||||
from llama_stack.distribution.datatypes import ModelInput, Provider, ShieldInput
|
||||
from llama_stack.providers.inline.inference.meta_reference import (
|
||||
MetaReferenceInferenceConfig,
|
||||
)
|
||||
from llama_stack.providers.inline.inference.sentence_transformers import (
|
||||
SentenceTransformersInferenceConfig,
|
||||
)
|
||||
from llama_stack.providers.inline.memory.faiss.config import FaissImplConfig
|
||||
from llama_stack.templates.template import DistributionTemplate, RunConfigSettings
|
||||
|
||||
|
@ -34,6 +39,11 @@ def get_distribution_template() -> DistributionTemplate:
|
|||
checkpoint_dir="${env.INFERENCE_CHECKPOINT_DIR:null}",
|
||||
),
|
||||
)
|
||||
embedding_provider = Provider(
|
||||
provider_id="sentence-transformers",
|
||||
provider_type="inline::sentence-transformers",
|
||||
config=SentenceTransformersInferenceConfig.sample_run_config(),
|
||||
)
|
||||
memory_provider = Provider(
|
||||
provider_id="faiss",
|
||||
provider_type="inline::faiss",
|
||||
|
@ -44,6 +54,14 @@ def get_distribution_template() -> DistributionTemplate:
|
|||
model_id="${env.INFERENCE_MODEL}",
|
||||
provider_id="meta-reference-inference",
|
||||
)
|
||||
embedding_model = ModelInput(
|
||||
model_id="all-MiniLM-L6-v2",
|
||||
provider_id="sentence-transformers",
|
||||
model_type=ModelType.embedding,
|
||||
metadata={
|
||||
"embedding_dimension": 384,
|
||||
},
|
||||
)
|
||||
safety_model = ModelInput(
|
||||
model_id="${env.SAFETY_MODEL}",
|
||||
provider_id="meta-reference-safety",
|
||||
|
@ -59,15 +77,16 @@ def get_distribution_template() -> DistributionTemplate:
|
|||
run_configs={
|
||||
"run.yaml": RunConfigSettings(
|
||||
provider_overrides={
|
||||
"inference": [inference_provider],
|
||||
"inference": [inference_provider, embedding_provider],
|
||||
"memory": [memory_provider],
|
||||
},
|
||||
default_models=[inference_model],
|
||||
default_models=[inference_model, embedding_model],
|
||||
),
|
||||
"run-with-safety.yaml": RunConfigSettings(
|
||||
provider_overrides={
|
||||
"inference": [
|
||||
inference_provider,
|
||||
embedding_provider,
|
||||
Provider(
|
||||
provider_id="meta-reference-safety",
|
||||
provider_type="inline::meta-reference",
|
||||
|
@ -82,6 +101,7 @@ def get_distribution_template() -> DistributionTemplate:
|
|||
default_models=[
|
||||
inference_model,
|
||||
safety_model,
|
||||
embedding_model,
|
||||
],
|
||||
default_shields=[ShieldInput(shield_id="${env.SAFETY_MODEL}")],
|
||||
),
|
||||
|
|
|
@ -19,6 +19,9 @@ providers:
|
|||
model: ${env.INFERENCE_MODEL}
|
||||
max_seq_len: 4096
|
||||
checkpoint_dir: ${env.INFERENCE_CHECKPOINT_DIR:null}
|
||||
- provider_id: sentence-transformers
|
||||
provider_type: inline::sentence-transformers
|
||||
config: {}
|
||||
- provider_id: meta-reference-safety
|
||||
provider_type: inline::meta-reference
|
||||
config:
|
||||
|
@ -83,10 +86,18 @@ models:
|
|||
model_id: ${env.INFERENCE_MODEL}
|
||||
provider_id: meta-reference-inference
|
||||
provider_model_id: null
|
||||
model_type: llm
|
||||
- metadata: {}
|
||||
model_id: ${env.SAFETY_MODEL}
|
||||
provider_id: meta-reference-safety
|
||||
provider_model_id: null
|
||||
model_type: llm
|
||||
- metadata:
|
||||
embedding_dimension: 384
|
||||
model_id: all-MiniLM-L6-v2
|
||||
provider_id: sentence-transformers
|
||||
provider_model_id: null
|
||||
model_type: embedding
|
||||
shields:
|
||||
- params: null
|
||||
shield_id: ${env.SAFETY_MODEL}
|
||||
|
|
|
@ -19,6 +19,9 @@ providers:
|
|||
model: ${env.INFERENCE_MODEL}
|
||||
max_seq_len: 4096
|
||||
checkpoint_dir: ${env.INFERENCE_CHECKPOINT_DIR:null}
|
||||
- provider_id: sentence-transformers
|
||||
provider_type: inline::sentence-transformers
|
||||
config: {}
|
||||
memory:
|
||||
- provider_id: faiss
|
||||
provider_type: inline::faiss
|
||||
|
@ -77,6 +80,13 @@ models:
|
|||
model_id: ${env.INFERENCE_MODEL}
|
||||
provider_id: meta-reference-inference
|
||||
provider_model_id: null
|
||||
model_type: llm
|
||||
- metadata:
|
||||
embedding_dimension: 384
|
||||
model_id: all-MiniLM-L6-v2
|
||||
provider_id: sentence-transformers
|
||||
provider_model_id: null
|
||||
model_type: embedding
|
||||
shields: []
|
||||
memory_banks: []
|
||||
datasets: []
|
||||
|
|
|
@ -6,10 +6,15 @@
|
|||
|
||||
from pathlib import Path
|
||||
|
||||
from llama_stack.apis.models.models import ModelType
|
||||
|
||||
from llama_stack.distribution.datatypes import ModelInput, Provider
|
||||
from llama_stack.providers.inline.inference.meta_reference import (
|
||||
MetaReferenceQuantizedInferenceConfig,
|
||||
)
|
||||
from llama_stack.providers.inline.inference.sentence_transformers import (
|
||||
SentenceTransformersInferenceConfig,
|
||||
)
|
||||
from llama_stack.providers.inline.memory.faiss.config import FaissImplConfig
|
||||
from llama_stack.templates.template import DistributionTemplate, RunConfigSettings
|
||||
|
||||
|
@ -34,6 +39,11 @@ def get_distribution_template() -> DistributionTemplate:
|
|||
checkpoint_dir="${env.INFERENCE_CHECKPOINT_DIR:null}",
|
||||
),
|
||||
)
|
||||
embedding_provider = Provider(
|
||||
provider_id="sentence-transformers",
|
||||
provider_type="inline::sentence-transformers",
|
||||
config=SentenceTransformersInferenceConfig.sample_run_config(),
|
||||
)
|
||||
memory_provider = Provider(
|
||||
provider_id="faiss",
|
||||
provider_type="inline::faiss",
|
||||
|
@ -44,6 +54,14 @@ def get_distribution_template() -> DistributionTemplate:
|
|||
model_id="${env.INFERENCE_MODEL}",
|
||||
provider_id="meta-reference-inference",
|
||||
)
|
||||
embedding_model = ModelInput(
|
||||
model_id="all-MiniLM-L6-v2",
|
||||
provider_id="sentence-transformers",
|
||||
model_type=ModelType.embedding,
|
||||
metadata={
|
||||
"embedding_dimension": 384,
|
||||
},
|
||||
)
|
||||
return DistributionTemplate(
|
||||
name=name,
|
||||
distro_type="self_hosted",
|
||||
|
@ -54,10 +72,10 @@ def get_distribution_template() -> DistributionTemplate:
|
|||
run_configs={
|
||||
"run.yaml": RunConfigSettings(
|
||||
provider_overrides={
|
||||
"inference": [inference_provider],
|
||||
"inference": [inference_provider, embedding_provider],
|
||||
"memory": [memory_provider],
|
||||
},
|
||||
default_models=[inference_model],
|
||||
default_models=[inference_model, embedding_model],
|
||||
),
|
||||
},
|
||||
run_config_env_vars={
|
||||
|
|
|
@ -21,6 +21,9 @@ providers:
|
|||
checkpoint_dir: ${env.INFERENCE_CHECKPOINT_DIR:null}
|
||||
quantization:
|
||||
type: fp8
|
||||
- provider_id: sentence-transformers
|
||||
provider_type: inline::sentence-transformers
|
||||
config: {}
|
||||
memory:
|
||||
- provider_id: faiss
|
||||
provider_type: inline::faiss
|
||||
|
@ -79,6 +82,13 @@ models:
|
|||
model_id: ${env.INFERENCE_MODEL}
|
||||
provider_id: meta-reference-inference
|
||||
provider_model_id: null
|
||||
model_type: llm
|
||||
- metadata:
|
||||
embedding_dimension: 384
|
||||
model_id: all-MiniLM-L6-v2
|
||||
provider_id: sentence-transformers
|
||||
provider_model_id: null
|
||||
model_type: embedding
|
||||
shields: []
|
||||
memory_banks: []
|
||||
datasets: []
|
||||
|
|
|
@ -6,7 +6,12 @@
|
|||
|
||||
from pathlib import Path
|
||||
|
||||
from llama_stack.apis.models.models import ModelType
|
||||
|
||||
from llama_stack.distribution.datatypes import ModelInput, Provider, ShieldInput
|
||||
from llama_stack.providers.inline.inference.sentence_transformers import (
|
||||
SentenceTransformersInferenceConfig,
|
||||
)
|
||||
from llama_stack.providers.inline.memory.faiss.config import FaissImplConfig
|
||||
from llama_stack.providers.remote.inference.ollama import OllamaImplConfig
|
||||
from llama_stack.templates.template import DistributionTemplate, RunConfigSettings
|
||||
|
@ -29,6 +34,11 @@ def get_distribution_template() -> DistributionTemplate:
|
|||
provider_type="remote::ollama",
|
||||
config=OllamaImplConfig.sample_run_config(),
|
||||
)
|
||||
embedding_provider = Provider(
|
||||
provider_id="sentence-transformers",
|
||||
provider_type="inline::sentence-transformers",
|
||||
config=SentenceTransformersInferenceConfig.sample_run_config(),
|
||||
)
|
||||
memory_provider = Provider(
|
||||
provider_id="faiss",
|
||||
provider_type="inline::faiss",
|
||||
|
@ -43,6 +53,14 @@ def get_distribution_template() -> DistributionTemplate:
|
|||
model_id="${env.SAFETY_MODEL}",
|
||||
provider_id="ollama",
|
||||
)
|
||||
embedding_model = ModelInput(
|
||||
model_id="all-MiniLM-L6-v2",
|
||||
provider_id="sentence-transformers",
|
||||
model_type=ModelType.embedding,
|
||||
metadata={
|
||||
"embedding_dimension": 384,
|
||||
},
|
||||
)
|
||||
|
||||
return DistributionTemplate(
|
||||
name=name,
|
||||
|
@ -55,21 +73,23 @@ def get_distribution_template() -> DistributionTemplate:
|
|||
run_configs={
|
||||
"run.yaml": RunConfigSettings(
|
||||
provider_overrides={
|
||||
"inference": [inference_provider],
|
||||
"inference": [inference_provider, embedding_provider],
|
||||
"memory": [memory_provider],
|
||||
},
|
||||
default_models=[inference_model],
|
||||
default_models=[inference_model, embedding_model],
|
||||
),
|
||||
"run-with-safety.yaml": RunConfigSettings(
|
||||
provider_overrides={
|
||||
"inference": [
|
||||
inference_provider,
|
||||
embedding_provider,
|
||||
],
|
||||
"memory": [memory_provider],
|
||||
},
|
||||
default_models=[
|
||||
inference_model,
|
||||
safety_model,
|
||||
embedding_model,
|
||||
],
|
||||
default_shields=[ShieldInput(shield_id="${env.SAFETY_MODEL}")],
|
||||
),
|
||||
|
|
|
@ -17,6 +17,9 @@ providers:
|
|||
provider_type: remote::ollama
|
||||
config:
|
||||
url: ${env.OLLAMA_URL:http://localhost:11434}
|
||||
- provider_id: sentence-transformers
|
||||
provider_type: inline::sentence-transformers
|
||||
config: {}
|
||||
memory:
|
||||
- provider_id: faiss
|
||||
provider_type: inline::faiss
|
||||
|
@ -75,10 +78,18 @@ models:
|
|||
model_id: ${env.INFERENCE_MODEL}
|
||||
provider_id: ollama
|
||||
provider_model_id: null
|
||||
model_type: llm
|
||||
- metadata: {}
|
||||
model_id: ${env.SAFETY_MODEL}
|
||||
provider_id: ollama
|
||||
provider_model_id: null
|
||||
model_type: llm
|
||||
- metadata:
|
||||
embedding_dimension: 384
|
||||
model_id: all-MiniLM-L6-v2
|
||||
provider_id: sentence-transformers
|
||||
provider_model_id: null
|
||||
model_type: embedding
|
||||
shields:
|
||||
- params: null
|
||||
shield_id: ${env.SAFETY_MODEL}
|
||||
|
|
|
@ -17,6 +17,9 @@ providers:
|
|||
provider_type: remote::ollama
|
||||
config:
|
||||
url: ${env.OLLAMA_URL:http://localhost:11434}
|
||||
- provider_id: sentence-transformers
|
||||
provider_type: inline::sentence-transformers
|
||||
config: {}
|
||||
memory:
|
||||
- provider_id: faiss
|
||||
provider_type: inline::faiss
|
||||
|
@ -75,6 +78,13 @@ models:
|
|||
model_id: ${env.INFERENCE_MODEL}
|
||||
provider_id: ollama
|
||||
provider_model_id: null
|
||||
model_type: llm
|
||||
- metadata:
|
||||
embedding_dimension: 384
|
||||
model_id: all-MiniLM-L6-v2
|
||||
provider_id: sentence-transformers
|
||||
provider_model_id: null
|
||||
model_type: embedding
|
||||
shields: []
|
||||
memory_banks: []
|
||||
datasets: []
|
||||
|
|
|
@ -22,6 +22,9 @@ providers:
|
|||
url: ${env.SAFETY_VLLM_URL}
|
||||
max_tokens: ${env.VLLM_MAX_TOKENS:4096}
|
||||
api_token: ${env.VLLM_API_TOKEN:fake}
|
||||
- provider_id: sentence-transformers
|
||||
provider_type: inline::sentence-transformers
|
||||
config: {}
|
||||
memory:
|
||||
- provider_id: faiss
|
||||
provider_type: inline::faiss
|
||||
|
@ -58,10 +61,18 @@ models:
|
|||
model_id: ${env.INFERENCE_MODEL}
|
||||
provider_id: vllm-inference
|
||||
provider_model_id: null
|
||||
model_type: llm
|
||||
- metadata: {}
|
||||
model_id: ${env.SAFETY_MODEL}
|
||||
provider_id: vllm-safety
|
||||
provider_model_id: null
|
||||
model_type: llm
|
||||
- metadata:
|
||||
embedding_dimension: 384
|
||||
model_id: all-MiniLM-L6-v2
|
||||
provider_id: sentence-transformers
|
||||
provider_model_id: null
|
||||
model_type: embedding
|
||||
shields:
|
||||
- params: null
|
||||
shield_id: ${env.SAFETY_MODEL}
|
||||
|
|
|
@ -16,6 +16,9 @@ providers:
|
|||
url: ${env.VLLM_URL}
|
||||
max_tokens: ${env.VLLM_MAX_TOKENS:4096}
|
||||
api_token: ${env.VLLM_API_TOKEN:fake}
|
||||
- provider_id: sentence-transformers
|
||||
provider_type: inline::sentence-transformers
|
||||
config: {}
|
||||
memory:
|
||||
- provider_id: faiss
|
||||
provider_type: inline::faiss
|
||||
|
@ -52,6 +55,13 @@ models:
|
|||
model_id: ${env.INFERENCE_MODEL}
|
||||
provider_id: vllm-inference
|
||||
provider_model_id: null
|
||||
model_type: llm
|
||||
- metadata:
|
||||
embedding_dimension: 384
|
||||
model_id: all-MiniLM-L6-v2
|
||||
provider_id: sentence-transformers
|
||||
provider_model_id: null
|
||||
model_type: embedding
|
||||
shields: []
|
||||
memory_banks: []
|
||||
datasets: []
|
||||
|
|
|
@ -6,7 +6,12 @@
|
|||
|
||||
from pathlib import Path
|
||||
|
||||
from llama_stack.apis.models.models import ModelType
|
||||
|
||||
from llama_stack.distribution.datatypes import ModelInput, Provider, ShieldInput
|
||||
from llama_stack.providers.inline.inference.sentence_transformers import (
|
||||
SentenceTransformersInferenceConfig,
|
||||
)
|
||||
from llama_stack.providers.inline.memory.faiss.config import FaissImplConfig
|
||||
from llama_stack.providers.remote.inference.vllm import VLLMInferenceAdapterConfig
|
||||
from llama_stack.templates.template import DistributionTemplate, RunConfigSettings
|
||||
|
@ -28,6 +33,11 @@ def get_distribution_template() -> DistributionTemplate:
|
|||
url="${env.VLLM_URL}",
|
||||
),
|
||||
)
|
||||
embedding_provider = Provider(
|
||||
provider_id="sentence-transformers",
|
||||
provider_type="inline::sentence-transformers",
|
||||
config=SentenceTransformersInferenceConfig.sample_run_config(),
|
||||
)
|
||||
memory_provider = Provider(
|
||||
provider_id="faiss",
|
||||
provider_type="inline::faiss",
|
||||
|
@ -42,6 +52,14 @@ def get_distribution_template() -> DistributionTemplate:
|
|||
model_id="${env.SAFETY_MODEL}",
|
||||
provider_id="vllm-safety",
|
||||
)
|
||||
embedding_model = ModelInput(
|
||||
model_id="all-MiniLM-L6-v2",
|
||||
provider_id="sentence-transformers",
|
||||
model_type=ModelType.embedding,
|
||||
metadata={
|
||||
"embedding_dimension": 384,
|
||||
},
|
||||
)
|
||||
|
||||
return DistributionTemplate(
|
||||
name=name,
|
||||
|
@ -53,10 +71,10 @@ def get_distribution_template() -> DistributionTemplate:
|
|||
run_configs={
|
||||
"run.yaml": RunConfigSettings(
|
||||
provider_overrides={
|
||||
"inference": [inference_provider],
|
||||
"inference": [inference_provider, embedding_provider],
|
||||
"memory": [memory_provider],
|
||||
},
|
||||
default_models=[inference_model],
|
||||
default_models=[inference_model, embedding_model],
|
||||
),
|
||||
"run-with-safety.yaml": RunConfigSettings(
|
||||
provider_overrides={
|
||||
|
@ -69,12 +87,14 @@ def get_distribution_template() -> DistributionTemplate:
|
|||
url="${env.SAFETY_VLLM_URL}",
|
||||
),
|
||||
),
|
||||
embedding_provider,
|
||||
],
|
||||
"memory": [memory_provider],
|
||||
},
|
||||
default_models=[
|
||||
inference_model,
|
||||
safety_model,
|
||||
embedding_model,
|
||||
],
|
||||
default_shields=[ShieldInput(shield_id="${env.SAFETY_MODEL}")],
|
||||
),
|
||||
|
|
|
@ -11,6 +11,7 @@ import jinja2
|
|||
import yaml
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
from llama_stack.apis.models.models import ModelType
|
||||
from llama_stack.distribution.datatypes import (
|
||||
Api,
|
||||
BuildConfig,
|
||||
|
@ -146,6 +147,13 @@ class DistributionTemplate(BaseModel):
|
|||
)
|
||||
|
||||
def save_distribution(self, yaml_output_dir: Path, doc_output_dir: Path) -> None:
|
||||
def enum_representer(dumper, data):
|
||||
return dumper.represent_scalar("tag:yaml.org,2002:str", data.value)
|
||||
|
||||
# Register YAML representer for ModelType
|
||||
yaml.add_representer(ModelType, enum_representer)
|
||||
yaml.SafeDumper.add_representer(ModelType, enum_representer)
|
||||
|
||||
for output_dir in [yaml_output_dir, doc_output_dir]:
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
|
|
|
@ -79,10 +79,12 @@ models:
|
|||
model_id: ${env.INFERENCE_MODEL}
|
||||
provider_id: tgi-inference
|
||||
provider_model_id: null
|
||||
model_type: llm
|
||||
- metadata: {}
|
||||
model_id: ${env.SAFETY_MODEL}
|
||||
provider_id: tgi-safety
|
||||
provider_model_id: null
|
||||
model_type: llm
|
||||
shields:
|
||||
- params: null
|
||||
shield_id: ${env.SAFETY_MODEL}
|
||||
|
|
|
@ -17,6 +17,9 @@ providers:
|
|||
provider_type: remote::tgi
|
||||
config:
|
||||
url: ${env.TGI_URL}
|
||||
- provider_id: sentence-transformers
|
||||
provider_type: inline::sentence-transformers
|
||||
config: {}
|
||||
memory:
|
||||
- provider_id: faiss
|
||||
provider_type: inline::faiss
|
||||
|
@ -75,6 +78,13 @@ models:
|
|||
model_id: ${env.INFERENCE_MODEL}
|
||||
provider_id: tgi-inference
|
||||
provider_model_id: null
|
||||
model_type: llm
|
||||
- metadata:
|
||||
embedding_dimension: 384
|
||||
model_id: all-MiniLM-L6-v2
|
||||
provider_id: sentence-transformers
|
||||
provider_model_id: null
|
||||
model_type: embedding
|
||||
shields: []
|
||||
memory_banks: []
|
||||
datasets: []
|
||||
|
|
|
@ -6,7 +6,12 @@
|
|||
|
||||
from pathlib import Path
|
||||
|
||||
from llama_stack.apis.models.models import ModelType
|
||||
|
||||
from llama_stack.distribution.datatypes import ModelInput, Provider, ShieldInput
|
||||
from llama_stack.providers.inline.inference.sentence_transformers import (
|
||||
SentenceTransformersInferenceConfig,
|
||||
)
|
||||
from llama_stack.providers.inline.memory.faiss.config import FaissImplConfig
|
||||
from llama_stack.providers.remote.inference.tgi import TGIImplConfig
|
||||
from llama_stack.templates.template import DistributionTemplate, RunConfigSettings
|
||||
|
@ -31,6 +36,11 @@ def get_distribution_template() -> DistributionTemplate:
|
|||
url="${env.TGI_URL}",
|
||||
),
|
||||
)
|
||||
embedding_provider = Provider(
|
||||
provider_id="sentence-transformers",
|
||||
provider_type="inline::sentence-transformers",
|
||||
config=SentenceTransformersInferenceConfig.sample_run_config(),
|
||||
)
|
||||
memory_provider = Provider(
|
||||
provider_id="faiss",
|
||||
provider_type="inline::faiss",
|
||||
|
@ -41,6 +51,14 @@ def get_distribution_template() -> DistributionTemplate:
|
|||
model_id="${env.INFERENCE_MODEL}",
|
||||
provider_id="tgi-inference",
|
||||
)
|
||||
embedding_model = ModelInput(
|
||||
model_id="all-MiniLM-L6-v2",
|
||||
provider_id="sentence-transformers",
|
||||
model_type=ModelType.embedding,
|
||||
metadata={
|
||||
"embedding_dimension": 384,
|
||||
},
|
||||
)
|
||||
safety_model = ModelInput(
|
||||
model_id="${env.SAFETY_MODEL}",
|
||||
provider_id="tgi-safety",
|
||||
|
@ -57,10 +75,10 @@ def get_distribution_template() -> DistributionTemplate:
|
|||
run_configs={
|
||||
"run.yaml": RunConfigSettings(
|
||||
provider_overrides={
|
||||
"inference": [inference_provider],
|
||||
"inference": [inference_provider, embedding_provider],
|
||||
"memory": [memory_provider],
|
||||
},
|
||||
default_models=[inference_model],
|
||||
default_models=[inference_model, embedding_model],
|
||||
),
|
||||
"run-with-safety.yaml": RunConfigSettings(
|
||||
provider_overrides={
|
||||
|
|
|
@ -18,6 +18,9 @@ providers:
|
|||
config:
|
||||
url: https://api.together.xyz/v1
|
||||
api_key: ${env.TOGETHER_API_KEY}
|
||||
- provider_id: sentence-transformers
|
||||
provider_type: inline::sentence-transformers
|
||||
config: {}
|
||||
memory:
|
||||
- provider_id: faiss
|
||||
provider_type: inline::faiss
|
||||
|
@ -74,36 +77,50 @@ metadata_store:
|
|||
models:
|
||||
- metadata: {}
|
||||
model_id: meta-llama/Llama-3.1-8B-Instruct
|
||||
provider_id: null
|
||||
provider_id: together
|
||||
provider_model_id: meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo
|
||||
model_type: llm
|
||||
- metadata: {}
|
||||
model_id: meta-llama/Llama-3.1-70B-Instruct
|
||||
provider_id: null
|
||||
provider_id: together
|
||||
provider_model_id: meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo
|
||||
model_type: llm
|
||||
- metadata: {}
|
||||
model_id: meta-llama/Llama-3.1-405B-Instruct-FP8
|
||||
provider_id: null
|
||||
provider_id: together
|
||||
provider_model_id: meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo
|
||||
model_type: llm
|
||||
- metadata: {}
|
||||
model_id: meta-llama/Llama-3.2-3B-Instruct
|
||||
provider_id: null
|
||||
provider_id: together
|
||||
provider_model_id: meta-llama/Llama-3.2-3B-Instruct-Turbo
|
||||
model_type: llm
|
||||
- metadata: {}
|
||||
model_id: meta-llama/Llama-3.2-11B-Vision-Instruct
|
||||
provider_id: null
|
||||
provider_id: together
|
||||
provider_model_id: meta-llama/Llama-3.2-11B-Vision-Instruct-Turbo
|
||||
model_type: llm
|
||||
- metadata: {}
|
||||
model_id: meta-llama/Llama-3.2-90B-Vision-Instruct
|
||||
provider_id: null
|
||||
provider_id: together
|
||||
provider_model_id: meta-llama/Llama-3.2-90B-Vision-Instruct-Turbo
|
||||
model_type: llm
|
||||
- metadata: {}
|
||||
model_id: meta-llama/Llama-Guard-3-8B
|
||||
provider_id: null
|
||||
provider_id: together
|
||||
provider_model_id: meta-llama/Meta-Llama-Guard-3-8B
|
||||
model_type: llm
|
||||
- metadata: {}
|
||||
model_id: meta-llama/Llama-Guard-3-11B-Vision
|
||||
provider_id: null
|
||||
provider_id: together
|
||||
provider_model_id: meta-llama/Llama-Guard-3-11B-Vision-Turbo
|
||||
model_type: llm
|
||||
- metadata:
|
||||
embedding_dimension: 384
|
||||
model_id: all-MiniLM-L6-v2
|
||||
provider_id: sentence-transformers
|
||||
provider_model_id: null
|
||||
model_type: embedding
|
||||
shields:
|
||||
- params: null
|
||||
shield_id: meta-llama/Llama-Guard-3-8B
|
||||
|
|
|
@ -8,11 +8,15 @@ from pathlib import Path
|
|||
|
||||
from llama_models.sku_list import all_registered_models
|
||||
|
||||
from llama_stack.apis.models.models import ModelType
|
||||
|
||||
from llama_stack.distribution.datatypes import ModelInput, Provider, ShieldInput
|
||||
from llama_stack.providers.inline.inference.sentence_transformers import (
|
||||
SentenceTransformersInferenceConfig,
|
||||
)
|
||||
from llama_stack.providers.inline.memory.faiss.config import FaissImplConfig
|
||||
from llama_stack.providers.remote.inference.together import TogetherImplConfig
|
||||
from llama_stack.providers.remote.inference.together.together import MODEL_ALIASES
|
||||
|
||||
from llama_stack.templates.template import DistributionTemplate, RunConfigSettings
|
||||
|
||||
|
||||
|
@ -38,6 +42,11 @@ def get_distribution_template() -> DistributionTemplate:
|
|||
provider_type="inline::faiss",
|
||||
config=FaissImplConfig.sample_run_config(f"distributions/{name}"),
|
||||
)
|
||||
embedding_provider = Provider(
|
||||
provider_id="sentence-transformers",
|
||||
provider_type="inline::sentence-transformers",
|
||||
config=SentenceTransformersInferenceConfig.sample_run_config(),
|
||||
)
|
||||
|
||||
core_model_to_hf_repo = {
|
||||
m.descriptor(): m.huggingface_repo for m in all_registered_models()
|
||||
|
@ -46,9 +55,18 @@ def get_distribution_template() -> DistributionTemplate:
|
|||
ModelInput(
|
||||
model_id=core_model_to_hf_repo[m.llama_model],
|
||||
provider_model_id=m.provider_model_id,
|
||||
provider_id="together",
|
||||
)
|
||||
for m in MODEL_ALIASES
|
||||
]
|
||||
embedding_model = ModelInput(
|
||||
model_id="all-MiniLM-L6-v2",
|
||||
provider_id="sentence-transformers",
|
||||
model_type=ModelType.embedding,
|
||||
metadata={
|
||||
"embedding_dimension": 384,
|
||||
},
|
||||
)
|
||||
|
||||
return DistributionTemplate(
|
||||
name=name,
|
||||
|
@ -61,10 +79,10 @@ def get_distribution_template() -> DistributionTemplate:
|
|||
run_configs={
|
||||
"run.yaml": RunConfigSettings(
|
||||
provider_overrides={
|
||||
"inference": [inference_provider],
|
||||
"inference": [inference_provider, embedding_provider],
|
||||
"memory": [memory_provider],
|
||||
},
|
||||
default_models=default_models,
|
||||
default_models=default_models + [embedding_model],
|
||||
default_shields=[ShieldInput(shield_id="meta-llama/Llama-Guard-3-8B")],
|
||||
),
|
||||
},
|
||||
|
|
|
@ -21,6 +21,9 @@ providers:
|
|||
max_tokens: ${env.MAX_TOKENS:4096}
|
||||
enforce_eager: ${env.ENFORCE_EAGER:False}
|
||||
gpu_memory_utilization: ${env.GPU_MEMORY_UTILIZATION:0.7}
|
||||
- provider_id: sentence-transformers
|
||||
provider_type: inline::sentence-transformers
|
||||
config: {}
|
||||
memory:
|
||||
- provider_id: faiss
|
||||
provider_type: inline::faiss
|
||||
|
@ -79,6 +82,13 @@ models:
|
|||
model_id: ${env.INFERENCE_MODEL}
|
||||
provider_id: vllm
|
||||
provider_model_id: null
|
||||
model_type: llm
|
||||
- metadata:
|
||||
embedding_dimension: 384
|
||||
model_id: all-MiniLM-L6-v2
|
||||
provider_id: sentence-transformers
|
||||
provider_model_id: null
|
||||
model_type: embedding
|
||||
shields: []
|
||||
memory_banks: []
|
||||
datasets: []
|
||||
|
|
|
@ -4,7 +4,11 @@
|
|||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
from llama_stack.apis.models.models import ModelType
|
||||
from llama_stack.distribution.datatypes import ModelInput, Provider
|
||||
from llama_stack.providers.inline.inference.sentence_transformers import (
|
||||
SentenceTransformersInferenceConfig,
|
||||
)
|
||||
from llama_stack.providers.inline.inference.vllm import VLLMConfig
|
||||
from llama_stack.providers.inline.memory.faiss.config import FaissImplConfig
|
||||
from llama_stack.templates.template import DistributionTemplate, RunConfigSettings
|
||||
|
@ -32,11 +36,24 @@ def get_distribution_template() -> DistributionTemplate:
|
|||
provider_type="inline::faiss",
|
||||
config=FaissImplConfig.sample_run_config(f"distributions/{name}"),
|
||||
)
|
||||
embedding_provider = Provider(
|
||||
provider_id="sentence-transformers",
|
||||
provider_type="inline::sentence-transformers",
|
||||
config=SentenceTransformersInferenceConfig.sample_run_config(),
|
||||
)
|
||||
|
||||
inference_model = ModelInput(
|
||||
model_id="${env.INFERENCE_MODEL}",
|
||||
provider_id="vllm",
|
||||
)
|
||||
embedding_model = ModelInput(
|
||||
model_id="all-MiniLM-L6-v2",
|
||||
provider_id="sentence-transformers",
|
||||
model_type=ModelType.embedding,
|
||||
metadata={
|
||||
"embedding_dimension": 384,
|
||||
},
|
||||
)
|
||||
|
||||
return DistributionTemplate(
|
||||
name=name,
|
||||
|
@ -49,10 +66,10 @@ def get_distribution_template() -> DistributionTemplate:
|
|||
run_configs={
|
||||
"run.yaml": RunConfigSettings(
|
||||
provider_overrides={
|
||||
"inference": [inference_provider],
|
||||
"inference": [inference_provider, embedding_provider],
|
||||
"memory": [memory_provider],
|
||||
},
|
||||
default_models=[inference_model],
|
||||
default_models=[inference_model, embedding_model],
|
||||
),
|
||||
},
|
||||
run_config_env_vars={
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue