From 7e30b5a466c967c052c0374463511667075d200b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C3=A9bastien=20Han?= <seb@redhat.com>
Date: Tue, 3 Jun 2025 18:00:27 +0200
Subject: [PATCH] fix: remove sentence-transformers from remote vllm
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

vLLM itself can perform the embeddings generation so we don't need this
extra provider.

Signed-off-by: Sébastien Han <seb@redhat.com>
---
 .../self_hosted_distro/remote-vllm.md             |  2 +-
 llama_stack/templates/remote-vllm/build.yaml      |  1 -
 .../templates/remote-vllm/run-with-safety.yaml    |  5 +----
 llama_stack/templates/remote-vllm/run.yaml        |  5 +----
 llama_stack/templates/remote-vllm/vllm.py         | 15 +++------------
 5 files changed, 6 insertions(+), 22 deletions(-)

diff --git a/docs/source/distributions/self_hosted_distro/remote-vllm.md b/docs/source/distributions/self_hosted_distro/remote-vllm.md
index 6e7cf410d..719a40690 100644
--- a/docs/source/distributions/self_hosted_distro/remote-vllm.md
+++ b/docs/source/distributions/self_hosted_distro/remote-vllm.md
@@ -17,7 +17,7 @@ The `llamastack/distribution-remote-vllm` distribution consists of the following
 | agents | `inline::meta-reference` |
 | datasetio | `remote::huggingface`, `inline::localfs` |
 | eval | `inline::meta-reference` |
-| inference | `remote::vllm`, `inline::sentence-transformers` |
+| inference | `remote::vllm` |
 | safety | `inline::llama-guard` |
 | scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
 | telemetry | `inline::meta-reference` |
diff --git a/llama_stack/templates/remote-vllm/build.yaml b/llama_stack/templates/remote-vllm/build.yaml
index 16fe5d4fd..136014e18 100644
--- a/llama_stack/templates/remote-vllm/build.yaml
+++ b/llama_stack/templates/remote-vllm/build.yaml
@@ -4,7 +4,6 @@ distribution_spec:
   providers:
     inference:
     - remote::vllm
-    - inline::sentence-transformers
     vector_io:
     - inline::faiss
     - remote::chromadb
diff --git a/llama_stack/templates/remote-vllm/run-with-safety.yaml b/llama_stack/templates/remote-vllm/run-with-safety.yaml
index 64f71087a..60fffa3ce 100644
--- a/llama_stack/templates/remote-vllm/run-with-safety.yaml
+++ b/llama_stack/templates/remote-vllm/run-with-safety.yaml
@@ -26,9 +26,6 @@ providers:
       max_tokens: ${env.VLLM_MAX_TOKENS:4096}
       api_token: ${env.VLLM_API_TOKEN:fake}
       tls_verify: ${env.VLLM_TLS_VERIFY:true}
-  - provider_id: sentence-transformers
-    provider_type: inline::sentence-transformers
-    config: {}
   vector_io:
   - provider_id: faiss
     provider_type: inline::faiss
@@ -133,7 +130,7 @@ models:
 - metadata:
     embedding_dimension: 384
   model_id: all-MiniLM-L6-v2
-  provider_id: sentence-transformers
+  provider_id: vllm-inference
   model_type: embedding
 shields:
 - shield_id: ${env.SAFETY_MODEL}
diff --git a/llama_stack/templates/remote-vllm/run.yaml b/llama_stack/templates/remote-vllm/run.yaml
index 353b9902d..d7961b159 100644
--- a/llama_stack/templates/remote-vllm/run.yaml
+++ b/llama_stack/templates/remote-vllm/run.yaml
@@ -19,9 +19,6 @@ providers:
       max_tokens: ${env.VLLM_MAX_TOKENS:4096}
       api_token: ${env.VLLM_API_TOKEN:fake}
       tls_verify: ${env.VLLM_TLS_VERIFY:true}
-  - provider_id: sentence-transformers
-    provider_type: inline::sentence-transformers
-    config: {}
   vector_io:
   - provider_id: faiss
     provider_type: inline::faiss
@@ -122,7 +119,7 @@ models:
 - metadata:
     embedding_dimension: 384
   model_id: all-MiniLM-L6-v2
-  provider_id: sentence-transformers
+  provider_id: vllm-inference
   model_type: embedding
 shields: []
 vector_dbs: []
diff --git a/llama_stack/templates/remote-vllm/vllm.py b/llama_stack/templates/remote-vllm/vllm.py
index 2782a3ea0..269bc3a59 100644
--- a/llama_stack/templates/remote-vllm/vllm.py
+++ b/llama_stack/templates/remote-vllm/vllm.py
@@ -13,9 +13,6 @@ from llama_stack.distribution.datatypes import (
     ShieldInput,
     ToolGroupInput,
 )
-from llama_stack.providers.inline.inference.sentence_transformers import (
-    SentenceTransformersInferenceConfig,
-)
 from llama_stack.providers.inline.vector_io.faiss.config import FaissVectorIOConfig
 from llama_stack.providers.remote.inference.vllm import VLLMInferenceAdapterConfig
 from llama_stack.templates.template import DistributionTemplate, RunConfigSettings
@@ -23,7 +20,7 @@ from llama_stack.templates.template import DistributionTemplate, RunConfigSettin
 
 def get_distribution_template() -> DistributionTemplate:
     providers = {
-        "inference": ["remote::vllm", "inline::sentence-transformers"],
+        "inference": ["remote::vllm"],
         "vector_io": ["inline::faiss", "remote::chromadb", "remote::pgvector"],
         "safety": ["inline::llama-guard"],
         "agents": ["inline::meta-reference"],
@@ -47,11 +44,6 @@ def get_distribution_template() -> DistributionTemplate:
             url="${env.VLLM_URL:http://localhost:8000/v1}",
         ),
     )
-    embedding_provider = Provider(
-        provider_id="sentence-transformers",
-        provider_type="inline::sentence-transformers",
-        config=SentenceTransformersInferenceConfig.sample_run_config(),
-    )
     vector_io_provider = Provider(
         provider_id="faiss",
         provider_type="inline::faiss",
@@ -68,7 +60,7 @@ def get_distribution_template() -> DistributionTemplate:
     )
     embedding_model = ModelInput(
         model_id="all-MiniLM-L6-v2",
-        provider_id="sentence-transformers",
+        provider_id="vllm-inference",
         model_type=ModelType.embedding,
         metadata={
             "embedding_dimension": 384,
@@ -98,7 +90,7 @@ def get_distribution_template() -> DistributionTemplate:
         run_configs={
             "run.yaml": RunConfigSettings(
                 provider_overrides={
-                    "inference": [inference_provider, embedding_provider],
+                    "inference": [inference_provider],
                     "vector_io": [vector_io_provider],
                 },
                 default_models=[inference_model, embedding_model],
@@ -115,7 +107,6 @@ def get_distribution_template() -> DistributionTemplate:
                                 url="${env.SAFETY_VLLM_URL}",
                             ),
                         ),
-                        embedding_provider,
                     ],
                     "vector_io": [vector_io_provider],
                 },