From fffdab4f5c23b4ac045b898bdb1e69edda3a3498 Mon Sep 17 00:00:00 2001
From: Chacksu <connorhack10@gmail.com>
Date: Wed, 13 Aug 2025 09:18:25 -0400
Subject: [PATCH] fix: Dell distribution missing kvstore (#3113)

# What does this PR do?

- Added kvstore config to ChromaDB provider config for Dell distribution
similar to [starter
config](https://github.com/meta-llama/llama-stack/blob/main/llama_stack/distributions/starter/run.yaml#L110-L112)
- Fixed
[error](https://github.com/huggingface/huggingface_hub/blob/main/src/huggingface_hub/inference/_generated/_async_client.py#L3424-L3425)
getting endpoint information by adding `hf-inference` as the provider to
the `AsyncInferenceClient` (TGI client).

## Test Plan
```
export INFERENCE_PORT=8181
export DEH_URL=http://0.0.0.0:$INFERENCE_PORT
export INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
export CHROMADB_HOST=localhost
export CHROMADB_PORT=8000
export CHROMA_URL=http://$CHROMADB_HOST:$CHROMADB_PORT
export CUDA_VISIBLE_DEVICES=0
export LLAMA_STACK_PORT=8321
export HF_TOKEN=[redacted]

# TGI Server
docker run --rm -it \
  --pull always \
  --network host \
  -v $HOME/.cache/huggingface:/data \
  -e HF_TOKEN=$HF_TOKEN \
  -e PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True \
  -p $INFERENCE_PORT:$INFERENCE_PORT \
  --gpus all \
  ghcr.io/huggingface/text-generation-inference:latest \
  --dtype float16 \
  --usage-stats off \
  --sharded false \
  --cuda-memory-fraction 0.8 \
  --model-id meta-llama/Llama-3.2-3B-Instruct \
  --port $INFERENCE_PORT \
  --hostname 0.0.0.0

# Chrome DB
docker run --rm -it \
  --name chromadb \
  --net=host  -p 8000:8000 \
  -v ~/chroma:/chroma/chroma \
  -e IS_PERSISTENT=TRUE \
  -e ANONYMIZED_TELEMETRY=FALSE \
  chromadb/chroma:latest

# Llama Stack
llama stack run dell \
 --port $LLAMA_STACK_PORT \
 --env INFERENCE_MODEL=$INFERENCE_MODEL \
 --env DEH_URL=$DEH_URL \
 --env CHROMA_URL=$CHROMA_URL
```

---------

Co-authored-by: Connor Hack <connorhack@fb.com>
Co-authored-by: Ashwin Bharambe <ashwin.bharambe@gmail.com>
---
 llama_stack/distributions/dell/dell.py              | 8 +++++---
 llama_stack/distributions/dell/run-with-safety.yaml | 5 ++++-
 llama_stack/distributions/dell/run.yaml             | 5 ++++-
 llama_stack/providers/remote/inference/tgi/tgi.py   | 4 +---
 4 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/llama_stack/distributions/dell/dell.py b/llama_stack/distributions/dell/dell.py
index b561ea00e..e3bf0ee03 100644
--- a/llama_stack/distributions/dell/dell.py
+++ b/llama_stack/distributions/dell/dell.py
@@ -16,6 +16,7 @@ from llama_stack.distributions.template import DistributionTemplate, RunConfigSe
 from llama_stack.providers.inline.inference.sentence_transformers import (
     SentenceTransformersInferenceConfig,
 )
+from llama_stack.providers.remote.vector_io.chroma import ChromaVectorIOConfig
 
 
 def get_distribution_template() -> DistributionTemplate:
@@ -71,9 +72,10 @@ def get_distribution_template() -> DistributionTemplate:
     chromadb_provider = Provider(
         provider_id="chromadb",
         provider_type="remote::chromadb",
-        config={
-            "url": "${env.CHROMA_URL}",
-        },
+        config=ChromaVectorIOConfig.sample_run_config(
+            f"~/.llama/distributions/{name}/",
+            url="${env.CHROMADB_URL:=}",
+        ),
     )
 
     inference_model = ModelInput(
diff --git a/llama_stack/distributions/dell/run-with-safety.yaml b/llama_stack/distributions/dell/run-with-safety.yaml
index ecc6729eb..d89c92aa1 100644
--- a/llama_stack/distributions/dell/run-with-safety.yaml
+++ b/llama_stack/distributions/dell/run-with-safety.yaml
@@ -26,7 +26,10 @@ providers:
   - provider_id: chromadb
     provider_type: remote::chromadb
     config:
-      url: ${env.CHROMA_URL}
+      url: ${env.CHROMADB_URL:=}
+      kvstore:
+        type: sqlite
+        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/dell/}/chroma_remote_registry.db
   safety:
   - provider_id: llama-guard
     provider_type: inline::llama-guard
diff --git a/llama_stack/distributions/dell/run.yaml b/llama_stack/distributions/dell/run.yaml
index fc2553526..7397410ba 100644
--- a/llama_stack/distributions/dell/run.yaml
+++ b/llama_stack/distributions/dell/run.yaml
@@ -22,7 +22,10 @@ providers:
   - provider_id: chromadb
     provider_type: remote::chromadb
     config:
-      url: ${env.CHROMA_URL}
+      url: ${env.CHROMADB_URL:=}
+      kvstore:
+        type: sqlite
+        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/dell/}/chroma_remote_registry.db
   safety:
   - provider_id: llama-guard
     provider_type: inline::llama-guard
diff --git a/llama_stack/providers/remote/inference/tgi/tgi.py b/llama_stack/providers/remote/inference/tgi/tgi.py
index a5bb079ef..323831845 100644
--- a/llama_stack/providers/remote/inference/tgi/tgi.py
+++ b/llama_stack/providers/remote/inference/tgi/tgi.py
@@ -308,9 +308,7 @@ class TGIAdapter(_HfAdapter):
         if not config.url:
             raise ValueError("You must provide a URL in run.yaml (or via the TGI_URL environment variable) to use TGI.")
         log.info(f"Initializing TGI client with url={config.url}")
-        self.client = AsyncInferenceClient(
-            model=config.url,
-        )
+        self.client = AsyncInferenceClient(model=config.url, provider="hf-inference")
         endpoint_info = await self.client.get_endpoint_info()
         self.max_tokens = endpoint_info["max_total_tokens"]
         self.model_id = endpoint_info["model_id"]