From 0899f789430655e48ae6a9095767b9a8817dc8b0 Mon Sep 17 00:00:00 2001
From: "mergify[bot]" <37929162+mergify[bot]@users.noreply.github.com>
Date: Wed, 3 Dec 2025 10:56:24 +0100
Subject: [PATCH] fix: Avoid model_limits KeyError (backport #4060) (#4283)

# What does this PR do?
It avoids model_limit KeyError while trying to get embedding models for
Watsonx


Closes https://github.com/llamastack/llama-stack/issues/4059

## Test Plan

Start server with watsonx distro:
```bash
llama stack list-deps watsonx | xargs -L1 uv pip install
uv run llama stack run watsonx
```
Run
```python
client = LlamaStackClient(base_url=base_url)
client.models.list()
```
Check if there is any embedding model available (currently there is not
a single one)<hr>This is an automatic backport of pull request #4060
done by [Mergify](https://mergify.com).

Co-authored-by: Wojciech-Rebisz <147821486+Wojciech-Rebisz@users.noreply.github.com>
---
 llama_stack/providers/remote/inference/watsonx/watsonx.py | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)
diff --git a/llama_stack/providers/remote/inference/watsonx/watsonx.py b/llama_stack/providers/remote/inference/watsonx/watsonx.py
index 2c051719b..8689ead8e 100644
--- a/llama_stack/providers/remote/inference/watsonx/watsonx.py
+++ b/llama_stack/providers/remote/inference/watsonx/watsonx.py
@@ -283,8 +283,8 @@ class WatsonXInferenceAdapter(LiteLLMOpenAIMixin):
             # ...
             provider_resource_id = f"{self.__provider_id__}/{model_spec['model_id']}"
             if "embedding" in functions:
-                embedding_dimension = model_spec["model_limits"]["embedding_dimension"]
-                context_length = model_spec["model_limits"]["max_sequence_length"]
+                embedding_dimension = model_spec.get("model_limits", {}).get("embedding_dimension", 0)
+                context_length = model_spec.get("model_limits", {}).get("max_sequence_length", 0)
                 embedding_metadata = {
                     "embedding_dimension": embedding_dimension,
                     "context_length": context_length,
@@ -306,10 +306,6 @@ class WatsonXInferenceAdapter(LiteLLMOpenAIMixin):
                     metadata={},
                     model_type=ModelType.llm,
                 )
-                # In theory, I guess it is possible that a model could be both an embedding model and a text chat model.
-                # In that case, the cache will record the generator Model object, and the list which we return will have
-                # both the generator Model object and the text chat Model object.  That's fine because the cache is
-                # only used for check_model_availability() anyway.
                 self._model_cache[provider_resource_id] = model
                 models.append(model)
         return models