mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-07-20 03:40:05 +00:00
feat(vllm): periodically refresh models (#2823)
Just like #2805 but for vLLM. We also make VLLM_URL env variable optional (not required) -- if not specified, the provider silently sits idle and yells eventually if someone tries to call a completion on it. This is done so as to allow this provider to be present in the `starter` distribution. ## Test Plan Set up vLLM, copy the starter template and set `{ refresh_models: true, refresh_models_interval: 10 }` for the vllm provider and then run: ``` ENABLE_VLLM=vllm VLLM_URL=http://localhost:8000/v1 \ uv run llama stack run --image-type venv /tmp/starter.yaml ``` Verify that `llama-stack-client models list` brings up the model correctly from vLLM.
This commit is contained in:
parent
ade075152e
commit
199f859eec
7 changed files with 98 additions and 14 deletions
|
@ -159,18 +159,18 @@ class OllamaInferenceAdapter(
|
|||
models = []
|
||||
for m in response.models:
|
||||
model_type = ModelType.embedding if m.details.family in ["bert"] else ModelType.llm
|
||||
# unfortunately, ollama does not provide embedding dimension in the model list :(
|
||||
# we should likely add a hard-coded mapping of model name to embedding dimension
|
||||
if model_type == ModelType.embedding:
|
||||
continue
|
||||
models.append(
|
||||
Model(
|
||||
identifier=m.model,
|
||||
provider_resource_id=m.model,
|
||||
provider_id=provider_id,
|
||||
metadata={"embedding_dimension": 384} if model_type == ModelType.embedding else {},
|
||||
metadata={},
|
||||
model_type=model_type,
|
||||
)
|
||||
)
|
||||
await self.model_store.update_registered_models(provider_id, models)
|
||||
await self.model_store.update_registered_llm_models(provider_id, models)
|
||||
logger.debug(f"ollama refreshed model list ({len(models)} models)")
|
||||
|
||||
await asyncio.sleep(self.config.refresh_models_interval)
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue