Fix issue #183

Fix issue #183: Pre-download models during server initialization to prevent HTTP timeouts This commit moves the model downloading logic from the `chat_completion` method to the `initialize` method in `OllamaInferenceAdapter`. By pre-loading required models during server startup, we ensure that large models (e.g., 16GB) are downloaded before serving requests, thus preventing HTTP request timeouts and aborted downloads during the first inference request. Closes #183.
2025-07-29 15:23:51 +00:00 · 2024-10-08 01:15:28 +08:00 · 2024-10-08 01:15:28 +08:00 · de80f66470
commit de80f66470
parent 53d440e952
1 changed files with 4 additions and 4 deletions
--- a/llama_stack/providers/adapters/inference/ollama/init.py
+++ b/llama_stack/providers/adapters/inference/ollama/init.py
@ -4,16 +4,16 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

+from typing import List
 from llama_stack.distribution.datatypes import RemoteProviderConfig

-
 class OllamaImplConfig(RemoteProviderConfig):
    port: int = 11434
+    preload_models: List[str] = ["Llama3.1-8B-Instruct"]

-
-async def get_adapter_impl(config: RemoteProviderConfig, _deps):
+async def get_adapter_impl(config: OllamaImplConfig, _deps):
    from .ollama import OllamaInferenceAdapter

-    impl = OllamaInferenceAdapter(config.url)
+    impl = OllamaInferenceAdapter(config.url, config.preload_models)
    await impl.initialize()
    return impl