From de80f664700543266a13e50a74c11a8f386978f7 Mon Sep 17 00:00:00 2001 From: Ezreal Date: Tue, 8 Oct 2024 01:15:28 +0800 Subject: [PATCH] Fix issue #183 Fix issue #183: Pre-download models during server initialization to prevent HTTP timeouts This commit moves the model downloading logic from the `chat_completion` method to the `initialize` method in `OllamaInferenceAdapter`. By pre-loading required models during server startup, we ensure that large models (e.g., 16GB) are downloaded before serving requests, thus preventing HTTP request timeouts and aborted downloads during the first inference request. Closes #183. --- .../providers/adapters/inference/ollama/__init__.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/llama_stack/providers/adapters/inference/ollama/__init__.py b/llama_stack/providers/adapters/inference/ollama/__init__.py index 7763af8d1..68c8077ff 100644 --- a/llama_stack/providers/adapters/inference/ollama/__init__.py +++ b/llama_stack/providers/adapters/inference/ollama/__init__.py @@ -4,16 +4,16 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. +from typing import List from llama_stack.distribution.datatypes import RemoteProviderConfig - class OllamaImplConfig(RemoteProviderConfig): port: int = 11434 + preload_models: List[str] = ["Llama3.1-8B-Instruct"] - -async def get_adapter_impl(config: RemoteProviderConfig, _deps): +async def get_adapter_impl(config: OllamaImplConfig, _deps): from .ollama import OllamaInferenceAdapter - impl = OllamaInferenceAdapter(config.url) + impl = OllamaInferenceAdapter(config.url, config.preload_models) await impl.initialize() return impl