Fix issue #183: Pre-download models during server initialization to prevent HTTP timeouts

This commit moves the model downloading logic from the `chat_completion` method to the `initialize` method in `OllamaInferenceAdapter`. By pre-loading required models during server startup, we ensure that large models (e.g., 16GB) are downloaded before serving requests, thus preventing HTTP request timeouts and aborted downloads during the first inference request.

Closes #183.
This commit is contained in:
Ezreal 2024-10-08 01:15:28 +08:00 committed by GitHub
parent 53d440e952
commit de80f66470
No known key found for this signature in database
GPG key ID: B5690EEEBB952194

View file

@ -4,16 +4,16 @@
# This source code is licensed under the terms described in the LICENSE file in # This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree. # the root directory of this source tree.
from typing import List
from llama_stack.distribution.datatypes import RemoteProviderConfig from llama_stack.distribution.datatypes import RemoteProviderConfig
class OllamaImplConfig(RemoteProviderConfig): class OllamaImplConfig(RemoteProviderConfig):
port: int = 11434 port: int = 11434
preload_models: List[str] = ["Llama3.1-8B-Instruct"]
async def get_adapter_impl(config: OllamaImplConfig, _deps):
async def get_adapter_impl(config: RemoteProviderConfig, _deps):
from .ollama import OllamaInferenceAdapter from .ollama import OllamaInferenceAdapter
impl = OllamaInferenceAdapter(config.url) impl = OllamaInferenceAdapter(config.url, config.preload_models)
await impl.initialize() await impl.initialize()
return impl return impl