mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-07-29 15:23:51 +00:00
Fix issue #183
Fix issue #183: Pre-download models during server initialization to prevent HTTP timeouts This commit moves the model downloading logic from the `chat_completion` method to the `initialize` method in `OllamaInferenceAdapter`. By pre-loading required models during server startup, we ensure that large models (e.g., 16GB) are downloaded before serving requests, thus preventing HTTP request timeouts and aborted downloads during the first inference request. Closes #183.
This commit is contained in:
parent
53d440e952
commit
de80f66470
1 changed files with 4 additions and 4 deletions
|
@ -4,16 +4,16 @@
|
|||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
from typing import List
|
||||
from llama_stack.distribution.datatypes import RemoteProviderConfig
|
||||
|
||||
|
||||
class OllamaImplConfig(RemoteProviderConfig):
|
||||
port: int = 11434
|
||||
preload_models: List[str] = ["Llama3.1-8B-Instruct"]
|
||||
|
||||
|
||||
async def get_adapter_impl(config: RemoteProviderConfig, _deps):
|
||||
async def get_adapter_impl(config: OllamaImplConfig, _deps):
|
||||
from .ollama import OllamaInferenceAdapter
|
||||
|
||||
impl = OllamaInferenceAdapter(config.url)
|
||||
impl = OllamaInferenceAdapter(config.url, config.preload_models)
|
||||
await impl.initialize()
|
||||
return impl
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue