Several smaller fixes to make adapters work

Also, reorganized the pattern of __init__ inside providers so configuration can stay lightweight
2025-10-04 12:07:34 +00:00 · 2024-08-28 09:42:08 -07:00 · 2024-08-28 09:42:08 -07:00 · 45987996c4
commit 45987996c4
parent 2a1552a5eb
23 changed files with 164 additions and 160 deletions
--- a/llama_toolchain/inference/meta_reference/inference.py
+++ b/llama_toolchain/inference/meta_reference/inference.py
@ -6,12 +6,11 @@

 import asyncio

-from typing import AsyncIterator, Dict, Union
+from typing import AsyncIterator, Union

 from llama_models.llama3.api.datatypes import StopReason
 from llama_models.sku_list import resolve_model

-from llama_toolchain.distribution.datatypes import Api, ProviderSpec
 from llama_toolchain.inference.api import (
    ChatCompletionRequest,
    ChatCompletionResponse,
@ -27,18 +26,6 @@ from .config import MetaReferenceImplConfig
 from .model_parallel import LlamaModelParallelGenerator


-async def get_provider_impl(
-    config: MetaReferenceImplConfig, _deps: Dict[Api, ProviderSpec]
-):
-    assert isinstance(
-        config, MetaReferenceImplConfig
-    ), f"Unexpected config type: {type(config)}"
-
-    impl = MetaReferenceInferenceImpl(config)
-    await impl.initialize()
-    return impl
-
-
 # there's a single model parallel process running serving the model. for now,
 # we don't support multiple concurrent requests to this process.
 SEMAPHORE = asyncio.Semaphore(1)