Make each inference provider into its own subdirectory

2025-12-03 18:00:36 +00:00 · 2024-08-05 15:13:52 -07:00 · 2024-08-05 15:13:52 -07:00 · 0de5a807c7
commit 0de5a807c7
parent f64668319c
42 changed files with 123 additions and 103 deletions
--- a/llama_toolchain/inference/api_instance.py
+++ b/llama_toolchain/inference/api_instance.py
@ -1,22 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-# from .api.config import ImplType, InferenceConfig
-
-
-# async def get_inference_api_instance(config: InferenceConfig):
-#     if config.impl_config.impl_type == ImplType.inline.value:
-#         from .inference import InferenceImpl
-
-#         return InferenceImpl(config.impl_config)
-#     elif config.impl_config.impl_type == ImplType.ollama.value:
-#         from .ollama import OllamaInference
-
-#         return OllamaInference(config.impl_config)
-
-#     from .client import InferenceClient
-
-#     return InferenceClient(config.impl_config.url)
--- a/llama_toolchain/inference/meta_reference/init.py
+++ b/llama_toolchain/inference/meta_reference/init.py
@ -0,0 +1,8 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from .config import MetaReferenceImplConfig  # noqa
+from .inference import get_provider_impl  # noqa
--- a/llama_toolchain/inference/meta_reference/config.py
+++ b/llama_toolchain/inference/meta_reference/config.py
@ -13,7 +13,7 @@ from pydantic import BaseModel, Field
 from strong_typing.schema import json_schema_type
 from typing_extensions import Annotated

-from .datatypes import QuantizationConfig
+from llama_toolchain.inference.api import QuantizationConfig


@json_schema_type
@ -63,9 +63,3 @@ class MetaReferenceImplConfig(BaseModel):
    torch_seed: Optional[int] = None
    max_seq_len: int
    max_batch_size: int = 1
-
-
-@json_schema_type
-class OllamaImplConfig(BaseModel):
-    model: str = Field(..., description="The name of the model in ollama catalog")
-    url: str = Field(..., description="The URL for the ollama server")
--- a/llama_toolchain/inference/meta_reference/generation.py
+++ b/llama_toolchain/inference/meta_reference/generation.py
@ -29,8 +29,9 @@ from llama_models.llama3_1.api.model import Transformer
 from llama_models.llama3_1.api.tokenizer import Tokenizer
 from termcolor import cprint

-from .api.config import CheckpointType, MetaReferenceImplConfig
-from .api.datatypes import QuantizationType
+from llama_toolchain.inference.api import QuantizationType
+
+from .config import CheckpointType, MetaReferenceImplConfig


@dataclass
--- a/llama_toolchain/inference/meta_reference/inference.py
+++ b/llama_toolchain/inference/meta_reference/inference.py
@ -12,20 +12,18 @@ from llama_models.llama3_1.api.datatypes import StopReason
 from llama_models.sku_list import resolve_model

 from llama_toolchain.distribution.datatypes import Api, ProviderSpec
-
-from .api.config import MetaReferenceImplConfig
-from .api.datatypes import (
+from llama_toolchain.inference.api import (
+    ChatCompletionRequest,
+    ChatCompletionResponse,
    ChatCompletionResponseEvent,
    ChatCompletionResponseEventType,
+    ChatCompletionResponseStreamChunk,
+    Inference,
    ToolCallDelta,
    ToolCallParseStatus,
 )
-from .api.endpoints import (
-    ChatCompletionRequest,
-    ChatCompletionResponse,
-    ChatCompletionResponseStreamChunk,
-    Inference,
-)
+
+from .config import MetaReferenceImplConfig
 from .model_parallel import LlamaModelParallelGenerator


--- a/llama_toolchain/inference/meta_reference/model_parallel.py
+++ b/llama_toolchain/inference/meta_reference/model_parallel.py
@ -13,7 +13,7 @@ from llama_models.llama3_1.api.chat_format import ChatFormat
 from llama_models.llama3_1.api.datatypes import Message
 from llama_models.llama3_1.api.tokenizer import Tokenizer

-from .api.config import MetaReferenceImplConfig
+from .config import MetaReferenceImplConfig
 from .generation import Llama
 from .parallel_utils import ModelParallelProcessGroup

--- a/llama_toolchain/inference/meta_reference/parallel_utils.py
+++ b/llama_toolchain/inference/meta_reference/parallel_utils.py
--- a/llama_toolchain/inference/ollama/init.py
+++ b/llama_toolchain/inference/ollama/init.py
@ -0,0 +1,8 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from .config import OllamaImplConfig  # noqa
+from .ollama import get_provider_impl  # noqa
--- a/llama_toolchain/inference/ollama/config.py
+++ b/llama_toolchain/inference/ollama/config.py
@ -0,0 +1,14 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from pydantic import BaseModel, Field
+from strong_typing.schema import json_schema_type
+
+
+@json_schema_type
+class OllamaImplConfig(BaseModel):
+    model: str = Field(..., description="The name of the model in ollama catalog")
+    url: str = Field(..., description="The URL for the ollama server")
--- a/llama_toolchain/inference/ollama/ollama.py
+++ b/llama_toolchain/inference/ollama/ollama.py
@ -22,21 +22,20 @@ from llama_models.sku_list import resolve_model

 from ollama import AsyncClient

-from .api.config import OllamaImplConfig
-from .api.datatypes import (
-    ChatCompletionResponseEvent,
-    ChatCompletionResponseEventType,
-    ToolCallDelta,
-    ToolCallParseStatus,
-)
-from .api.endpoints import (
+from llama_toolchain.inference.api import (
    ChatCompletionRequest,
    ChatCompletionResponse,
+    ChatCompletionResponseEvent,
+    ChatCompletionResponseEventType,
    ChatCompletionResponseStreamChunk,
    CompletionRequest,
    Inference,
+    ToolCallDelta,
+    ToolCallParseStatus,
 )

+from .config import OllamaImplConfig
+
 # TODO: Eventually this will move to the llama cli model list command
 # mapping of Model SKUs to ollama models
 OLLAMA_SUPPORTED_SKUS = {
--- a/llama_toolchain/inference/providers.py
+++ b/llama_toolchain/inference/providers.py
@ -18,8 +18,8 @@ def available_inference_providers() -> List[ProviderSpec]:
                "torch",
                "zmq",
            ],
-            module="llama_toolchain.inference.inference",
-            config_class="llama_toolchain.inference.inference.MetaReferenceImplConfig",
+            module="llama_toolchain.inference.meta_reference",
+            config_class="llama_toolchain.inference.meta_reference.MetaReferenceImplConfig",
        ),
        InlineProviderSpec(
            api=Api.inference,