Add an option to not use elastic agents for meta-reference inference (#269)

2025-12-03 18:00:36 +00:00 · 2024-10-18 12:51:10 -07:00 · 2024-10-18 12:51:10 -07:00 · 33afd34e6f
commit 33afd34e6f
parent be3c5c034d
2 changed files with 33 additions and 8 deletions
--- a/llama_stack/providers/impls/meta_reference/inference/config.py
+++ b/llama_stack/providers/impls/meta_reference/inference/config.py
@ -17,13 +17,18 @@ from llama_stack.providers.utils.inference import supported_inference_models
 class MetaReferenceInferenceConfig(BaseModel):
    model: str = Field(
-        default="Llama3.1-8B-Instruct",
+        default="Llama3.2-3B-Instruct",
        description="Model descriptor from `llama model list`",
    )
    torch_seed: Optional[int] = None
    max_seq_len: int = 4096
    max_batch_size: int = 1
    # when this is False, we assume that the distributed process group is setup by someone
    # outside of this code (e.g., when run inside `torchrun`). that is useful for clients
    # (including our testing code) who might be using llama-stack as a library.
    create_distributed_process_group: bool = True
    @field_validator("model")
    @classmethod
    def validate_model(cls, model: str) -> str:
--- a/llama_stack/providers/impls/meta_reference/inference/inference.py
+++ b/llama_stack/providers/impls/meta_reference/inference/inference.py
@ -18,6 +18,7 @@ from llama_stack.providers.utils.inference.prompt_adapter import (
 )
 from .config import MetaReferenceInferenceConfig
 from .generation import Llama
 from .model_parallel import LlamaModelParallelGenerator
 # there's a single model parallel process running serving the model. for now,
@ -36,8 +37,11 @@ class MetaReferenceInferenceImpl(Inference, ModelsProtocolPrivate):
    async def initialize(self) -> None:
        print(f"Loading model `{self.model.descriptor()}`")
        if self.config.create_distributed_process_group:
            self.generator = LlamaModelParallelGenerator(self.config)
            self.generator.start()
        else:
            self.generator = Llama.build(self.config)
    async def register_model(self, model: ModelDef) -> None:
        raise ValueError("Dynamic model registration is not supported")
@ -51,6 +55,7 @@ class MetaReferenceInferenceImpl(Inference, ModelsProtocolPrivate):
        ]
    async def shutdown(self) -> None:
        if self.config.create_distributed_process_group:
            self.generator.stop()
    def completion(
@ -99,6 +104,7 @@ class MetaReferenceInferenceImpl(Inference, ModelsProtocolPrivate):
                f"Model mismatch: {request.model} != {self.model.descriptor()}"
            )
        if self.config.create_distributed_process_group:
            if SEMAPHORE.locked():
                raise RuntimeError("Only one concurrent request is supported")
@ -110,7 +116,7 @@ class MetaReferenceInferenceImpl(Inference, ModelsProtocolPrivate):
    async def _nonstream_chat_completion(
        self, request: ChatCompletionRequest
    ) -> ChatCompletionResponse:
-        async with SEMAPHORE:
+        def impl():
            messages = chat_completion_request_to_messages(request)
            tokens = []
@ -154,10 +160,16 @@ class MetaReferenceInferenceImpl(Inference, ModelsProtocolPrivate):
                logprobs=logprobs if request.logprobs else None,
            )
        if self.config.create_distributed_process_group:
            async with SEMAPHORE:
                return impl()
        else:
            return impl()
    async def _stream_chat_completion(
        self, request: ChatCompletionRequest
    ) -> AsyncGenerator:
-        async with SEMAPHORE:
+        def impl():
            messages = chat_completion_request_to_messages(request)
            yield ChatCompletionResponseStreamChunk(
@ -272,6 +284,14 @@ class MetaReferenceInferenceImpl(Inference, ModelsProtocolPrivate):
                )
            )
        if self.config.create_distributed_process_group:
            async with SEMAPHORE:
                for x in impl():
                    yield x
        else:
            for x in impl():
                yield x
    async def embeddings(
        self,
        model: str,