forked from phoenix-oss/llama-stack-mirror
## What does this PR do?
In this PR, we refactor the meta reference inference logic to support
- load the model during registering model instead of during spinning up
server
- support inference finetuned model checkpoint on top of native llama
model
## Why need these changes
To solve the existing pain points that
- user cannot lazy load the model and hot switch the inference
checkpoint after spinning up the server
- this blocks us doing inference and eval on the same sever for a
finetuned checkpoint after post training
- user cannot do inference on a finetuned checkpoint on top of native
llama models
## Expect user experience change
- The inference model won't be loaded when spinning up server. Instead,
it will be loaded during register model. If user add the model as models
resource in run.yaml, it will be registered and loaded automatically
when starting server. There is an optional flag 'skip_initialize' in
model metadata to skip model loading during registration.
- There is an optional flag 'llama_model' in model metadata to identify
the base model of the Model class for validation and initialize model
arch. model identifier no longer needs to be a native llama model
- the default inference model name updates from
'meta-llama/Llama-3.2-3B-Instruct' to 'Llama3.2-3B-Instruct'
- It aligns with the checkpoint folder name after running 'llama model
download'
- It aligns with the descriptor name defined in llama-models SKU list
bf5b0c4fe7/models/datatypes.py (L95)
## test
run python llama_stack/scripts/distro_codegen.py
**run unit test**
- torchrun $CONDA_PREFIX/bin/pytest -v -s -k "meta_reference"
--inference-model="Llama3.1-8B-Instruct"
./llama_stack/providers/tests/inference/test_text_inference.py
- torchrun $CONDA_PREFIX/bin/pytest -v -s -k "meta_reference"
--inference-model="Llama3.1-8B-Instruct"
./llama_stack/providers/tests/inference/test_model_registration.py
**test post training experience**
on server side run: llama stack run
llama_stack/templates/experimental-post-training/run.yaml
server is spinning up without model loaded
<img width="812" alt="Screenshot 2024-12-17 at 1 24 50 PM"
src="https://github.com/user-attachments/assets/ce1f606b-3b6f-452f-b48e-b3761ffd90f3"
/>
on client side, run: llama-stack-client --endpoint
http://devgpu018.nha2.facebook.com:5000 models register
Llama3.2-3B-Instruct
register model successfully and the model is loaded
<img width="1111" alt="Screenshot 2024-12-17 at 1 26 30 PM"
src="https://github.com/user-attachments/assets/56e02131-cf7d-4de5-8f63-fbdcb8c55c26"
/>
<img width="1541" alt="Screenshot 2024-12-17 at 1 26 09 PM"
src="https://github.com/user-attachments/assets/a83255a1-20f5-40a2-af51-55641410a115"
/>
if add "skip_initialize" in metadata, model is registered but isn't
loaded
on client side, run: llama-stack-client --endpoint
http://devgpu018.nha2.facebook.com:5000 inference chat-completion
--message "hello, what model are you?"
Inference the model succesfully
<img width="1121" alt="Screenshot 2024-12-17 at 1 27 33 PM"
src="https://github.com/user-attachments/assets/8e708545-3fe7-4a73-8754-1470fa5f1e75"
/>
**test inference experience**
run: llama stack run llama_stack/templates/meta-reference-gpu/run.yaml
model is loaded since the model is in resouce list in run.yaml
<img width="1537" alt="Screenshot 2024-12-17 at 1 30 19 PM"
src="https://github.com/user-attachments/assets/5c8af817-66eb-43f8-bf4c-f5e24b0a12c6"
/>
on client side, run: llama-stack-client --endpoint
http://devgpu018.nha2.facebook.com:5000 inference chat-completion
--message "hello, what model are you?"
inference successfully
<img width="1123" alt="Screenshot 2024-12-17 at 1 31 08 PM"
src="https://github.com/user-attachments/assets/471809aa-c65e-46dc-a37e-7094fb857f97"
/>
## inference on a finetuned model
**register a finetuned model that finetuned by post training api
(torchtune)**
- the model is registered and loaded successfully
- the model is shown up in the model list
<img width="974" alt="Screenshot 2024-12-18 at 3 56 33 PM"
src="https://github.com/user-attachments/assets/2994b4f5-4fa9-40c6-acc6-4b971479f3e2"
/>
**run inference**
<img width="977" alt="Screenshot 2024-12-18 at 3 57 59 PM"
src="https://github.com/user-attachments/assets/d117abbc-b2a0-41d8-a028-1a13128787b2"
/>
115 lines
3.8 KiB
Python
115 lines
3.8 KiB
Python
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
# All rights reserved.
|
|
#
|
|
# This source code is licensed under the terms described in the LICENSE file in
|
|
# the root directory of this source tree.
|
|
|
|
import os
|
|
from copy import deepcopy
|
|
from functools import partial
|
|
from typing import Any, Generator
|
|
|
|
from llama_models.llama3.api.chat_format import ChatFormat
|
|
from llama_models.llama3.api.datatypes import Model
|
|
from llama_models.llama3.api.tokenizer import Tokenizer
|
|
from llama_models.sku_list import resolve_model
|
|
|
|
from llama_stack.apis.inference import ChatCompletionRequest, CompletionRequest
|
|
|
|
from .config import MetaReferenceInferenceConfig
|
|
from .generation import Llama, model_checkpoint_dir
|
|
from .parallel_utils import ModelParallelProcessGroup
|
|
|
|
|
|
class ModelRunner:
|
|
def __init__(self, llama):
|
|
self.llama = llama
|
|
|
|
# the `task` object is the same that is sent to `ModelParallelProcessGroup.run_inference()`
|
|
def __call__(self, req: Any):
|
|
if isinstance(req, ChatCompletionRequest):
|
|
return self.llama.chat_completion(req)
|
|
elif isinstance(req, CompletionRequest):
|
|
return self.llama.completion(req)
|
|
else:
|
|
raise ValueError(f"Unexpected task type {type(req)}")
|
|
|
|
|
|
def init_model_cb(
|
|
config: MetaReferenceInferenceConfig,
|
|
model_id: str,
|
|
llama_model: Model,
|
|
):
|
|
llama = Llama.build(config, model_id, llama_model)
|
|
return ModelRunner(llama)
|
|
|
|
|
|
class LlamaModelParallelGenerator:
|
|
"""
|
|
This abstraction exists so
|
|
- we can run model parallel code without needing to run the CLIs via torchrun
|
|
- this also enables use model parallel code within a notebook context.
|
|
|
|
A Context Manager is used to ensure that the model parallel process is started and stopped
|
|
correctly. This does make the ergonomics a little awkward, because it isn't immediately
|
|
clear at the callsite why we need to use a context manager.
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
config: MetaReferenceInferenceConfig,
|
|
model_id: str,
|
|
llama_model: Model,
|
|
):
|
|
self.config = config
|
|
self.model_id = model_id
|
|
self.llama_model = llama_model
|
|
|
|
# this is a hack because Agent's loop uses this to tokenize and check if input is too long
|
|
# while the tool-use loop is going
|
|
resolved_model = resolve_model(model_id)
|
|
if resolved_model is None:
|
|
# if the model is not a native llama model, get the default checkpoint_dir based on model id
|
|
checkpoint_dir = model_checkpoint_dir(model_id)
|
|
else:
|
|
# if the model is a native llama model, get the default checkpoint_dir based on model core_model_id value
|
|
checkpoint_dir = model_checkpoint_dir(resolved_model.descriptor())
|
|
tokenizer_path = os.path.join(checkpoint_dir, "tokenizer.model")
|
|
self.formatter = ChatFormat(Tokenizer(tokenizer_path))
|
|
|
|
def start(self):
|
|
self.__enter__()
|
|
|
|
def stop(self):
|
|
self.__exit__(None, None, None)
|
|
|
|
def __enter__(self):
|
|
model_parallel_size = self.llama_model.pth_file_count
|
|
|
|
self.group = ModelParallelProcessGroup(
|
|
model_parallel_size,
|
|
init_model_cb=partial(
|
|
init_model_cb, self.config, self.model_id, self.llama_model
|
|
),
|
|
)
|
|
self.group.start()
|
|
return self
|
|
|
|
def __exit__(self, exc_type, exc_value, exc_traceback):
|
|
self.group.stop()
|
|
|
|
def completion(
|
|
self,
|
|
request: CompletionRequest,
|
|
) -> Generator:
|
|
req_obj = deepcopy(request)
|
|
gen = self.group.run_inference(req_obj)
|
|
yield from gen
|
|
|
|
def chat_completion(
|
|
self,
|
|
request: ChatCompletionRequest,
|
|
) -> Generator:
|
|
req_obj = deepcopy(request)
|
|
gen = self.group.run_inference(req_obj)
|
|
yield from gen
|