Support for Llama3.2 models and Swift SDK (#98)

2025-12-04 10:10:36 +00:00 · 2024-09-25 10:29:58 -07:00 · 2024-09-25 10:29:58 -07:00 · 56aed59eb4
commit 56aed59eb4
parent 95abbf576b
56 changed files with 3745 additions and 630 deletions
--- a/llama_stack/providers/impls/meta_reference/agents/agent_instance.py
+++ b/llama_stack/providers/impls/meta_reference/agents/agent_instance.py
@ -398,7 +398,11 @@ class ChatAgent(ShieldRunnerMixin):
                color = "yellow"
            else:
                color = None
-            cprint(f"{str(msg)}", color=color)
+            if len(str(msg)) > 1000:
+                msg_str = f"{str(msg)[:500]}...<more>...{str(msg)[-500:]}"
+            else:
+                msg_str = str(msg)
+            cprint(f"{msg_str}", color=color)

            step_id = str(uuid.uuid4())
            yield AgentTurnResponseStreamChunk(
@ -466,6 +470,13 @@ class ChatAgent(ShieldRunnerMixin):
                        stop_reason = event.stop_reason

            stop_reason = stop_reason or StopReason.out_of_tokens
+
+            # If tool calls are parsed successfully,
+            # if content is not made null the tool call str will also be in the content
+            # and tokens will have tool call syntax included twice
+            if tool_calls:
+                content = ""
+
            message = CompletionMessage(
                content=content,
                stop_reason=stop_reason,
--- a/llama_stack/providers/impls/meta_reference/agents/rag/context_retriever.py
+++ b/llama_stack/providers/impls/meta_reference/agents/rag/context_retriever.py
@ -10,13 +10,14 @@ from jinja2 import Template
 from llama_models.llama3.api import *  # noqa: F403


+from termcolor import cprint  # noqa: F401
+
 from llama_stack.apis.agents import (
    DefaultMemoryQueryGeneratorConfig,
    LLMMemoryQueryGeneratorConfig,
    MemoryQueryGenerator,
    MemoryQueryGeneratorConfig,
 )
-from termcolor import cprint  # noqa: F401
 from llama_stack.apis.inference import *  # noqa: F403


--- a/llama_stack/providers/impls/meta_reference/inference/config.py
+++ b/llama_stack/providers/impls/meta_reference/inference/config.py
@ -16,7 +16,7 @@ from pydantic import BaseModel, Field, field_validator

 class MetaReferenceImplConfig(BaseModel):
    model: str = Field(
-        default="Meta-Llama3.1-8B-Instruct",
+        default="Llama3.1-8B-Instruct",
        description="Model descriptor from `llama model list`",
    )
    quantization: Optional[QuantizationConfig] = None
@ -30,7 +30,7 @@ class MetaReferenceImplConfig(BaseModel):
        permitted_models = [
            m.descriptor()
            for m in all_registered_models()
-            if m.model_family == ModelFamily.llama3_1
+            if m.model_family in {ModelFamily.llama3_1, ModelFamily.llama3_2}
            or m.core_model_id == CoreModelId.llama_guard_3_8b
        ]
        if model not in permitted_models:
@ -42,14 +42,9 @@ class MetaReferenceImplConfig(BaseModel):

    @property
    def model_parallel_size(self) -> int:
-        # HUGE HACK ALERT: this will be fixed when we move inference configuration
+        # HACK ALERT: this will be fixed when we move inference configuration
        # to ModelsRegistry and we can explicitly ask for `model_parallel_size`
        # as configuration there
-        gpu_count = 1
        resolved = resolve_model(self.model)
        assert resolved is not None
-        descriptor = resolved.descriptor().lower()
-        if "-70b" in descriptor or "-405b" in descriptor:
-            gpu_count = 8
-
-        return gpu_count
+        return resolved.pth_file_count
--- a/llama_stack/providers/impls/meta_reference/inference/generation.py
+++ b/llama_stack/providers/impls/meta_reference/inference/generation.py
@ -24,21 +24,31 @@ from fairscale.nn.model_parallel.initialize import (
 )
 from llama_models.llama3.api.args import ModelArgs
 from llama_models.llama3.api.chat_format import ChatFormat, ModelInput
-from llama_models.llama3.api.datatypes import Message, ToolPromptFormat
+from llama_models.llama3.api.datatypes import (
+    InterleavedTextMedia,
+    Message,
+    ToolPromptFormat,
+)
 from llama_models.llama3.api.tokenizer import Tokenizer
 from llama_models.llama3.reference_impl.model import Transformer
+from llama_models.llama3.reference_impl.multimodal.model import (
+    CrossAttentionTransformer,
+)
 from llama_models.sku_list import resolve_model
+from termcolor import cprint
+
 from llama_stack.apis.inference import QuantizationType

 from llama_stack.distribution.utils.model_utils import model_local_dir
-from termcolor import cprint

 from .config import MetaReferenceImplConfig


 def model_checkpoint_dir(model) -> str:
    checkpoint_dir = Path(model_local_dir(model.descriptor()))
-    if not Path(checkpoint_dir / "consolidated.00.pth").exists():
+
+    paths = [Path(checkpoint_dir / f"consolidated.{ext}") for ext in ["pth", "00.pth"]]
+    if not any(p.exists() for p in paths):
        checkpoint_dir = checkpoint_dir / "original"

    assert checkpoint_dir.exists(), (
@ -134,7 +144,11 @@ class Llama:
            # load on CPU in bf16 so that fp8 conversion does not find an
            # unexpected (fp32, e.g.) datatype
            torch.set_default_tensor_type(torch.BFloat16Tensor)
-            model = Transformer(model_args)
+            if model_args.vision_chunk_size > 0:
+                model = CrossAttentionTransformer(model_args)
+                model.setup_cache(model_args.max_batch_size, torch.bfloat16)
+            else:
+                model = Transformer(model_args)
            model.load_state_dict(state_dict, strict=False)
            model = convert_to_quantized_model(model, config)
        else:
@ -142,7 +156,11 @@ class Llama:
                torch.set_default_tensor_type(torch.cuda.BFloat16Tensor)
            else:
                torch.set_default_tensor_type(torch.cuda.HalfTensor)
-            model = Transformer(model_args)
+            if model_args.vision_chunk_size > 0:
+                model = CrossAttentionTransformer(model_args)
+                model.setup_cache(model_args.max_batch_size, torch.bfloat16)
+            else:
+                model = Transformer(model_args)
            model.load_state_dict(state_dict, strict=False)

        print(f"Loaded in {time.time() - start_time:.2f} seconds")
@ -167,7 +185,11 @@ class Llama:
    ) -> Generator:
        params = self.model.params

-        # cprint("Input to model -> " + self.tokenizer.decode(model_input.tokens), "red")
+        # input_tokens = [
+        #     self.formatter.vision_token if t == 128256 else t
+        #     for t in model_input.tokens
+        # ]
+        # cprint("Input to model -> " + self.tokenizer.decode(input_tokens), "red")
        prompt_tokens = [model_input.tokens]

        bsz = 1
@ -183,6 +205,21 @@ class Llama:
            return

        total_len = min(max_gen_len + max_prompt_len, params.max_seq_len)
+
+        is_vision = isinstance(self.model, CrossAttentionTransformer)
+        if is_vision:
+            images = model_input.vision.images if model_input.vision is not None else []
+            mask = model_input.vision.mask if model_input.vision is not None else []
+
+            # the method works for bsz > 1 so add a batch dimension
+            xattn_caches, cross_attention_masks, full_text_row_masked_out_mask = (
+                self.model.compute_vision_tokens_masks(
+                    batch_images=[images],
+                    batch_masks=[mask],
+                    total_len=total_len,
+                )
+            )
+
        pad_id = self.tokenizer.pad_id
        tokens = torch.full((bsz, total_len), pad_id, dtype=torch.long, device="cuda")
        for k, t in enumerate(prompt_tokens):
@ -206,7 +243,19 @@ class Llama:
        stop_tokens = torch.tensor(self.tokenizer.stop_tokens)

        for cur_pos in range(min_prompt_len, total_len):
-            logits = self.model.forward(tokens[:, prev_pos:cur_pos], prev_pos)
+            if is_vision:
+                position_ids = torch.arange(
+                    prev_pos, cur_pos, dtype=torch.long, device="cuda"
+                )
+                logits = self.model.forward(
+                    position_ids,
+                    tokens,
+                    cross_attention_masks,
+                    full_text_row_masked_out_mask,
+                    xattn_caches,
+                )
+            else:
+                logits = self.model.forward(tokens[:, prev_pos:cur_pos], prev_pos)

            if temperature > 0:
                probs = torch.softmax(logits[:, -1] / temperature, dim=-1)
@ -222,6 +271,18 @@ class Llama:
            tokens[:, cur_pos] = next_token

            target = tokens[:, prev_pos + 1 : cur_pos + 1]
+            if is_vision:
+                # the logits space (num_classes) is designed to never contain a media_token
+                # however our input token stream does contain them. we need to nuke them here
+                # or else the CUDA kernels will crash with an illegal memory access
+                vision_tokens = [self.tokenizer.special_tokens["<|image|>"], 128256]
+                masks = [target.eq(t) for t in vision_tokens]
+                if len(masks) > 1:
+                    mask = torch.logical_or(*masks)
+                else:
+                    mask = masks[0]
+                target[mask] = 0
+
            if logprobs:
                token_logprobs[:, prev_pos + 1 : cur_pos + 1] = -F.cross_entropy(
                    input=logits.transpose(1, 2),
@ -248,7 +309,7 @@ class Llama:

    def text_completion(
        self,
-        prompt: str,
+        content: InterleavedTextMedia,
        temperature: float = 0.6,
        top_p: float = 0.9,
        max_gen_len: Optional[int] = None,
@ -262,10 +323,10 @@ class Llama:
        ):
            max_gen_len = self.model.params.max_seq_len - 1

-        prompt_tokens = self.tokenizer.encode(prompt, bos=True, eos=False)
+        model_input = self.formatter.encode_content(content)

        yield from self.generate(
-            model_input=ModelInput(tokens=prompt_tokens),
+            model_input=model_input,
            max_gen_len=max_gen_len,
            temperature=temperature,
            top_p=top_p,
--- a/llama_stack/providers/impls/meta_reference/inference/inference.py
+++ b/llama_stack/providers/impls/meta_reference/inference/inference.py
@ -21,7 +21,9 @@ from llama_stack.apis.inference import (
    ToolCallDelta,
    ToolCallParseStatus,
 )
-from llama_stack.providers.utils.inference.prepare_messages import prepare_messages
+from llama_stack.providers.utils.inference.augment_messages import (
+    augment_messages_for_tools,
+)

 from .config import MetaReferenceImplConfig
 from .model_parallel import LlamaModelParallelGenerator
@ -57,7 +59,7 @@ class MetaReferenceInferenceImpl(Inference):
        model: str,
        messages: List[Message],
        sampling_params: Optional[SamplingParams] = SamplingParams(),
-        tools: Optional[List[ToolDefinition]] = [],
+        tools: Optional[List[ToolDefinition]] = None,
        tool_choice: Optional[ToolChoice] = ToolChoice.auto,
        tool_prompt_format: Optional[ToolPromptFormat] = ToolPromptFormat.json,
        stream: Optional[bool] = False,
@ -70,14 +72,14 @@ class MetaReferenceInferenceImpl(Inference):
            model=model,
            messages=messages,
            sampling_params=sampling_params,
-            tools=tools,
+            tools=tools or [],
            tool_choice=tool_choice,
            tool_prompt_format=tool_prompt_format,
            stream=stream,
            logprobs=logprobs,
        )

-        messages = prepare_messages(request)
+        messages = augment_messages_for_tools(request)
        model = resolve_model(request.model)
        if model is None:
            raise RuntimeError(
--- a/llama_stack/providers/impls/meta_reference/safety/init.py
+++ b/llama_stack/providers/impls/meta_reference/safety/init.py
@ -7,11 +7,11 @@
 from .config import SafetyConfig


-async def get_provider_impl(config: SafetyConfig, _deps):
+async def get_provider_impl(config: SafetyConfig, deps):
    from .safety import MetaReferenceSafetyImpl

    assert isinstance(config, SafetyConfig), f"Unexpected config type: {type(config)}"

-    impl = MetaReferenceSafetyImpl(config)
+    impl = MetaReferenceSafetyImpl(config, deps)
    await impl.initialize()
    return impl
--- a/llama_stack/providers/impls/meta_reference/safety/config.py
+++ b/llama_stack/providers/impls/meta_reference/safety/config.py
@ -31,7 +31,10 @@ class LlamaGuardShieldConfig(BaseModel):
        permitted_models = [
            m.descriptor()
            for m in safety_models()
-            if m.core_model_id == CoreModelId.llama_guard_3_8b
+            if (
+                m.core_model_id
+                in {CoreModelId.llama_guard_3_8b, CoreModelId.llama_guard_3_11b_vision}
+            )
        ]
        if model not in permitted_models:
            raise ValueError(
--- a/llama_stack/providers/impls/meta_reference/safety/safety.py
+++ b/llama_stack/providers/impls/meta_reference/safety/safety.py
@ -7,8 +7,10 @@
 from llama_models.sku_list import resolve_model

 from llama_stack.distribution.utils.model_utils import model_local_dir
+from llama_stack.apis.inference import *  # noqa: F403
 from llama_stack.apis.safety import *  # noqa: F403
 from llama_models.llama3.api.datatypes import *  # noqa: F403
+from llama_stack.distribution.datatypes import Api

 from llama_stack.providers.impls.meta_reference.safety.shields.base import (
    OnViolationAction,
@ -34,20 +36,11 @@ def resolve_and_get_path(model_name: str) -> str:


 class MetaReferenceSafetyImpl(Safety):
-    def __init__(self, config: SafetyConfig) -> None:
+    def __init__(self, config: SafetyConfig, deps) -> None:
        self.config = config
+        self.inference_api = deps[Api.inference]

    async def initialize(self) -> None:
-        shield_cfg = self.config.llama_guard_shield
-        if shield_cfg is not None:
-            model_dir = resolve_and_get_path(shield_cfg.model)
-            _ = LlamaGuardShield.instance(
-                model_dir=model_dir,
-                excluded_categories=shield_cfg.excluded_categories,
-                disable_input_check=shield_cfg.disable_input_check,
-                disable_output_check=shield_cfg.disable_output_check,
-            )
-
        shield_cfg = self.config.prompt_guard_shield
        if shield_cfg is not None:
            model_dir = resolve_and_get_path(shield_cfg.model)
@ -91,11 +84,18 @@ class MetaReferenceSafetyImpl(Safety):
    def get_shield_impl(self, typ: MetaReferenceShieldType) -> ShieldBase:
        cfg = self.config
        if typ == MetaReferenceShieldType.llama_guard:
+            cfg = cfg.llama_guard_shield
            assert (
-                cfg.llama_guard_shield is not None
+                cfg is not None
            ), "Cannot use LlamaGuardShield since not present in config"
-            model_dir = resolve_and_get_path(cfg.llama_guard_shield.model)
-            return LlamaGuardShield.instance(model_dir=model_dir)
+
+            return LlamaGuardShield(
+                model=cfg.model,
+                inference_api=self.inference_api,
+                excluded_categories=cfg.excluded_categories,
+                disable_input_check=cfg.disable_input_check,
+                disable_output_check=cfg.disable_output_check,
+            )
        elif typ == MetaReferenceShieldType.jailbreak_shield:
            assert (
                cfg.prompt_guard_shield is not None
--- a/llama_stack/providers/impls/meta_reference/safety/shields/llama_guard.py
+++ b/llama_stack/providers/impls/meta_reference/safety/shields/llama_guard.py
@ -9,9 +9,8 @@ import re
 from string import Template
 from typing import List, Optional

-import torch
 from llama_models.llama3.api.datatypes import Message, Role
-from transformers import AutoModelForCausalLM, AutoTokenizer
+from llama_stack.apis.inference import *  # noqa: F403

 from .base import CANNED_RESPONSE_TEXT, OnViolationAction, ShieldBase, ShieldResponse

@ -100,39 +99,17 @@ PROMPT_TEMPLATE = Template(


 class LlamaGuardShield(ShieldBase):
-    @staticmethod
-    def instance(
-        on_violation_action=OnViolationAction.RAISE,
-        model_dir: str = None,
-        excluded_categories: List[str] = None,
-        disable_input_check: bool = False,
-        disable_output_check: bool = False,
-    ) -> "LlamaGuardShield":
-        global _INSTANCE
-        if _INSTANCE is None:
-            _INSTANCE = LlamaGuardShield(
-                on_violation_action,
-                model_dir,
-                excluded_categories,
-                disable_input_check,
-                disable_output_check,
-            )
-        return _INSTANCE
-
    def __init__(
        self,
-        on_violation_action: OnViolationAction = OnViolationAction.RAISE,
-        model_dir: str = None,
+        model: str,
+        inference_api: Inference,
        excluded_categories: List[str] = None,
        disable_input_check: bool = False,
        disable_output_check: bool = False,
+        on_violation_action: OnViolationAction = OnViolationAction.RAISE,
    ):
        super().__init__(on_violation_action)

-        dtype = torch.bfloat16
-
-        assert model_dir is not None, "Llama Guard model_dir is None"
-
        if excluded_categories is None:
            excluded_categories = []

@ -140,18 +117,12 @@ class LlamaGuardShield(ShieldBase):
            x in SAFETY_CATEGORIES_TO_CODE_MAP.values() for x in excluded_categories
        ), "Invalid categories in excluded categories. Expected format is ['S1', 'S2', ..]"

-        self.device = "cuda"
+        self.model = model
+        self.inference_api = inference_api
        self.excluded_categories = excluded_categories
        self.disable_input_check = disable_input_check
        self.disable_output_check = disable_output_check

-        # load model
-        torch_dtype = torch.bfloat16
-        self.tokenizer = AutoTokenizer.from_pretrained(model_dir)
-        self.model = AutoModelForCausalLM.from_pretrained(
-            model_dir, torch_dtype=torch_dtype, device_map=self.device
-        )
-
    def check_unsafe_response(self, response: str) -> Optional[str]:
        match = re.match(r"^unsafe\n(.*)$", response)
        if match:
@ -212,26 +183,21 @@ class LlamaGuardShield(ShieldBase):
            )
        else:
            prompt = self.build_prompt(messages)
-            llama_guard_input = {
-                "role": "user",
-                "content": prompt,
-            }
-            input_ids = self.tokenizer.apply_chat_template(
-                [llama_guard_input], return_tensors="pt", tokenize=True
-            ).to(self.device)
-            prompt_len = input_ids.shape[1]
-            output = self.model.generate(
-                input_ids=input_ids,
-                max_new_tokens=20,
-                output_scores=True,
-                return_dict_in_generate=True,
-                pad_token_id=0,
-            )
-            generated_tokens = output.sequences[:, prompt_len:]

-            response = self.tokenizer.decode(
-                generated_tokens[0], skip_special_tokens=True
-            )
-            response = response.strip()
-            shield_response = self.get_shield_response(response)
+            # TODO: llama-stack inference protocol has issues with non-streaming inference code
+            content = ""
+            async for chunk in self.inference_api.chat_completion(
+                model=self.model,
+                messages=[
+                    UserMessage(content=prompt),
+                ],
+                stream=True,
+            ):
+                event = chunk.event
+                if event.event_type == ChatCompletionResponseEventType.progress:
+                    assert isinstance(event.delta, str)
+                    content += event.delta
+
+            content = content.strip()
+            shield_response = self.get_shield_response(content)
            return shield_response