mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-12-17 15:09:49 +00:00
Signed-off-by: Francisco Javier Arceo <farceo@redhat.com> chore: Enable keyword search for Milvus inline (#3073) With https://github.com/milvus-io/milvus-lite/pull/294 - Milvus Lite supports keyword search using BM25. While introducing keyword search we had explicitly disabled it for inline milvus. This PR removes the need for the check, and enables `inline::milvus` for tests. <!-- If resolving an issue, uncomment and update the line below --> <!-- Closes #[issue-number] --> Run llama stack with `inline::milvus` enabled: ``` pytest tests/integration/vector_io/test_openai_vector_stores.py::test_openai_vector_store_search_modes --stack-config=http://localhost:8321 --embedding-model=all-MiniLM-L6-v2 -v ``` ``` INFO 2025-08-07 17:06:20,932 tests.integration.conftest:64 tests: Setting DISABLE_CODE_SANDBOX=1 for macOS =========================================================================================== test session starts ============================================================================================ platform darwin -- Python 3.12.11, pytest-7.4.4, pluggy-1.5.0 -- /Users/vnarsing/miniconda3/envs/stack-client/bin/python cachedir: .pytest_cache metadata: {'Python': '3.12.11', 'Platform': 'macOS-14.7.6-arm64-arm-64bit', 'Packages': {'pytest': '7.4.4', 'pluggy': '1.5.0'}, 'Plugins': {'asyncio': '0.23.8', 'cov': '6.0.0', 'timeout': '2.2.0', 'socket': '0.7.0', 'html': '3.1.1', 'langsmith': '0.3.39', 'anyio': '4.8.0', 'metadata': '3.0.0'}} rootdir: /Users/vnarsing/go/src/github/meta-llama/llama-stack configfile: pyproject.toml plugins: asyncio-0.23.8, cov-6.0.0, timeout-2.2.0, socket-0.7.0, html-3.1.1, langsmith-0.3.39, anyio-4.8.0, metadata-3.0.0 asyncio: mode=Mode.AUTO collected 3 items tests/integration/vector_io/test_openai_vector_stores.py::test_openai_vector_store_search_modes[None-None-all-MiniLM-L6-v2-None-384-vector] PASSED [ 33%] tests/integration/vector_io/test_openai_vector_stores.py::test_openai_vector_store_search_modes[None-None-all-MiniLM-L6-v2-None-384-keyword] PASSED [ 66%] tests/integration/vector_io/test_openai_vector_stores.py::test_openai_vector_store_search_modes[None-None-all-MiniLM-L6-v2-None-384-hybrid] PASSED [100%] ============================================================================================ 3 passed in 4.75s ============================================================================================= ``` Signed-off-by: Varsha Prasad Narsing <varshaprasad96@gmail.com> Co-authored-by: Francisco Arceo <arceofrancisco@gmail.com> chore: Fixup main pre commit (#3204) build: Bump version to 0.2.18 chore: Faster npm pre-commit (#3206) Adds npm to pre-commit.yml installation and caches ui Removes node installation during pre-commit. <!-- If resolving an issue, uncomment and update the line below --> <!-- Closes #[issue-number] --> <!-- Describe the tests you ran to verify your changes with result summaries. *Provide clear instructions so the plan can be easily re-executed.* --> Signed-off-by: Francisco Javier Arceo <farceo@redhat.com> chiecking in for tonight, wip moving to agents api Signed-off-by: Francisco Javier Arceo <farceo@redhat.com> remove log Signed-off-by: Francisco Javier Arceo <farceo@redhat.com> updated Signed-off-by: Francisco Javier Arceo <farceo@redhat.com> fix: disable ui-prettier & ui-eslint (#3207) chore(pre-commit): add pre-commit hook to enforce llama_stack logger usage (#3061) This PR adds a step in pre-commit to enforce using `llama_stack` logger. Currently, various parts of the code base uses different loggers. As a custom `llama_stack` logger exist and used in the codebase, it is better to standardize its utilization. Signed-off-by: Mustafa Elbehery <melbeher@redhat.com> Co-authored-by: Matthew Farrellee <matt@cs.wisc.edu> fix: fix ```openai_embeddings``` for asymmetric embedding NIMs (#3205) NVIDIA asymmetric embedding models (e.g., `nvidia/llama-3.2-nv-embedqa-1b-v2`) require an `input_type` parameter not present in the standard OpenAI embeddings API. This PR adds the `input_type="query"` as default and updates the documentation to suggest using the `embedding` API for passage embeddings. <!-- If resolving an issue, uncomment and update the line below --> Resolves #2892 ``` pytest -s -v tests/integration/inference/test_openai_embeddings.py --stack-config="inference=nvidia" --embedding-model="nvidia/llama-3.2-nv-embedqa-1b-v2" --env NVIDIA_API_KEY={nvidia_api_key} --env NVIDIA_BASE_URL="https://integrate.api.nvidia.com" ``` cleaning up Signed-off-by: Francisco Javier Arceo <farceo@redhat.com> updating session manager to cache messages locally Signed-off-by: Francisco Javier Arceo <farceo@redhat.com> fix linter Signed-off-by: Francisco Javier Arceo <farceo@redhat.com> more cleanup Signed-off-by: Francisco Javier Arceo <farceo@redhat.com>
316 lines
8.9 KiB
Python
316 lines
8.9 KiB
Python
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
# All rights reserved.
|
|
#
|
|
# This source code is licensed under the terms described in the LICENSE file in
|
|
# the root directory of this source tree.
|
|
|
|
# type: ignore
|
|
import collections
|
|
|
|
from llama_stack.log import get_logger
|
|
|
|
log = get_logger(name=__name__, category="llama")
|
|
|
|
try:
|
|
import fbgemm_gpu.experimental.gen_ai # noqa: F401
|
|
|
|
log.info("Using efficient FP8 or INT4 operators in FBGEMM.")
|
|
except ImportError:
|
|
log.error("No efficient FP8 or INT4 operators. Please install FBGEMM.")
|
|
raise
|
|
|
|
import torch
|
|
from torch import Tensor, nn
|
|
|
|
|
|
class Fp8ScaledWeights:
|
|
# TODO: Ugly trick so torch allows us to replace parameters
|
|
# with our custom Fp8Weights instance. Do this properly.
|
|
@property
|
|
def __class__(self) -> type[nn.parameter.Parameter]:
|
|
return nn.Parameter
|
|
|
|
@property
|
|
def grad_fn(self) -> None:
|
|
return None
|
|
|
|
|
|
# pyre-fixme[4]: Attribute annotation cannot be `Any`.
|
|
# pyre-fixme[2]: Parameter annotation cannot be `Any`.
|
|
class Fp8RowwiseWeights(
|
|
Fp8ScaledWeights,
|
|
collections.namedtuple(
|
|
"Fp8RowwiseWeights",
|
|
["weight", "scale", "shape", "activation_scale_ub"],
|
|
),
|
|
):
|
|
pass
|
|
|
|
|
|
class Int4ScaledWeights:
|
|
# TODO: Ugly trick so torch allows us to replace parameters
|
|
# with our custom Int4Weights instance. Do this properly.
|
|
@property
|
|
def __class__(self) -> type[nn.parameter.Parameter]:
|
|
return nn.Parameter
|
|
|
|
@property
|
|
def grad_fn(self) -> None:
|
|
return None
|
|
|
|
|
|
# pyre-fixme[4]: Attribute annotation cannot be `Any`.
|
|
# pyre-fixme[2]: Parameter annotation cannot be `Any`.
|
|
class Int4Weights(
|
|
Int4ScaledWeights,
|
|
collections.namedtuple(
|
|
"Int4Weights",
|
|
["weight", "scale", "zero_point", "shape"],
|
|
),
|
|
):
|
|
pass
|
|
|
|
|
|
def int4_row_quantize(
|
|
x: torch.Tensor,
|
|
group_size: int = 128,
|
|
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
|
|
n_bit = 4 # Number of target bits.
|
|
to_quant = x.reshape(-1, group_size).to(torch.float)
|
|
|
|
max_val = to_quant.amax(dim=1, keepdim=True)
|
|
min_val = to_quant.amin(dim=1, keepdim=True)
|
|
max_int = 2**n_bit - 1
|
|
min_int = 0
|
|
scales = (max_val - min_val).clamp(min=1e-6) / max_int
|
|
|
|
zeros = min_val + scales * (2 ** (n_bit - 1))
|
|
|
|
out = to_quant.sub(min_val).div(scales).round().clamp_(min_int, max_int)
|
|
|
|
# Recenter output and move to int8.
|
|
out = (out - 2 ** (n_bit - 1)).to(dtype=torch.int8).reshape(x.shape)
|
|
|
|
# Cutlass expects column major layout for scale and zero point,
|
|
# so we transpose here and make them contiguous.
|
|
scales = scales.view(x.shape[0], -1).t().contiguous()
|
|
zeros = zeros.view(x.shape[0], -1).t().contiguous()
|
|
|
|
return out, scales, zeros
|
|
|
|
|
|
def pack_int4(x: torch.Tensor) -> torch.Tensor:
|
|
# Given int8 x, pack adjacent int4 values into a single int8.
|
|
low_x = x[:, ::2]
|
|
high_x = x[:, 1::2]
|
|
|
|
# High bits need to left shift, this also masks off extra bits.
|
|
high_x = torch.bitwise_left_shift(high_x, 4)
|
|
# Low bits need to have sign bits removed.
|
|
low_x = torch.bitwise_and(low_x, 0xF)
|
|
|
|
# Recombine into a single value with bitwise or.
|
|
return torch.bitwise_or(low_x, high_x).contiguous()
|
|
|
|
|
|
def bmm_nt(
|
|
x: Tensor,
|
|
w: Fp8RowwiseWeights | Int4Weights,
|
|
num_tokens: Tensor | None = None,
|
|
) -> Tensor:
|
|
if isinstance(w, Fp8ScaledWeights):
|
|
xq, x_scale = torch.ops.fbgemm.quantize_fp8_per_row(x, num_tokens, w.activation_scale_ub)
|
|
return torch.ops.fbgemm.f8f8bf16_rowwise_batched(xq, w.weight, x_scale, w.scale)
|
|
elif isinstance(w, Int4ScaledWeights):
|
|
return torch.ops.fbgemm.bf16i4bf16_rowwise_batched(x, w.weight, w.scale, w.zero_point)
|
|
else:
|
|
raise ValueError("Unsupported quantization type")
|
|
|
|
|
|
def ffn_swiglu(
|
|
x: Tensor,
|
|
w1: Fp8RowwiseWeights | Int4Weights,
|
|
w3: Fp8RowwiseWeights | Int4Weights,
|
|
w2: Fp8RowwiseWeights | Int4Weights,
|
|
num_tokens: Tensor | None = None,
|
|
is_memory_bounded: bool = False,
|
|
) -> Tensor:
|
|
if (isinstance(w1, Fp8ScaledWeights) and isinstance(w3, Fp8ScaledWeights) and isinstance(w2, Fp8ScaledWeights)) or (
|
|
isinstance(w1, Int4ScaledWeights) and isinstance(w3, Int4ScaledWeights) and isinstance(w2, Int4ScaledWeights)
|
|
):
|
|
return ffn_swiglu_dynamic(x, w1, w3, w2, w1.activation_scale_ub, num_tokens, is_memory_bounded)
|
|
|
|
(B, T, D) = x.shape # noqa: N806
|
|
(HD_L, D_) = w1.shape # noqa: N806
|
|
assert D_ == D
|
|
|
|
assert isinstance(w1, Tensor)
|
|
assert isinstance(w3, Tensor)
|
|
x1 = x.view(B * T, D) @ w1.T
|
|
x2 = x.view(B * T, D) @ w3.T
|
|
z = torch.nn.functional.silu(x1) * x2
|
|
del x1, x2
|
|
assert isinstance(w2, Tensor)
|
|
return (z @ w2.T).view(B, T, D)
|
|
|
|
|
|
@torch.inference_mode()
|
|
def quantize_fp8(
|
|
w: Tensor,
|
|
fp8_activation_scale_ub: float,
|
|
output_device: torch.device | None = None,
|
|
) -> Fp8RowwiseWeights:
|
|
"""Quantize [n, k] weight tensor.
|
|
|
|
Args:
|
|
w (Tensor): [n, k] input high precision tensor to quantize.
|
|
fp8_activation_scale_ub (float): Upper bound for activation max.
|
|
"""
|
|
activation_scale_ub = torch.tensor(
|
|
[fp8_activation_scale_ub],
|
|
dtype=torch.float,
|
|
device=output_device,
|
|
)
|
|
wq, w_scale = torch.ops.fbgemm.quantize_fp8_per_row(w)
|
|
del w
|
|
return Fp8RowwiseWeights(
|
|
weight=wq,
|
|
scale=w_scale,
|
|
shape=wq.shape,
|
|
activation_scale_ub=activation_scale_ub,
|
|
)
|
|
|
|
|
|
@torch.inference_mode()
|
|
def quantize_int4(
|
|
w: Tensor,
|
|
output_device: torch.device | None = None,
|
|
) -> Int4Weights:
|
|
"""Quantize [n, k/2] weight tensor.
|
|
|
|
Args:
|
|
w (Tensor): [n, k/2] input high precision tensor to quantize.
|
|
"""
|
|
if w.ndim >= 3:
|
|
wq, scale, zero_point = zip(*[int4_row_quantize(i) for i in w], strict=False)
|
|
wq = torch.stack([pack_int4(i) for i in wq], dim=0)
|
|
scale = torch.stack(scale, dim=0)
|
|
zero_point = torch.stack(zero_point, dim=0)
|
|
else:
|
|
wq, scale, zero_point = int4_row_quantize(w)
|
|
wq = pack_int4(wq)
|
|
del w
|
|
return Int4Weights(
|
|
weight=wq.to(output_device),
|
|
scale=scale.to(output_device),
|
|
zero_point=zero_point.to(output_device),
|
|
shape=wq.shape,
|
|
)
|
|
|
|
|
|
@torch.inference_mode()
|
|
def load_fp8(
|
|
w: Tensor,
|
|
w_scale: Tensor,
|
|
fp8_activation_scale_ub: float,
|
|
output_device: torch.device | None = None,
|
|
) -> Fp8RowwiseWeights:
|
|
"""Load FP8 [n, k] weight tensor.
|
|
|
|
Args:
|
|
w (Tensor): [n, k] input FP8.
|
|
fp8_activation_scale_ub (float): Upper bound for activation max.
|
|
"""
|
|
activation_scale_ub = torch.tensor(
|
|
[fp8_activation_scale_ub],
|
|
dtype=torch.float,
|
|
device=output_device,
|
|
)
|
|
return Fp8RowwiseWeights(
|
|
weight=w.to(torch.float8_e4m3fn).to(device=output_device),
|
|
scale=w_scale.to(device=output_device),
|
|
shape=w.shape,
|
|
activation_scale_ub=activation_scale_ub,
|
|
)
|
|
|
|
|
|
@torch.inference_mode()
|
|
def load_int4(
|
|
w: Tensor,
|
|
scale: Tensor,
|
|
zero_point: Tensor,
|
|
output_device: torch.device | None = None,
|
|
) -> Int4Weights:
|
|
"""Load INT4 [n, k/2] weight tensor.
|
|
|
|
Args:
|
|
w (Tensor): [n, k/2] input INT4.
|
|
"""
|
|
return Int4Weights(
|
|
weight=w.to(torch.int8).to(device=output_device),
|
|
scale=scale.to(device=output_device),
|
|
zero_point=zero_point.to(device=output_device),
|
|
shape=w.shape,
|
|
)
|
|
|
|
|
|
def fc_dynamic(
|
|
x: Tensor,
|
|
w: Fp8RowwiseWeights | Int4Weights,
|
|
activation_scale_ub: Tensor | None = None,
|
|
num_tokens: Tensor | None = None,
|
|
is_memory_bounded: bool = False,
|
|
) -> Tensor:
|
|
"""
|
|
Single w8a8 fc layer with dynamic row-wise scaling, or w4a16 fc layer with dyanmic row-wise scaling
|
|
"""
|
|
if isinstance(w, Int4Weights):
|
|
y = torch.ops.fbgemm.bf16i4bf16_rowwise(x, w.weight, w.scale, w.zero_point)
|
|
else:
|
|
xq, x_scale = torch.ops.fbgemm.quantize_fp8_per_row(x, num_tokens, activation_scale_ub)
|
|
y = torch.ops.fbgemm.f8f8bf16_rowwise(xq, w.weight, x_scale, w.scale, use_fast_accum=True)
|
|
del xq
|
|
return y
|
|
|
|
|
|
def ffn_swiglu_dynamic(
|
|
x: Tensor,
|
|
w1: Fp8RowwiseWeights | Int4Weights,
|
|
w3: Fp8RowwiseWeights | Int4Weights,
|
|
w2: Fp8RowwiseWeights | Int4Weights,
|
|
activation_scale_ub: Tensor | None = None,
|
|
num_tokens: Tensor | None = None,
|
|
is_memory_bounded: bool = False,
|
|
) -> Tensor:
|
|
assert x.dim() == 3 or x.dim() == 2
|
|
if x.dim() == 3:
|
|
(B, T, D) = x.shape # noqa: N806
|
|
else:
|
|
(T, D) = x.shape # noqa: N806
|
|
B = 1 # noqa: N806
|
|
|
|
HD_L = w1.shape[0] # noqa: N806
|
|
assert HD_L == w3.shape[0]
|
|
x1 = fc_dynamic(
|
|
x.view(B * T, D),
|
|
w1,
|
|
activation_scale_ub,
|
|
num_tokens,
|
|
is_memory_bounded,
|
|
)
|
|
x2 = fc_dynamic(
|
|
x.view(B * T, D),
|
|
w3,
|
|
activation_scale_ub,
|
|
num_tokens,
|
|
is_memory_bounded,
|
|
)
|
|
z = torch.nn.functional.silu(x1) * x2
|
|
del x1, x2
|
|
|
|
z_ = fc_dynamic(z, w2, activation_scale_ub, num_tokens, is_memory_bounded)
|
|
|
|
if x.dim() == 3:
|
|
return z_.view(B, T, D)
|
|
else:
|
|
return z_
|