mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-12-17 18:59:48 +00:00
Signed-off-by: Francisco Javier Arceo <farceo@redhat.com> chore: Enable keyword search for Milvus inline (#3073) With https://github.com/milvus-io/milvus-lite/pull/294 - Milvus Lite supports keyword search using BM25. While introducing keyword search we had explicitly disabled it for inline milvus. This PR removes the need for the check, and enables `inline::milvus` for tests. <!-- If resolving an issue, uncomment and update the line below --> <!-- Closes #[issue-number] --> Run llama stack with `inline::milvus` enabled: ``` pytest tests/integration/vector_io/test_openai_vector_stores.py::test_openai_vector_store_search_modes --stack-config=http://localhost:8321 --embedding-model=all-MiniLM-L6-v2 -v ``` ``` INFO 2025-08-07 17:06:20,932 tests.integration.conftest:64 tests: Setting DISABLE_CODE_SANDBOX=1 for macOS =========================================================================================== test session starts ============================================================================================ platform darwin -- Python 3.12.11, pytest-7.4.4, pluggy-1.5.0 -- /Users/vnarsing/miniconda3/envs/stack-client/bin/python cachedir: .pytest_cache metadata: {'Python': '3.12.11', 'Platform': 'macOS-14.7.6-arm64-arm-64bit', 'Packages': {'pytest': '7.4.4', 'pluggy': '1.5.0'}, 'Plugins': {'asyncio': '0.23.8', 'cov': '6.0.0', 'timeout': '2.2.0', 'socket': '0.7.0', 'html': '3.1.1', 'langsmith': '0.3.39', 'anyio': '4.8.0', 'metadata': '3.0.0'}} rootdir: /Users/vnarsing/go/src/github/meta-llama/llama-stack configfile: pyproject.toml plugins: asyncio-0.23.8, cov-6.0.0, timeout-2.2.0, socket-0.7.0, html-3.1.1, langsmith-0.3.39, anyio-4.8.0, metadata-3.0.0 asyncio: mode=Mode.AUTO collected 3 items tests/integration/vector_io/test_openai_vector_stores.py::test_openai_vector_store_search_modes[None-None-all-MiniLM-L6-v2-None-384-vector] PASSED [ 33%] tests/integration/vector_io/test_openai_vector_stores.py::test_openai_vector_store_search_modes[None-None-all-MiniLM-L6-v2-None-384-keyword] PASSED [ 66%] tests/integration/vector_io/test_openai_vector_stores.py::test_openai_vector_store_search_modes[None-None-all-MiniLM-L6-v2-None-384-hybrid] PASSED [100%] ============================================================================================ 3 passed in 4.75s ============================================================================================= ``` Signed-off-by: Varsha Prasad Narsing <varshaprasad96@gmail.com> Co-authored-by: Francisco Arceo <arceofrancisco@gmail.com> chore: Fixup main pre commit (#3204) build: Bump version to 0.2.18 chore: Faster npm pre-commit (#3206) Adds npm to pre-commit.yml installation and caches ui Removes node installation during pre-commit. <!-- If resolving an issue, uncomment and update the line below --> <!-- Closes #[issue-number] --> <!-- Describe the tests you ran to verify your changes with result summaries. *Provide clear instructions so the plan can be easily re-executed.* --> Signed-off-by: Francisco Javier Arceo <farceo@redhat.com> chiecking in for tonight, wip moving to agents api Signed-off-by: Francisco Javier Arceo <farceo@redhat.com> remove log Signed-off-by: Francisco Javier Arceo <farceo@redhat.com> updated Signed-off-by: Francisco Javier Arceo <farceo@redhat.com> fix: disable ui-prettier & ui-eslint (#3207) chore(pre-commit): add pre-commit hook to enforce llama_stack logger usage (#3061) This PR adds a step in pre-commit to enforce using `llama_stack` logger. Currently, various parts of the code base uses different loggers. As a custom `llama_stack` logger exist and used in the codebase, it is better to standardize its utilization. Signed-off-by: Mustafa Elbehery <melbeher@redhat.com> Co-authored-by: Matthew Farrellee <matt@cs.wisc.edu> fix: fix ```openai_embeddings``` for asymmetric embedding NIMs (#3205) NVIDIA asymmetric embedding models (e.g., `nvidia/llama-3.2-nv-embedqa-1b-v2`) require an `input_type` parameter not present in the standard OpenAI embeddings API. This PR adds the `input_type="query"` as default and updates the documentation to suggest using the `embedding` API for passage embeddings. <!-- If resolving an issue, uncomment and update the line below --> Resolves #2892 ``` pytest -s -v tests/integration/inference/test_openai_embeddings.py --stack-config="inference=nvidia" --embedding-model="nvidia/llama-3.2-nv-embedqa-1b-v2" --env NVIDIA_API_KEY={nvidia_api_key} --env NVIDIA_BASE_URL="https://integrate.api.nvidia.com" ``` cleaning up Signed-off-by: Francisco Javier Arceo <farceo@redhat.com> updating session manager to cache messages locally Signed-off-by: Francisco Javier Arceo <farceo@redhat.com> fix linter Signed-off-by: Francisco Javier Arceo <farceo@redhat.com> more cleanup Signed-off-by: Francisco Javier Arceo <farceo@redhat.com>
180 lines
6.4 KiB
Python
180 lines
6.4 KiB
Python
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
# All rights reserved.
|
|
#
|
|
# This source code is licensed under the terms described in the LICENSE file in
|
|
# the root directory of this source tree.
|
|
|
|
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
# All rights reserved.
|
|
#
|
|
# This source code is licensed under the terms described in the LICENSE file in
|
|
# top-level folder for each specific model found within the models/ directory at
|
|
# the top-level of this source tree.
|
|
|
|
# Copyright (c) Meta Platforms, Inc. and its affiliates.
|
|
import math
|
|
|
|
import torch
|
|
import torch.nn.functional as F
|
|
|
|
from llama_stack.log import get_logger
|
|
|
|
from .utils import get_negative_inf_value, to_2tuple
|
|
|
|
logger = get_logger(name=__name__, category="models::llama")
|
|
|
|
|
|
def resize_local_position_embedding(orig_pos_embed, grid_size):
|
|
"""
|
|
Resize position embedding for vision encoder.
|
|
Original position embedding is [n_tiles * n_tiles + 1, dim]
|
|
New position embedding will be [grid_size[0] * grid_size[1] + 1, dim]
|
|
"""
|
|
new_grid_size = to_2tuple(grid_size)
|
|
orig_grid_size = to_2tuple(int(math.sqrt(len(orig_pos_embed) - 1)))
|
|
|
|
new_pos_emb_tok, new_pos_emb_img = (
|
|
orig_pos_embed[:1],
|
|
orig_pos_embed[1:],
|
|
)
|
|
logger.info(f"resizing position embedding grid-size from {orig_grid_size} to {new_grid_size}")
|
|
|
|
new_pos_emb_img = new_pos_emb_img.reshape(1, orig_grid_size[0], orig_grid_size[1], -1).permute(0, 3, 1, 2)
|
|
|
|
new_pos_emb_img = F.interpolate(
|
|
new_pos_emb_img,
|
|
size=new_grid_size,
|
|
mode="bilinear",
|
|
align_corners=True,
|
|
)
|
|
new_pos_emb_img = new_pos_emb_img.permute(0, 2, 3, 1).reshape(1, new_grid_size[0] * new_grid_size[1], -1)[0]
|
|
new_pos_embed = torch.cat([new_pos_emb_tok, new_pos_emb_img], dim=0)
|
|
return new_pos_embed
|
|
|
|
|
|
def initialize_global_position_embedding_from_local(pos_and_cls_embed, grid_size, x_scale, y_scale):
|
|
"""
|
|
Takes a local position embedding for vision encoder and uses it
|
|
to initialize the global position embedding.
|
|
Input: local position embedding of shape [grid_size[0] * grid_size[1] + 1, dim]
|
|
Returns: global position embedding of shape [x_scale, y_scale, grid_size[0] * grid_size[1] + 1, dim]
|
|
Here x_scale and y_scale are the number of tiles along x-axis and y-axis respectively.
|
|
"""
|
|
pos_embed = pos_and_cls_embed[1:]
|
|
cls_embed = pos_and_cls_embed[0].view(1, 1, 1, -1)
|
|
grid_size = to_2tuple(grid_size)
|
|
new_pos_emb_img = pos_embed.reshape(1, grid_size[0], grid_size[1], -1).permute(0, 3, 1, 2)
|
|
new_grid_size = (x_scale * grid_size[0], y_scale * grid_size[1])
|
|
new_pos_emb_img = F.interpolate(
|
|
new_pos_emb_img,
|
|
size=new_grid_size,
|
|
mode="bilinear",
|
|
align_corners=True,
|
|
)
|
|
new_pos_emb_img = new_pos_emb_img.permute(0, 2, 3, 1)
|
|
new_pos_emb_img = new_pos_emb_img.view(x_scale, grid_size[0], y_scale, grid_size[1], -1)
|
|
new_pos_emb_img = new_pos_emb_img.permute(0, 2, 1, 3, 4).contiguous()
|
|
new_pos_emb_img = new_pos_emb_img.reshape(x_scale, y_scale, grid_size[0] * grid_size[1], -1)
|
|
cls_embed = cls_embed.expand(x_scale, y_scale, -1, -1)
|
|
pos_and_cls_embed = torch.cat([cls_embed, new_pos_emb_img], dim=2)
|
|
return pos_and_cls_embed
|
|
|
|
|
|
def resize_global_position_embedding(pos_and_cls_embed, grid_size, x_scale, y_scale):
|
|
"""
|
|
Takes a global position embedding for vision encoder and resizes it to new size.
|
|
Input: global position embedding of shape [x_old, y_old, old_grid_size[0] * old_grid_size[1] + 1, dim]
|
|
Returns: global position embedding of shape [x_scale, y_scale, grid_size[0] * grid_size[1] + 1, dim]
|
|
Here x_scale and y_scale are the number of tiles along x-axis and y-axis respectively.
|
|
"""
|
|
# first remove cls token
|
|
pos_embed = pos_and_cls_embed[:, :, 1:]
|
|
cls_embed = pos_and_cls_embed[:, :, 0].unsqueeze(2)
|
|
|
|
xs_old, ys_old, ntok, dim = pos_embed.shape
|
|
old_grid_size = int(math.sqrt(ntok))
|
|
|
|
# move to correct form for interpolation
|
|
pos_embed = pos_embed.view(xs_old, ys_old, old_grid_size, old_grid_size, dim)
|
|
pos_embed = pos_embed.permute(0, 2, 1, 3, 4).contiguous()
|
|
pos_embed = pos_embed.view(xs_old * old_grid_size, ys_old * old_grid_size, dim)
|
|
pos_embed = pos_embed.unsqueeze(0)
|
|
|
|
# interpolate
|
|
new_size = (grid_size[0] * x_scale, grid_size[1] * y_scale)
|
|
pos_embed = pos_embed.permute(0, 3, 1, 2)
|
|
pos_embed_resized = F.interpolate(
|
|
pos_embed,
|
|
size=new_size,
|
|
mode="bilinear",
|
|
align_corners=True,
|
|
)
|
|
pos_embed = pos_embed_resized.permute(0, 2, 3, 1)[0]
|
|
|
|
# move it back in place
|
|
pos_embed = pos_embed.view(x_scale, grid_size[0], y_scale, grid_size[1], dim)
|
|
pos_embed = pos_embed.permute(0, 2, 1, 3, 4).contiguous()
|
|
pos_embed = pos_embed.view(x_scale, y_scale, grid_size[0] * grid_size[1], dim)
|
|
|
|
# interpolate cls token
|
|
cls_embed = cls_embed.permute(2, 3, 0, 1)
|
|
cls_embed_resized = F.interpolate(
|
|
cls_embed,
|
|
size=(x_scale, y_scale),
|
|
mode="bilinear",
|
|
align_corners=True,
|
|
)
|
|
cls_embed = cls_embed_resized.permute(2, 3, 0, 1)
|
|
# add cls token back in
|
|
pos_and_cls_embed = torch.cat([cls_embed, pos_embed], dim=2)
|
|
|
|
return pos_and_cls_embed
|
|
|
|
|
|
def build_encoder_attention_mask(
|
|
x: torch.Tensor,
|
|
ar: torch.Tensor,
|
|
ntok: int,
|
|
num_chunks: int,
|
|
n_heads: int,
|
|
):
|
|
"""
|
|
Build vision encoder attention mask that omits padding tokens.
|
|
"""
|
|
masks = []
|
|
for arx in ar:
|
|
mask_i = torch.ones((num_chunks, x.shape[2], 1), dtype=x.dtype)
|
|
mask_i[: arx[0] * arx[1], :ntok] = 0
|
|
mask_i = mask_i.view(num_chunks * x.shape[2], -1)
|
|
mask_i = mask_i @ mask_i.T * get_negative_inf_value(x.dtype)
|
|
mask_i = mask_i.unsqueeze(0)
|
|
masks.append(mask_i)
|
|
masks = torch.stack(masks).to(x.device).expand(-1, n_heads, -1, -1)
|
|
return masks
|
|
|
|
|
|
def expand_num_tokens_to_mult8(x):
|
|
num_pad_tokens = 8 - (x.shape[-2] % 8)
|
|
if num_pad_tokens == 0:
|
|
return x, 0
|
|
else:
|
|
return (
|
|
torch.cat(
|
|
[
|
|
x,
|
|
torch.zeros(
|
|
(x.shape[0], x.shape[1], num_pad_tokens, x.shape[-1]),
|
|
dtype=x.dtype,
|
|
device=x.device,
|
|
),
|
|
],
|
|
dim=-2,
|
|
),
|
|
num_pad_tokens,
|
|
)
|
|
|
|
|
|
def contract_num_tokens_from_mult8(x, num_pad_tokens):
|
|
if num_pad_tokens == 0:
|
|
return x
|
|
return x[:, :, :-num_pad_tokens]
|