From eba9d1ea14a2307375dfe7aeab7783103a953b55 Mon Sep 17 00:00:00 2001 From: Russell Bryant Date: Thu, 10 Oct 2024 14:21:59 -0400 Subject: [PATCH 01/10] ci: Run pre-commit checks in CI (#176) Run the pre-commit checks in a github workflow to validate that a PR or a direct push to the repo does not introduce new errors. --- .flake8 | 10 +++---- .github/workflows/pre-commit.yml | 45 ++++++++++++++++++++++++++++++++ 2 files changed, 50 insertions(+), 5 deletions(-) create mode 100644 .github/workflows/pre-commit.yml diff --git a/.flake8 b/.flake8 index af5005b0d..7cadda2a9 100644 --- a/.flake8 +++ b/.flake8 @@ -21,11 +21,11 @@ ignore = optional-ascii-coding = True exclude = ./.git, - ./docs - ./build + ./docs/*, + ./build, ./scripts, ./venv, - *.pyi - .pre-commit-config.yaml - *.md + *.pyi, + .pre-commit-config.yaml, + *.md, .flake8 diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml new file mode 100644 index 000000000..502753976 --- /dev/null +++ b/.github/workflows/pre-commit.yml @@ -0,0 +1,45 @@ +name: Pre-commit + +on: [pull_request] + +jobs: + pre-commit: + runs-on: ubuntu-latest + + steps: + - name: Checkout code + uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 + + - name: Set up Python + uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0 + with: + python-version: '3.11' + cache: pip + cache-dependency-path: | + **/requirements*.txt + .pre-commit-config.yaml + + - name: Install pre-commit + run: | + python -m pip install --upgrade pip + pip install pre-commit + + - name: Fetch base branch + run: git fetch origin ${{ github.event.pull_request.base.ref }}:refs/remotes/origin/${{ github.event.pull_request.base.ref }} + + - name: Fetch head commit from PR + run: git fetch origin pull/${{ github.event.pull_request.number }}/head:pr-${{ github.event.pull_request.number }} + + - name: Get changed files + id: changed-files + run: | + git diff --name-only origin/${{ github.event.pull_request.base.ref }} pr-${{ github.event.pull_request.number }} > changed_files.txt + cat changed_files.txt + + - name: Run pre-commit + run: | + if [ -s changed_files.txt ]; then + pre-commit run --files $(cat changed_files.txt | tr '\n' ' ') + else + echo "No changed files to run pre-commit on." + fi From a3e65d58a9b1297aa6b0167ba3cbb7a4332845ba Mon Sep 17 00:00:00 2001 From: Dalton Flanagan <6599399+dltn@users.noreply.github.com> Date: Thu, 10 Oct 2024 15:04:21 -0400 Subject: [PATCH 02/10] Add logo --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 050a71aff..2a76913d6 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,5 @@ +Llama Stack Logo + # Llama Stack [![PyPI version](https://img.shields.io/pypi/v/llama_stack.svg)](https://pypi.org/project/llama_stack/) From 7ff5800dea3feb328df2b73e26e56087061f6848 Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Thu, 10 Oct 2024 15:30:34 -0700 Subject: [PATCH 03/10] generate openapi --- docs/resources/llama-stack-spec.html | 66 ++++++++++++++-------------- docs/resources/llama-stack-spec.yaml | 26 +++++------ 2 files changed, 46 insertions(+), 46 deletions(-) diff --git a/docs/resources/llama-stack-spec.html b/docs/resources/llama-stack-spec.html index 96ef7e4bb..a2f92b6e4 100644 --- a/docs/resources/llama-stack-spec.html +++ b/docs/resources/llama-stack-spec.html @@ -21,7 +21,7 @@ "info": { "title": "[DRAFT] Llama Stack Specification", "version": "0.0.1", - "description": "This is the specification of the llama stack that provides\n a set of endpoints and their corresponding interfaces that are tailored to\n best leverage Llama Models. The specification is still in draft and subject to change.\n Generated at 2024-10-09 21:10:09.073430" + "description": "This is the specification of the llama stack that provides\n a set of endpoints and their corresponding interfaces that are tailored to\n best leverage Llama Models. The specification is still in draft and subject to change.\n Generated at 2024-10-10 15:29:56.831109" }, "servers": [ { @@ -355,7 +355,7 @@ "200": { "description": "OK", "content": { - "application/json": { + "text/event-stream": { "schema": { "$ref": "#/components/schemas/AgentTurnResponseStreamChunk" } @@ -6074,36 +6074,6 @@ } ], "tags": [ - { - "name": "RewardScoring" - }, - { - "name": "Memory" - }, - { - "name": "SyntheticDataGeneration" - }, - { - "name": "Models" - }, - { - "name": "Safety" - }, - { - "name": "BatchInference" - }, - { - "name": "Agents" - }, - { - "name": "MemoryBanks" - }, - { - "name": "Shields" - }, - { - "name": "Datasets" - }, { "name": "Evaluations" }, @@ -6111,14 +6081,44 @@ "name": "Inspect" }, { - "name": "PostTraining" + "name": "RewardScoring" + }, + { + "name": "Datasets" + }, + { + "name": "Models" }, { "name": "Telemetry" }, + { + "name": "PostTraining" + }, + { + "name": "SyntheticDataGeneration" + }, + { + "name": "BatchInference" + }, { "name": "Inference" }, + { + "name": "Agents" + }, + { + "name": "Memory" + }, + { + "name": "Safety" + }, + { + "name": "Shields" + }, + { + "name": "MemoryBanks" + }, { "name": "BuiltinTool", "description": "" diff --git a/docs/resources/llama-stack-spec.yaml b/docs/resources/llama-stack-spec.yaml index 9307ee47b..c9822d6ca 100644 --- a/docs/resources/llama-stack-spec.yaml +++ b/docs/resources/llama-stack-spec.yaml @@ -2507,7 +2507,7 @@ info: description: "This is the specification of the llama stack that provides\n \ \ a set of endpoints and their corresponding interfaces that are tailored\ \ to\n best leverage Llama Models. The specification is still in\ - \ draft and subject to change.\n Generated at 2024-10-09 21:10:09.073430" + \ draft and subject to change.\n Generated at 2024-10-10 15:29:56.831109" title: '[DRAFT] Llama Stack Specification' version: 0.0.1 jsonSchemaDialect: https://json-schema.org/draft/2020-12/schema @@ -2693,7 +2693,7 @@ paths: responses: '200': content: - application/json: + text/event-stream: schema: $ref: '#/components/schemas/AgentTurnResponseStreamChunk' description: OK @@ -3712,21 +3712,21 @@ security: servers: - url: http://any-hosted-llama-stack.com tags: -- name: RewardScoring -- name: Memory -- name: SyntheticDataGeneration -- name: Models -- name: Safety -- name: BatchInference -- name: Agents -- name: MemoryBanks -- name: Shields -- name: Datasets - name: Evaluations - name: Inspect -- name: PostTraining +- name: RewardScoring +- name: Datasets +- name: Models - name: Telemetry +- name: PostTraining +- name: SyntheticDataGeneration +- name: BatchInference - name: Inference +- name: Agents +- name: Memory +- name: Safety +- name: Shields +- name: MemoryBanks - description: name: BuiltinTool - description: Date: Thu, 10 Oct 2024 15:54:08 -0700 Subject: [PATCH 04/10] Split off meta-reference-quantized provider --- llama_stack/cli/stack/build.py | 6 +++- .../agents/tests/test_chat_agent.py | 4 +-- .../meta_reference/inference/__init__.py | 13 ++++---- .../impls/meta_reference/inference/config.py | 11 +++---- .../meta_reference/inference/generation.py | 33 ++++++------------- .../meta_reference/inference/inference.py | 4 +-- .../inference/model_parallel.py | 7 ++-- .../inference/parallel_utils.py | 4 +-- .../inference/quantization/loader.py | 13 ++------ llama_stack/providers/registry/inference.py | 17 +++++++++- 10 files changed, 54 insertions(+), 58 deletions(-) diff --git a/llama_stack/cli/stack/build.py b/llama_stack/cli/stack/build.py index 3fe615e6e..3c59e8c20 100644 --- a/llama_stack/cli/stack/build.py +++ b/llama_stack/cli/stack/build.py @@ -149,6 +149,7 @@ class StackBuild(Subcommand): def _run_template_list_cmd(self, args: argparse.Namespace) -> None: import json + from llama_stack.cli.table import print_table # eventually, this should query a registry at llama.meta.com/llamastack/distributions @@ -175,6 +176,7 @@ class StackBuild(Subcommand): def _run_stack_build_command(self, args: argparse.Namespace) -> None: import textwrap + import yaml from llama_stack.distribution.distribution import get_provider_registry from prompt_toolkit import prompt @@ -256,7 +258,9 @@ class StackBuild(Subcommand): providers = dict() for api, providers_for_api in get_provider_registry().items(): available_providers = [ - x for x in providers_for_api.keys() if x != "remote" + x + for x in providers_for_api.keys() + if x not in ("remote", "remote::sample") ] api_provider = prompt( "> Enter provider for API {}: ".format(api.value), diff --git a/llama_stack/providers/impls/meta_reference/agents/tests/test_chat_agent.py b/llama_stack/providers/impls/meta_reference/agents/tests/test_chat_agent.py index 9d941edc9..46423814b 100644 --- a/llama_stack/providers/impls/meta_reference/agents/tests/test_chat_agent.py +++ b/llama_stack/providers/impls/meta_reference/agents/tests/test_chat_agent.py @@ -16,7 +16,7 @@ from llama_stack.apis.agents import * # noqa: F403 from ..agents import ( AGENT_INSTANCES_BY_ID, MetaReferenceAgentsImpl, - MetaReferenceImplConfig, + MetaReferenceInferenceConfig, ) @@ -166,7 +166,7 @@ def mock_memory_api(): @pytest.fixture async def chat_agent(mock_inference_api, mock_safety_api, mock_memory_api): impl = MetaReferenceAgentsImpl( - config=MetaReferenceImplConfig(), + config=MetaReferenceInferenceConfig(), inference_api=mock_inference_api, safety_api=mock_safety_api, memory_api=mock_memory_api, diff --git a/llama_stack/providers/impls/meta_reference/inference/__init__.py b/llama_stack/providers/impls/meta_reference/inference/__init__.py index 64d315e79..9c923490d 100644 --- a/llama_stack/providers/impls/meta_reference/inference/__init__.py +++ b/llama_stack/providers/impls/meta_reference/inference/__init__.py @@ -4,16 +4,17 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. -from .config import MetaReferenceImplConfig # noqa +from typing import Union + +from .config import MetaReferenceInferenceConfig, MetaReferenceQuantizedInferenceConfig -async def get_provider_impl(config: MetaReferenceImplConfig, _deps): +async def get_provider_impl( + config: Union[MetaReferenceInferenceConfig, MetaReferenceQuantizedInferenceConfig], + _deps, +): from .inference import MetaReferenceInferenceImpl - assert isinstance( - config, MetaReferenceImplConfig - ), f"Unexpected config type: {type(config)}" - impl = MetaReferenceInferenceImpl(config) await impl.initialize() return impl diff --git a/llama_stack/providers/impls/meta_reference/inference/config.py b/llama_stack/providers/impls/meta_reference/inference/config.py index ba5eddd53..901a8c7fb 100644 --- a/llama_stack/providers/impls/meta_reference/inference/config.py +++ b/llama_stack/providers/impls/meta_reference/inference/config.py @@ -15,12 +15,11 @@ from pydantic import BaseModel, Field, field_validator from llama_stack.providers.utils.inference import supported_inference_models -class MetaReferenceImplConfig(BaseModel): +class MetaReferenceInferenceConfig(BaseModel): model: str = Field( default="Llama3.1-8B-Instruct", description="Model descriptor from `llama model list`", ) - quantization: Optional[QuantizationConfig] = None torch_seed: Optional[int] = None max_seq_len: int = 4096 max_batch_size: int = 1 @@ -38,9 +37,9 @@ class MetaReferenceImplConfig(BaseModel): @property def model_parallel_size(self) -> int: - # HACK ALERT: this will be fixed when we move inference configuration - # to ModelsRegistry and we can explicitly ask for `model_parallel_size` - # as configuration there resolved = resolve_model(self.model) - assert resolved is not None return resolved.pth_file_count + + +class MetaReferenceQuantizedInferenceConfig(MetaReferenceInferenceConfig): + quantization: QuantizationConfig diff --git a/llama_stack/providers/impls/meta_reference/inference/generation.py b/llama_stack/providers/impls/meta_reference/inference/generation.py index 37aef5ede..8d94a20d1 100644 --- a/llama_stack/providers/impls/meta_reference/inference/generation.py +++ b/llama_stack/providers/impls/meta_reference/inference/generation.py @@ -11,9 +11,8 @@ import json import os import sys import time -from dataclasses import dataclass from pathlib import Path -from typing import Generator, List, Optional +from typing import Generator, List, Optional, Union import torch import torch.nn.functional as F @@ -36,14 +35,12 @@ from llama_models.llama3.reference_impl.multimodal.model import ( ) from llama_models.sku_list import resolve_model -from llama_stack.apis.inference import QuantizationType - -from llama_stack.distribution.utils.model_utils import model_local_dir - from pydantic import BaseModel from termcolor import cprint -from .config import MetaReferenceImplConfig +from llama_stack.distribution.utils.model_utils import model_local_dir + +from .config import MetaReferenceInferenceConfig, MetaReferenceQuantizedInferenceConfig def model_checkpoint_dir(model) -> str: @@ -68,7 +65,11 @@ class TokenResult(BaseModel): class Llama: @staticmethod - def build(config: MetaReferenceImplConfig): + def build( + config: Union[ + MetaReferenceInferenceConfig, MetaReferenceQuantizedInferenceConfig + ] + ): """ Build a Llama instance by initializing and loading a model checkpoint. @@ -78,15 +79,6 @@ class Llama: """ model = resolve_model(config.model) - if ( - config.quantization - and config.quantization.type == QuantizationType.fp8.value - ): - from .quantization.loader import is_fbgemm_available - - if not is_fbgemm_available(): - raise ImportError("fbgemm-gpu is required for FP8 quantization") - if not torch.distributed.is_initialized(): torch.distributed.init_process_group("nccl") @@ -134,12 +126,7 @@ class Llama: model_args.vocab_size == tokenizer.n_words ), f"model_args vocab = {model_args.vocab_size} but tokenizer vocab = {tokenizer.n_words}" - fp8 = ( - config.quantization - and config.quantization.type == QuantizationType.fp8.value - ) - - if fp8: + if isinstance(config, MetaReferenceQuantizedInferenceConfig): from .quantization.loader import convert_to_quantized_model # load on CPU in bf16 so that fp8 conversion does not find an diff --git a/llama_stack/providers/impls/meta_reference/inference/inference.py b/llama_stack/providers/impls/meta_reference/inference/inference.py index a8afcea54..6696762c9 100644 --- a/llama_stack/providers/impls/meta_reference/inference/inference.py +++ b/llama_stack/providers/impls/meta_reference/inference/inference.py @@ -17,7 +17,7 @@ from llama_stack.providers.utils.inference.prompt_adapter import ( chat_completion_request_to_messages, ) -from .config import MetaReferenceImplConfig +from .config import MetaReferenceInferenceConfig from .model_parallel import LlamaModelParallelGenerator # there's a single model parallel process running serving the model. for now, @@ -26,7 +26,7 @@ SEMAPHORE = asyncio.Semaphore(1) class MetaReferenceInferenceImpl(Inference, ModelsProtocolPrivate): - def __init__(self, config: MetaReferenceImplConfig) -> None: + def __init__(self, config: MetaReferenceInferenceConfig) -> None: self.config = config model = resolve_model(config.model) if model is None: diff --git a/llama_stack/providers/impls/meta_reference/inference/model_parallel.py b/llama_stack/providers/impls/meta_reference/inference/model_parallel.py index 798fadcbe..e8f483f30 100644 --- a/llama_stack/providers/impls/meta_reference/inference/model_parallel.py +++ b/llama_stack/providers/impls/meta_reference/inference/model_parallel.py @@ -6,7 +6,6 @@ import os from copy import deepcopy -from dataclasses import dataclass from functools import partial from typing import Generator, List, Optional @@ -15,7 +14,7 @@ from llama_models.llama3.api.datatypes import Message, ToolPromptFormat from llama_models.llama3.api.tokenizer import Tokenizer from llama_models.sku_list import resolve_model -from .config import MetaReferenceImplConfig +from .config import MetaReferenceInferenceConfig from .generation import Llama, model_checkpoint_dir from .parallel_utils import InferenceArgs, ModelParallelProcessGroup @@ -36,7 +35,7 @@ class ModelRunner: ) -def init_model_cb(config: MetaReferenceImplConfig): +def init_model_cb(config: MetaReferenceInferenceConfig): llama = Llama.build(config) return ModelRunner(llama) @@ -52,7 +51,7 @@ class LlamaModelParallelGenerator: clear at the callsite why we need to use a context manager. """ - def __init__(self, config: MetaReferenceImplConfig): + def __init__(self, config: MetaReferenceInferenceConfig): self.config = config self.model = resolve_model(self.config.model) # this is a hack because Agent's loop uses this to tokenize and check if input is too long diff --git a/llama_stack/providers/impls/meta_reference/inference/parallel_utils.py b/llama_stack/providers/impls/meta_reference/inference/parallel_utils.py index c6eacc73c..7dbedd0f0 100644 --- a/llama_stack/providers/impls/meta_reference/inference/parallel_utils.py +++ b/llama_stack/providers/impls/meta_reference/inference/parallel_utils.py @@ -11,7 +11,7 @@ import tempfile import time import uuid from enum import Enum -from typing import Any, Callable, Generator, List, Literal, Optional, Union +from typing import Callable, Generator, List, Literal, Optional, Union import torch @@ -317,7 +317,7 @@ def start_model_parallel_process( request_socket.send(encode_msg(ReadyRequest())) response = request_socket.recv() - print(f"Finished model load {response}") + print("Loaded model...") return request_socket, process diff --git a/llama_stack/providers/impls/meta_reference/inference/quantization/loader.py b/llama_stack/providers/impls/meta_reference/inference/quantization/loader.py index 1df86cb84..92b3a6ce3 100644 --- a/llama_stack/providers/impls/meta_reference/inference/quantization/loader.py +++ b/llama_stack/providers/impls/meta_reference/inference/quantization/loader.py @@ -22,19 +22,10 @@ from torch import Tensor from llama_stack.apis.inference import QuantizationType from llama_stack.providers.impls.meta_reference.inference.config import ( - MetaReferenceImplConfig, + MetaReferenceQuantizedInferenceConfig, ) -def is_fbgemm_available() -> bool: - try: - import fbgemm_gpu.experimental.gen_ai # noqa: F401 - - return True - except ImportError: - return False - - def swiglu_wrapper( self, x: Tensor, @@ -47,7 +38,7 @@ def swiglu_wrapper( def convert_to_quantized_model( model: Transformer, - config: MetaReferenceImplConfig, + config: MetaReferenceQuantizedInferenceConfig, fp8_activation_scale_ub: Optional[float] = 1200.0, ) -> Transformer: if config.quantization.type == QuantizationType.bf16.value: diff --git a/llama_stack/providers/registry/inference.py b/llama_stack/providers/registry/inference.py index ddfd4ff40..686fc273b 100644 --- a/llama_stack/providers/registry/inference.py +++ b/llama_stack/providers/registry/inference.py @@ -14,6 +14,21 @@ def available_providers() -> List[ProviderSpec]: InlineProviderSpec( api=Api.inference, provider_type="meta-reference", + pip_packages=[ + "accelerate", + "blobfile", + "fairscale", + "torch", + "torchvision", + "transformers", + "zmq", + ], + module="llama_stack.providers.impls.meta_reference.inference", + config_class="llama_stack.providers.impls.meta_reference.inference.MetaReferenceInferenceConfig", + ), + InlineProviderSpec( + api=Api.inference, + provider_type="meta-reference-quantized", pip_packages=[ "accelerate", "blobfile", @@ -25,7 +40,7 @@ def available_providers() -> List[ProviderSpec]: "zmq", ], module="llama_stack.providers.impls.meta_reference.inference", - config_class="llama_stack.providers.impls.meta_reference.inference.MetaReferenceImplConfig", + config_class="llama_stack.providers.impls.meta_reference.inference.MetaReferenceQuantizedInferenceConfig", ), remote_provider_spec( api=Api.inference, From ca29980c6b591c0cd500aebe7bf6e4620fb51c40 Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Thu, 10 Oct 2024 20:17:29 -0700 Subject: [PATCH 05/10] fix agents context retriever --- .../impls/meta_reference/agents/rag/context_retriever.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/llama_stack/providers/impls/meta_reference/agents/rag/context_retriever.py b/llama_stack/providers/impls/meta_reference/agents/rag/context_retriever.py index 6b59479b3..b668dc0d6 100644 --- a/llama_stack/providers/impls/meta_reference/agents/rag/context_retriever.py +++ b/llama_stack/providers/impls/meta_reference/agents/rag/context_retriever.py @@ -63,13 +63,12 @@ async def llm_rag_query_generator( model = config.model message = UserMessage(content=content) - response = inference_api.chat_completion( + response = await inference_api.chat_completion( model=model, messages=[message], stream=False, ) - async for chunk in response: - query = chunk.completion_message.content + query = response.completion_message.content return query From 9fbe8852aa2ecf6e029e0bfe16177883fc7f831f Mon Sep 17 00:00:00 2001 From: Dalton Flanagan <6599399+dltn@users.noreply.github.com> Date: Thu, 10 Oct 2024 23:39:25 -0400 Subject: [PATCH 06/10] Add Swift Package Index badge --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 2a76913d6..238475840 100644 --- a/README.md +++ b/README.md @@ -99,7 +99,7 @@ The `llama` CLI makes it easy to work with the Llama Stack set of tools. Please | **Language** | **Client SDK** | **Package** | | :----: | :----: | :----: | | Python | [llama-stack-client-python](https://github.com/meta-llama/llama-stack-client-python) | [![PyPI version](https://img.shields.io/pypi/v/llama_stack_client.svg)](https://pypi.org/project/llama_stack_client/) -| Swift | [llama-stack-client-swift](https://github.com/meta-llama/llama-stack-client-swift) | +| Swift | [llama-stack-client-swift](https://github.com/meta-llama/llama-stack-client-swift) | [![Swift Package Index](https://img.shields.io/endpoint?url=https%3A%2F%2Fswiftpackageindex.com%2Fapi%2Fpackages%2Fmeta-llama%2Fllama-stack-client-swift%2Fbadge%3Ftype%3Dswift-versions)](https://swiftpackageindex.com/meta-llama/llama-stack-client-swift) | Node | [llama-stack-client-node](https://github.com/meta-llama/llama-stack-client-node) | [![NPM version](https://img.shields.io/npm/v/llama-stack-client.svg)](https://npmjs.org/package/llama-stack-client) | Kotlin | [llama-stack-client-kotlin](https://github.com/meta-llama/llama-stack-client-kotlin) | From 2128e61da2d3f660deee7e8c2fdf454dc0168a2d Mon Sep 17 00:00:00 2001 From: Yuan Tang Date: Fri, 11 Oct 2024 11:47:57 -0400 Subject: [PATCH 07/10] Fix incorrect completion() signature for Databricks provider (#236) --- docs/getting_started.md | 2 +- .../providers/adapters/inference/databricks/__init__.py | 3 ++- .../providers/adapters/inference/databricks/config.py | 3 +-- .../adapters/inference/databricks/databricks.py | 9 ++++++++- .../providers/impls/meta_reference/safety/llama_guard.py | 2 +- llama_stack/providers/impls/vllm/__init__.py | 6 ++++++ 6 files changed, 19 insertions(+), 6 deletions(-) diff --git a/docs/getting_started.md b/docs/getting_started.md index 32f4d2d15..6c8c902c0 100644 --- a/docs/getting_started.md +++ b/docs/getting_started.md @@ -73,7 +73,7 @@ docker run -it -p 5000:5000 -v ~/.llama:/root/.llama --gpus=all llamastack-local ``` > [!NOTE] -> `~/.llama` should be the path containing downloaded weights of Llama models. +> `~/.llama` should be the path containing downloaded weights of Llama models. #### Via conda diff --git a/llama_stack/providers/adapters/inference/databricks/__init__.py b/llama_stack/providers/adapters/inference/databricks/__init__.py index 097579d25..ca2a0a103 100644 --- a/llama_stack/providers/adapters/inference/databricks/__init__.py +++ b/llama_stack/providers/adapters/inference/databricks/__init__.py @@ -7,10 +7,11 @@ from .config import DatabricksImplConfig from .databricks import DatabricksInferenceAdapter + async def get_adapter_impl(config: DatabricksImplConfig, _deps): assert isinstance( config, DatabricksImplConfig ), f"Unexpected config type: {type(config)}" impl = DatabricksInferenceAdapter(config) await impl.initialize() - return impl \ No newline at end of file + return impl diff --git a/llama_stack/providers/adapters/inference/databricks/config.py b/llama_stack/providers/adapters/inference/databricks/config.py index 927bb474c..ae2b056ea 100644 --- a/llama_stack/providers/adapters/inference/databricks/config.py +++ b/llama_stack/providers/adapters/inference/databricks/config.py @@ -4,7 +4,6 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. -from typing import Optional from llama_models.schema_utils import json_schema_type from pydantic import BaseModel, Field @@ -19,4 +18,4 @@ class DatabricksImplConfig(BaseModel): api_token: str = Field( default=None, description="The Databricks API token", - ) \ No newline at end of file + ) diff --git a/llama_stack/providers/adapters/inference/databricks/databricks.py b/llama_stack/providers/adapters/inference/databricks/databricks.py index 2d7427253..7e8263dbf 100644 --- a/llama_stack/providers/adapters/inference/databricks/databricks.py +++ b/llama_stack/providers/adapters/inference/databricks/databricks.py @@ -48,7 +48,14 @@ class DatabricksInferenceAdapter(ModelRegistryHelper, Inference): async def shutdown(self) -> None: pass - def completion(self, request: CompletionRequest) -> AsyncGenerator: + def completion( + self, + model: str, + content: InterleavedTextMedia, + sampling_params: Optional[SamplingParams] = SamplingParams(), + stream: Optional[bool] = False, + logprobs: Optional[LogProbConfig] = None, + ) -> AsyncGenerator: raise NotImplementedError() def chat_completion( diff --git a/llama_stack/providers/impls/meta_reference/safety/llama_guard.py b/llama_stack/providers/impls/meta_reference/safety/llama_guard.py index 19a20a899..a6f450fae 100644 --- a/llama_stack/providers/impls/meta_reference/safety/llama_guard.py +++ b/llama_stack/providers/impls/meta_reference/safety/llama_guard.py @@ -170,7 +170,7 @@ class LlamaGuardShield(ShieldBase): for i in range(1, len(messages)): if messages[i].role == messages[i - 1].role: raise ValueError( - f"Messages must alternate between user and assistant. Message {i} has the same role as message {i-1}" + f"Messages must alternate between user and assistant. Message {i} has the same role as message {i - 1}" ) return messages diff --git a/llama_stack/providers/impls/vllm/__init__.py b/llama_stack/providers/impls/vllm/__init__.py index 3d5a81ad9..aa0c4b101 100644 --- a/llama_stack/providers/impls/vllm/__init__.py +++ b/llama_stack/providers/impls/vllm/__init__.py @@ -1,3 +1,9 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + from typing import Any from .config import VLLMConfig From 05282d12349d9c1034b93f5833e4c8840aa48c15 Mon Sep 17 00:00:00 2001 From: Yuan Tang Date: Fri, 11 Oct 2024 13:03:59 -0400 Subject: [PATCH 08/10] Enable pre-commit on main branch (#237) --- .github/workflows/pre-commit.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml index 502753976..871e91f4a 100644 --- a/.github/workflows/pre-commit.yml +++ b/.github/workflows/pre-commit.yml @@ -1,6 +1,9 @@ name: Pre-commit -on: [pull_request] +on: + pull_request: + push: + branches: [main] jobs: pre-commit: From a2b87ed0cb5be83022e13f99bf0b5c17a6524072 Mon Sep 17 00:00:00 2001 From: Yuan Tang Date: Fri, 11 Oct 2024 14:09:11 -0400 Subject: [PATCH 09/10] Switch to pre-commit/action (#239) --- .github/workflows/pre-commit.yml | 25 +------------------------ 1 file changed, 1 insertion(+), 24 deletions(-) diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml index 871e91f4a..dd1a5c6cd 100644 --- a/.github/workflows/pre-commit.yml +++ b/.github/workflows/pre-commit.yml @@ -22,27 +22,4 @@ jobs: **/requirements*.txt .pre-commit-config.yaml - - name: Install pre-commit - run: | - python -m pip install --upgrade pip - pip install pre-commit - - - name: Fetch base branch - run: git fetch origin ${{ github.event.pull_request.base.ref }}:refs/remotes/origin/${{ github.event.pull_request.base.ref }} - - - name: Fetch head commit from PR - run: git fetch origin pull/${{ github.event.pull_request.number }}/head:pr-${{ github.event.pull_request.number }} - - - name: Get changed files - id: changed-files - run: | - git diff --name-only origin/${{ github.event.pull_request.base.ref }} pr-${{ github.event.pull_request.number }} > changed_files.txt - cat changed_files.txt - - - name: Run pre-commit - run: | - if [ -s changed_files.txt ]; then - pre-commit run --files $(cat changed_files.txt | tr '\n' ' ') - else - echo "No changed files to run pre-commit on." - fi + - uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd #v3.0.1 From 209cd3d35ed34c7db550969539d9b7d29e09f01b Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Mon, 14 Oct 2024 11:13:04 -0700 Subject: [PATCH 10/10] Bump version to 0.0.42 --- requirements.txt | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 3a24cff21..767f06be8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,7 +2,7 @@ blobfile fire httpx huggingface-hub -llama-models>=0.0.41 +llama-models>=0.0.42 prompt-toolkit python-dotenv pydantic>=2 diff --git a/setup.py b/setup.py index 4f4ea7713..466ca655f 100644 --- a/setup.py +++ b/setup.py @@ -16,7 +16,7 @@ def read_requirements(): setup( name="llama_stack", - version="0.0.41", + version="0.0.42", author="Meta Llama", author_email="llama-oss@meta.com", description="Llama Stack",