mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-10-05 20:27:35 +00:00
llama_toolchain -> llama_stack
This commit is contained in:
parent
f372355409
commit
2cf731faea
175 changed files with 300 additions and 279 deletions
5
llama_stack/__init__.py
Normal file
5
llama_stack/__init__.py
Normal file
|
@ -0,0 +1,5 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
5
llama_stack/agentic_system/__init__.py
Normal file
5
llama_stack/agentic_system/__init__.py
Normal file
|
@ -0,0 +1,5 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
7
llama_stack/agentic_system/api/__init__.py
Normal file
7
llama_stack/agentic_system/api/__init__.py
Normal file
|
@ -0,0 +1,7 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
from .api import * # noqa: F401 F403
|
467
llama_stack/agentic_system/api/api.py
Normal file
467
llama_stack/agentic_system/api/api.py
Normal file
|
@ -0,0 +1,467 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
from datetime import datetime
|
||||
from enum import Enum
|
||||
from typing import Any, Dict, List, Literal, Optional, Protocol, Union
|
||||
|
||||
from llama_models.schema_utils import json_schema_type, webmethod
|
||||
|
||||
from pydantic import BaseModel, ConfigDict, Field
|
||||
from typing_extensions import Annotated
|
||||
|
||||
from llama_models.llama3.api.datatypes import * # noqa: F403
|
||||
from llama_stack.common.deployment_types import * # noqa: F403
|
||||
from llama_stack.inference.api import * # noqa: F403
|
||||
from llama_stack.safety.api import * # noqa: F403
|
||||
from llama_stack.memory.api import * # noqa: F403
|
||||
|
||||
|
||||
@json_schema_type
|
||||
class Attachment(BaseModel):
|
||||
content: InterleavedTextMedia | URL
|
||||
mime_type: str
|
||||
|
||||
|
||||
class AgenticSystemTool(Enum):
|
||||
brave_search = "brave_search"
|
||||
wolfram_alpha = "wolfram_alpha"
|
||||
photogen = "photogen"
|
||||
code_interpreter = "code_interpreter"
|
||||
|
||||
function_call = "function_call"
|
||||
memory = "memory"
|
||||
|
||||
|
||||
class ToolDefinitionCommon(BaseModel):
|
||||
input_shields: Optional[List[ShieldDefinition]] = Field(default_factory=list)
|
||||
output_shields: Optional[List[ShieldDefinition]] = Field(default_factory=list)
|
||||
|
||||
|
||||
class SearchEngineType(Enum):
|
||||
bing = "bing"
|
||||
brave = "brave"
|
||||
|
||||
|
||||
@json_schema_type
|
||||
class SearchToolDefinition(ToolDefinitionCommon):
|
||||
# NOTE: brave_search is just a placeholder since model always uses
|
||||
# brave_search as tool call name
|
||||
type: Literal[AgenticSystemTool.brave_search.value] = (
|
||||
AgenticSystemTool.brave_search.value
|
||||
)
|
||||
engine: SearchEngineType = SearchEngineType.brave
|
||||
remote_execution: Optional[RestAPIExecutionConfig] = None
|
||||
|
||||
|
||||
@json_schema_type
|
||||
class WolframAlphaToolDefinition(ToolDefinitionCommon):
|
||||
type: Literal[AgenticSystemTool.wolfram_alpha.value] = (
|
||||
AgenticSystemTool.wolfram_alpha.value
|
||||
)
|
||||
remote_execution: Optional[RestAPIExecutionConfig] = None
|
||||
|
||||
|
||||
@json_schema_type
|
||||
class PhotogenToolDefinition(ToolDefinitionCommon):
|
||||
type: Literal[AgenticSystemTool.photogen.value] = AgenticSystemTool.photogen.value
|
||||
remote_execution: Optional[RestAPIExecutionConfig] = None
|
||||
|
||||
|
||||
@json_schema_type
|
||||
class CodeInterpreterToolDefinition(ToolDefinitionCommon):
|
||||
type: Literal[AgenticSystemTool.code_interpreter.value] = (
|
||||
AgenticSystemTool.code_interpreter.value
|
||||
)
|
||||
enable_inline_code_execution: bool = True
|
||||
remote_execution: Optional[RestAPIExecutionConfig] = None
|
||||
|
||||
|
||||
@json_schema_type
|
||||
class FunctionCallToolDefinition(ToolDefinitionCommon):
|
||||
type: Literal[AgenticSystemTool.function_call.value] = (
|
||||
AgenticSystemTool.function_call.value
|
||||
)
|
||||
function_name: str
|
||||
description: str
|
||||
parameters: Dict[str, ToolParamDefinition]
|
||||
remote_execution: Optional[RestAPIExecutionConfig] = None
|
||||
|
||||
|
||||
class _MemoryBankConfigCommon(BaseModel):
|
||||
bank_id: str
|
||||
|
||||
|
||||
class AgenticSystemVectorMemoryBankConfig(_MemoryBankConfigCommon):
|
||||
type: Literal[MemoryBankType.vector.value] = MemoryBankType.vector.value
|
||||
|
||||
|
||||
class AgenticSystemKeyValueMemoryBankConfig(_MemoryBankConfigCommon):
|
||||
type: Literal[MemoryBankType.keyvalue.value] = MemoryBankType.keyvalue.value
|
||||
keys: List[str] # what keys to focus on
|
||||
|
||||
|
||||
class AgenticSystemKeywordMemoryBankConfig(_MemoryBankConfigCommon):
|
||||
type: Literal[MemoryBankType.keyword.value] = MemoryBankType.keyword.value
|
||||
|
||||
|
||||
class AgenticSystemGraphMemoryBankConfig(_MemoryBankConfigCommon):
|
||||
type: Literal[MemoryBankType.graph.value] = MemoryBankType.graph.value
|
||||
entities: List[str] # what entities to focus on
|
||||
|
||||
|
||||
MemoryBankConfig = Annotated[
|
||||
Union[
|
||||
AgenticSystemVectorMemoryBankConfig,
|
||||
AgenticSystemKeyValueMemoryBankConfig,
|
||||
AgenticSystemKeywordMemoryBankConfig,
|
||||
AgenticSystemGraphMemoryBankConfig,
|
||||
],
|
||||
Field(discriminator="type"),
|
||||
]
|
||||
|
||||
|
||||
class MemoryQueryGenerator(Enum):
|
||||
default = "default"
|
||||
llm = "llm"
|
||||
custom = "custom"
|
||||
|
||||
|
||||
class DefaultMemoryQueryGeneratorConfig(BaseModel):
|
||||
type: Literal[MemoryQueryGenerator.default.value] = (
|
||||
MemoryQueryGenerator.default.value
|
||||
)
|
||||
sep: str = " "
|
||||
|
||||
|
||||
class LLMMemoryQueryGeneratorConfig(BaseModel):
|
||||
type: Literal[MemoryQueryGenerator.llm.value] = MemoryQueryGenerator.llm.value
|
||||
model: str
|
||||
template: str
|
||||
|
||||
|
||||
class CustomMemoryQueryGeneratorConfig(BaseModel):
|
||||
type: Literal[MemoryQueryGenerator.custom.value] = MemoryQueryGenerator.custom.value
|
||||
|
||||
|
||||
MemoryQueryGeneratorConfig = Annotated[
|
||||
Union[
|
||||
DefaultMemoryQueryGeneratorConfig,
|
||||
LLMMemoryQueryGeneratorConfig,
|
||||
CustomMemoryQueryGeneratorConfig,
|
||||
],
|
||||
Field(discriminator="type"),
|
||||
]
|
||||
|
||||
|
||||
class MemoryToolDefinition(ToolDefinitionCommon):
|
||||
type: Literal[AgenticSystemTool.memory.value] = AgenticSystemTool.memory.value
|
||||
memory_bank_configs: List[MemoryBankConfig] = Field(default_factory=list)
|
||||
# This config defines how a query is generated using the messages
|
||||
# for memory bank retrieval.
|
||||
query_generator_config: MemoryQueryGeneratorConfig = Field(
|
||||
default=DefaultMemoryQueryGeneratorConfig()
|
||||
)
|
||||
max_tokens_in_context: int = 4096
|
||||
max_chunks: int = 10
|
||||
|
||||
|
||||
AgenticSystemToolDefinition = Annotated[
|
||||
Union[
|
||||
SearchToolDefinition,
|
||||
WolframAlphaToolDefinition,
|
||||
PhotogenToolDefinition,
|
||||
CodeInterpreterToolDefinition,
|
||||
FunctionCallToolDefinition,
|
||||
MemoryToolDefinition,
|
||||
],
|
||||
Field(discriminator="type"),
|
||||
]
|
||||
|
||||
|
||||
class StepCommon(BaseModel):
|
||||
turn_id: str
|
||||
step_id: str
|
||||
started_at: Optional[datetime] = None
|
||||
completed_at: Optional[datetime] = None
|
||||
|
||||
|
||||
class StepType(Enum):
|
||||
inference = "inference"
|
||||
tool_execution = "tool_execution"
|
||||
shield_call = "shield_call"
|
||||
memory_retrieval = "memory_retrieval"
|
||||
|
||||
|
||||
@json_schema_type
|
||||
class InferenceStep(StepCommon):
|
||||
model_config = ConfigDict(protected_namespaces=())
|
||||
|
||||
step_type: Literal[StepType.inference.value] = StepType.inference.value
|
||||
model_response: CompletionMessage
|
||||
|
||||
|
||||
@json_schema_type
|
||||
class ToolExecutionStep(StepCommon):
|
||||
step_type: Literal[StepType.tool_execution.value] = StepType.tool_execution.value
|
||||
tool_calls: List[ToolCall]
|
||||
tool_responses: List[ToolResponse]
|
||||
|
||||
|
||||
@json_schema_type
|
||||
class ShieldCallStep(StepCommon):
|
||||
step_type: Literal[StepType.shield_call.value] = StepType.shield_call.value
|
||||
response: ShieldResponse
|
||||
|
||||
|
||||
@json_schema_type
|
||||
class MemoryRetrievalStep(StepCommon):
|
||||
step_type: Literal[StepType.memory_retrieval.value] = (
|
||||
StepType.memory_retrieval.value
|
||||
)
|
||||
memory_bank_ids: List[str]
|
||||
inserted_context: InterleavedTextMedia
|
||||
|
||||
|
||||
Step = Annotated[
|
||||
Union[
|
||||
InferenceStep,
|
||||
ToolExecutionStep,
|
||||
ShieldCallStep,
|
||||
MemoryRetrievalStep,
|
||||
],
|
||||
Field(discriminator="step_type"),
|
||||
]
|
||||
|
||||
|
||||
@json_schema_type
|
||||
class Turn(BaseModel):
|
||||
"""A single turn in an interaction with an Agentic System."""
|
||||
|
||||
turn_id: str
|
||||
session_id: str
|
||||
input_messages: List[
|
||||
Union[
|
||||
UserMessage,
|
||||
ToolResponseMessage,
|
||||
]
|
||||
]
|
||||
steps: List[Step]
|
||||
output_message: CompletionMessage
|
||||
output_attachments: List[Attachment] = Field(default_factory=list)
|
||||
|
||||
started_at: datetime
|
||||
completed_at: Optional[datetime] = None
|
||||
|
||||
|
||||
@json_schema_type
|
||||
class Session(BaseModel):
|
||||
"""A single session of an interaction with an Agentic System."""
|
||||
|
||||
session_id: str
|
||||
session_name: str
|
||||
turns: List[Turn]
|
||||
started_at: datetime
|
||||
|
||||
memory_bank: Optional[MemoryBank] = None
|
||||
|
||||
|
||||
class AgentConfigCommon(BaseModel):
|
||||
sampling_params: Optional[SamplingParams] = SamplingParams()
|
||||
|
||||
input_shields: Optional[List[ShieldDefinition]] = Field(default_factory=list)
|
||||
output_shields: Optional[List[ShieldDefinition]] = Field(default_factory=list)
|
||||
|
||||
tools: Optional[List[AgenticSystemToolDefinition]] = Field(default_factory=list)
|
||||
tool_choice: Optional[ToolChoice] = Field(default=ToolChoice.auto)
|
||||
tool_prompt_format: Optional[ToolPromptFormat] = Field(
|
||||
default=ToolPromptFormat.json
|
||||
)
|
||||
|
||||
|
||||
@json_schema_type
|
||||
class AgentConfig(AgentConfigCommon):
|
||||
model: str
|
||||
instructions: str
|
||||
|
||||
|
||||
class AgentConfigOverridablePerTurn(AgentConfigCommon):
|
||||
instructions: Optional[str] = None
|
||||
|
||||
|
||||
class AgenticSystemTurnResponseEventType(Enum):
|
||||
step_start = "step_start"
|
||||
step_complete = "step_complete"
|
||||
step_progress = "step_progress"
|
||||
|
||||
turn_start = "turn_start"
|
||||
turn_complete = "turn_complete"
|
||||
|
||||
|
||||
@json_schema_type
|
||||
class AgenticSystemTurnResponseStepStartPayload(BaseModel):
|
||||
event_type: Literal[AgenticSystemTurnResponseEventType.step_start.value] = (
|
||||
AgenticSystemTurnResponseEventType.step_start.value
|
||||
)
|
||||
step_type: StepType
|
||||
step_id: str
|
||||
metadata: Optional[Dict[str, Any]] = Field(default_factory=dict)
|
||||
|
||||
|
||||
@json_schema_type
|
||||
class AgenticSystemTurnResponseStepCompletePayload(BaseModel):
|
||||
event_type: Literal[AgenticSystemTurnResponseEventType.step_complete.value] = (
|
||||
AgenticSystemTurnResponseEventType.step_complete.value
|
||||
)
|
||||
step_type: StepType
|
||||
step_details: Step
|
||||
|
||||
|
||||
@json_schema_type
|
||||
class AgenticSystemTurnResponseStepProgressPayload(BaseModel):
|
||||
model_config = ConfigDict(protected_namespaces=())
|
||||
|
||||
event_type: Literal[AgenticSystemTurnResponseEventType.step_progress.value] = (
|
||||
AgenticSystemTurnResponseEventType.step_progress.value
|
||||
)
|
||||
step_type: StepType
|
||||
step_id: str
|
||||
|
||||
model_response_text_delta: Optional[str] = None
|
||||
tool_call_delta: Optional[ToolCallDelta] = None
|
||||
tool_response_text_delta: Optional[str] = None
|
||||
|
||||
|
||||
@json_schema_type
|
||||
class AgenticSystemTurnResponseTurnStartPayload(BaseModel):
|
||||
event_type: Literal[AgenticSystemTurnResponseEventType.turn_start.value] = (
|
||||
AgenticSystemTurnResponseEventType.turn_start.value
|
||||
)
|
||||
turn_id: str
|
||||
|
||||
|
||||
@json_schema_type
|
||||
class AgenticSystemTurnResponseTurnCompletePayload(BaseModel):
|
||||
event_type: Literal[AgenticSystemTurnResponseEventType.turn_complete.value] = (
|
||||
AgenticSystemTurnResponseEventType.turn_complete.value
|
||||
)
|
||||
turn: Turn
|
||||
|
||||
|
||||
@json_schema_type
|
||||
class AgenticSystemTurnResponseEvent(BaseModel):
|
||||
"""Streamed agent execution response."""
|
||||
|
||||
payload: Annotated[
|
||||
Union[
|
||||
AgenticSystemTurnResponseStepStartPayload,
|
||||
AgenticSystemTurnResponseStepProgressPayload,
|
||||
AgenticSystemTurnResponseStepCompletePayload,
|
||||
AgenticSystemTurnResponseTurnStartPayload,
|
||||
AgenticSystemTurnResponseTurnCompletePayload,
|
||||
],
|
||||
Field(discriminator="event_type"),
|
||||
]
|
||||
|
||||
|
||||
@json_schema_type
|
||||
class AgenticSystemCreateResponse(BaseModel):
|
||||
agent_id: str
|
||||
|
||||
|
||||
@json_schema_type
|
||||
class AgenticSystemSessionCreateResponse(BaseModel):
|
||||
session_id: str
|
||||
|
||||
|
||||
@json_schema_type
|
||||
class AgenticSystemTurnCreateRequest(AgentConfigOverridablePerTurn):
|
||||
agent_id: str
|
||||
session_id: str
|
||||
|
||||
# TODO: figure out how we can simplify this and make why
|
||||
# ToolResponseMessage needs to be here (it is function call
|
||||
# execution from outside the system)
|
||||
messages: List[
|
||||
Union[
|
||||
UserMessage,
|
||||
ToolResponseMessage,
|
||||
]
|
||||
]
|
||||
attachments: Optional[List[Attachment]] = None
|
||||
|
||||
stream: Optional[bool] = False
|
||||
|
||||
|
||||
@json_schema_type
|
||||
class AgenticSystemTurnResponseStreamChunk(BaseModel):
|
||||
event: AgenticSystemTurnResponseEvent
|
||||
|
||||
|
||||
@json_schema_type
|
||||
class AgenticSystemStepResponse(BaseModel):
|
||||
step: Step
|
||||
|
||||
|
||||
class AgenticSystem(Protocol):
|
||||
@webmethod(route="/agentic_system/create")
|
||||
async def create_agentic_system(
|
||||
self,
|
||||
agent_config: AgentConfig,
|
||||
) -> AgenticSystemCreateResponse: ...
|
||||
|
||||
@webmethod(route="/agentic_system/turn/create")
|
||||
async def create_agentic_system_turn(
|
||||
self,
|
||||
agent_id: str,
|
||||
session_id: str,
|
||||
messages: List[
|
||||
Union[
|
||||
UserMessage,
|
||||
ToolResponseMessage,
|
||||
]
|
||||
],
|
||||
attachments: Optional[List[Attachment]] = None,
|
||||
stream: Optional[bool] = False,
|
||||
) -> AgenticSystemTurnResponseStreamChunk: ...
|
||||
|
||||
@webmethod(route="/agentic_system/turn/get")
|
||||
async def get_agentic_system_turn(
|
||||
self,
|
||||
agent_id: str,
|
||||
turn_id: str,
|
||||
) -> Turn: ...
|
||||
|
||||
@webmethod(route="/agentic_system/step/get")
|
||||
async def get_agentic_system_step(
|
||||
self, agent_id: str, turn_id: str, step_id: str
|
||||
) -> AgenticSystemStepResponse: ...
|
||||
|
||||
@webmethod(route="/agentic_system/session/create")
|
||||
async def create_agentic_system_session(
|
||||
self,
|
||||
agent_id: str,
|
||||
session_name: str,
|
||||
) -> AgenticSystemSessionCreateResponse: ...
|
||||
|
||||
@webmethod(route="/agentic_system/session/get")
|
||||
async def get_agentic_system_session(
|
||||
self,
|
||||
agent_id: str,
|
||||
session_id: str,
|
||||
turn_ids: Optional[List[str]] = None,
|
||||
) -> Session: ...
|
||||
|
||||
@webmethod(route="/agentic_system/session/delete")
|
||||
async def delete_agentic_system_session(
|
||||
self, agent_id: str, session_id: str
|
||||
) -> None: ...
|
||||
|
||||
@webmethod(route="/agentic_system/delete")
|
||||
async def delete_agentic_system(
|
||||
self,
|
||||
agent_id: str,
|
||||
) -> None: ...
|
212
llama_stack/agentic_system/client.py
Normal file
212
llama_stack/agentic_system/client.py
Normal file
|
@ -0,0 +1,212 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
from typing import AsyncGenerator
|
||||
|
||||
import fire
|
||||
|
||||
import httpx
|
||||
|
||||
from pydantic import BaseModel
|
||||
from termcolor import cprint
|
||||
|
||||
from llama_models.llama3.api.datatypes import * # noqa: F403
|
||||
from llama_stack.core.datatypes import RemoteProviderConfig
|
||||
|
||||
from .api import * # noqa: F403
|
||||
from .event_logger import EventLogger
|
||||
|
||||
|
||||
async def get_client_impl(config: RemoteProviderConfig, _deps):
|
||||
return AgenticSystemClient(config.url)
|
||||
|
||||
|
||||
def encodable_dict(d: BaseModel):
|
||||
return json.loads(d.json())
|
||||
|
||||
|
||||
class AgenticSystemClient(AgenticSystem):
|
||||
def __init__(self, base_url: str):
|
||||
self.base_url = base_url
|
||||
|
||||
async def create_agentic_system(
|
||||
self, agent_config: AgentConfig
|
||||
) -> AgenticSystemCreateResponse:
|
||||
async with httpx.AsyncClient() as client:
|
||||
response = await client.post(
|
||||
f"{self.base_url}/agentic_system/create",
|
||||
json={
|
||||
"agent_config": encodable_dict(agent_config),
|
||||
},
|
||||
headers={"Content-Type": "application/json"},
|
||||
)
|
||||
response.raise_for_status()
|
||||
return AgenticSystemCreateResponse(**response.json())
|
||||
|
||||
async def create_agentic_system_session(
|
||||
self,
|
||||
agent_id: str,
|
||||
session_name: str,
|
||||
) -> AgenticSystemSessionCreateResponse:
|
||||
async with httpx.AsyncClient() as client:
|
||||
response = await client.post(
|
||||
f"{self.base_url}/agentic_system/session/create",
|
||||
json={
|
||||
"agent_id": agent_id,
|
||||
"session_name": session_name,
|
||||
},
|
||||
headers={"Content-Type": "application/json"},
|
||||
)
|
||||
response.raise_for_status()
|
||||
return AgenticSystemSessionCreateResponse(**response.json())
|
||||
|
||||
async def create_agentic_system_turn(
|
||||
self,
|
||||
request: AgenticSystemTurnCreateRequest,
|
||||
) -> AsyncGenerator:
|
||||
async with httpx.AsyncClient() as client:
|
||||
async with client.stream(
|
||||
"POST",
|
||||
f"{self.base_url}/agentic_system/turn/create",
|
||||
json=encodable_dict(request),
|
||||
headers={"Content-Type": "application/json"},
|
||||
timeout=20,
|
||||
) as response:
|
||||
async for line in response.aiter_lines():
|
||||
if line.startswith("data:"):
|
||||
data = line[len("data: ") :]
|
||||
try:
|
||||
jdata = json.loads(data)
|
||||
if "error" in jdata:
|
||||
cprint(data, "red")
|
||||
continue
|
||||
|
||||
yield AgenticSystemTurnResponseStreamChunk(**jdata)
|
||||
except Exception as e:
|
||||
print(data)
|
||||
print(f"Error with parsing or validation: {e}")
|
||||
|
||||
|
||||
async def _run_agent(api, tool_definitions, user_prompts, attachments=None):
|
||||
agent_config = AgentConfig(
|
||||
model="Meta-Llama3.1-8B-Instruct",
|
||||
instructions="You are a helpful assistant",
|
||||
sampling_params=SamplingParams(temperature=1.0, top_p=0.9),
|
||||
tools=tool_definitions,
|
||||
tool_choice=ToolChoice.auto,
|
||||
tool_prompt_format=ToolPromptFormat.function_tag,
|
||||
)
|
||||
|
||||
create_response = await api.create_agentic_system(agent_config)
|
||||
session_response = await api.create_agentic_system_session(
|
||||
agent_id=create_response.agent_id,
|
||||
session_name="test_session",
|
||||
)
|
||||
|
||||
for content in user_prompts:
|
||||
cprint(f"User> {content}", color="white", attrs=["bold"])
|
||||
iterator = api.create_agentic_system_turn(
|
||||
AgenticSystemTurnCreateRequest(
|
||||
agent_id=create_response.agent_id,
|
||||
session_id=session_response.session_id,
|
||||
messages=[
|
||||
UserMessage(content=content),
|
||||
],
|
||||
attachments=attachments,
|
||||
stream=True,
|
||||
)
|
||||
)
|
||||
|
||||
async for event, log in EventLogger().log(iterator):
|
||||
if log is not None:
|
||||
log.print()
|
||||
|
||||
|
||||
async def run_main(host: str, port: int):
|
||||
api = AgenticSystemClient(f"http://{host}:{port}")
|
||||
|
||||
tool_definitions = [
|
||||
SearchToolDefinition(engine=SearchEngineType.bing),
|
||||
WolframAlphaToolDefinition(),
|
||||
CodeInterpreterToolDefinition(),
|
||||
]
|
||||
tool_definitions += [
|
||||
FunctionCallToolDefinition(
|
||||
function_name="get_boiling_point",
|
||||
description="Get the boiling point of a imaginary liquids (eg. polyjuice)",
|
||||
parameters={
|
||||
"liquid_name": ToolParamDefinition(
|
||||
param_type="str",
|
||||
description="The name of the liquid",
|
||||
required=True,
|
||||
),
|
||||
"celcius": ToolParamDefinition(
|
||||
param_type="str",
|
||||
description="Whether to return the boiling point in Celcius",
|
||||
required=False,
|
||||
),
|
||||
},
|
||||
),
|
||||
]
|
||||
|
||||
user_prompts = [
|
||||
"Who are you?",
|
||||
"what is the 100th prime number?",
|
||||
"Search web for who was 44th President of USA?",
|
||||
"Write code to check if a number is prime. Use that to check if 7 is prime",
|
||||
"What is the boiling point of polyjuicepotion ?",
|
||||
]
|
||||
await _run_agent(api, tool_definitions, user_prompts)
|
||||
|
||||
|
||||
async def run_rag(host: str, port: int):
|
||||
api = AgenticSystemClient(f"http://{host}:{port}")
|
||||
|
||||
urls = [
|
||||
"memory_optimizations.rst",
|
||||
"chat.rst",
|
||||
"llama3.rst",
|
||||
"datasets.rst",
|
||||
"qat_finetune.rst",
|
||||
"lora_finetune.rst",
|
||||
]
|
||||
attachments = [
|
||||
Attachment(
|
||||
content=URL(
|
||||
uri=f"https://raw.githubusercontent.com/pytorch/torchtune/main/docs/source/tutorials/{url}"
|
||||
),
|
||||
mime_type="text/plain",
|
||||
)
|
||||
for i, url in enumerate(urls)
|
||||
]
|
||||
|
||||
# Alternatively, you can pre-populate the memory bank with documents for example,
|
||||
# using `llama_stack.memory.client`. Then you can grab the bank_id
|
||||
# from the output of that run.
|
||||
tool_definitions = [
|
||||
MemoryToolDefinition(
|
||||
max_tokens_in_context=2048,
|
||||
memory_bank_configs=[],
|
||||
),
|
||||
]
|
||||
|
||||
user_prompts = [
|
||||
"How do I use Lora?",
|
||||
"Tell me briefly about llama3 and torchtune",
|
||||
]
|
||||
|
||||
await _run_agent(api, tool_definitions, user_prompts, attachments)
|
||||
|
||||
|
||||
def main(host: str, port: int, rag: bool = False):
|
||||
fn = run_rag if rag else run_main
|
||||
asyncio.run(fn(host, port))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
fire.Fire(main)
|
184
llama_stack/agentic_system/event_logger.py
Normal file
184
llama_stack/agentic_system/event_logger.py
Normal file
|
@ -0,0 +1,184 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
from typing import Optional
|
||||
|
||||
from llama_models.llama3.api.datatypes import * # noqa: F403
|
||||
from llama_models.llama3.api.tool_utils import ToolUtils
|
||||
|
||||
from termcolor import cprint
|
||||
|
||||
from llama_stack.agentic_system.api import AgenticSystemTurnResponseEventType, StepType
|
||||
|
||||
|
||||
class LogEvent:
|
||||
def __init__(
|
||||
self,
|
||||
role: Optional[str] = None,
|
||||
content: str = "",
|
||||
end: str = "\n",
|
||||
color="white",
|
||||
):
|
||||
self.role = role
|
||||
self.content = content
|
||||
self.color = color
|
||||
self.end = "\n" if end is None else end
|
||||
|
||||
def __str__(self):
|
||||
if self.role is not None:
|
||||
return f"{self.role}> {self.content}"
|
||||
else:
|
||||
return f"{self.content}"
|
||||
|
||||
def print(self, flush=True):
|
||||
cprint(f"{str(self)}", color=self.color, end=self.end, flush=flush)
|
||||
|
||||
|
||||
EventType = AgenticSystemTurnResponseEventType
|
||||
|
||||
|
||||
class EventLogger:
|
||||
async def log(
|
||||
self,
|
||||
event_generator,
|
||||
stream=True,
|
||||
tool_prompt_format: ToolPromptFormat = ToolPromptFormat.json,
|
||||
):
|
||||
previous_event_type = None
|
||||
previous_step_type = None
|
||||
|
||||
async for chunk in event_generator:
|
||||
if not hasattr(chunk, "event"):
|
||||
# Need to check for custom tool first
|
||||
# since it does not produce event but instead
|
||||
# a Message
|
||||
if isinstance(chunk, ToolResponseMessage):
|
||||
yield chunk, LogEvent(
|
||||
role="CustomTool", content=chunk.content, color="grey"
|
||||
)
|
||||
continue
|
||||
|
||||
event = chunk.event
|
||||
event_type = event.payload.event_type
|
||||
if event_type in {
|
||||
EventType.turn_start.value,
|
||||
EventType.turn_complete.value,
|
||||
}:
|
||||
# Currently not logging any turn realted info
|
||||
yield event, None
|
||||
continue
|
||||
|
||||
step_type = event.payload.step_type
|
||||
# handle safety
|
||||
if (
|
||||
step_type == StepType.shield_call
|
||||
and event_type == EventType.step_complete.value
|
||||
):
|
||||
response = event.payload.step_details.response
|
||||
if not response.is_violation:
|
||||
yield event, LogEvent(
|
||||
role=step_type, content="No Violation", color="magenta"
|
||||
)
|
||||
else:
|
||||
yield event, LogEvent(
|
||||
role=step_type,
|
||||
content=f"{response.violation_type} {response.violation_return_message}",
|
||||
color="red",
|
||||
)
|
||||
|
||||
# handle inference
|
||||
if step_type == StepType.inference:
|
||||
if stream:
|
||||
if event_type == EventType.step_start.value:
|
||||
# TODO: Currently this event is never received
|
||||
yield event, LogEvent(
|
||||
role=step_type, content="", end="", color="yellow"
|
||||
)
|
||||
elif event_type == EventType.step_progress.value:
|
||||
# HACK: if previous was not step/event was not inference's step_progress
|
||||
# this is the first time we are getting model inference response
|
||||
# aka equivalent to step_start for inference. Hence,
|
||||
# start with "Model>".
|
||||
if (
|
||||
previous_event_type != EventType.step_progress.value
|
||||
and previous_step_type != StepType.inference
|
||||
):
|
||||
yield event, LogEvent(
|
||||
role=step_type, content="", end="", color="yellow"
|
||||
)
|
||||
|
||||
if event.payload.tool_call_delta:
|
||||
if isinstance(event.payload.tool_call_delta.content, str):
|
||||
yield event, LogEvent(
|
||||
role=None,
|
||||
content=event.payload.tool_call_delta.content,
|
||||
end="",
|
||||
color="cyan",
|
||||
)
|
||||
else:
|
||||
yield event, LogEvent(
|
||||
role=None,
|
||||
content=event.payload.model_response_text_delta,
|
||||
end="",
|
||||
color="yellow",
|
||||
)
|
||||
else:
|
||||
# step_complete
|
||||
yield event, LogEvent(role=None, content="")
|
||||
|
||||
else:
|
||||
# Not streaming
|
||||
if event_type == EventType.step_complete.value:
|
||||
response = event.payload.step_details.model_response
|
||||
if response.tool_calls:
|
||||
content = ToolUtils.encode_tool_call(
|
||||
response.tool_calls[0], tool_prompt_format
|
||||
)
|
||||
else:
|
||||
content = response.content
|
||||
yield event, LogEvent(
|
||||
role=step_type,
|
||||
content=content,
|
||||
color="yellow",
|
||||
)
|
||||
|
||||
# handle tool_execution
|
||||
if (
|
||||
step_type == StepType.tool_execution
|
||||
and
|
||||
# Only print tool calls and responses at the step_complete event
|
||||
event_type == EventType.step_complete.value
|
||||
):
|
||||
details = event.payload.step_details
|
||||
for t in details.tool_calls:
|
||||
yield event, LogEvent(
|
||||
role=step_type,
|
||||
content=f"Tool:{t.tool_name} Args:{t.arguments}",
|
||||
color="green",
|
||||
)
|
||||
for r in details.tool_responses:
|
||||
yield event, LogEvent(
|
||||
role=step_type,
|
||||
content=f"Tool:{r.tool_name} Response:{r.content}",
|
||||
color="green",
|
||||
)
|
||||
|
||||
if (
|
||||
step_type == StepType.memory_retrieval
|
||||
and event_type == EventType.step_complete.value
|
||||
):
|
||||
details = event.payload.step_details
|
||||
content = interleaved_text_media_as_str(details.inserted_context)
|
||||
content = content[:200] + "..." if len(content) > 200 else content
|
||||
|
||||
yield event, LogEvent(
|
||||
role=step_type,
|
||||
content=f"Retrieved context from banks: {details.memory_bank_ids}.\n====\n{content}\n>",
|
||||
color="cyan",
|
||||
)
|
||||
|
||||
preivous_event_type = event_type
|
||||
previous_step_type = step_type
|
96
llama_stack/agentic_system/execute_with_custom_tools.py
Normal file
96
llama_stack/agentic_system/execute_with_custom_tools.py
Normal file
|
@ -0,0 +1,96 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
from typing import AsyncGenerator, List
|
||||
|
||||
from llama_models.llama3.api.datatypes import * # noqa: F403
|
||||
from llama_stack.agentic_system.api import * # noqa: F403
|
||||
from llama_stack.memory.api import * # noqa: F403
|
||||
from llama_stack.safety.api import * # noqa: F403
|
||||
|
||||
from llama_stack.agentic_system.api import (
|
||||
AgenticSystemTurnResponseEventType as EventType,
|
||||
)
|
||||
from llama_stack.tools.custom.datatypes import CustomTool
|
||||
|
||||
|
||||
class AgentWithCustomToolExecutor:
|
||||
def __init__(
|
||||
self,
|
||||
api: AgenticSystem,
|
||||
agent_id: str,
|
||||
session_id: str,
|
||||
agent_config: AgentConfig,
|
||||
custom_tools: List[CustomTool],
|
||||
):
|
||||
self.api = api
|
||||
self.agent_id = agent_id
|
||||
self.session_id = session_id
|
||||
self.agent_config = agent_config
|
||||
self.custom_tools = custom_tools
|
||||
|
||||
async def execute_turn(
|
||||
self,
|
||||
messages: List[Message],
|
||||
attachments: Optional[List[Attachment]] = None,
|
||||
max_iters: int = 5,
|
||||
stream: bool = True,
|
||||
) -> AsyncGenerator:
|
||||
tools_dict = {t.get_name(): t for t in self.custom_tools}
|
||||
|
||||
current_messages = messages.copy()
|
||||
n_iter = 0
|
||||
while n_iter < max_iters:
|
||||
n_iter += 1
|
||||
|
||||
request = AgenticSystemTurnCreateRequest(
|
||||
agent_id=self.agent_id,
|
||||
session_id=self.session_id,
|
||||
messages=current_messages,
|
||||
attachments=attachments,
|
||||
stream=stream,
|
||||
)
|
||||
|
||||
turn = None
|
||||
async for chunk in self.api.create_agentic_system_turn(request):
|
||||
if chunk.event.payload.event_type != EventType.turn_complete.value:
|
||||
yield chunk
|
||||
else:
|
||||
turn = chunk.event.payload.turn
|
||||
|
||||
message = turn.output_message
|
||||
if len(message.tool_calls) == 0:
|
||||
yield chunk
|
||||
return
|
||||
|
||||
if message.stop_reason == StopReason.out_of_tokens:
|
||||
yield chunk
|
||||
return
|
||||
|
||||
tool_call = message.tool_calls[0]
|
||||
if tool_call.tool_name not in tools_dict:
|
||||
m = ToolResponseMessage(
|
||||
call_id=tool_call.call_id,
|
||||
tool_name=tool_call.tool_name,
|
||||
content=f"Unknown tool `{tool_call.tool_name}` was called. Try again with something else",
|
||||
)
|
||||
next_message = m
|
||||
else:
|
||||
tool = tools_dict[tool_call.tool_name]
|
||||
result_messages = await execute_custom_tool(tool, message)
|
||||
next_message = result_messages[0]
|
||||
|
||||
yield next_message
|
||||
current_messages = [next_message]
|
||||
|
||||
|
||||
async def execute_custom_tool(tool: CustomTool, message: Message) -> List[Message]:
|
||||
result_messages = await tool.run([message])
|
||||
assert (
|
||||
len(result_messages) == 1
|
||||
), f"Expected single message, got {len(result_messages)}"
|
||||
|
||||
return result_messages
|
30
llama_stack/agentic_system/meta_reference/__init__.py
Normal file
30
llama_stack/agentic_system/meta_reference/__init__.py
Normal file
|
@ -0,0 +1,30 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
from typing import Dict
|
||||
|
||||
from llama_stack.core.datatypes import Api, ProviderSpec
|
||||
|
||||
from .config import MetaReferenceImplConfig
|
||||
|
||||
|
||||
async def get_provider_impl(
|
||||
config: MetaReferenceImplConfig, deps: Dict[Api, ProviderSpec]
|
||||
):
|
||||
from .agentic_system import MetaReferenceAgenticSystemImpl
|
||||
|
||||
assert isinstance(
|
||||
config, MetaReferenceImplConfig
|
||||
), f"Unexpected config type: {type(config)}"
|
||||
|
||||
impl = MetaReferenceAgenticSystemImpl(
|
||||
config,
|
||||
deps[Api.inference],
|
||||
deps[Api.memory],
|
||||
deps[Api.safety],
|
||||
)
|
||||
await impl.initialize()
|
||||
return impl
|
797
llama_stack/agentic_system/meta_reference/agent_instance.py
Normal file
797
llama_stack/agentic_system/meta_reference/agent_instance.py
Normal file
|
@ -0,0 +1,797 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
import asyncio
|
||||
import copy
|
||||
import os
|
||||
import secrets
|
||||
import shutil
|
||||
import string
|
||||
import tempfile
|
||||
import uuid
|
||||
from datetime import datetime
|
||||
from typing import AsyncGenerator, List, Tuple
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import httpx
|
||||
|
||||
from termcolor import cprint
|
||||
|
||||
from llama_stack.agentic_system.api import * # noqa: F403
|
||||
from llama_stack.inference.api import * # noqa: F403
|
||||
from llama_stack.memory.api import * # noqa: F403
|
||||
from llama_stack.safety.api import * # noqa: F403
|
||||
|
||||
from llama_stack.tools.base import BaseTool
|
||||
from llama_stack.tools.builtin import (
|
||||
interpret_content_as_attachment,
|
||||
SingleMessageBuiltinTool,
|
||||
)
|
||||
|
||||
from .rag.context_retriever import generate_rag_query
|
||||
from .safety import SafetyException, ShieldRunnerMixin
|
||||
|
||||
|
||||
def make_random_string(length: int = 8):
|
||||
return "".join(
|
||||
secrets.choice(string.ascii_letters + string.digits) for _ in range(length)
|
||||
)
|
||||
|
||||
|
||||
class ChatAgent(ShieldRunnerMixin):
|
||||
def __init__(
|
||||
self,
|
||||
agent_config: AgentConfig,
|
||||
inference_api: Inference,
|
||||
memory_api: Memory,
|
||||
safety_api: Safety,
|
||||
builtin_tools: List[SingleMessageBuiltinTool],
|
||||
max_infer_iters: int = 10,
|
||||
):
|
||||
self.agent_config = agent_config
|
||||
self.inference_api = inference_api
|
||||
self.memory_api = memory_api
|
||||
self.safety_api = safety_api
|
||||
|
||||
self.max_infer_iters = max_infer_iters
|
||||
self.tools_dict = {t.get_name(): t for t in builtin_tools}
|
||||
|
||||
self.tempdir = tempfile.mkdtemp()
|
||||
self.sessions = {}
|
||||
|
||||
ShieldRunnerMixin.__init__(
|
||||
self,
|
||||
safety_api,
|
||||
input_shields=agent_config.input_shields,
|
||||
output_shields=agent_config.output_shields,
|
||||
)
|
||||
|
||||
def __del__(self):
|
||||
shutil.rmtree(self.tempdir)
|
||||
|
||||
def turn_to_messages(self, turn: Turn) -> List[Message]:
|
||||
messages = []
|
||||
|
||||
# We do not want to keep adding RAG context to the input messages
|
||||
# May be this should be a parameter of the agentic instance
|
||||
# that can define its behavior in a custom way
|
||||
for m in turn.input_messages:
|
||||
msg = m.copy()
|
||||
if isinstance(msg, UserMessage):
|
||||
msg.context = None
|
||||
messages.append(msg)
|
||||
|
||||
# messages.extend(turn.input_messages)
|
||||
for step in turn.steps:
|
||||
if step.step_type == StepType.inference.value:
|
||||
messages.append(step.model_response)
|
||||
elif step.step_type == StepType.tool_execution.value:
|
||||
for response in step.tool_responses:
|
||||
messages.append(
|
||||
ToolResponseMessage(
|
||||
call_id=response.call_id,
|
||||
tool_name=response.tool_name,
|
||||
content=response.content,
|
||||
)
|
||||
)
|
||||
elif step.step_type == StepType.shield_call.value:
|
||||
response = step.response
|
||||
if response.is_violation:
|
||||
# CompletionMessage itself in the ShieldResponse
|
||||
messages.append(
|
||||
CompletionMessage(
|
||||
content=response.violation_return_message,
|
||||
stop_reason=StopReason.end_of_turn,
|
||||
)
|
||||
)
|
||||
# print_dialog(messages)
|
||||
return messages
|
||||
|
||||
def create_session(self, name: str) -> Session:
|
||||
session_id = str(uuid.uuid4())
|
||||
session = Session(
|
||||
session_id=session_id,
|
||||
session_name=name,
|
||||
turns=[],
|
||||
started_at=datetime.now(),
|
||||
)
|
||||
self.sessions[session_id] = session
|
||||
return session
|
||||
|
||||
async def create_and_execute_turn(
|
||||
self, request: AgenticSystemTurnCreateRequest
|
||||
) -> AsyncGenerator:
|
||||
assert (
|
||||
request.session_id in self.sessions
|
||||
), f"Session {request.session_id} not found"
|
||||
|
||||
session = self.sessions[request.session_id]
|
||||
|
||||
messages = []
|
||||
for i, turn in enumerate(session.turns):
|
||||
messages.extend(self.turn_to_messages(turn))
|
||||
|
||||
messages.extend(request.messages)
|
||||
|
||||
# print("processed dialog ======== ")
|
||||
# print_dialog(messages)
|
||||
|
||||
turn_id = str(uuid.uuid4())
|
||||
start_time = datetime.now()
|
||||
yield AgenticSystemTurnResponseStreamChunk(
|
||||
event=AgenticSystemTurnResponseEvent(
|
||||
payload=AgenticSystemTurnResponseTurnStartPayload(
|
||||
turn_id=turn_id,
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
steps = []
|
||||
output_message = None
|
||||
async for chunk in self.run(
|
||||
session=session,
|
||||
turn_id=turn_id,
|
||||
input_messages=messages,
|
||||
attachments=request.attachments or [],
|
||||
sampling_params=self.agent_config.sampling_params,
|
||||
stream=request.stream,
|
||||
):
|
||||
if isinstance(chunk, CompletionMessage):
|
||||
cprint(
|
||||
f"{chunk.role.capitalize()}: {chunk.content}",
|
||||
"white",
|
||||
attrs=["bold"],
|
||||
)
|
||||
output_message = chunk
|
||||
continue
|
||||
|
||||
assert isinstance(
|
||||
chunk, AgenticSystemTurnResponseStreamChunk
|
||||
), f"Unexpected type {type(chunk)}"
|
||||
event = chunk.event
|
||||
if (
|
||||
event.payload.event_type
|
||||
== AgenticSystemTurnResponseEventType.step_complete.value
|
||||
):
|
||||
steps.append(event.payload.step_details)
|
||||
|
||||
yield chunk
|
||||
|
||||
assert output_message is not None
|
||||
|
||||
turn = Turn(
|
||||
turn_id=turn_id,
|
||||
session_id=request.session_id,
|
||||
input_messages=request.messages,
|
||||
output_message=output_message,
|
||||
started_at=start_time,
|
||||
completed_at=datetime.now(),
|
||||
steps=steps,
|
||||
)
|
||||
session.turns.append(turn)
|
||||
|
||||
chunk = AgenticSystemTurnResponseStreamChunk(
|
||||
event=AgenticSystemTurnResponseEvent(
|
||||
payload=AgenticSystemTurnResponseTurnCompletePayload(
|
||||
turn=turn,
|
||||
)
|
||||
)
|
||||
)
|
||||
yield chunk
|
||||
|
||||
async def run(
|
||||
self,
|
||||
session: Session,
|
||||
turn_id: str,
|
||||
input_messages: List[Message],
|
||||
attachments: List[Attachment],
|
||||
sampling_params: SamplingParams,
|
||||
stream: bool = False,
|
||||
) -> AsyncGenerator:
|
||||
# Doing async generators makes downstream code much simpler and everything amenable to
|
||||
# streaming. However, it also makes things complicated here because AsyncGenerators cannot
|
||||
# return a "final value" for the `yield from` statement. we simulate that by yielding a
|
||||
# final boolean (to see whether an exception happened) and then explicitly testing for it.
|
||||
|
||||
async for res in self.run_shields_wrapper(
|
||||
turn_id, input_messages, self.input_shields, "user-input"
|
||||
):
|
||||
if isinstance(res, bool):
|
||||
return
|
||||
else:
|
||||
yield res
|
||||
|
||||
async for res in self._run(
|
||||
session, turn_id, input_messages, attachments, sampling_params, stream
|
||||
):
|
||||
if isinstance(res, bool):
|
||||
return
|
||||
elif isinstance(res, CompletionMessage):
|
||||
final_response = res
|
||||
break
|
||||
else:
|
||||
yield res
|
||||
|
||||
assert final_response is not None
|
||||
# for output shields run on the full input and output combination
|
||||
messages = input_messages + [final_response]
|
||||
|
||||
async for res in self.run_shields_wrapper(
|
||||
turn_id, messages, self.output_shields, "assistant-output"
|
||||
):
|
||||
if isinstance(res, bool):
|
||||
return
|
||||
else:
|
||||
yield res
|
||||
|
||||
yield final_response
|
||||
|
||||
async def run_shields_wrapper(
|
||||
self,
|
||||
turn_id: str,
|
||||
messages: List[Message],
|
||||
shields: List[ShieldDefinition],
|
||||
touchpoint: str,
|
||||
) -> AsyncGenerator:
|
||||
if len(shields) == 0:
|
||||
return
|
||||
|
||||
step_id = str(uuid.uuid4())
|
||||
try:
|
||||
yield AgenticSystemTurnResponseStreamChunk(
|
||||
event=AgenticSystemTurnResponseEvent(
|
||||
payload=AgenticSystemTurnResponseStepStartPayload(
|
||||
step_type=StepType.shield_call.value,
|
||||
step_id=step_id,
|
||||
metadata=dict(touchpoint=touchpoint),
|
||||
)
|
||||
)
|
||||
)
|
||||
await self.run_shields(messages, shields)
|
||||
|
||||
except SafetyException as e:
|
||||
yield AgenticSystemTurnResponseStreamChunk(
|
||||
event=AgenticSystemTurnResponseEvent(
|
||||
payload=AgenticSystemTurnResponseStepCompletePayload(
|
||||
step_type=StepType.shield_call.value,
|
||||
step_details=ShieldCallStep(
|
||||
step_id=step_id,
|
||||
turn_id=turn_id,
|
||||
response=e.response,
|
||||
),
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
yield CompletionMessage(
|
||||
content=str(e),
|
||||
stop_reason=StopReason.end_of_turn,
|
||||
)
|
||||
yield False
|
||||
|
||||
yield AgenticSystemTurnResponseStreamChunk(
|
||||
event=AgenticSystemTurnResponseEvent(
|
||||
payload=AgenticSystemTurnResponseStepCompletePayload(
|
||||
step_type=StepType.shield_call.value,
|
||||
step_details=ShieldCallStep(
|
||||
step_id=step_id,
|
||||
turn_id=turn_id,
|
||||
response=ShieldResponse(
|
||||
# TODO: fix this, give each shield a shield type method and
|
||||
# fire one event for each shield run
|
||||
shield_type=BuiltinShield.llama_guard,
|
||||
is_violation=False,
|
||||
),
|
||||
),
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
async def _run(
|
||||
self,
|
||||
session: Session,
|
||||
turn_id: str,
|
||||
input_messages: List[Message],
|
||||
attachments: List[Attachment],
|
||||
sampling_params: SamplingParams,
|
||||
stream: bool = False,
|
||||
) -> AsyncGenerator:
|
||||
enabled_tools = set(t.type for t in self.agent_config.tools)
|
||||
need_rag_context = await self._should_retrieve_context(
|
||||
input_messages, attachments
|
||||
)
|
||||
if need_rag_context:
|
||||
step_id = str(uuid.uuid4())
|
||||
yield AgenticSystemTurnResponseStreamChunk(
|
||||
event=AgenticSystemTurnResponseEvent(
|
||||
payload=AgenticSystemTurnResponseStepStartPayload(
|
||||
step_type=StepType.memory_retrieval.value,
|
||||
step_id=step_id,
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
# TODO: find older context from the session and either replace it
|
||||
# or append with a sliding window. this is really a very simplistic implementation
|
||||
rag_context, bank_ids = await self._retrieve_context(
|
||||
session, input_messages, attachments
|
||||
)
|
||||
|
||||
step_id = str(uuid.uuid4())
|
||||
yield AgenticSystemTurnResponseStreamChunk(
|
||||
event=AgenticSystemTurnResponseEvent(
|
||||
payload=AgenticSystemTurnResponseStepCompletePayload(
|
||||
step_type=StepType.memory_retrieval.value,
|
||||
step_id=step_id,
|
||||
step_details=MemoryRetrievalStep(
|
||||
turn_id=turn_id,
|
||||
step_id=step_id,
|
||||
memory_bank_ids=bank_ids,
|
||||
inserted_context=rag_context or "",
|
||||
),
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
if rag_context:
|
||||
last_message = input_messages[-1]
|
||||
last_message.context = "\n".join(rag_context)
|
||||
|
||||
elif attachments and AgenticSystemTool.code_interpreter.value in enabled_tools:
|
||||
urls = [a.content for a in attachments if isinstance(a.content, URL)]
|
||||
msg = await attachment_message(self.tempdir, urls)
|
||||
input_messages.append(msg)
|
||||
|
||||
output_attachments = []
|
||||
|
||||
n_iter = 0
|
||||
while True:
|
||||
msg = input_messages[-1]
|
||||
if msg.role == Role.user.value:
|
||||
color = "blue"
|
||||
elif msg.role == Role.ipython.value:
|
||||
color = "yellow"
|
||||
else:
|
||||
color = None
|
||||
cprint(f"{str(msg)}", color=color)
|
||||
|
||||
step_id = str(uuid.uuid4())
|
||||
yield AgenticSystemTurnResponseStreamChunk(
|
||||
event=AgenticSystemTurnResponseEvent(
|
||||
payload=AgenticSystemTurnResponseStepStartPayload(
|
||||
step_type=StepType.inference.value,
|
||||
step_id=step_id,
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
tool_calls = []
|
||||
content = ""
|
||||
stop_reason = None
|
||||
async for chunk in self.inference_api.chat_completion(
|
||||
self.agent_config.model,
|
||||
input_messages,
|
||||
tools=self._get_tools(),
|
||||
tool_prompt_format=self.agent_config.tool_prompt_format,
|
||||
stream=True,
|
||||
sampling_params=sampling_params,
|
||||
):
|
||||
event = chunk.event
|
||||
if event.event_type == ChatCompletionResponseEventType.start:
|
||||
continue
|
||||
elif event.event_type == ChatCompletionResponseEventType.complete:
|
||||
stop_reason = StopReason.end_of_turn
|
||||
continue
|
||||
|
||||
delta = event.delta
|
||||
if isinstance(delta, ToolCallDelta):
|
||||
if delta.parse_status == ToolCallParseStatus.success:
|
||||
tool_calls.append(delta.content)
|
||||
|
||||
if stream:
|
||||
yield AgenticSystemTurnResponseStreamChunk(
|
||||
event=AgenticSystemTurnResponseEvent(
|
||||
payload=AgenticSystemTurnResponseStepProgressPayload(
|
||||
step_type=StepType.inference.value,
|
||||
step_id=step_id,
|
||||
model_response_text_delta="",
|
||||
tool_call_delta=delta,
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
elif isinstance(delta, str):
|
||||
content += delta
|
||||
if stream and event.stop_reason is None:
|
||||
yield AgenticSystemTurnResponseStreamChunk(
|
||||
event=AgenticSystemTurnResponseEvent(
|
||||
payload=AgenticSystemTurnResponseStepProgressPayload(
|
||||
step_type=StepType.inference.value,
|
||||
step_id=step_id,
|
||||
model_response_text_delta=event.delta,
|
||||
)
|
||||
)
|
||||
)
|
||||
else:
|
||||
raise ValueError(f"Unexpected delta type {type(delta)}")
|
||||
|
||||
if event.stop_reason is not None:
|
||||
stop_reason = event.stop_reason
|
||||
|
||||
stop_reason = stop_reason or StopReason.out_of_tokens
|
||||
message = CompletionMessage(
|
||||
content=content,
|
||||
stop_reason=stop_reason,
|
||||
tool_calls=tool_calls,
|
||||
)
|
||||
|
||||
yield AgenticSystemTurnResponseStreamChunk(
|
||||
event=AgenticSystemTurnResponseEvent(
|
||||
payload=AgenticSystemTurnResponseStepCompletePayload(
|
||||
step_type=StepType.inference.value,
|
||||
step_id=step_id,
|
||||
step_details=InferenceStep(
|
||||
# somewhere deep, we are re-assigning message or closing over some
|
||||
# variable which causes message to mutate later on. fix with a
|
||||
# `deepcopy` for now, but this is symptomatic of a deeper issue.
|
||||
step_id=step_id,
|
||||
turn_id=turn_id,
|
||||
model_response=copy.deepcopy(message),
|
||||
),
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
if n_iter >= self.max_infer_iters:
|
||||
cprint("Done with MAX iterations, exiting.")
|
||||
yield message
|
||||
break
|
||||
|
||||
if stop_reason == StopReason.out_of_tokens:
|
||||
cprint("Out of token budget, exiting.")
|
||||
yield message
|
||||
break
|
||||
|
||||
if len(message.tool_calls) == 0:
|
||||
if stop_reason == StopReason.end_of_turn:
|
||||
# TODO: UPDATE RETURN TYPE TO SEND A TUPLE OF (MESSAGE, ATTACHMENTS)
|
||||
if len(output_attachments) > 0:
|
||||
if isinstance(message.content, list):
|
||||
message.content += attachments
|
||||
else:
|
||||
message.content = [message.content] + attachments
|
||||
yield message
|
||||
else:
|
||||
cprint(f"Partial message: {str(message)}", color="green")
|
||||
input_messages = input_messages + [message]
|
||||
else:
|
||||
cprint(f"{str(message)}", color="green")
|
||||
try:
|
||||
tool_call = message.tool_calls[0]
|
||||
|
||||
name = tool_call.tool_name
|
||||
if not isinstance(name, BuiltinTool):
|
||||
yield message
|
||||
return
|
||||
|
||||
step_id = str(uuid.uuid4())
|
||||
yield AgenticSystemTurnResponseStreamChunk(
|
||||
event=AgenticSystemTurnResponseEvent(
|
||||
payload=AgenticSystemTurnResponseStepStartPayload(
|
||||
step_type=StepType.tool_execution.value,
|
||||
step_id=step_id,
|
||||
)
|
||||
)
|
||||
)
|
||||
yield AgenticSystemTurnResponseStreamChunk(
|
||||
event=AgenticSystemTurnResponseEvent(
|
||||
payload=AgenticSystemTurnResponseStepProgressPayload(
|
||||
step_type=StepType.tool_execution.value,
|
||||
step_id=step_id,
|
||||
tool_call=tool_call,
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
result_messages = await execute_tool_call_maybe(
|
||||
self.tools_dict,
|
||||
[message],
|
||||
)
|
||||
assert (
|
||||
len(result_messages) == 1
|
||||
), "Currently not supporting multiple messages"
|
||||
result_message = result_messages[0]
|
||||
|
||||
yield AgenticSystemTurnResponseStreamChunk(
|
||||
event=AgenticSystemTurnResponseEvent(
|
||||
payload=AgenticSystemTurnResponseStepCompletePayload(
|
||||
step_type=StepType.tool_execution.value,
|
||||
step_details=ToolExecutionStep(
|
||||
step_id=step_id,
|
||||
turn_id=turn_id,
|
||||
tool_calls=[tool_call],
|
||||
tool_responses=[
|
||||
ToolResponse(
|
||||
call_id=result_message.call_id,
|
||||
tool_name=result_message.tool_name,
|
||||
content=result_message.content,
|
||||
)
|
||||
],
|
||||
),
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
# TODO: add tool-input touchpoint and a "start" event for this step also
|
||||
# but that needs a lot more refactoring of Tool code potentially
|
||||
yield AgenticSystemTurnResponseStreamChunk(
|
||||
event=AgenticSystemTurnResponseEvent(
|
||||
payload=AgenticSystemTurnResponseStepCompletePayload(
|
||||
step_type=StepType.shield_call.value,
|
||||
step_details=ShieldCallStep(
|
||||
step_id=str(uuid.uuid4()),
|
||||
turn_id=turn_id,
|
||||
response=ShieldResponse(
|
||||
# TODO: fix this, give each shield a shield type method and
|
||||
# fire one event for each shield run
|
||||
shield_type=BuiltinShield.llama_guard,
|
||||
is_violation=False,
|
||||
),
|
||||
),
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
except SafetyException as e:
|
||||
yield AgenticSystemTurnResponseStreamChunk(
|
||||
event=AgenticSystemTurnResponseEvent(
|
||||
payload=AgenticSystemTurnResponseStepCompletePayload(
|
||||
step_type=StepType.shield_call.value,
|
||||
step_details=ShieldCallStep(
|
||||
step_id=str(uuid.uuid4()),
|
||||
turn_id=turn_id,
|
||||
response=e.response,
|
||||
),
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
yield CompletionMessage(
|
||||
content=str(e),
|
||||
stop_reason=StopReason.end_of_turn,
|
||||
)
|
||||
yield False
|
||||
return
|
||||
|
||||
if out_attachment := interpret_content_as_attachment(
|
||||
result_message.content
|
||||
):
|
||||
# NOTE: when we push this message back to the model, the model may ignore the
|
||||
# attached file path etc. since the model is trained to only provide a user message
|
||||
# with the summary. We keep all generated attachments and then attach them to final message
|
||||
output_attachments.append(out_attachment)
|
||||
|
||||
input_messages = input_messages + [message, result_message]
|
||||
|
||||
n_iter += 1
|
||||
|
||||
async def _ensure_memory_bank(self, session: Session) -> MemoryBank:
|
||||
if session.memory_bank is None:
|
||||
session.memory_bank = await self.memory_api.create_memory_bank(
|
||||
name=f"memory_bank_{session.session_id}",
|
||||
config=VectorMemoryBankConfig(
|
||||
embedding_model="sentence-transformer/all-MiniLM-L6-v2",
|
||||
chunk_size_in_tokens=512,
|
||||
),
|
||||
)
|
||||
|
||||
return session.memory_bank
|
||||
|
||||
async def _should_retrieve_context(
|
||||
self, messages: List[Message], attachments: List[Attachment]
|
||||
) -> bool:
|
||||
enabled_tools = set(t.type for t in self.agent_config.tools)
|
||||
if attachments:
|
||||
if (
|
||||
AgenticSystemTool.code_interpreter.value in enabled_tools
|
||||
and self.agent_config.tool_choice == ToolChoice.required
|
||||
):
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
|
||||
return AgenticSystemTool.memory.value in enabled_tools
|
||||
|
||||
def _memory_tool_definition(self) -> Optional[MemoryToolDefinition]:
|
||||
for t in self.agent_config.tools:
|
||||
if t.type == AgenticSystemTool.memory.value:
|
||||
return t
|
||||
|
||||
return None
|
||||
|
||||
async def _retrieve_context(
|
||||
self, session: Session, messages: List[Message], attachments: List[Attachment]
|
||||
) -> Tuple[List[str], List[int]]: # (rag_context, bank_ids)
|
||||
bank_ids = []
|
||||
|
||||
memory = self._memory_tool_definition()
|
||||
assert memory is not None, "Memory tool not configured"
|
||||
bank_ids.extend(c.bank_id for c in memory.memory_bank_configs)
|
||||
|
||||
if attachments:
|
||||
bank = await self._ensure_memory_bank(session)
|
||||
bank_ids.append(bank.bank_id)
|
||||
|
||||
documents = [
|
||||
MemoryBankDocument(
|
||||
document_id=str(uuid.uuid4()),
|
||||
content=a.content,
|
||||
mime_type=a.mime_type,
|
||||
metadata={},
|
||||
)
|
||||
for a in attachments
|
||||
]
|
||||
await self.memory_api.insert_documents(bank.bank_id, documents)
|
||||
elif session.memory_bank:
|
||||
bank_ids.append(session.memory_bank.bank_id)
|
||||
|
||||
if not bank_ids:
|
||||
# this can happen if the per-session memory bank is not yet populated
|
||||
# (i.e., no prior turns uploaded an Attachment)
|
||||
return None, []
|
||||
|
||||
query = await generate_rag_query(
|
||||
memory.query_generator_config, messages, inference_api=self.inference_api
|
||||
)
|
||||
tasks = [
|
||||
self.memory_api.query_documents(
|
||||
bank_id=bank_id,
|
||||
query=query,
|
||||
params={
|
||||
"max_chunks": 5,
|
||||
},
|
||||
)
|
||||
for bank_id in bank_ids
|
||||
]
|
||||
results: List[QueryDocumentsResponse] = await asyncio.gather(*tasks)
|
||||
chunks = [c for r in results for c in r.chunks]
|
||||
scores = [s for r in results for s in r.scores]
|
||||
|
||||
# sort by score
|
||||
chunks, scores = zip(
|
||||
*sorted(zip(chunks, scores), key=lambda x: x[1], reverse=True)
|
||||
)
|
||||
if not chunks:
|
||||
return None, bank_ids
|
||||
|
||||
tokens = 0
|
||||
picked = []
|
||||
for c in chunks[: memory.max_chunks]:
|
||||
tokens += c.token_count
|
||||
if tokens > memory.max_tokens_in_context:
|
||||
cprint(
|
||||
f"Using {len(picked)} chunks; reached max tokens in context: {tokens}",
|
||||
"red",
|
||||
)
|
||||
break
|
||||
picked.append(f"id:{c.document_id}; content:{c.content}")
|
||||
|
||||
return [
|
||||
"Here are the retrieved documents for relevant context:\n=== START-RETRIEVED-CONTEXT ===\n",
|
||||
*picked,
|
||||
"\n=== END-RETRIEVED-CONTEXT ===\n",
|
||||
], bank_ids
|
||||
|
||||
def _get_tools(self) -> List[ToolDefinition]:
|
||||
ret = []
|
||||
for t in self.agent_config.tools:
|
||||
if isinstance(t, SearchToolDefinition):
|
||||
ret.append(ToolDefinition(tool_name=BuiltinTool.brave_search))
|
||||
elif isinstance(t, WolframAlphaToolDefinition):
|
||||
ret.append(ToolDefinition(tool_name=BuiltinTool.wolfram_alpha))
|
||||
elif isinstance(t, PhotogenToolDefinition):
|
||||
ret.append(ToolDefinition(tool_name=BuiltinTool.photogen))
|
||||
elif isinstance(t, CodeInterpreterToolDefinition):
|
||||
ret.append(ToolDefinition(tool_name=BuiltinTool.code_interpreter))
|
||||
elif isinstance(t, FunctionCallToolDefinition):
|
||||
ret.append(
|
||||
ToolDefinition(
|
||||
tool_name=t.function_name,
|
||||
description=t.description,
|
||||
parameters=t.parameters,
|
||||
)
|
||||
)
|
||||
return ret
|
||||
|
||||
|
||||
async def attachment_message(tempdir: str, urls: List[URL]) -> ToolResponseMessage:
|
||||
content = []
|
||||
|
||||
for url in urls:
|
||||
uri = url.uri
|
||||
if uri.startswith("file://"):
|
||||
filepath = uri[len("file://") :]
|
||||
elif uri.startswith("http"):
|
||||
path = urlparse(uri).path
|
||||
basename = os.path.basename(path)
|
||||
filepath = f"{tempdir}/{make_random_string() + basename}"
|
||||
print(f"Downloading {url} -> {filepath}")
|
||||
|
||||
async with httpx.AsyncClient() as client:
|
||||
r = await client.get(uri)
|
||||
resp = r.text
|
||||
with open(filepath, "w") as fp:
|
||||
fp.write(resp)
|
||||
else:
|
||||
raise ValueError(f"Unsupported URL {url}")
|
||||
|
||||
content.append(f'# There is a file accessible to you at "{filepath}"\n')
|
||||
|
||||
return ToolResponseMessage(
|
||||
call_id="",
|
||||
tool_name=BuiltinTool.code_interpreter,
|
||||
content=content,
|
||||
)
|
||||
|
||||
|
||||
async def execute_tool_call_maybe(
|
||||
tools_dict: Dict[str, BaseTool], messages: List[CompletionMessage]
|
||||
) -> List[ToolResponseMessage]:
|
||||
# While Tools.run interface takes a list of messages,
|
||||
# All tools currently only run on a single message
|
||||
# When this changes, we can drop this assert
|
||||
# Whether to call tools on each message and aggregate
|
||||
# or aggregate and call tool once, reamins to be seen.
|
||||
assert len(messages) == 1, "Expected single message"
|
||||
message = messages[0]
|
||||
|
||||
tool_call = message.tool_calls[0]
|
||||
name = tool_call.tool_name
|
||||
assert isinstance(name, BuiltinTool)
|
||||
|
||||
name = name.value
|
||||
|
||||
assert name in tools_dict, f"Tool {name} not found"
|
||||
tool = tools_dict[name]
|
||||
result_messages = await tool.run(messages)
|
||||
return result_messages
|
||||
|
||||
|
||||
def print_dialog(messages: List[Message]):
|
||||
for i, m in enumerate(messages):
|
||||
if m.role == Role.user.value:
|
||||
color = "red"
|
||||
elif m.role == Role.assistant.value:
|
||||
color = "white"
|
||||
elif m.role == Role.ipython.value:
|
||||
color = "yellow"
|
||||
elif m.role == Role.system.value:
|
||||
color = "green"
|
||||
else:
|
||||
color = "white"
|
||||
|
||||
s = str(m)
|
||||
cprint(f"{i} ::: {s[:100]}...", color=color)
|
145
llama_stack/agentic_system/meta_reference/agentic_system.py
Normal file
145
llama_stack/agentic_system/meta_reference/agentic_system.py
Normal file
|
@ -0,0 +1,145 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
|
||||
import logging
|
||||
import tempfile
|
||||
import uuid
|
||||
from typing import AsyncGenerator
|
||||
|
||||
from llama_stack.inference.api import Inference
|
||||
from llama_stack.memory.api import Memory
|
||||
from llama_stack.safety.api import Safety
|
||||
from llama_stack.agentic_system.api import * # noqa: F403
|
||||
from llama_stack.tools.builtin import (
|
||||
CodeInterpreterTool,
|
||||
PhotogenTool,
|
||||
SearchTool,
|
||||
WolframAlphaTool,
|
||||
)
|
||||
from llama_stack.tools.safety import with_safety
|
||||
|
||||
from .agent_instance import ChatAgent
|
||||
from .config import MetaReferenceImplConfig
|
||||
|
||||
|
||||
logger = logging.getLogger()
|
||||
logger.setLevel(logging.INFO)
|
||||
|
||||
|
||||
AGENT_INSTANCES_BY_ID = {}
|
||||
|
||||
|
||||
class MetaReferenceAgenticSystemImpl(AgenticSystem):
|
||||
def __init__(
|
||||
self,
|
||||
config: MetaReferenceImplConfig,
|
||||
inference_api: Inference,
|
||||
memory_api: Memory,
|
||||
safety_api: Safety,
|
||||
):
|
||||
self.config = config
|
||||
self.inference_api = inference_api
|
||||
self.memory_api = memory_api
|
||||
self.safety_api = safety_api
|
||||
|
||||
async def initialize(self) -> None:
|
||||
pass
|
||||
|
||||
async def create_agentic_system(
|
||||
self,
|
||||
agent_config: AgentConfig,
|
||||
) -> AgenticSystemCreateResponse:
|
||||
agent_id = str(uuid.uuid4())
|
||||
|
||||
builtin_tools = []
|
||||
for tool_defn in agent_config.tools:
|
||||
if isinstance(tool_defn, WolframAlphaToolDefinition):
|
||||
key = self.config.wolfram_api_key
|
||||
if not key:
|
||||
raise ValueError("Wolfram API key not defined in config")
|
||||
tool = WolframAlphaTool(key)
|
||||
elif isinstance(tool_defn, SearchToolDefinition):
|
||||
key = None
|
||||
if tool_defn.engine == SearchEngineType.brave:
|
||||
key = self.config.brave_search_api_key
|
||||
elif tool_defn.engine == SearchEngineType.bing:
|
||||
key = self.config.bing_search_api_key
|
||||
if not key:
|
||||
raise ValueError("API key not defined in config")
|
||||
tool = SearchTool(tool_defn.engine, key)
|
||||
elif isinstance(tool_defn, CodeInterpreterToolDefinition):
|
||||
tool = CodeInterpreterTool()
|
||||
elif isinstance(tool_defn, PhotogenToolDefinition):
|
||||
tool = PhotogenTool(dump_dir=tempfile.mkdtemp())
|
||||
else:
|
||||
continue
|
||||
|
||||
builtin_tools.append(
|
||||
with_safety(
|
||||
tool,
|
||||
self.safety_api,
|
||||
tool_defn.input_shields,
|
||||
tool_defn.output_shields,
|
||||
)
|
||||
)
|
||||
|
||||
AGENT_INSTANCES_BY_ID[agent_id] = ChatAgent(
|
||||
agent_config=agent_config,
|
||||
inference_api=self.inference_api,
|
||||
safety_api=self.safety_api,
|
||||
memory_api=self.memory_api,
|
||||
builtin_tools=builtin_tools,
|
||||
)
|
||||
|
||||
return AgenticSystemCreateResponse(
|
||||
agent_id=agent_id,
|
||||
)
|
||||
|
||||
async def create_agentic_system_session(
|
||||
self,
|
||||
agent_id: str,
|
||||
session_name: str,
|
||||
) -> AgenticSystemSessionCreateResponse:
|
||||
assert agent_id in AGENT_INSTANCES_BY_ID, f"System {agent_id} not found"
|
||||
agent = AGENT_INSTANCES_BY_ID[agent_id]
|
||||
|
||||
session = agent.create_session(session_name)
|
||||
return AgenticSystemSessionCreateResponse(
|
||||
session_id=session.session_id,
|
||||
)
|
||||
|
||||
async def create_agentic_system_turn(
|
||||
self,
|
||||
agent_id: str,
|
||||
session_id: str,
|
||||
messages: List[
|
||||
Union[
|
||||
UserMessage,
|
||||
ToolResponseMessage,
|
||||
]
|
||||
],
|
||||
attachments: Optional[List[Attachment]] = None,
|
||||
stream: Optional[bool] = False,
|
||||
) -> AsyncGenerator:
|
||||
# wrapper request to make it easier to pass around (internal only, not exposed to API)
|
||||
request = AgenticSystemTurnCreateRequest(
|
||||
agent_id=agent_id,
|
||||
session_id=session_id,
|
||||
messages=messages,
|
||||
attachments=attachments,
|
||||
stream=stream,
|
||||
)
|
||||
|
||||
agent_id = request.agent_id
|
||||
assert agent_id in AGENT_INSTANCES_BY_ID, f"System {agent_id} not found"
|
||||
agent = AGENT_INSTANCES_BY_ID[agent_id]
|
||||
|
||||
assert (
|
||||
request.session_id in agent.sessions
|
||||
), f"Session {request.session_id} not found"
|
||||
async for event in agent.create_and_execute_turn(request):
|
||||
yield event
|
15
llama_stack/agentic_system/meta_reference/config.py
Normal file
15
llama_stack/agentic_system/meta_reference/config.py
Normal file
|
@ -0,0 +1,15 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
from typing import Optional
|
||||
|
||||
from pydantic import BaseModel
|
||||
|
||||
|
||||
class MetaReferenceImplConfig(BaseModel):
|
||||
brave_search_api_key: Optional[str] = None
|
||||
bing_search_api_key: Optional[str] = None
|
||||
wolfram_api_key: Optional[str] = None
|
|
@ -0,0 +1,76 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
from typing import List
|
||||
|
||||
from jinja2 import Template
|
||||
from llama_models.llama3.api import * # noqa: F403
|
||||
|
||||
|
||||
from llama_stack.agentic_system.api import (
|
||||
DefaultMemoryQueryGeneratorConfig,
|
||||
LLMMemoryQueryGeneratorConfig,
|
||||
MemoryQueryGenerator,
|
||||
MemoryQueryGeneratorConfig,
|
||||
)
|
||||
from termcolor import cprint # noqa: F401
|
||||
from llama_stack.inference.api import * # noqa: F403
|
||||
|
||||
|
||||
async def generate_rag_query(
|
||||
config: MemoryQueryGeneratorConfig,
|
||||
messages: List[Message],
|
||||
**kwargs,
|
||||
):
|
||||
"""
|
||||
Generates a query that will be used for
|
||||
retrieving relevant information from the memory bank.
|
||||
"""
|
||||
if config.type == MemoryQueryGenerator.default.value:
|
||||
query = await default_rag_query_generator(config, messages, **kwargs)
|
||||
elif config.type == MemoryQueryGenerator.llm.value:
|
||||
query = await llm_rag_query_generator(config, messages, **kwargs)
|
||||
else:
|
||||
raise NotImplementedError(f"Unsupported memory query generator {config.type}")
|
||||
# cprint(f"Generated query >>>: {query}", color="green")
|
||||
return query
|
||||
|
||||
|
||||
async def default_rag_query_generator(
|
||||
config: DefaultMemoryQueryGeneratorConfig,
|
||||
messages: List[Message],
|
||||
**kwargs,
|
||||
):
|
||||
return config.sep.join(interleaved_text_media_as_str(m.content) for m in messages)
|
||||
|
||||
|
||||
async def llm_rag_query_generator(
|
||||
config: LLMMemoryQueryGeneratorConfig,
|
||||
messages: List[Message],
|
||||
**kwargs,
|
||||
):
|
||||
assert "inference_api" in kwargs, "LLMRAGQueryGenerator needs inference_api"
|
||||
inference_api = kwargs["inference_api"]
|
||||
|
||||
m_dict = {"messages": [m.model_dump() for m in messages]}
|
||||
|
||||
template = Template(config.template)
|
||||
content = template.render(m_dict)
|
||||
|
||||
model = config.model
|
||||
message = UserMessage(content=content)
|
||||
response = inference_api.chat_completion(
|
||||
ChatCompletionRequest(
|
||||
model=model,
|
||||
messages=[message],
|
||||
stream=False,
|
||||
)
|
||||
)
|
||||
|
||||
async for chunk in response:
|
||||
query = chunk.completion_message.content
|
||||
|
||||
return query
|
65
llama_stack/agentic_system/meta_reference/safety.py
Normal file
65
llama_stack/agentic_system/meta_reference/safety.py
Normal file
|
@ -0,0 +1,65 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
from typing import List
|
||||
|
||||
from llama_models.llama3.api.datatypes import Message, Role, UserMessage
|
||||
from termcolor import cprint
|
||||
|
||||
from llama_stack.safety.api import (
|
||||
OnViolationAction,
|
||||
RunShieldRequest,
|
||||
Safety,
|
||||
ShieldDefinition,
|
||||
ShieldResponse,
|
||||
)
|
||||
|
||||
|
||||
class SafetyException(Exception): # noqa: N818
|
||||
def __init__(self, response: ShieldResponse):
|
||||
self.response = response
|
||||
super().__init__(response.violation_return_message)
|
||||
|
||||
|
||||
class ShieldRunnerMixin:
|
||||
def __init__(
|
||||
self,
|
||||
safety_api: Safety,
|
||||
input_shields: List[ShieldDefinition] = None,
|
||||
output_shields: List[ShieldDefinition] = None,
|
||||
):
|
||||
self.safety_api = safety_api
|
||||
self.input_shields = input_shields
|
||||
self.output_shields = output_shields
|
||||
|
||||
async def run_shields(
|
||||
self, messages: List[Message], shields: List[ShieldDefinition]
|
||||
) -> List[ShieldResponse]:
|
||||
messages = messages.copy()
|
||||
# some shields like llama-guard require the first message to be a user message
|
||||
# since this might be a tool call, first role might not be user
|
||||
if len(messages) > 0 and messages[0].role != Role.user.value:
|
||||
messages[0] = UserMessage(content=messages[0].content)
|
||||
|
||||
res = await self.safety_api.run_shields(
|
||||
RunShieldRequest(
|
||||
messages=messages,
|
||||
shields=shields,
|
||||
)
|
||||
)
|
||||
|
||||
results = res.responses
|
||||
for shield, r in zip(shields, results):
|
||||
if r.is_violation:
|
||||
if shield.on_violation_action == OnViolationAction.RAISE:
|
||||
raise SafetyException(r)
|
||||
elif shield.on_violation_action == OnViolationAction.WARN:
|
||||
cprint(
|
||||
f"[Warn]{shield.__class__.__name__} raised a warning",
|
||||
color="red",
|
||||
)
|
||||
|
||||
return results
|
34
llama_stack/agentic_system/providers.py
Normal file
34
llama_stack/agentic_system/providers.py
Normal file
|
@ -0,0 +1,34 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
from typing import List
|
||||
|
||||
from llama_stack.core.datatypes import Api, InlineProviderSpec, ProviderSpec
|
||||
|
||||
|
||||
def available_providers() -> List[ProviderSpec]:
|
||||
return [
|
||||
InlineProviderSpec(
|
||||
api=Api.agentic_system,
|
||||
provider_id="meta-reference",
|
||||
pip_packages=[
|
||||
"codeshield",
|
||||
"matplotlib",
|
||||
"pillow",
|
||||
"pandas",
|
||||
"scikit-learn",
|
||||
"torch",
|
||||
"transformers",
|
||||
],
|
||||
module="llama_stack.agentic_system.meta_reference",
|
||||
config_class="llama_stack.agentic_system.meta_reference.MetaReferenceImplConfig",
|
||||
api_dependencies=[
|
||||
Api.inference,
|
||||
Api.safety,
|
||||
Api.memory,
|
||||
],
|
||||
),
|
||||
]
|
5
llama_stack/batch_inference/__init__.py
Normal file
5
llama_stack/batch_inference/__init__.py
Normal file
|
@ -0,0 +1,5 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
7
llama_stack/batch_inference/api/__init__.py
Normal file
7
llama_stack/batch_inference/api/__init__.py
Normal file
|
@ -0,0 +1,7 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
from .api import * # noqa: F401 F403
|
71
llama_stack/batch_inference/api/api.py
Normal file
71
llama_stack/batch_inference/api/api.py
Normal file
|
@ -0,0 +1,71 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
from typing import List, Optional, Protocol
|
||||
|
||||
from llama_models.schema_utils import json_schema_type, webmethod
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
from llama_models.llama3.api.datatypes import * # noqa: F403
|
||||
from llama_stack.inference.api import * # noqa: F403
|
||||
|
||||
|
||||
@json_schema_type
|
||||
class BatchCompletionRequest(BaseModel):
|
||||
model: str
|
||||
content_batch: List[InterleavedTextMedia]
|
||||
sampling_params: Optional[SamplingParams] = SamplingParams()
|
||||
logprobs: Optional[LogProbConfig] = None
|
||||
|
||||
|
||||
@json_schema_type
|
||||
class BatchCompletionResponse(BaseModel):
|
||||
completion_message_batch: List[CompletionMessage]
|
||||
|
||||
|
||||
@json_schema_type
|
||||
class BatchChatCompletionRequest(BaseModel):
|
||||
model: str
|
||||
messages_batch: List[List[Message]]
|
||||
sampling_params: Optional[SamplingParams] = SamplingParams()
|
||||
|
||||
# zero-shot tool definitions as input to the model
|
||||
tools: Optional[List[ToolDefinition]] = Field(default_factory=list)
|
||||
tool_choice: Optional[ToolChoice] = Field(default=ToolChoice.auto)
|
||||
tool_prompt_format: Optional[ToolPromptFormat] = Field(
|
||||
default=ToolPromptFormat.json
|
||||
)
|
||||
logprobs: Optional[LogProbConfig] = None
|
||||
|
||||
|
||||
@json_schema_type
|
||||
class BatchChatCompletionResponse(BaseModel):
|
||||
completion_message_batch: List[CompletionMessage]
|
||||
|
||||
|
||||
class BatchInference(Protocol):
|
||||
@webmethod(route="/batch_inference/completion")
|
||||
async def batch_completion(
|
||||
self,
|
||||
model: str,
|
||||
content_batch: List[InterleavedTextMedia],
|
||||
sampling_params: Optional[SamplingParams] = SamplingParams(),
|
||||
logprobs: Optional[LogProbConfig] = None,
|
||||
) -> BatchCompletionResponse: ...
|
||||
|
||||
@webmethod(route="/batch_inference/chat_completion")
|
||||
async def batch_chat_completion(
|
||||
self,
|
||||
model: str,
|
||||
messages_batch: List[List[Message]],
|
||||
sampling_params: Optional[SamplingParams] = SamplingParams(),
|
||||
# zero-shot tool definitions as input to the model
|
||||
tools: Optional[List[ToolDefinition]] = list,
|
||||
tool_choice: Optional[ToolChoice] = ToolChoice.auto,
|
||||
tool_prompt_format: Optional[ToolPromptFormat] = ToolPromptFormat.json,
|
||||
logprobs: Optional[LogProbConfig] = None,
|
||||
) -> BatchChatCompletionResponse: ...
|
5
llama_stack/cli/__init__.py
Normal file
5
llama_stack/cli/__init__.py
Normal file
|
@ -0,0 +1,5 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
339
llama_stack/cli/download.py
Normal file
339
llama_stack/cli/download.py
Normal file
|
@ -0,0 +1,339 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import json
|
||||
import os
|
||||
import shutil
|
||||
import time
|
||||
from datetime import datetime
|
||||
from functools import partial
|
||||
from pathlib import Path
|
||||
from typing import Dict, List
|
||||
|
||||
import httpx
|
||||
from pydantic import BaseModel
|
||||
|
||||
from termcolor import cprint
|
||||
|
||||
from llama_stack.cli.subcommand import Subcommand
|
||||
|
||||
|
||||
class Download(Subcommand):
|
||||
"""Llama cli for downloading llama toolchain assets"""
|
||||
|
||||
def __init__(self, subparsers: argparse._SubParsersAction):
|
||||
super().__init__()
|
||||
self.parser = subparsers.add_parser(
|
||||
"download",
|
||||
prog="llama download",
|
||||
description="Download a model from llama.meta.com or Hugging Face Hub",
|
||||
formatter_class=argparse.RawTextHelpFormatter,
|
||||
)
|
||||
setup_download_parser(self.parser)
|
||||
|
||||
|
||||
def setup_download_parser(parser: argparse.ArgumentParser) -> None:
|
||||
from llama_models.sku_list import all_registered_models
|
||||
|
||||
models = all_registered_models()
|
||||
parser.add_argument(
|
||||
"--source",
|
||||
choices=["meta", "huggingface"],
|
||||
required=True,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--model-id",
|
||||
required=False,
|
||||
help="See `llama model list` or `llama model list --show-all` for the list of available models",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--hf-token",
|
||||
type=str,
|
||||
required=False,
|
||||
default=None,
|
||||
help="Hugging Face API token. Needed for gated models like llama2/3. Will also try to read environment variable `HF_TOKEN` as default.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--meta-url",
|
||||
type=str,
|
||||
required=False,
|
||||
help="For source=meta, URL obtained from llama.meta.com after accepting license terms",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--ignore-patterns",
|
||||
type=str,
|
||||
required=False,
|
||||
default="*.safetensors",
|
||||
help="""
|
||||
For source=huggingface, files matching any of the patterns are not downloaded. Defaults to ignoring
|
||||
safetensors files to avoid downloading duplicate weights.
|
||||
""",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--manifest-file",
|
||||
type=str,
|
||||
help="For source=meta, you can download models from a manifest file containing a file => URL mapping",
|
||||
required=False,
|
||||
)
|
||||
parser.set_defaults(func=partial(run_download_cmd, parser=parser))
|
||||
|
||||
|
||||
def _hf_download(
|
||||
model: "Model",
|
||||
hf_token: str,
|
||||
ignore_patterns: str,
|
||||
parser: argparse.ArgumentParser,
|
||||
):
|
||||
from huggingface_hub import snapshot_download
|
||||
from huggingface_hub.utils import GatedRepoError, RepositoryNotFoundError
|
||||
|
||||
from llama_stack.common.model_utils import model_local_dir
|
||||
|
||||
repo_id = model.huggingface_repo
|
||||
if repo_id is None:
|
||||
raise ValueError(f"No repo id found for model {model.descriptor()}")
|
||||
|
||||
output_dir = model_local_dir(model.descriptor())
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
try:
|
||||
true_output_dir = snapshot_download(
|
||||
repo_id,
|
||||
local_dir=output_dir,
|
||||
ignore_patterns=ignore_patterns,
|
||||
token=hf_token,
|
||||
library_name="llama-toolchain",
|
||||
)
|
||||
except GatedRepoError:
|
||||
parser.error(
|
||||
"It looks like you are trying to access a gated repository. Please ensure you "
|
||||
"have access to the repository and have provided the proper Hugging Face API token "
|
||||
"using the option `--hf-token` or by running `huggingface-cli login`."
|
||||
"You can find your token by visiting https://huggingface.co/settings/tokens"
|
||||
)
|
||||
except RepositoryNotFoundError:
|
||||
parser.error(f"Repository '{args.repo_id}' not found on the Hugging Face Hub.")
|
||||
except Exception as e:
|
||||
parser.error(e)
|
||||
|
||||
print(f"\nSuccessfully downloaded model to {true_output_dir}")
|
||||
|
||||
|
||||
def _meta_download(model: "Model", meta_url: str):
|
||||
from llama_models.sku_list import llama_meta_net_info
|
||||
|
||||
from llama_stack.common.model_utils import model_local_dir
|
||||
|
||||
output_dir = Path(model_local_dir(model.descriptor()))
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
|
||||
info = llama_meta_net_info(model)
|
||||
|
||||
# I believe we can use some concurrency here if needed but not sure it is worth it
|
||||
for f in info.files:
|
||||
output_file = str(output_dir / f)
|
||||
url = meta_url.replace("*", f"{info.folder}/{f}")
|
||||
total_size = info.pth_size if "consolidated" in f else 0
|
||||
cprint(f"Downloading `{f}`...", "white")
|
||||
downloader = ResumableDownloader(url, output_file, total_size)
|
||||
asyncio.run(downloader.download())
|
||||
|
||||
print(f"\nSuccessfully downloaded model to {output_dir}")
|
||||
cprint(f"\nMD5 Checksums are at: {output_dir / 'checklist.chk'}", "white")
|
||||
|
||||
|
||||
def run_download_cmd(args: argparse.Namespace, parser: argparse.ArgumentParser):
|
||||
from llama_models.sku_list import resolve_model
|
||||
|
||||
if args.manifest_file:
|
||||
_download_from_manifest(args.manifest_file)
|
||||
return
|
||||
|
||||
if args.model_id is None:
|
||||
parser.error("Please provide a model id")
|
||||
return
|
||||
|
||||
model = resolve_model(args.model_id)
|
||||
if model is None:
|
||||
parser.error(f"Model {args.model_id} not found")
|
||||
return
|
||||
|
||||
if args.source == "huggingface":
|
||||
_hf_download(model, args.hf_token, args.ignore_patterns, parser)
|
||||
else:
|
||||
meta_url = args.meta_url
|
||||
if not meta_url:
|
||||
meta_url = input(
|
||||
"Please provide the signed URL you received via email (e.g., https://llama3-1.llamameta.net/*?Policy...): "
|
||||
)
|
||||
assert meta_url is not None and "llamameta.net" in meta_url
|
||||
_meta_download(model, meta_url)
|
||||
|
||||
|
||||
class ModelEntry(BaseModel):
|
||||
model_id: str
|
||||
files: Dict[str, str]
|
||||
|
||||
class Config:
|
||||
protected_namespaces = ()
|
||||
|
||||
|
||||
class Manifest(BaseModel):
|
||||
models: List[ModelEntry]
|
||||
expires_on: datetime
|
||||
|
||||
|
||||
def _download_from_manifest(manifest_file: str):
|
||||
from llama_stack.common.model_utils import model_local_dir
|
||||
|
||||
with open(manifest_file, "r") as f:
|
||||
d = json.load(f)
|
||||
manifest = Manifest(**d)
|
||||
|
||||
if datetime.now() > manifest.expires_on:
|
||||
raise ValueError(f"Manifest URLs have expired on {manifest.expires_on}")
|
||||
|
||||
for entry in manifest.models:
|
||||
print(f"Downloading model {entry.model_id}...")
|
||||
output_dir = Path(model_local_dir(entry.model_id))
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
|
||||
if any(output_dir.iterdir()):
|
||||
cprint(f"Output directory {output_dir} is not empty.", "red")
|
||||
|
||||
while True:
|
||||
resp = input(
|
||||
"Do you want to (C)ontinue download or (R)estart completely? (continue/restart): "
|
||||
)
|
||||
if resp.lower() == "restart" or resp.lower() == "r":
|
||||
shutil.rmtree(output_dir)
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
break
|
||||
elif resp.lower() == "continue" or resp.lower() == "c":
|
||||
print("Continuing download...")
|
||||
break
|
||||
else:
|
||||
cprint("Invalid response. Please try again.", "red")
|
||||
|
||||
for fname, url in entry.files.items():
|
||||
output_file = str(output_dir / fname)
|
||||
downloader = ResumableDownloader(url, output_file)
|
||||
asyncio.run(downloader.download())
|
||||
|
||||
|
||||
class ResumableDownloader:
|
||||
def __init__(
|
||||
self,
|
||||
url: str,
|
||||
output_file: str,
|
||||
total_size: int = 0,
|
||||
buffer_size: int = 32 * 1024,
|
||||
):
|
||||
self.url = url
|
||||
self.output_file = output_file
|
||||
self.buffer_size = buffer_size
|
||||
self.total_size = total_size
|
||||
self.downloaded_size = 0
|
||||
self.start_size = 0
|
||||
self.start_time = 0
|
||||
|
||||
async def get_file_info(self, client: httpx.AsyncClient) -> None:
|
||||
if self.total_size > 0:
|
||||
return
|
||||
|
||||
# Force disable compression when trying to retrieve file size
|
||||
response = await client.head(
|
||||
self.url, follow_redirects=True, headers={"Accept-Encoding": "identity"}
|
||||
)
|
||||
response.raise_for_status()
|
||||
self.url = str(response.url) # Update URL in case of redirects
|
||||
self.total_size = int(response.headers.get("Content-Length", 0))
|
||||
if self.total_size == 0:
|
||||
raise ValueError(
|
||||
"Unable to determine file size. The server might not support range requests."
|
||||
)
|
||||
|
||||
async def download(self) -> None:
|
||||
self.start_time = time.time()
|
||||
async with httpx.AsyncClient(follow_redirects=True) as client:
|
||||
await self.get_file_info(client)
|
||||
|
||||
if os.path.exists(self.output_file):
|
||||
self.downloaded_size = os.path.getsize(self.output_file)
|
||||
self.start_size = self.downloaded_size
|
||||
if self.downloaded_size >= self.total_size:
|
||||
print(f"Already downloaded `{self.output_file}`, skipping...")
|
||||
return
|
||||
|
||||
additional_size = self.total_size - self.downloaded_size
|
||||
if not self.has_disk_space(additional_size):
|
||||
M = 1024 * 1024 # noqa
|
||||
print(
|
||||
f"Not enough disk space to download `{self.output_file}`. "
|
||||
f"Required: {(additional_size // M):.2f} MB"
|
||||
)
|
||||
raise ValueError(
|
||||
f"Not enough disk space to download `{self.output_file}`"
|
||||
)
|
||||
|
||||
while True:
|
||||
if self.downloaded_size >= self.total_size:
|
||||
break
|
||||
|
||||
# Cloudfront has a max-size limit
|
||||
max_chunk_size = 27_000_000_000
|
||||
request_size = min(
|
||||
self.total_size - self.downloaded_size, max_chunk_size
|
||||
)
|
||||
headers = {
|
||||
"Range": f"bytes={self.downloaded_size}-{self.downloaded_size + request_size}"
|
||||
}
|
||||
print(f"Downloading `{self.output_file}`....{headers}")
|
||||
try:
|
||||
async with client.stream(
|
||||
"GET", self.url, headers=headers
|
||||
) as response:
|
||||
response.raise_for_status()
|
||||
with open(self.output_file, "ab") as file:
|
||||
async for chunk in response.aiter_bytes(self.buffer_size):
|
||||
file.write(chunk)
|
||||
self.downloaded_size += len(chunk)
|
||||
self.print_progress()
|
||||
except httpx.HTTPError as e:
|
||||
print(f"\nDownload interrupted: {e}")
|
||||
print("You can resume the download by running the script again.")
|
||||
except Exception as e:
|
||||
print(f"\nAn error occurred: {e}")
|
||||
|
||||
print(f"\nFinished downloading `{self.output_file}`....")
|
||||
|
||||
def print_progress(self) -> None:
|
||||
percent = (self.downloaded_size / self.total_size) * 100
|
||||
bar_length = 50
|
||||
filled_length = int(bar_length * self.downloaded_size // self.total_size)
|
||||
bar = "█" * filled_length + "-" * (bar_length - filled_length)
|
||||
|
||||
elapsed_time = time.time() - self.start_time
|
||||
M = 1024 * 1024 # noqa
|
||||
|
||||
speed = (
|
||||
(self.downloaded_size - self.start_size) / (elapsed_time * M)
|
||||
if elapsed_time > 0
|
||||
else 0
|
||||
)
|
||||
print(
|
||||
f"\rProgress: |{bar}| {percent:.2f}% "
|
||||
f"({self.downloaded_size // M}/{self.total_size // M} MB) "
|
||||
f"Speed: {speed:.2f} MiB/s",
|
||||
end="",
|
||||
flush=True,
|
||||
)
|
||||
|
||||
def has_disk_space(self, file_size: int) -> bool:
|
||||
dir_path = os.path.dirname(os.path.abspath(self.output_file))
|
||||
free_space = shutil.disk_usage(dir_path).free
|
||||
return free_space > file_size
|
58
llama_stack/cli/llama.py
Normal file
58
llama_stack/cli/llama.py
Normal file
|
@ -0,0 +1,58 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
import argparse
|
||||
|
||||
from .download import Download
|
||||
from .model import ModelParser
|
||||
from .stack import StackParser
|
||||
|
||||
|
||||
class LlamaCLIParser:
|
||||
"""Defines CLI parser for Llama CLI"""
|
||||
|
||||
def __init__(self):
|
||||
self.parser = argparse.ArgumentParser(
|
||||
prog="llama",
|
||||
description="Welcome to the Llama CLI",
|
||||
add_help=True,
|
||||
)
|
||||
|
||||
# Default command is to print help
|
||||
self.parser.set_defaults(func=lambda args: self.parser.print_help())
|
||||
|
||||
subparsers = self.parser.add_subparsers(title="subcommands")
|
||||
|
||||
# Add sub-commands
|
||||
Download.create(subparsers)
|
||||
ModelParser.create(subparsers)
|
||||
StackParser.create(subparsers)
|
||||
|
||||
# Import sub-commands from agentic_system if they exist
|
||||
try:
|
||||
from llama_agentic_system.cli.subcommand_modules import SUBCOMMAND_MODULES
|
||||
|
||||
for module in SUBCOMMAND_MODULES:
|
||||
module.create(subparsers)
|
||||
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
def parse_args(self) -> argparse.Namespace:
|
||||
return self.parser.parse_args()
|
||||
|
||||
def run(self, args: argparse.Namespace) -> None:
|
||||
args.func(args)
|
||||
|
||||
|
||||
def main():
|
||||
parser = LlamaCLIParser()
|
||||
args = parser.parse_args()
|
||||
parser.run(args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
7
llama_stack/cli/model/__init__.py
Normal file
7
llama_stack/cli/model/__init__.py
Normal file
|
@ -0,0 +1,7 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
from .model import ModelParser # noqa
|
75
llama_stack/cli/model/describe.py
Normal file
75
llama_stack/cli/model/describe.py
Normal file
|
@ -0,0 +1,75 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
import argparse
|
||||
import json
|
||||
|
||||
from llama_models.sku_list import resolve_model
|
||||
|
||||
from termcolor import colored
|
||||
|
||||
from llama_stack.cli.subcommand import Subcommand
|
||||
from llama_stack.cli.table import print_table
|
||||
from llama_stack.common.serialize import EnumEncoder
|
||||
|
||||
|
||||
class ModelDescribe(Subcommand):
|
||||
"""Show details about a model"""
|
||||
|
||||
def __init__(self, subparsers: argparse._SubParsersAction):
|
||||
super().__init__()
|
||||
self.parser = subparsers.add_parser(
|
||||
"describe",
|
||||
prog="llama model describe",
|
||||
description="Show details about a llama model",
|
||||
formatter_class=argparse.RawTextHelpFormatter,
|
||||
)
|
||||
self._add_arguments()
|
||||
self.parser.set_defaults(func=self._run_model_describe_cmd)
|
||||
|
||||
def _add_arguments(self):
|
||||
self.parser.add_argument(
|
||||
"-m",
|
||||
"--model-id",
|
||||
type=str,
|
||||
required=True,
|
||||
)
|
||||
|
||||
def _run_model_describe_cmd(self, args: argparse.Namespace) -> None:
|
||||
model = resolve_model(args.model_id)
|
||||
if model is None:
|
||||
self.parser.error(
|
||||
f"Model {args.model_id} not found; try 'llama model list' for a list of available models."
|
||||
)
|
||||
return
|
||||
|
||||
rows = [
|
||||
(
|
||||
colored("Model", "white", attrs=["bold"]),
|
||||
colored(model.descriptor(), "white", attrs=["bold"]),
|
||||
),
|
||||
("HuggingFace ID", model.huggingface_repo or "<Not Available>"),
|
||||
("Description", model.description_markdown),
|
||||
("Context Length", f"{model.max_seq_length // 1024}K tokens"),
|
||||
("Weights format", model.quantization_format.value),
|
||||
("Model params.json", json.dumps(model.model_args, indent=4)),
|
||||
]
|
||||
|
||||
if model.recommended_sampling_params is not None:
|
||||
sampling_params = model.recommended_sampling_params.dict()
|
||||
for k in ("max_tokens", "repetition_penalty"):
|
||||
del sampling_params[k]
|
||||
rows.append(
|
||||
(
|
||||
"Recommended sampling params",
|
||||
json.dumps(sampling_params, cls=EnumEncoder, indent=4),
|
||||
)
|
||||
)
|
||||
|
||||
print_table(
|
||||
rows,
|
||||
separate_rows=True,
|
||||
)
|
24
llama_stack/cli/model/download.py
Normal file
24
llama_stack/cli/model/download.py
Normal file
|
@ -0,0 +1,24 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
import argparse
|
||||
|
||||
from llama_stack.cli.subcommand import Subcommand
|
||||
|
||||
|
||||
class ModelDownload(Subcommand):
|
||||
def __init__(self, subparsers: argparse._SubParsersAction):
|
||||
super().__init__()
|
||||
self.parser = subparsers.add_parser(
|
||||
"download",
|
||||
prog="llama model download",
|
||||
description="Download a model from llama.meta.com or Hugging Face Hub",
|
||||
formatter_class=argparse.RawTextHelpFormatter,
|
||||
)
|
||||
|
||||
from llama_stack.cli.download import setup_download_parser
|
||||
|
||||
setup_download_parser(self.parser)
|
60
llama_stack/cli/model/list.py
Normal file
60
llama_stack/cli/model/list.py
Normal file
|
@ -0,0 +1,60 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
import argparse
|
||||
|
||||
from llama_models.sku_list import all_registered_models
|
||||
|
||||
from llama_stack.cli.subcommand import Subcommand
|
||||
from llama_stack.cli.table import print_table
|
||||
|
||||
|
||||
class ModelList(Subcommand):
|
||||
"""List available llama models"""
|
||||
|
||||
def __init__(self, subparsers: argparse._SubParsersAction):
|
||||
super().__init__()
|
||||
self.parser = subparsers.add_parser(
|
||||
"list",
|
||||
prog="llama model list",
|
||||
description="Show available llama models",
|
||||
formatter_class=argparse.RawTextHelpFormatter,
|
||||
)
|
||||
self._add_arguments()
|
||||
self.parser.set_defaults(func=self._run_model_list_cmd)
|
||||
|
||||
def _add_arguments(self):
|
||||
self.parser.add_argument(
|
||||
"--show-all",
|
||||
action="store_true",
|
||||
help="Show all models (not just defaults)",
|
||||
)
|
||||
|
||||
def _run_model_list_cmd(self, args: argparse.Namespace) -> None:
|
||||
headers = [
|
||||
"Model Descriptor",
|
||||
"HuggingFace Repo",
|
||||
"Context Length",
|
||||
]
|
||||
|
||||
rows = []
|
||||
for model in all_registered_models():
|
||||
if not args.show_all and not model.is_featured:
|
||||
continue
|
||||
|
||||
descriptor = model.descriptor()
|
||||
rows.append(
|
||||
[
|
||||
descriptor,
|
||||
model.huggingface_repo,
|
||||
f"{model.max_seq_length // 1024}K",
|
||||
]
|
||||
)
|
||||
print_table(
|
||||
rows,
|
||||
headers,
|
||||
separate_rows=True,
|
||||
)
|
34
llama_stack/cli/model/model.py
Normal file
34
llama_stack/cli/model/model.py
Normal file
|
@ -0,0 +1,34 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
import argparse
|
||||
|
||||
from llama_stack.cli.model.describe import ModelDescribe
|
||||
from llama_stack.cli.model.download import ModelDownload
|
||||
from llama_stack.cli.model.list import ModelList
|
||||
from llama_stack.cli.model.template import ModelTemplate
|
||||
|
||||
from llama_stack.cli.subcommand import Subcommand
|
||||
|
||||
|
||||
class ModelParser(Subcommand):
|
||||
"""Llama cli for model interface apis"""
|
||||
|
||||
def __init__(self, subparsers: argparse._SubParsersAction):
|
||||
super().__init__()
|
||||
self.parser = subparsers.add_parser(
|
||||
"model",
|
||||
prog="llama model",
|
||||
description="Work with llama models",
|
||||
)
|
||||
|
||||
subparsers = self.parser.add_subparsers(title="model_subcommands")
|
||||
|
||||
# Add sub-commands
|
||||
ModelDownload.create(subparsers)
|
||||
ModelList.create(subparsers)
|
||||
ModelTemplate.create(subparsers)
|
||||
ModelDescribe.create(subparsers)
|
113
llama_stack/cli/model/template.py
Normal file
113
llama_stack/cli/model/template.py
Normal file
|
@ -0,0 +1,113 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
import argparse
|
||||
import textwrap
|
||||
|
||||
from termcolor import colored
|
||||
|
||||
from llama_stack.cli.subcommand import Subcommand
|
||||
|
||||
|
||||
class ModelTemplate(Subcommand):
|
||||
"""Llama model cli for describe a model template (message formats)"""
|
||||
|
||||
def __init__(self, subparsers: argparse._SubParsersAction):
|
||||
super().__init__()
|
||||
self.parser = subparsers.add_parser(
|
||||
"template",
|
||||
prog="llama model template",
|
||||
description="Show llama model message formats",
|
||||
epilog=textwrap.dedent(
|
||||
"""
|
||||
Example:
|
||||
llama model template <options>
|
||||
"""
|
||||
),
|
||||
formatter_class=argparse.RawTextHelpFormatter,
|
||||
)
|
||||
self._add_arguments()
|
||||
self.parser.set_defaults(func=self._run_model_template_cmd)
|
||||
|
||||
def _prompt_type(self, value):
|
||||
from llama_models.llama3.api.datatypes import ToolPromptFormat
|
||||
|
||||
try:
|
||||
return ToolPromptFormat(value.lower())
|
||||
except ValueError:
|
||||
raise argparse.ArgumentTypeError(
|
||||
f"{value} is not a valid ToolPromptFormat. Choose from {', '.join(t.value for t in ToolPromptFormat)}"
|
||||
) from None
|
||||
|
||||
def _add_arguments(self):
|
||||
self.parser.add_argument(
|
||||
"-m",
|
||||
"--model-family",
|
||||
type=str,
|
||||
default="llama3_1",
|
||||
help="Model Family (llama3_1, llama3_X, etc.)",
|
||||
)
|
||||
self.parser.add_argument(
|
||||
"--name",
|
||||
type=str,
|
||||
help="Usecase template name (system_message, user_message, assistant_message, tool_message)...",
|
||||
required=False,
|
||||
)
|
||||
self.parser.add_argument(
|
||||
"--format",
|
||||
type=str,
|
||||
help="ToolPromptFormat (json or function_tag). This flag is used to print the template in a specific formats.",
|
||||
required=False,
|
||||
default="json",
|
||||
)
|
||||
self.parser.add_argument(
|
||||
"--raw",
|
||||
action="store_true",
|
||||
help="If set to true, don't pretty-print into a table. Useful to copy-paste.",
|
||||
)
|
||||
|
||||
def _run_model_template_cmd(self, args: argparse.Namespace) -> None:
|
||||
from llama_models.llama3.api.interface import (
|
||||
list_jinja_templates,
|
||||
render_jinja_template,
|
||||
)
|
||||
|
||||
from llama_stack.cli.table import print_table
|
||||
|
||||
if args.name:
|
||||
tool_prompt_format = self._prompt_type(args.format)
|
||||
template, tokens_info = render_jinja_template(args.name, tool_prompt_format)
|
||||
rendered = ""
|
||||
for tok, is_special in tokens_info:
|
||||
if is_special:
|
||||
rendered += colored(tok, "yellow", attrs=["bold"])
|
||||
else:
|
||||
rendered += tok
|
||||
|
||||
if not args.raw:
|
||||
rendered = rendered.replace("\n", "↵\n")
|
||||
print_table(
|
||||
[
|
||||
(
|
||||
"Name",
|
||||
colored(template.template_name, "white", attrs=["bold"]),
|
||||
),
|
||||
("Template", rendered),
|
||||
("Notes", template.notes),
|
||||
],
|
||||
separate_rows=True,
|
||||
)
|
||||
else:
|
||||
print("Template: ", template.template_name)
|
||||
print("=" * 40)
|
||||
print(rendered)
|
||||
else:
|
||||
templates = list_jinja_templates()
|
||||
headers = ["Role", "Template Name"]
|
||||
print_table(
|
||||
[(t.role, t.template_name) for t in templates],
|
||||
headers,
|
||||
)
|
5
llama_stack/cli/scripts/__init__.py
Normal file
5
llama_stack/cli/scripts/__init__.py
Normal file
|
@ -0,0 +1,5 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
38
llama_stack/cli/scripts/install-wheel-from-presigned.sh
Executable file
38
llama_stack/cli/scripts/install-wheel-from-presigned.sh
Executable file
|
@ -0,0 +1,38 @@
|
|||
#!/bin/bash
|
||||
|
||||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
if [ $# -eq 0 ]; then
|
||||
echo "Please provide a URL as an argument."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
URL=$1
|
||||
|
||||
HEADERS_FILE=$(mktemp)
|
||||
curl -s -I "$URL" >"$HEADERS_FILE"
|
||||
FILENAME=$(grep -i "x-manifold-obj-canonicalpath:" "$HEADERS_FILE" | sed -E 's/.*nodes\/[^\/]+\/(.+)/\1/' | tr -d "\r\n")
|
||||
|
||||
if [ -z "$FILENAME" ]; then
|
||||
echo "Could not find the x-manifold-obj-canonicalpath header."
|
||||
echo "HEADERS_FILE contents: "
|
||||
cat "$HEADERS_FILE"
|
||||
echo ""
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "Downloading $FILENAME..."
|
||||
|
||||
curl -s -L -o "$FILENAME" "$URL"
|
||||
|
||||
echo "Installing $FILENAME..."
|
||||
pip install "$FILENAME"
|
||||
echo "Successfully installed $FILENAME"
|
||||
|
||||
rm -f "$FILENAME"
|
18
llama_stack/cli/scripts/run.py
Normal file
18
llama_stack/cli/scripts/run.py
Normal file
|
@ -0,0 +1,18 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
|
||||
|
||||
def install_wheel_from_presigned():
|
||||
file = "install-wheel-from-presigned.sh"
|
||||
script_path = os.path.join(os.path.dirname(__file__), file)
|
||||
try:
|
||||
subprocess.run(["sh", script_path] + sys.argv[1:], check=True)
|
||||
except Exception:
|
||||
sys.exit(1)
|
7
llama_stack/cli/stack/__init__.py
Normal file
7
llama_stack/cli/stack/__init__.py
Normal file
|
@ -0,0 +1,7 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
from .stack import StackParser # noqa
|
94
llama_stack/cli/stack/build.py
Normal file
94
llama_stack/cli/stack/build.py
Normal file
|
@ -0,0 +1,94 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
import argparse
|
||||
|
||||
from llama_stack.cli.subcommand import Subcommand
|
||||
from llama_stack.core.datatypes import * # noqa: F403
|
||||
from pathlib import Path
|
||||
|
||||
import yaml
|
||||
|
||||
|
||||
class StackBuild(Subcommand):
|
||||
def __init__(self, subparsers: argparse._SubParsersAction):
|
||||
super().__init__()
|
||||
self.parser = subparsers.add_parser(
|
||||
"build",
|
||||
prog="llama stack build",
|
||||
description="Build a Llama stack container",
|
||||
formatter_class=argparse.RawTextHelpFormatter,
|
||||
)
|
||||
self._add_arguments()
|
||||
self.parser.set_defaults(func=self._run_stack_build_command)
|
||||
|
||||
def _add_arguments(self):
|
||||
self.parser.add_argument(
|
||||
"config",
|
||||
type=str,
|
||||
help="Path to a config file to use for the build. You may find example configs in llama_stack/configs/distributions",
|
||||
)
|
||||
|
||||
self.parser.add_argument(
|
||||
"--name",
|
||||
type=str,
|
||||
help="Name of the llama stack build to override from template config",
|
||||
)
|
||||
|
||||
def _run_stack_build_command_from_build_config(
|
||||
self, build_config: BuildConfig
|
||||
) -> None:
|
||||
import json
|
||||
import os
|
||||
|
||||
from llama_stack.common.config_dirs import DISTRIBS_BASE_DIR
|
||||
from llama_stack.common.serialize import EnumEncoder
|
||||
from llama_stack.core.package import ApiInput, build_image, ImageType
|
||||
from termcolor import cprint
|
||||
|
||||
# save build.yaml spec for building same distribution again
|
||||
if build_config.image_type == ImageType.docker.value:
|
||||
# docker needs build file to be in the llama-stack repo dir to be able to copy over to the image
|
||||
llama_stack_path = Path(os.path.relpath(__file__)).parent.parent.parent
|
||||
build_dir = (
|
||||
llama_stack_path / "configs/distributions" / build_config.image_type
|
||||
)
|
||||
else:
|
||||
build_dir = DISTRIBS_BASE_DIR / build_config.image_type
|
||||
|
||||
os.makedirs(build_dir, exist_ok=True)
|
||||
build_file_path = build_dir / f"{build_config.name}-build.yaml"
|
||||
|
||||
with open(build_file_path, "w") as f:
|
||||
to_write = json.loads(json.dumps(build_config.dict(), cls=EnumEncoder))
|
||||
f.write(yaml.dump(to_write, sort_keys=False))
|
||||
|
||||
build_image(build_config, build_file_path)
|
||||
|
||||
cprint(
|
||||
f"Build spec configuration saved at {str(build_file_path)}",
|
||||
color="green",
|
||||
)
|
||||
|
||||
def _run_stack_build_command(self, args: argparse.Namespace) -> None:
|
||||
from llama_stack.common.prompt_for_config import prompt_for_config
|
||||
from llama_stack.core.dynamic import instantiate_class_type
|
||||
|
||||
if not args.config:
|
||||
self.parser.error(
|
||||
"No config file specified. Please use `llama stack build /path/to/*-build.yaml`. Example config files can be found in llama_stack/configs/distributions"
|
||||
)
|
||||
return
|
||||
|
||||
with open(args.config, "r") as f:
|
||||
try:
|
||||
build_config = BuildConfig(**yaml.safe_load(f))
|
||||
except Exception as e:
|
||||
self.parser.error(f"Could not parse config file {args.config}: {e}")
|
||||
return
|
||||
if args.name:
|
||||
build_config.name = args.name
|
||||
self._run_stack_build_command_from_build_config(build_config)
|
137
llama_stack/cli/stack/configure.py
Normal file
137
llama_stack/cli/stack/configure.py
Normal file
|
@ -0,0 +1,137 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
import argparse
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
import pkg_resources
|
||||
|
||||
import yaml
|
||||
from termcolor import cprint
|
||||
|
||||
from llama_stack.cli.subcommand import Subcommand
|
||||
from llama_stack.common.config_dirs import BUILDS_BASE_DIR
|
||||
|
||||
from llama_stack.common.exec import run_with_pty
|
||||
from llama_stack.core.datatypes import * # noqa: F403
|
||||
import os
|
||||
|
||||
|
||||
class StackConfigure(Subcommand):
|
||||
"""Llama cli for configuring llama toolchain configs"""
|
||||
|
||||
def __init__(self, subparsers: argparse._SubParsersAction):
|
||||
super().__init__()
|
||||
self.parser = subparsers.add_parser(
|
||||
"configure",
|
||||
prog="llama stack configure",
|
||||
description="configure a llama stack distribution",
|
||||
formatter_class=argparse.RawTextHelpFormatter,
|
||||
)
|
||||
self._add_arguments()
|
||||
self.parser.set_defaults(func=self._run_stack_configure_cmd)
|
||||
|
||||
def _add_arguments(self):
|
||||
self.parser.add_argument(
|
||||
"config",
|
||||
type=str,
|
||||
help="Path to the build config file (e.g. ~/.llama/builds/<image_type>/<name>-build.yaml). For docker, this could also be the name of the docker image. ",
|
||||
)
|
||||
|
||||
self.parser.add_argument(
|
||||
"--output-dir",
|
||||
type=str,
|
||||
help="Path to the output directory to store generated run.yaml config file. If not specified, will use ~/.llama/build/<image_type>/<name>-run.yaml",
|
||||
)
|
||||
|
||||
def _run_stack_configure_cmd(self, args: argparse.Namespace) -> None:
|
||||
from llama_stack.core.package import ImageType
|
||||
|
||||
docker_image = None
|
||||
build_config_file = Path(args.config)
|
||||
if not build_config_file.exists():
|
||||
cprint(
|
||||
f"Could not find {build_config_file}. Trying docker image name instead...",
|
||||
color="green",
|
||||
)
|
||||
docker_image = args.config
|
||||
|
||||
builds_dir = BUILDS_BASE_DIR / ImageType.docker.value
|
||||
if args.output_dir:
|
||||
builds_dir = Path(output_dir)
|
||||
os.makedirs(builds_dir, exist_ok=True)
|
||||
|
||||
script = pkg_resources.resource_filename(
|
||||
"llama_stack", "core/configure_container.sh"
|
||||
)
|
||||
script_args = [script, docker_image, str(builds_dir)]
|
||||
|
||||
return_code = run_with_pty(script_args)
|
||||
|
||||
# we have regenerated the build config file with script, now check if it exists
|
||||
if return_code != 0:
|
||||
self.parser.error(
|
||||
f"Can not find {build_config_file}. Please run llama stack build first or check if docker image exists"
|
||||
)
|
||||
|
||||
build_name = docker_image.removeprefix("llamastack-")
|
||||
cprint(
|
||||
f"YAML configuration has been written to {builds_dir / f'{build_name}-run.yaml'}",
|
||||
color="green",
|
||||
)
|
||||
return
|
||||
|
||||
with open(build_config_file, "r") as f:
|
||||
build_config = BuildConfig(**yaml.safe_load(f))
|
||||
|
||||
self._configure_llama_distribution(build_config, args.output_dir)
|
||||
|
||||
def _configure_llama_distribution(
|
||||
self,
|
||||
build_config: BuildConfig,
|
||||
output_dir: Optional[str] = None,
|
||||
):
|
||||
from llama_stack.common.serialize import EnumEncoder
|
||||
from llama_stack.core.configure import configure_api_providers
|
||||
|
||||
builds_dir = BUILDS_BASE_DIR / build_config.image_type
|
||||
if output_dir:
|
||||
builds_dir = Path(output_dir)
|
||||
os.makedirs(builds_dir, exist_ok=True)
|
||||
image_name = build_config.name.replace("::", "-")
|
||||
run_config_file = builds_dir / f"{image_name}-run.yaml"
|
||||
|
||||
if run_config_file.exists():
|
||||
cprint(
|
||||
f"Configuration already exists for {build_config.name}. Will overwrite...",
|
||||
"yellow",
|
||||
attrs=["bold"],
|
||||
)
|
||||
config = StackRunConfig(**yaml.safe_load(run_config_file.read_text()))
|
||||
else:
|
||||
config = StackRunConfig(
|
||||
built_at=datetime.now(),
|
||||
image_name=image_name,
|
||||
apis_to_serve=[],
|
||||
provider_map={},
|
||||
)
|
||||
|
||||
config = configure_api_providers(config, build_config.distribution_spec)
|
||||
|
||||
config.docker_image = (
|
||||
image_name if build_config.image_type == "docker" else None
|
||||
)
|
||||
config.conda_env = image_name if build_config.image_type == "conda" else None
|
||||
|
||||
with open(run_config_file, "w") as f:
|
||||
to_write = json.loads(json.dumps(config.dict(), cls=EnumEncoder))
|
||||
f.write(yaml.dump(to_write, sort_keys=False))
|
||||
|
||||
cprint(
|
||||
f"> YAML configuration has been written to {run_config_file}",
|
||||
color="blue",
|
||||
)
|
47
llama_stack/cli/stack/list_apis.py
Normal file
47
llama_stack/cli/stack/list_apis.py
Normal file
|
@ -0,0 +1,47 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
import argparse
|
||||
|
||||
from llama_stack.cli.subcommand import Subcommand
|
||||
|
||||
|
||||
class StackListApis(Subcommand):
|
||||
def __init__(self, subparsers: argparse._SubParsersAction):
|
||||
super().__init__()
|
||||
self.parser = subparsers.add_parser(
|
||||
"list-apis",
|
||||
prog="llama stack list-apis",
|
||||
description="List APIs part of the Llama Stack implementation",
|
||||
formatter_class=argparse.RawTextHelpFormatter,
|
||||
)
|
||||
self._add_arguments()
|
||||
self.parser.set_defaults(func=self._run_apis_list_cmd)
|
||||
|
||||
def _add_arguments(self):
|
||||
pass
|
||||
|
||||
def _run_apis_list_cmd(self, args: argparse.Namespace) -> None:
|
||||
from llama_stack.cli.table import print_table
|
||||
from llama_stack.core.distribution import stack_apis
|
||||
|
||||
# eventually, this should query a registry at llama.meta.com/llamastack/distributions
|
||||
headers = [
|
||||
"API",
|
||||
]
|
||||
|
||||
rows = []
|
||||
for api in stack_apis():
|
||||
rows.append(
|
||||
[
|
||||
api.value,
|
||||
]
|
||||
)
|
||||
print_table(
|
||||
rows,
|
||||
headers,
|
||||
separate_rows=True,
|
||||
)
|
60
llama_stack/cli/stack/list_providers.py
Normal file
60
llama_stack/cli/stack/list_providers.py
Normal file
|
@ -0,0 +1,60 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
import argparse
|
||||
|
||||
from llama_stack.cli.subcommand import Subcommand
|
||||
|
||||
|
||||
class StackListProviders(Subcommand):
|
||||
def __init__(self, subparsers: argparse._SubParsersAction):
|
||||
super().__init__()
|
||||
self.parser = subparsers.add_parser(
|
||||
"list-providers",
|
||||
prog="llama stack list-providers",
|
||||
description="Show available Llama Stack Providers for an API",
|
||||
formatter_class=argparse.RawTextHelpFormatter,
|
||||
)
|
||||
self._add_arguments()
|
||||
self.parser.set_defaults(func=self._run_providers_list_cmd)
|
||||
|
||||
def _add_arguments(self):
|
||||
from llama_stack.core.distribution import stack_apis
|
||||
|
||||
api_values = [a.value for a in stack_apis()]
|
||||
self.parser.add_argument(
|
||||
"api",
|
||||
type=str,
|
||||
choices=api_values,
|
||||
help="API to list providers for (one of: {})".format(api_values),
|
||||
)
|
||||
|
||||
def _run_providers_list_cmd(self, args: argparse.Namespace) -> None:
|
||||
from llama_stack.cli.table import print_table
|
||||
from llama_stack.core.distribution import Api, api_providers
|
||||
|
||||
all_providers = api_providers()
|
||||
providers_for_api = all_providers[Api(args.api)]
|
||||
|
||||
# eventually, this should query a registry at llama.meta.com/llamastack/distributions
|
||||
headers = [
|
||||
"Provider Type",
|
||||
"PIP Package Dependencies",
|
||||
]
|
||||
|
||||
rows = []
|
||||
for spec in providers_for_api.values():
|
||||
rows.append(
|
||||
[
|
||||
spec.provider_id,
|
||||
",".join(spec.pip_packages),
|
||||
]
|
||||
)
|
||||
print_table(
|
||||
rows,
|
||||
headers,
|
||||
separate_rows=True,
|
||||
)
|
88
llama_stack/cli/stack/run.py
Normal file
88
llama_stack/cli/stack/run.py
Normal file
|
@ -0,0 +1,88 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
import argparse
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
import pkg_resources
|
||||
import yaml
|
||||
|
||||
from llama_stack.cli.subcommand import Subcommand
|
||||
from llama_stack.core.datatypes import * # noqa: F403
|
||||
|
||||
|
||||
class StackRun(Subcommand):
|
||||
def __init__(self, subparsers: argparse._SubParsersAction):
|
||||
super().__init__()
|
||||
self.parser = subparsers.add_parser(
|
||||
"run",
|
||||
prog="llama stack run",
|
||||
description="""start the server for a Llama Stack Distribution. You should have already built (or downloaded) and configured the distribution.""",
|
||||
formatter_class=argparse.RawTextHelpFormatter,
|
||||
)
|
||||
self._add_arguments()
|
||||
self.parser.set_defaults(func=self._run_stack_run_cmd)
|
||||
|
||||
def _add_arguments(self):
|
||||
self.parser.add_argument(
|
||||
"config",
|
||||
type=str,
|
||||
help="Path to config file to use for the run",
|
||||
)
|
||||
self.parser.add_argument(
|
||||
"--port",
|
||||
type=int,
|
||||
help="Port to run the server on. Defaults to 5000",
|
||||
default=5000,
|
||||
)
|
||||
self.parser.add_argument(
|
||||
"--disable-ipv6",
|
||||
action="store_true",
|
||||
help="Disable IPv6 support",
|
||||
default=False,
|
||||
)
|
||||
|
||||
def _run_stack_run_cmd(self, args: argparse.Namespace) -> None:
|
||||
from llama_stack.common.exec import run_with_pty
|
||||
|
||||
if not args.config:
|
||||
self.parser.error("Must specify a config file to run")
|
||||
return
|
||||
|
||||
path = args.config
|
||||
config_file = Path(path)
|
||||
|
||||
if not config_file.exists():
|
||||
self.parser.error(
|
||||
f"File {str(config_file)} does not exist. Did you run `llama stack build`?"
|
||||
)
|
||||
return
|
||||
|
||||
with open(config_file, "r") as f:
|
||||
config = StackRunConfig(**yaml.safe_load(f))
|
||||
|
||||
if config.docker_image:
|
||||
script = pkg_resources.resource_filename(
|
||||
"llama_stack",
|
||||
"core/start_container.sh",
|
||||
)
|
||||
run_args = [script, config.docker_image]
|
||||
else:
|
||||
script = pkg_resources.resource_filename(
|
||||
"llama_stack",
|
||||
"core/start_conda_env.sh",
|
||||
)
|
||||
run_args = [
|
||||
script,
|
||||
config.conda_env,
|
||||
]
|
||||
|
||||
run_args.extend([str(config_file), str(args.port)])
|
||||
if args.disable_ipv6:
|
||||
run_args.append("--disable-ipv6")
|
||||
|
||||
run_with_pty(run_args)
|
34
llama_stack/cli/stack/stack.py
Normal file
34
llama_stack/cli/stack/stack.py
Normal file
|
@ -0,0 +1,34 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
import argparse
|
||||
|
||||
from llama_stack.cli.subcommand import Subcommand
|
||||
|
||||
from .build import StackBuild
|
||||
from .configure import StackConfigure
|
||||
from .list_apis import StackListApis
|
||||
from .list_providers import StackListProviders
|
||||
from .run import StackRun
|
||||
|
||||
|
||||
class StackParser(Subcommand):
|
||||
def __init__(self, subparsers: argparse._SubParsersAction):
|
||||
super().__init__()
|
||||
self.parser = subparsers.add_parser(
|
||||
"stack",
|
||||
prog="llama stack",
|
||||
description="Operations for the Llama Stack / Distributions",
|
||||
)
|
||||
|
||||
subparsers = self.parser.add_subparsers(title="stack_subcommands")
|
||||
|
||||
# Add sub-commands
|
||||
StackBuild.create(subparsers)
|
||||
StackConfigure.create(subparsers)
|
||||
StackListApis.create(subparsers)
|
||||
StackListProviders.create(subparsers)
|
||||
StackRun.create(subparsers)
|
19
llama_stack/cli/subcommand.py
Normal file
19
llama_stack/cli/subcommand.py
Normal file
|
@ -0,0 +1,19 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
|
||||
class Subcommand:
|
||||
"""All llama cli subcommands must inherit from this class"""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
pass
|
||||
|
||||
@classmethod
|
||||
def create(cls, *args, **kwargs):
|
||||
return cls(*args, **kwargs)
|
||||
|
||||
def _add_arguments(self):
|
||||
pass
|
77
llama_stack/cli/table.py
Normal file
77
llama_stack/cli/table.py
Normal file
|
@ -0,0 +1,77 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
import re
|
||||
import textwrap
|
||||
|
||||
from termcolor import cprint
|
||||
|
||||
|
||||
def strip_ansi_colors(text):
|
||||
ansi_escape = re.compile(r"\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])")
|
||||
return ansi_escape.sub("", text)
|
||||
|
||||
|
||||
def format_row(row, col_widths):
|
||||
def wrap(text, width):
|
||||
lines = []
|
||||
for line in text.split("\n"):
|
||||
if line.strip() == "":
|
||||
lines.append("")
|
||||
else:
|
||||
lines.extend(
|
||||
textwrap.wrap(
|
||||
line, width, break_long_words=False, replace_whitespace=False
|
||||
)
|
||||
)
|
||||
return lines
|
||||
|
||||
wrapped = [wrap(item, width) for item, width in zip(row, col_widths)]
|
||||
max_lines = max(len(subrow) for subrow in wrapped)
|
||||
|
||||
lines = []
|
||||
for i in range(max_lines):
|
||||
line = []
|
||||
for cell_lines, width in zip(wrapped, col_widths):
|
||||
value = cell_lines[i] if i < len(cell_lines) else ""
|
||||
line.append(value + " " * (width - len(strip_ansi_colors(value))))
|
||||
lines.append("| " + (" | ".join(line)) + " |")
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def print_table(rows, headers=None, separate_rows: bool = False):
|
||||
def itemlen(item):
|
||||
return max([len(line) for line in strip_ansi_colors(item).split("\n")])
|
||||
|
||||
rows = [[x or "" for x in row] for row in rows]
|
||||
if not headers:
|
||||
col_widths = [max(itemlen(item) for item in col) for col in zip(*rows)]
|
||||
else:
|
||||
col_widths = [
|
||||
max(
|
||||
itemlen(header),
|
||||
max(itemlen(item) for item in col),
|
||||
)
|
||||
for header, col in zip(headers, zip(*rows))
|
||||
]
|
||||
col_widths = [min(w, 80) for w in col_widths]
|
||||
|
||||
header_line = "+".join("-" * (width + 2) for width in col_widths)
|
||||
header_line = f"+{header_line}+"
|
||||
|
||||
if headers:
|
||||
print(header_line)
|
||||
cprint(format_row(headers, col_widths), "white", attrs=["bold"])
|
||||
|
||||
print(header_line)
|
||||
for row in rows:
|
||||
print(format_row(row, col_widths))
|
||||
if separate_rows:
|
||||
print(header_line)
|
||||
|
||||
if not separate_rows:
|
||||
print(header_line)
|
5
llama_stack/common/__init__.py
Normal file
5
llama_stack/common/__init__.py
Normal file
|
@ -0,0 +1,5 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
17
llama_stack/common/config_dirs.py
Normal file
17
llama_stack/common/config_dirs.py
Normal file
|
@ -0,0 +1,17 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
LLAMA_STACK_CONFIG_DIR = Path(os.path.expanduser("~/.llama/"))
|
||||
|
||||
DISTRIBS_BASE_DIR = LLAMA_STACK_CONFIG_DIR / "distributions"
|
||||
|
||||
DEFAULT_CHECKPOINT_DIR = LLAMA_STACK_CONFIG_DIR / "checkpoints"
|
||||
|
||||
BUILDS_BASE_DIR = LLAMA_STACK_CONFIG_DIR / "builds"
|
31
llama_stack/common/deployment_types.py
Normal file
31
llama_stack/common/deployment_types.py
Normal file
|
@ -0,0 +1,31 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
from enum import Enum
|
||||
from typing import Any, Dict, Optional
|
||||
|
||||
from llama_models.llama3.api.datatypes import URL
|
||||
|
||||
from llama_models.schema_utils import json_schema_type
|
||||
|
||||
from pydantic import BaseModel
|
||||
|
||||
|
||||
@json_schema_type
|
||||
class RestAPIMethod(Enum):
|
||||
GET = "GET"
|
||||
POST = "POST"
|
||||
PUT = "PUT"
|
||||
DELETE = "DELETE"
|
||||
|
||||
|
||||
@json_schema_type
|
||||
class RestAPIExecutionConfig(BaseModel):
|
||||
url: URL
|
||||
method: RestAPIMethod
|
||||
params: Optional[Dict[str, Any]] = None
|
||||
headers: Optional[Dict[str, Any]] = None
|
||||
body: Optional[Dict[str, Any]] = None
|
105
llama_stack/common/exec.py
Normal file
105
llama_stack/common/exec.py
Normal file
|
@ -0,0 +1,105 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
import errno
|
||||
import os
|
||||
import pty
|
||||
import select
|
||||
import signal
|
||||
import subprocess
|
||||
import sys
|
||||
import termios
|
||||
|
||||
from termcolor import cprint
|
||||
|
||||
|
||||
# run a command in a pseudo-terminal, with interrupt handling,
|
||||
# useful when you want to run interactive things
|
||||
def run_with_pty(command):
|
||||
master, slave = pty.openpty()
|
||||
|
||||
old_settings = termios.tcgetattr(sys.stdin)
|
||||
original_sigint = signal.getsignal(signal.SIGINT)
|
||||
|
||||
ctrl_c_pressed = False
|
||||
|
||||
def sigint_handler(signum, frame):
|
||||
nonlocal ctrl_c_pressed
|
||||
ctrl_c_pressed = True
|
||||
cprint("\nCtrl-C detected. Aborting...", "white", attrs=["bold"])
|
||||
|
||||
try:
|
||||
# Set up the signal handler
|
||||
signal.signal(signal.SIGINT, sigint_handler)
|
||||
|
||||
new_settings = termios.tcgetattr(sys.stdin)
|
||||
new_settings[3] = new_settings[3] & ~termios.ECHO # Disable echo
|
||||
new_settings[3] = new_settings[3] & ~termios.ICANON # Disable canonical mode
|
||||
termios.tcsetattr(sys.stdin, termios.TCSADRAIN, new_settings)
|
||||
|
||||
process = subprocess.Popen(
|
||||
command,
|
||||
stdin=slave,
|
||||
stdout=slave,
|
||||
stderr=slave,
|
||||
universal_newlines=True,
|
||||
preexec_fn=os.setsid,
|
||||
)
|
||||
|
||||
# Close the slave file descriptor as it's now owned by the subprocess
|
||||
os.close(slave)
|
||||
|
||||
def handle_io():
|
||||
while not ctrl_c_pressed:
|
||||
try:
|
||||
rlist, _, _ = select.select([sys.stdin, master], [], [], 0.1)
|
||||
|
||||
if sys.stdin in rlist:
|
||||
data = os.read(sys.stdin.fileno(), 1024)
|
||||
if not data:
|
||||
break
|
||||
os.write(master, data)
|
||||
|
||||
if master in rlist:
|
||||
data = os.read(master, 1024)
|
||||
if not data:
|
||||
break
|
||||
sys.stdout.buffer.write(data)
|
||||
sys.stdout.flush()
|
||||
|
||||
except KeyboardInterrupt:
|
||||
# This will be raised when Ctrl+C is pressed
|
||||
break
|
||||
|
||||
if process.poll() is not None:
|
||||
break
|
||||
|
||||
handle_io()
|
||||
except (EOFError, KeyboardInterrupt):
|
||||
pass
|
||||
except OSError as e:
|
||||
if e.errno != errno.EIO:
|
||||
raise
|
||||
finally:
|
||||
# Clean up
|
||||
termios.tcsetattr(sys.stdin, termios.TCSADRAIN, old_settings)
|
||||
signal.signal(signal.SIGINT, original_sigint)
|
||||
|
||||
os.close(master)
|
||||
if process.poll() is None:
|
||||
process.terminate()
|
||||
process.wait()
|
||||
|
||||
return process.returncode
|
||||
|
||||
|
||||
def run_command(command):
|
||||
process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
||||
output, error = process.communicate()
|
||||
if process.returncode != 0:
|
||||
print(f"Error: {error.decode('utf-8')}")
|
||||
sys.exit(1)
|
||||
return output.decode("utf-8")
|
13
llama_stack/common/model_utils.py
Normal file
13
llama_stack/common/model_utils.py
Normal file
|
@ -0,0 +1,13 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
import os
|
||||
|
||||
from .config_dirs import DEFAULT_CHECKPOINT_DIR
|
||||
|
||||
|
||||
def model_local_dir(descriptor: str) -> str:
|
||||
return os.path.join(DEFAULT_CHECKPOINT_DIR, descriptor)
|
309
llama_stack/common/prompt_for_config.py
Normal file
309
llama_stack/common/prompt_for_config.py
Normal file
|
@ -0,0 +1,309 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
import inspect
|
||||
import json
|
||||
from enum import Enum
|
||||
|
||||
from typing import Any, get_args, get_origin, List, Literal, Optional, Type, Union
|
||||
|
||||
from pydantic import BaseModel
|
||||
from pydantic.fields import FieldInfo
|
||||
from pydantic_core import PydanticUndefinedType
|
||||
|
||||
from typing_extensions import Annotated
|
||||
|
||||
|
||||
def is_list_of_primitives(field_type):
|
||||
"""Check if a field type is a List of primitive types."""
|
||||
origin = get_origin(field_type)
|
||||
if origin is List or origin is list:
|
||||
args = get_args(field_type)
|
||||
if len(args) == 1 and args[0] in (int, float, str, bool):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def is_basemodel_without_fields(typ):
|
||||
return (
|
||||
inspect.isclass(typ) and issubclass(typ, BaseModel) and len(typ.__fields__) == 0
|
||||
)
|
||||
|
||||
|
||||
def can_recurse(typ):
|
||||
return (
|
||||
inspect.isclass(typ) and issubclass(typ, BaseModel) and len(typ.__fields__) > 0
|
||||
)
|
||||
|
||||
|
||||
def get_literal_values(field):
|
||||
"""Extract literal values from a field if it's a Literal type."""
|
||||
if get_origin(field.annotation) is Literal:
|
||||
return get_args(field.annotation)
|
||||
return None
|
||||
|
||||
|
||||
def is_optional(field_type):
|
||||
"""Check if a field type is Optional."""
|
||||
return get_origin(field_type) is Union and type(None) in get_args(field_type)
|
||||
|
||||
|
||||
def get_non_none_type(field_type):
|
||||
"""Get the non-None type from an Optional type."""
|
||||
return next(arg for arg in get_args(field_type) if arg is not type(None))
|
||||
|
||||
|
||||
def manually_validate_field(model: Type[BaseModel], field_name: str, value: Any):
|
||||
validators = model.__pydantic_decorators__.field_validators
|
||||
for _name, validator in validators.items():
|
||||
if field_name in validator.info.fields:
|
||||
validator.func(value)
|
||||
|
||||
return value
|
||||
|
||||
|
||||
def is_discriminated_union(typ) -> bool:
|
||||
if isinstance(typ, FieldInfo):
|
||||
return typ.discriminator
|
||||
else:
|
||||
if not (get_origin(typ) is Annotated):
|
||||
return False
|
||||
args = get_args(typ)
|
||||
return len(args) >= 2 and args[1].discriminator
|
||||
|
||||
|
||||
def prompt_for_discriminated_union(
|
||||
field_name,
|
||||
typ,
|
||||
existing_value,
|
||||
):
|
||||
if isinstance(typ, FieldInfo):
|
||||
inner_type = typ.annotation
|
||||
discriminator = typ.discriminator
|
||||
else:
|
||||
args = get_args(typ)
|
||||
inner_type = args[0]
|
||||
discriminator = args[1].discriminator
|
||||
|
||||
union_types = get_args(inner_type)
|
||||
# Find the discriminator field in each union type
|
||||
type_map = {}
|
||||
for t in union_types:
|
||||
disc_field = t.__fields__[discriminator]
|
||||
literal_values = get_literal_values(disc_field)
|
||||
if literal_values:
|
||||
for value in literal_values:
|
||||
type_map[value] = t
|
||||
|
||||
while True:
|
||||
discriminator_value = input(
|
||||
f"Enter `{discriminator}` for {field_name} (options: {', '.join(type_map.keys())}): "
|
||||
)
|
||||
if discriminator_value in type_map:
|
||||
chosen_type = type_map[discriminator_value]
|
||||
print(f"\nConfiguring {chosen_type.__name__}:")
|
||||
|
||||
if existing_value and (
|
||||
getattr(existing_value, discriminator) != discriminator_value
|
||||
):
|
||||
existing_value = None
|
||||
|
||||
sub_config = prompt_for_config(chosen_type, existing_value)
|
||||
# Set the discriminator field in the sub-config
|
||||
setattr(sub_config, discriminator, discriminator_value)
|
||||
return sub_config
|
||||
else:
|
||||
print(f"Invalid {discriminator}. Please try again.")
|
||||
|
||||
|
||||
# This is somewhat elaborate, but does not purport to be comprehensive in any way.
|
||||
# We should add handling for the most common cases to tide us over.
|
||||
#
|
||||
# doesn't support List[nested_class] yet or Dicts of any kind. needs a bunch of
|
||||
# unit tests for coverage.
|
||||
def prompt_for_config(
|
||||
config_type: type[BaseModel], existing_config: Optional[BaseModel] = None
|
||||
) -> BaseModel:
|
||||
"""
|
||||
Recursively prompt the user for configuration values based on a Pydantic BaseModel.
|
||||
|
||||
Args:
|
||||
config_type: A Pydantic BaseModel class representing the configuration structure.
|
||||
|
||||
Returns:
|
||||
An instance of the config_type with user-provided values.
|
||||
"""
|
||||
config_data = {}
|
||||
|
||||
for field_name, field in config_type.__fields__.items():
|
||||
field_type = field.annotation
|
||||
existing_value = (
|
||||
getattr(existing_config, field_name) if existing_config else None
|
||||
)
|
||||
if existing_value:
|
||||
default_value = existing_value
|
||||
else:
|
||||
default_value = (
|
||||
field.default
|
||||
if not isinstance(field.default, PydanticUndefinedType)
|
||||
else None
|
||||
)
|
||||
is_required = field.is_required
|
||||
|
||||
# Skip fields with Literal type
|
||||
if get_origin(field_type) is Literal:
|
||||
continue
|
||||
|
||||
# Skip fields with no type annotations
|
||||
if is_basemodel_without_fields(field_type):
|
||||
config_data[field_name] = field_type()
|
||||
continue
|
||||
|
||||
if inspect.isclass(field_type) and issubclass(field_type, Enum):
|
||||
prompt = f"Choose {field_name} (options: {', '.join(e.name for e in field_type)}):"
|
||||
while True:
|
||||
# this branch does not handle existing and default values yet
|
||||
user_input = input(prompt + " ")
|
||||
try:
|
||||
value = field_type[user_input]
|
||||
validated_value = manually_validate_field(config_type, field, value)
|
||||
config_data[field_name] = validated_value
|
||||
break
|
||||
except KeyError:
|
||||
print(
|
||||
f"Invalid choice. Please choose from: {', '.join(e.name for e in field_type)}"
|
||||
)
|
||||
continue
|
||||
|
||||
if is_discriminated_union(field):
|
||||
config_data[field_name] = prompt_for_discriminated_union(
|
||||
field_name, field, existing_value
|
||||
)
|
||||
continue
|
||||
|
||||
if is_optional(field_type) and can_recurse(get_non_none_type(field_type)):
|
||||
prompt = f"Do you want to configure {field_name}? (y/n): "
|
||||
if input(prompt).lower() == "n":
|
||||
config_data[field_name] = None
|
||||
continue
|
||||
nested_type = get_non_none_type(field_type)
|
||||
print(f"Entering sub-configuration for {field_name}:")
|
||||
config_data[field_name] = prompt_for_config(nested_type, existing_value)
|
||||
elif is_optional(field_type) and is_discriminated_union(
|
||||
get_non_none_type(field_type)
|
||||
):
|
||||
prompt = f"Do you want to configure {field_name}? (y/n): "
|
||||
if input(prompt).lower() == "n":
|
||||
config_data[field_name] = None
|
||||
continue
|
||||
nested_type = get_non_none_type(field_type)
|
||||
config_data[field_name] = prompt_for_discriminated_union(
|
||||
field_name,
|
||||
nested_type,
|
||||
existing_value,
|
||||
)
|
||||
elif can_recurse(field_type):
|
||||
print(f"\nEntering sub-configuration for {field_name}:")
|
||||
config_data[field_name] = prompt_for_config(
|
||||
field_type,
|
||||
existing_value,
|
||||
)
|
||||
else:
|
||||
prompt = f"Enter value for {field_name}"
|
||||
if existing_value is not None:
|
||||
prompt += f" (existing: {existing_value})"
|
||||
elif default_value is not None:
|
||||
prompt += f" (default: {default_value})"
|
||||
if is_optional(field_type):
|
||||
prompt += " (optional)"
|
||||
elif is_required:
|
||||
prompt += " (required)"
|
||||
prompt += ": "
|
||||
|
||||
while True:
|
||||
user_input = input(prompt)
|
||||
if user_input == "":
|
||||
if default_value is not None:
|
||||
config_data[field_name] = default_value
|
||||
break
|
||||
elif is_optional(field_type) or not is_required:
|
||||
config_data[field_name] = None
|
||||
break
|
||||
else:
|
||||
print("This field is required. Please provide a value.")
|
||||
continue
|
||||
else:
|
||||
try:
|
||||
# Handle Optional types
|
||||
if is_optional(field_type):
|
||||
if user_input.lower() == "none":
|
||||
value = None
|
||||
else:
|
||||
field_type = get_non_none_type(field_type)
|
||||
value = user_input
|
||||
|
||||
# Handle List of primitives
|
||||
elif is_list_of_primitives(field_type):
|
||||
try:
|
||||
value = json.loads(user_input)
|
||||
if not isinstance(value, list):
|
||||
raise ValueError(
|
||||
"Input must be a JSON-encoded list"
|
||||
)
|
||||
element_type = get_args(field_type)[0]
|
||||
value = [element_type(item) for item in value]
|
||||
|
||||
except json.JSONDecodeError:
|
||||
print(
|
||||
"Invalid JSON. Please enter a valid JSON-encoded list."
|
||||
)
|
||||
continue
|
||||
except ValueError as e:
|
||||
print(f"{str(e)}")
|
||||
continue
|
||||
|
||||
elif get_origin(field_type) is dict:
|
||||
try:
|
||||
value = json.loads(user_input)
|
||||
if not isinstance(value, dict):
|
||||
raise ValueError(
|
||||
"Input must be a JSON-encoded dictionary"
|
||||
)
|
||||
|
||||
except json.JSONDecodeError:
|
||||
print(
|
||||
"Invalid JSON. Please enter a valid JSON-encoded dict."
|
||||
)
|
||||
continue
|
||||
|
||||
# Convert the input to the correct type
|
||||
elif inspect.isclass(field_type) and issubclass(
|
||||
field_type, BaseModel
|
||||
):
|
||||
# For nested BaseModels, we assume a dictionary-like string input
|
||||
import ast
|
||||
|
||||
value = field_type(**ast.literal_eval(user_input))
|
||||
else:
|
||||
value = field_type(user_input)
|
||||
|
||||
except ValueError:
|
||||
print(
|
||||
f"Invalid input. Expected type: {getattr(field_type, '__name__', str(field_type))}"
|
||||
)
|
||||
continue
|
||||
|
||||
try:
|
||||
# Validate the field using our manual validation function
|
||||
validated_value = manually_validate_field(
|
||||
config_type, field_name, value
|
||||
)
|
||||
config_data[field_name] = validated_value
|
||||
break
|
||||
except ValueError as e:
|
||||
print(f"Validation error: {str(e)}")
|
||||
|
||||
return config_type(**config_data)
|
18
llama_stack/common/serialize.py
Normal file
18
llama_stack/common/serialize.py
Normal file
|
@ -0,0 +1,18 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
import json
|
||||
from datetime import datetime
|
||||
from enum import Enum
|
||||
|
||||
|
||||
class EnumEncoder(json.JSONEncoder):
|
||||
def default(self, obj):
|
||||
if isinstance(obj, Enum):
|
||||
return obj.value
|
||||
elif isinstance(obj, datetime):
|
||||
return obj.isoformat()
|
||||
return super().default(obj)
|
16
llama_stack/common/training_types.py
Normal file
16
llama_stack/common/training_types.py
Normal file
|
@ -0,0 +1,16 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
from llama_models.llama3.api.datatypes import URL
|
||||
from llama_models.schema_utils import json_schema_type
|
||||
from pydantic import BaseModel
|
||||
|
||||
|
||||
@json_schema_type(schema={"description": "Checkpoint created during training runs"})
|
||||
class Checkpoint(BaseModel):
|
||||
iters: int
|
||||
path: URL
|
||||
epoch: int
|
|
@ -0,0 +1,10 @@
|
|||
name: local-conda-example
|
||||
distribution_spec:
|
||||
description: Use code from `llama_stack` itself to serve all llama stack APIs
|
||||
providers:
|
||||
inference: meta-reference
|
||||
memory: meta-reference-faiss
|
||||
safety: meta-reference
|
||||
agentic_system: meta-reference
|
||||
telemetry: console
|
||||
image_type: conda
|
|
@ -0,0 +1,10 @@
|
|||
name: local-fireworks-conda-example
|
||||
distribution_spec:
|
||||
description: Use Fireworks.ai for running LLM inference
|
||||
providers:
|
||||
inference: remote::fireworks
|
||||
memory: meta-reference-faiss
|
||||
safety: meta-reference
|
||||
agentic_system: meta-reference
|
||||
telemetry: console
|
||||
image_type: conda
|
|
@ -0,0 +1,10 @@
|
|||
name: local-ollama-conda-example
|
||||
distribution_spec:
|
||||
description: Like local, but use ollama for running LLM inference
|
||||
providers:
|
||||
inference: remote::ollama
|
||||
memory: meta-reference-faiss
|
||||
safety: meta-reference
|
||||
agentic_system: meta-reference
|
||||
telemetry: console
|
||||
image_type: conda
|
|
@ -0,0 +1,10 @@
|
|||
name: local-tgi-conda-example
|
||||
distribution_spec:
|
||||
description: Use TGI (local or with Hugging Face Inference Endpoints for running LLM inference. When using HF Inference Endpoints, you must provide the name of the endpoint).
|
||||
providers:
|
||||
inference: remote::tgi
|
||||
memory: meta-reference-faiss
|
||||
safety: meta-reference
|
||||
agentic_system: meta-reference
|
||||
telemetry: console
|
||||
image_type: conda
|
|
@ -0,0 +1,10 @@
|
|||
name: local-tgi-conda-example
|
||||
distribution_spec:
|
||||
description: Use Together.ai for running LLM inference
|
||||
providers:
|
||||
inference: remote::together
|
||||
memory: meta-reference-faiss
|
||||
safety: meta-reference
|
||||
agentic_system: meta-reference
|
||||
telemetry: console
|
||||
image_type: conda
|
|
@ -0,0 +1,10 @@
|
|||
name: local-docker-example
|
||||
distribution_spec:
|
||||
description: Use code from `llama_stack` itself to serve all llama stack APIs
|
||||
providers:
|
||||
inference: meta-reference
|
||||
memory: meta-reference-faiss
|
||||
safety: meta-reference
|
||||
agentic_system: meta-reference
|
||||
telemetry: console
|
||||
image_type: docker
|
5
llama_stack/core/__init__.py
Normal file
5
llama_stack/core/__init__.py
Normal file
|
@ -0,0 +1,5 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
115
llama_stack/core/build_conda_env.sh
Executable file
115
llama_stack/core/build_conda_env.sh
Executable file
|
@ -0,0 +1,115 @@
|
|||
#!/bin/bash
|
||||
|
||||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
LLAMA_MODELS_DIR=${LLAMA_MODELS_DIR:-}
|
||||
LLAMA_TOOLCHAIN_DIR=${LLAMA_TOOLCHAIN_DIR:-}
|
||||
TEST_PYPI_VERSION=${TEST_PYPI_VERSION:-}
|
||||
|
||||
if [ -n "$LLAMA_TOOLCHAIN_DIR" ]; then
|
||||
echo "Using llama-toolchain-dir=$LLAMA_TOOLCHAIN_DIR"
|
||||
fi
|
||||
if [ -n "$LLAMA_MODELS_DIR" ]; then
|
||||
echo "Using llama-models-dir=$LLAMA_MODELS_DIR"
|
||||
fi
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
if [ "$#" -ne 2 ]; then
|
||||
echo "Usage: $0 <distribution_type> <build_name> <pip_dependencies>" >&2
|
||||
echo "Example: $0 <distribution_type> mybuild 'numpy pandas scipy'" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
build_name="$1"
|
||||
env_name="llamastack-$build_name"
|
||||
pip_dependencies="$2"
|
||||
|
||||
# Define color codes
|
||||
RED='\033[0;31m'
|
||||
GREEN='\033[0;32m'
|
||||
NC='\033[0m' # No Color
|
||||
|
||||
# this is set if we actually create a new conda in which case we need to clean up
|
||||
ENVNAME=""
|
||||
|
||||
SCRIPT_DIR=$(dirname "$(readlink -f "$0")")
|
||||
source "$SCRIPT_DIR/common.sh"
|
||||
|
||||
ensure_conda_env_python310() {
|
||||
local env_name="$1"
|
||||
local pip_dependencies="$2"
|
||||
local python_version="3.10"
|
||||
|
||||
# Check if conda command is available
|
||||
if ! command -v conda &>/dev/null; then
|
||||
printf "${RED}Error: conda command not found. Is Conda installed and in your PATH?${NC}" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Check if the environment exists
|
||||
if conda env list | grep -q "^${env_name} "; then
|
||||
printf "Conda environment '${env_name}' exists. Checking Python version...\n"
|
||||
|
||||
# Check Python version in the environment
|
||||
current_version=$(conda run -n "${env_name}" python --version 2>&1 | cut -d' ' -f2 | cut -d'.' -f1,2)
|
||||
|
||||
if [ "$current_version" = "$python_version" ]; then
|
||||
printf "Environment '${env_name}' already has Python ${python_version}. No action needed.\n"
|
||||
else
|
||||
printf "Updating environment '${env_name}' to Python ${python_version}...\n"
|
||||
conda install -n "${env_name}" python="${python_version}" -y
|
||||
fi
|
||||
else
|
||||
printf "Conda environment '${env_name}' does not exist. Creating with Python ${python_version}...\n"
|
||||
conda create -n "${env_name}" python="${python_version}" -y
|
||||
|
||||
ENVNAME="${env_name}"
|
||||
# setup_cleanup_handlers
|
||||
fi
|
||||
|
||||
eval "$(conda shell.bash hook)"
|
||||
conda deactivate && conda activate "${env_name}"
|
||||
|
||||
if [ -n "$TEST_PYPI_VERSION" ]; then
|
||||
# these packages are damaged in test-pypi, so install them first
|
||||
pip install fastapi libcst
|
||||
pip install --extra-index-url https://test.pypi.org/simple/ llama-models==$TEST_PYPI_VERSION llama-toolchain==$TEST_PYPI_VERSION $pip_dependencies
|
||||
else
|
||||
# Re-installing llama-toolchain in the new conda environment
|
||||
if [ -n "$LLAMA_TOOLCHAIN_DIR" ]; then
|
||||
if [ ! -d "$LLAMA_TOOLCHAIN_DIR" ]; then
|
||||
printf "${RED}Warning: LLAMA_TOOLCHAIN_DIR is set but directory does not exist: $LLAMA_TOOLCHAIN_DIR${NC}\n" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
printf "Installing from LLAMA_TOOLCHAIN_DIR: $LLAMA_TOOLCHAIN_DIR\n"
|
||||
pip install --no-cache-dir -e "$LLAMA_TOOLCHAIN_DIR"
|
||||
else
|
||||
pip install --no-cache-dir llama-toolchain
|
||||
fi
|
||||
|
||||
if [ -n "$LLAMA_MODELS_DIR" ]; then
|
||||
if [ ! -d "$LLAMA_MODELS_DIR" ]; then
|
||||
printf "${RED}Warning: LLAMA_MODELS_DIR is set but directory does not exist: $LLAMA_MODELS_DIR${NC}\n" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
printf "Installing from LLAMA_MODELS_DIR: $LLAMA_MODELS_DIR\n"
|
||||
pip uninstall -y llama-models
|
||||
pip install --no-cache-dir -e "$LLAMA_MODELS_DIR"
|
||||
fi
|
||||
|
||||
# Install pip dependencies
|
||||
if [ -n "$pip_dependencies" ]; then
|
||||
printf "Installing pip dependencies: $pip_dependencies\n"
|
||||
pip install $pip_dependencies
|
||||
fi
|
||||
fi
|
||||
}
|
||||
|
||||
ensure_conda_env_python310 "$env_name" "$pip_dependencies"
|
117
llama_stack/core/build_container.sh
Executable file
117
llama_stack/core/build_container.sh
Executable file
|
@ -0,0 +1,117 @@
|
|||
#!/bin/bash
|
||||
|
||||
LLAMA_MODELS_DIR=${LLAMA_MODELS_DIR:-}
|
||||
LLAMA_TOOLCHAIN_DIR=${LLAMA_TOOLCHAIN_DIR:-}
|
||||
TEST_PYPI_VERSION=${TEST_PYPI_VERSION:-}
|
||||
|
||||
if [ "$#" -ne 4 ]; then
|
||||
echo "Usage: $0 <build_name> <docker_base> <pip_dependencies>
|
||||
echo "Example: $0 my-fastapi-app python:3.9-slim 'fastapi uvicorn'
|
||||
exit 1
|
||||
fi
|
||||
|
||||
build_name="$1"
|
||||
image_name="llamastack-$build_name"
|
||||
docker_base=$2
|
||||
build_file_path=$3
|
||||
pip_dependencies=$4
|
||||
|
||||
# Define color codes
|
||||
RED='\033[0;31m'
|
||||
GREEN='\033[0;32m'
|
||||
NC='\033[0m' # No Color
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR=$(dirname "$(readlink -f "$0")")
|
||||
REPO_DIR=$(dirname $(dirname "$SCRIPT_DIR"))
|
||||
DOCKER_BINARY=${DOCKER_BINARY:-docker}
|
||||
DOCKER_OPTS=${DOCKER_OPTS:-}
|
||||
|
||||
TEMP_DIR=$(mktemp -d)
|
||||
|
||||
add_to_docker() {
|
||||
local input
|
||||
output_file="$TEMP_DIR/Dockerfile"
|
||||
if [ -t 0 ]; then
|
||||
printf '%s\n' "$1" >>"$output_file"
|
||||
else
|
||||
# If stdin is not a terminal, read from it (heredoc)
|
||||
cat >>"$output_file"
|
||||
fi
|
||||
}
|
||||
|
||||
add_to_docker <<EOF
|
||||
FROM $docker_base
|
||||
WORKDIR /app
|
||||
|
||||
RUN apt-get update && apt-get install -y \
|
||||
iputils-ping net-tools iproute2 dnsutils telnet \
|
||||
curl wget telnet \
|
||||
procps psmisc lsof \
|
||||
traceroute \
|
||||
bubblewrap \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
EOF
|
||||
|
||||
toolchain_mount="/app/llama-toolchain-source"
|
||||
models_mount="/app/llama-models-source"
|
||||
|
||||
if [ -n "$LLAMA_TOOLCHAIN_DIR" ]; then
|
||||
if [ ! -d "$LLAMA_TOOLCHAIN_DIR" ]; then
|
||||
echo "${RED}Warning: LLAMA_TOOLCHAIN_DIR is set but directory does not exist: $LLAMA_TOOLCHAIN_DIR${NC}" >&2
|
||||
exit 1
|
||||
fi
|
||||
add_to_docker "RUN pip install $toolchain_mount"
|
||||
else
|
||||
add_to_docker "RUN pip install llama-toolchain"
|
||||
fi
|
||||
|
||||
if [ -n "$LLAMA_MODELS_DIR" ]; then
|
||||
if [ ! -d "$LLAMA_MODELS_DIR" ]; then
|
||||
echo "${RED}Warning: LLAMA_MODELS_DIR is set but directory does not exist: $LLAMA_MODELS_DIR${NC}" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
add_to_docker <<EOF
|
||||
RUN pip uninstall -y llama-models
|
||||
RUN pip install $models_mount
|
||||
|
||||
EOF
|
||||
fi
|
||||
|
||||
if [ -n "$pip_dependencies" ]; then
|
||||
add_to_docker "RUN pip install $pip_dependencies"
|
||||
fi
|
||||
|
||||
add_to_docker <<EOF
|
||||
|
||||
# This would be good in production but for debugging flexibility lets not add it right now
|
||||
# We need a more solid production ready entrypoint.sh anyway
|
||||
#
|
||||
# ENTRYPOINT ["python", "-m", "llama_stack.core.server"]
|
||||
|
||||
EOF
|
||||
|
||||
add_to_docker "ADD $build_file_path ./llamastack-build.yaml"
|
||||
|
||||
printf "Dockerfile created successfully in $TEMP_DIR/Dockerfile"
|
||||
cat $TEMP_DIR/Dockerfile
|
||||
printf "\n"
|
||||
|
||||
mounts=""
|
||||
if [ -n "$LLAMA_TOOLCHAIN_DIR" ]; then
|
||||
mounts="$mounts -v $(readlink -f $LLAMA_TOOLCHAIN_DIR):$toolchain_mount"
|
||||
fi
|
||||
if [ -n "$LLAMA_MODELS_DIR" ]; then
|
||||
mounts="$mounts -v $(readlink -f $LLAMA_MODELS_DIR):$models_mount"
|
||||
fi
|
||||
set -x
|
||||
$DOCKER_BINARY build $DOCKER_OPTS -t $image_name -f "$TEMP_DIR/Dockerfile" "$REPO_DIR" $mounts
|
||||
set +x
|
||||
|
||||
echo "You can run it with: podman run -p 8000:8000 $image_name"
|
||||
|
||||
echo "Checking image builds..."
|
||||
podman run -it $image_name cat llamastack-build.yaml
|
40
llama_stack/core/common.sh
Executable file
40
llama_stack/core/common.sh
Executable file
|
@ -0,0 +1,40 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
cleanup() {
|
||||
envname="$1"
|
||||
|
||||
set +x
|
||||
echo "Cleaning up..."
|
||||
conda deactivate
|
||||
conda env remove --name $envname -y
|
||||
}
|
||||
|
||||
handle_int() {
|
||||
if [ -n $ENVNAME ]; then
|
||||
cleanup $ENVNAME
|
||||
fi
|
||||
exit 1
|
||||
}
|
||||
|
||||
handle_exit() {
|
||||
if [ $? -ne 0 ]; then
|
||||
echo -e "\033[1;31mABORTING.\033[0m"
|
||||
if [ -n $ENVNAME ]; then
|
||||
cleanup $ENVNAME
|
||||
fi
|
||||
fi
|
||||
}
|
||||
|
||||
setup_cleanup_handlers() {
|
||||
trap handle_int INT
|
||||
trap handle_exit EXIT
|
||||
|
||||
__conda_setup="$('conda' 'shell.bash' 'hook' 2>/dev/null)"
|
||||
eval "$__conda_setup"
|
||||
|
||||
conda deactivate
|
||||
}
|
91
llama_stack/core/configure.py
Normal file
91
llama_stack/core/configure.py
Normal file
|
@ -0,0 +1,91 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
from typing import Any
|
||||
|
||||
from pydantic import BaseModel
|
||||
|
||||
from llama_stack.core.datatypes import * # noqa: F403
|
||||
from termcolor import cprint
|
||||
|
||||
from llama_stack.common.prompt_for_config import prompt_for_config
|
||||
from llama_stack.core.distribution import api_providers, stack_apis
|
||||
from llama_stack.core.dynamic import instantiate_class_type
|
||||
|
||||
|
||||
# These are hacks so we can re-use the `prompt_for_config` utility
|
||||
# This needs a bunch of work to be made very user friendly.
|
||||
class ReqApis(BaseModel):
|
||||
apis_to_serve: List[str]
|
||||
|
||||
|
||||
def make_routing_entry_type(config_class: Any):
|
||||
class BaseModelWithConfig(BaseModel):
|
||||
routing_key: str
|
||||
config: config_class
|
||||
|
||||
return BaseModelWithConfig
|
||||
|
||||
|
||||
# TODO: make sure we can deal with existing configuration values correctly
|
||||
# instead of just overwriting them
|
||||
def configure_api_providers(
|
||||
config: StackRunConfig, spec: DistributionSpec
|
||||
) -> StackRunConfig:
|
||||
cprint("Configuring APIs to serve...", "white", attrs=["bold"])
|
||||
print("Enter comma-separated list of APIs to serve:")
|
||||
|
||||
apis = config.apis_to_serve or list(spec.providers.keys())
|
||||
apis = [a for a in apis if a != "telemetry"]
|
||||
req_apis = ReqApis(
|
||||
apis_to_serve=apis,
|
||||
)
|
||||
req_apis = prompt_for_config(ReqApis, req_apis)
|
||||
config.apis_to_serve = req_apis.apis_to_serve
|
||||
print("")
|
||||
|
||||
apis = [v.value for v in stack_apis()]
|
||||
all_providers = api_providers()
|
||||
|
||||
apis_to_serve = req_apis.apis_to_serve + ["telemetry"]
|
||||
for api_str in apis_to_serve:
|
||||
if api_str not in apis:
|
||||
raise ValueError(f"Unknown API `{api_str}`")
|
||||
|
||||
cprint(f"Configuring API `{api_str}`...\n", "white", attrs=["bold"])
|
||||
api = Api(api_str)
|
||||
if isinstance(spec.providers[api_str], list):
|
||||
print(
|
||||
"You have specified multiple providers for this API. We will configure a routing table now. For each provider, provide a routing key followed by provider configuration.\n"
|
||||
)
|
||||
routing_entries = []
|
||||
for p in spec.providers[api_str]:
|
||||
print(f"Configuring provider `{p}`...")
|
||||
provider_spec = all_providers[api][p]
|
||||
config_type = instantiate_class_type(provider_spec.config_class)
|
||||
|
||||
wrapper_type = make_routing_entry_type(config_type)
|
||||
rt_entry = prompt_for_config(wrapper_type, None)
|
||||
|
||||
# TODO: we need to validate the routing keys
|
||||
routing_entries.append(
|
||||
ProviderRoutingEntry(
|
||||
provider_id=p,
|
||||
routing_key=rt_entry.routing_key,
|
||||
config=rt_entry.config.dict(),
|
||||
)
|
||||
)
|
||||
config.provider_map[api_str] = routing_entries
|
||||
else:
|
||||
provider_spec = all_providers[api][spec.providers[api_str]]
|
||||
config_type = instantiate_class_type(provider_spec.config_class)
|
||||
cfg = prompt_for_config(config_type, None)
|
||||
config.provider_map[api_str] = GenericProviderConfig(
|
||||
provider_id=spec.providers[api_str],
|
||||
config=cfg.dict(),
|
||||
)
|
||||
|
||||
return config
|
31
llama_stack/core/configure_container.sh
Executable file
31
llama_stack/core/configure_container.sh
Executable file
|
@ -0,0 +1,31 @@
|
|||
#!/bin/bash
|
||||
|
||||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
error_handler() {
|
||||
echo "Error occurred in script at line: ${1}" >&2
|
||||
exit 1
|
||||
}
|
||||
|
||||
trap 'error_handler ${LINENO}' ERR
|
||||
|
||||
if [ $# -lt 2 ]; then
|
||||
echo "Usage: $0 <container name> <build file path>"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
docker_image="$1"
|
||||
host_build_dir="$2"
|
||||
container_build_dir="/app/builds"
|
||||
|
||||
set -x
|
||||
podman run -it \
|
||||
-v $host_build_dir:$container_build_dir \
|
||||
$docker_image \
|
||||
llama stack configure ./llamastack-build.yaml --output-dir $container_build_dir
|
250
llama_stack/core/datatypes.py
Normal file
250
llama_stack/core/datatypes.py
Normal file
|
@ -0,0 +1,250 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
from datetime import datetime
|
||||
from enum import Enum
|
||||
from typing import Any, Dict, List, Optional, Union
|
||||
|
||||
from llama_models.schema_utils import json_schema_type
|
||||
|
||||
from pydantic import BaseModel, Field, validator
|
||||
|
||||
|
||||
@json_schema_type
|
||||
class Api(Enum):
|
||||
inference = "inference"
|
||||
safety = "safety"
|
||||
agentic_system = "agentic_system"
|
||||
memory = "memory"
|
||||
telemetry = "telemetry"
|
||||
|
||||
|
||||
@json_schema_type
|
||||
class ApiEndpoint(BaseModel):
|
||||
route: str
|
||||
method: str
|
||||
name: str
|
||||
|
||||
|
||||
@json_schema_type
|
||||
class ProviderSpec(BaseModel):
|
||||
api: Api
|
||||
provider_id: str
|
||||
config_class: str = Field(
|
||||
...,
|
||||
description="Fully-qualified classname of the config for this provider",
|
||||
)
|
||||
api_dependencies: List[Api] = Field(
|
||||
default_factory=list,
|
||||
description="Higher-level API surfaces may depend on other providers to provide their functionality",
|
||||
)
|
||||
|
||||
|
||||
@json_schema_type
|
||||
class RouterProviderSpec(ProviderSpec):
|
||||
provider_id: str = "router"
|
||||
config_class: str = ""
|
||||
|
||||
docker_image: Optional[str] = None
|
||||
|
||||
inner_specs: List[ProviderSpec]
|
||||
module: str = Field(
|
||||
...,
|
||||
description="""
|
||||
Fully-qualified name of the module to import. The module is expected to have:
|
||||
|
||||
- `get_router_impl(config, provider_specs, deps)`: returns the router implementation
|
||||
""",
|
||||
)
|
||||
|
||||
@property
|
||||
def pip_packages(self) -> List[str]:
|
||||
raise AssertionError("Should not be called on RouterProviderSpec")
|
||||
|
||||
|
||||
class GenericProviderConfig(BaseModel):
|
||||
provider_id: str
|
||||
config: Dict[str, Any]
|
||||
|
||||
|
||||
@json_schema_type
|
||||
class AdapterSpec(BaseModel):
|
||||
adapter_id: str = Field(
|
||||
...,
|
||||
description="Unique identifier for this adapter",
|
||||
)
|
||||
module: str = Field(
|
||||
...,
|
||||
description="""
|
||||
Fully-qualified name of the module to import. The module is expected to have:
|
||||
|
||||
- `get_adapter_impl(config, deps)`: returns the adapter implementation
|
||||
""",
|
||||
)
|
||||
pip_packages: List[str] = Field(
|
||||
default_factory=list,
|
||||
description="The pip dependencies needed for this implementation",
|
||||
)
|
||||
config_class: Optional[str] = Field(
|
||||
default=None,
|
||||
description="Fully-qualified classname of the config for this provider",
|
||||
)
|
||||
|
||||
|
||||
@json_schema_type
|
||||
class InlineProviderSpec(ProviderSpec):
|
||||
pip_packages: List[str] = Field(
|
||||
default_factory=list,
|
||||
description="The pip dependencies needed for this implementation",
|
||||
)
|
||||
docker_image: Optional[str] = Field(
|
||||
default=None,
|
||||
description="""
|
||||
The docker image to use for this implementation. If one is provided, pip_packages will be ignored.
|
||||
If a provider depends on other providers, the dependencies MUST NOT specify a docker image.
|
||||
""",
|
||||
)
|
||||
module: str = Field(
|
||||
...,
|
||||
description="""
|
||||
Fully-qualified name of the module to import. The module is expected to have:
|
||||
|
||||
- `get_provider_impl(config, deps)`: returns the local implementation
|
||||
""",
|
||||
)
|
||||
|
||||
|
||||
class RemoteProviderConfig(BaseModel):
|
||||
url: str = Field(..., description="The URL for the provider")
|
||||
|
||||
@validator("url")
|
||||
@classmethod
|
||||
def validate_url(cls, url: str) -> str:
|
||||
if not url.startswith("http"):
|
||||
raise ValueError(f"URL must start with http: {url}")
|
||||
return url.rstrip("/")
|
||||
|
||||
|
||||
def remote_provider_id(adapter_id: str) -> str:
|
||||
return f"remote::{adapter_id}"
|
||||
|
||||
|
||||
@json_schema_type
|
||||
class RemoteProviderSpec(ProviderSpec):
|
||||
adapter: Optional[AdapterSpec] = Field(
|
||||
default=None,
|
||||
description="""
|
||||
If some code is needed to convert the remote responses into Llama Stack compatible
|
||||
API responses, specify the adapter here. If not specified, it indicates the remote
|
||||
as being "Llama Stack compatible"
|
||||
""",
|
||||
)
|
||||
|
||||
@property
|
||||
def docker_image(self) -> Optional[str]:
|
||||
return None
|
||||
|
||||
@property
|
||||
def module(self) -> str:
|
||||
if self.adapter:
|
||||
return self.adapter.module
|
||||
return f"llama_stack.{self.api.value}.client"
|
||||
|
||||
@property
|
||||
def pip_packages(self) -> List[str]:
|
||||
if self.adapter:
|
||||
return self.adapter.pip_packages
|
||||
return []
|
||||
|
||||
|
||||
# Can avoid this by using Pydantic computed_field
|
||||
def remote_provider_spec(
|
||||
api: Api, adapter: Optional[AdapterSpec] = None
|
||||
) -> RemoteProviderSpec:
|
||||
config_class = (
|
||||
adapter.config_class
|
||||
if adapter and adapter.config_class
|
||||
else "llama_stack.core.datatypes.RemoteProviderConfig"
|
||||
)
|
||||
provider_id = remote_provider_id(adapter.adapter_id) if adapter else "remote"
|
||||
|
||||
return RemoteProviderSpec(
|
||||
api=api, provider_id=provider_id, config_class=config_class, adapter=adapter
|
||||
)
|
||||
|
||||
|
||||
@json_schema_type
|
||||
class DistributionSpec(BaseModel):
|
||||
description: Optional[str] = Field(
|
||||
default="",
|
||||
description="Description of the distribution",
|
||||
)
|
||||
docker_image: Optional[str] = None
|
||||
providers: Dict[str, Union[str, List[str]]] = Field(
|
||||
default_factory=dict,
|
||||
description="""
|
||||
Provider Types for each of the APIs provided by this distribution. If you
|
||||
select multiple providers, you should provide an appropriate 'routing_map'
|
||||
in the runtime configuration to help route to the correct provider.""",
|
||||
)
|
||||
|
||||
|
||||
@json_schema_type
|
||||
class ProviderRoutingEntry(GenericProviderConfig):
|
||||
routing_key: str
|
||||
|
||||
|
||||
ProviderMapEntry = Union[GenericProviderConfig, List[ProviderRoutingEntry]]
|
||||
|
||||
|
||||
@json_schema_type
|
||||
class StackRunConfig(BaseModel):
|
||||
built_at: datetime
|
||||
|
||||
image_name: str = Field(
|
||||
...,
|
||||
description="""
|
||||
Reference to the distribution this package refers to. For unregistered (adhoc) packages,
|
||||
this could be just a hash
|
||||
""",
|
||||
)
|
||||
docker_image: Optional[str] = Field(
|
||||
default=None,
|
||||
description="Reference to the docker image if this package refers to a container",
|
||||
)
|
||||
conda_env: Optional[str] = Field(
|
||||
default=None,
|
||||
description="Reference to the conda environment if this package refers to a conda environment",
|
||||
)
|
||||
apis_to_serve: List[str] = Field(
|
||||
description="""
|
||||
The list of APIs to serve. If not specified, all APIs specified in the provider_map will be served""",
|
||||
)
|
||||
provider_map: Dict[str, ProviderMapEntry] = Field(
|
||||
description="""
|
||||
Provider configurations for each of the APIs provided by this package.
|
||||
|
||||
Given an API, you can specify a single provider or a "routing table". Each entry in the routing
|
||||
table has a (routing_key, provider_config) tuple. How the key is interpreted is API-specific.
|
||||
|
||||
As examples:
|
||||
- the "inference" API interprets the routing_key as a "model"
|
||||
- the "memory" API interprets the routing_key as the type of a "memory bank"
|
||||
|
||||
The key may support wild-cards alsothe routing_key to route to the correct provider.""",
|
||||
)
|
||||
|
||||
|
||||
@json_schema_type
|
||||
class BuildConfig(BaseModel):
|
||||
name: str
|
||||
distribution_spec: DistributionSpec = Field(
|
||||
description="The distribution spec to build including API providers. "
|
||||
)
|
||||
image_type: str = Field(
|
||||
default="conda",
|
||||
description="Type of package to build (conda | container)",
|
||||
)
|
76
llama_stack/core/distribution.py
Normal file
76
llama_stack/core/distribution.py
Normal file
|
@ -0,0 +1,76 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
import importlib
|
||||
import inspect
|
||||
from typing import Dict, List
|
||||
|
||||
from llama_stack.agentic_system.api import AgenticSystem
|
||||
from llama_stack.inference.api import Inference
|
||||
from llama_stack.memory.api import Memory
|
||||
from llama_stack.safety.api import Safety
|
||||
from llama_stack.telemetry.api import Telemetry
|
||||
|
||||
from .datatypes import Api, ApiEndpoint, ProviderSpec, remote_provider_spec
|
||||
|
||||
# These are the dependencies needed by the distribution server.
|
||||
# `llama-toolchain` is automatically installed by the installation script.
|
||||
SERVER_DEPENDENCIES = [
|
||||
"fastapi",
|
||||
"uvicorn",
|
||||
]
|
||||
|
||||
|
||||
def stack_apis() -> List[Api]:
|
||||
return [v for v in Api]
|
||||
|
||||
|
||||
def api_endpoints() -> Dict[Api, List[ApiEndpoint]]:
|
||||
apis = {}
|
||||
|
||||
protocols = {
|
||||
Api.inference: Inference,
|
||||
Api.safety: Safety,
|
||||
Api.agentic_system: AgenticSystem,
|
||||
Api.memory: Memory,
|
||||
Api.telemetry: Telemetry,
|
||||
}
|
||||
|
||||
for api, protocol in protocols.items():
|
||||
endpoints = []
|
||||
protocol_methods = inspect.getmembers(protocol, predicate=inspect.isfunction)
|
||||
|
||||
for name, method in protocol_methods:
|
||||
if not hasattr(method, "__webmethod__"):
|
||||
continue
|
||||
|
||||
webmethod = method.__webmethod__
|
||||
route = webmethod.route
|
||||
|
||||
if webmethod.method == "GET":
|
||||
method = "get"
|
||||
elif webmethod.method == "DELETE":
|
||||
method = "delete"
|
||||
else:
|
||||
method = "post"
|
||||
endpoints.append(ApiEndpoint(route=route, method=method, name=name))
|
||||
|
||||
apis[api] = endpoints
|
||||
|
||||
return apis
|
||||
|
||||
|
||||
def api_providers() -> Dict[Api, Dict[str, ProviderSpec]]:
|
||||
ret = {}
|
||||
for api in stack_apis():
|
||||
name = api.name.lower()
|
||||
module = importlib.import_module(f"llama_stack.{name}.providers")
|
||||
ret[api] = {
|
||||
"remote": remote_provider_spec(api),
|
||||
**{a.provider_id: a for a in module.available_providers()},
|
||||
}
|
||||
|
||||
return ret
|
66
llama_stack/core/dynamic.py
Normal file
66
llama_stack/core/dynamic.py
Normal file
|
@ -0,0 +1,66 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
import importlib
|
||||
from typing import Any, Dict
|
||||
|
||||
from llama_stack.core.datatypes import * # noqa: F403
|
||||
|
||||
|
||||
def instantiate_class_type(fully_qualified_name):
|
||||
module_name, class_name = fully_qualified_name.rsplit(".", 1)
|
||||
module = importlib.import_module(module_name)
|
||||
return getattr(module, class_name)
|
||||
|
||||
|
||||
# returns a class implementing the protocol corresponding to the Api
|
||||
async def instantiate_provider(
|
||||
provider_spec: ProviderSpec,
|
||||
deps: Dict[str, Any],
|
||||
provider_config: ProviderMapEntry,
|
||||
):
|
||||
module = importlib.import_module(provider_spec.module)
|
||||
|
||||
args = []
|
||||
if isinstance(provider_spec, RemoteProviderSpec):
|
||||
if provider_spec.adapter:
|
||||
method = "get_adapter_impl"
|
||||
else:
|
||||
method = "get_client_impl"
|
||||
|
||||
assert isinstance(provider_config, GenericProviderConfig)
|
||||
config_type = instantiate_class_type(provider_spec.config_class)
|
||||
config = config_type(**provider_config.config)
|
||||
args = [config, deps]
|
||||
elif isinstance(provider_spec, RouterProviderSpec):
|
||||
method = "get_router_impl"
|
||||
|
||||
assert isinstance(provider_config, list)
|
||||
inner_specs = {x.provider_id: x for x in provider_spec.inner_specs}
|
||||
inner_impls = []
|
||||
for routing_entry in provider_config:
|
||||
impl = await instantiate_provider(
|
||||
inner_specs[routing_entry.provider_id],
|
||||
deps,
|
||||
routing_entry,
|
||||
)
|
||||
inner_impls.append((routing_entry.routing_key, impl))
|
||||
|
||||
config = None
|
||||
args = [inner_impls, deps]
|
||||
else:
|
||||
method = "get_provider_impl"
|
||||
|
||||
assert isinstance(provider_config, GenericProviderConfig)
|
||||
config_type = instantiate_class_type(provider_spec.config_class)
|
||||
config = config_type(**provider_config.config)
|
||||
args = [config, deps]
|
||||
|
||||
fn = getattr(module, method)
|
||||
impl = await fn(*args)
|
||||
impl.__provider_spec__ = provider_spec
|
||||
impl.__provider_config__ = config
|
||||
return impl
|
96
llama_stack/core/package.py
Normal file
96
llama_stack/core/package.py
Normal file
|
@ -0,0 +1,96 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
from enum import Enum
|
||||
from typing import List, Optional
|
||||
|
||||
import pkg_resources
|
||||
from pydantic import BaseModel
|
||||
|
||||
from termcolor import cprint
|
||||
|
||||
from llama_stack.common.exec import run_with_pty
|
||||
|
||||
from llama_stack.core.datatypes import * # noqa: F403
|
||||
from pathlib import Path
|
||||
|
||||
from llama_stack.core.distribution import api_providers, SERVER_DEPENDENCIES
|
||||
|
||||
|
||||
class ImageType(Enum):
|
||||
docker = "docker"
|
||||
conda = "conda"
|
||||
|
||||
|
||||
class Dependencies(BaseModel):
|
||||
pip_packages: List[str]
|
||||
docker_image: Optional[str] = None
|
||||
|
||||
|
||||
class ApiInput(BaseModel):
|
||||
api: Api
|
||||
provider: str
|
||||
|
||||
|
||||
def build_image(build_config: BuildConfig, build_file_path: Path):
|
||||
package_deps = Dependencies(
|
||||
docker_image=build_config.distribution_spec.docker_image or "python:3.10-slim",
|
||||
pip_packages=SERVER_DEPENDENCIES,
|
||||
)
|
||||
|
||||
# extend package dependencies based on providers spec
|
||||
all_providers = api_providers()
|
||||
for (
|
||||
api_str,
|
||||
provider_or_providers,
|
||||
) in build_config.distribution_spec.providers.items():
|
||||
providers_for_api = all_providers[Api(api_str)]
|
||||
|
||||
providers = (
|
||||
provider_or_providers
|
||||
if isinstance(provider_or_providers, list)
|
||||
else [provider_or_providers]
|
||||
)
|
||||
|
||||
for provider in providers:
|
||||
if provider not in providers_for_api:
|
||||
raise ValueError(
|
||||
f"Provider `{provider}` is not available for API `{api_str}`"
|
||||
)
|
||||
|
||||
provider_spec = providers_for_api[provider]
|
||||
package_deps.pip_packages.extend(provider_spec.pip_packages)
|
||||
if provider_spec.docker_image:
|
||||
raise ValueError("A stack's dependencies cannot have a docker image")
|
||||
|
||||
if build_config.image_type == ImageType.docker.value:
|
||||
script = pkg_resources.resource_filename(
|
||||
"llama_stack", "core/build_container.sh"
|
||||
)
|
||||
args = [
|
||||
script,
|
||||
build_config.name,
|
||||
package_deps.docker_image,
|
||||
str(build_file_path),
|
||||
" ".join(package_deps.pip_packages),
|
||||
]
|
||||
else:
|
||||
script = pkg_resources.resource_filename(
|
||||
"llama_stack", "core/build_conda_env.sh"
|
||||
)
|
||||
args = [
|
||||
script,
|
||||
build_config.name,
|
||||
" ".join(package_deps.pip_packages),
|
||||
]
|
||||
|
||||
return_code = run_with_pty(args)
|
||||
if return_code != 0:
|
||||
cprint(
|
||||
f"Failed to build target {build_config.name} with return code {return_code}",
|
||||
color="red",
|
||||
)
|
||||
return
|
392
llama_stack/core/server.py
Normal file
392
llama_stack/core/server.py
Normal file
|
@ -0,0 +1,392 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
import asyncio
|
||||
import inspect
|
||||
import json
|
||||
import signal
|
||||
import traceback
|
||||
|
||||
from collections.abc import (
|
||||
AsyncGenerator as AsyncGeneratorABC,
|
||||
AsyncIterator as AsyncIteratorABC,
|
||||
)
|
||||
from contextlib import asynccontextmanager
|
||||
from ssl import SSLError
|
||||
from typing import (
|
||||
Any,
|
||||
AsyncGenerator,
|
||||
AsyncIterator,
|
||||
Dict,
|
||||
get_type_hints,
|
||||
List,
|
||||
Optional,
|
||||
Set,
|
||||
)
|
||||
|
||||
import fire
|
||||
import httpx
|
||||
import yaml
|
||||
|
||||
from fastapi import Body, FastAPI, HTTPException, Request, Response
|
||||
from fastapi.exceptions import RequestValidationError
|
||||
from fastapi.responses import JSONResponse, StreamingResponse
|
||||
from fastapi.routing import APIRoute
|
||||
from pydantic import BaseModel, ValidationError
|
||||
from termcolor import cprint
|
||||
from typing_extensions import Annotated
|
||||
|
||||
from llama_stack.telemetry.tracing import (
|
||||
end_trace,
|
||||
setup_logger,
|
||||
SpanStatus,
|
||||
start_trace,
|
||||
)
|
||||
from llama_stack.core.datatypes import * # noqa: F403
|
||||
|
||||
from .distribution import api_endpoints, api_providers
|
||||
from .dynamic import instantiate_provider
|
||||
|
||||
|
||||
def is_async_iterator_type(typ):
|
||||
if hasattr(typ, "__origin__"):
|
||||
origin = typ.__origin__
|
||||
if isinstance(origin, type):
|
||||
return issubclass(
|
||||
origin,
|
||||
(AsyncIterator, AsyncGenerator, AsyncIteratorABC, AsyncGeneratorABC),
|
||||
)
|
||||
return False
|
||||
return isinstance(
|
||||
typ, (AsyncIterator, AsyncGenerator, AsyncIteratorABC, AsyncGeneratorABC)
|
||||
)
|
||||
|
||||
|
||||
def create_sse_event(data: Any) -> str:
|
||||
if isinstance(data, BaseModel):
|
||||
data = data.json()
|
||||
else:
|
||||
data = json.dumps(data)
|
||||
|
||||
return f"data: {data}\n\n"
|
||||
|
||||
|
||||
async def global_exception_handler(request: Request, exc: Exception):
|
||||
traceback.print_exception(exc)
|
||||
http_exc = translate_exception(exc)
|
||||
|
||||
return JSONResponse(
|
||||
status_code=http_exc.status_code, content={"error": {"detail": http_exc.detail}}
|
||||
)
|
||||
|
||||
|
||||
def translate_exception(exc: Exception) -> HTTPException:
|
||||
if isinstance(exc, ValidationError):
|
||||
return RequestValidationError(exc.raw_errors)
|
||||
|
||||
# Add more custom exception translations here
|
||||
return HTTPException(status_code=500, detail="Internal server error")
|
||||
|
||||
|
||||
async def passthrough(
|
||||
request: Request,
|
||||
downstream_url: str,
|
||||
downstream_headers: Optional[Dict[str, str]] = None,
|
||||
):
|
||||
await start_trace(request.path, {"downstream_url": downstream_url})
|
||||
|
||||
headers = dict(request.headers)
|
||||
headers.pop("host", None)
|
||||
headers.update(downstream_headers or {})
|
||||
|
||||
content = await request.body()
|
||||
|
||||
client = httpx.AsyncClient()
|
||||
erred = False
|
||||
try:
|
||||
req = client.build_request(
|
||||
method=request.method,
|
||||
url=downstream_url,
|
||||
headers=headers,
|
||||
content=content,
|
||||
params=request.query_params,
|
||||
)
|
||||
response = await client.send(req, stream=True)
|
||||
|
||||
async def stream_response():
|
||||
async for chunk in response.aiter_raw(chunk_size=64):
|
||||
yield chunk
|
||||
|
||||
await response.aclose()
|
||||
await client.aclose()
|
||||
|
||||
return StreamingResponse(
|
||||
stream_response(),
|
||||
status_code=response.status_code,
|
||||
headers=dict(response.headers),
|
||||
media_type=response.headers.get("content-type"),
|
||||
)
|
||||
|
||||
except httpx.ReadTimeout:
|
||||
erred = True
|
||||
return Response(content="Downstream server timed out", status_code=504)
|
||||
except httpx.NetworkError as e:
|
||||
erred = True
|
||||
return Response(content=f"Network error: {str(e)}", status_code=502)
|
||||
except httpx.TooManyRedirects:
|
||||
erred = True
|
||||
return Response(content="Too many redirects", status_code=502)
|
||||
except SSLError as e:
|
||||
erred = True
|
||||
return Response(content=f"SSL error: {str(e)}", status_code=502)
|
||||
except httpx.HTTPStatusError as e:
|
||||
erred = True
|
||||
return Response(content=str(e), status_code=e.response.status_code)
|
||||
except Exception as e:
|
||||
erred = True
|
||||
return Response(content=f"Unexpected error: {str(e)}", status_code=500)
|
||||
finally:
|
||||
await end_trace(SpanStatus.OK if not erred else SpanStatus.ERROR)
|
||||
|
||||
|
||||
def handle_sigint(*args, **kwargs):
|
||||
print("SIGINT or CTRL-C detected. Exiting gracefully...")
|
||||
loop = asyncio.get_event_loop()
|
||||
for task in asyncio.all_tasks(loop):
|
||||
task.cancel()
|
||||
loop.stop()
|
||||
|
||||
|
||||
@asynccontextmanager
|
||||
async def lifespan(app: FastAPI):
|
||||
print("Starting up")
|
||||
yield
|
||||
print("Shutting down")
|
||||
|
||||
|
||||
def create_dynamic_passthrough(
|
||||
downstream_url: str, downstream_headers: Optional[Dict[str, str]] = None
|
||||
):
|
||||
async def endpoint(request: Request):
|
||||
return await passthrough(request, downstream_url, downstream_headers)
|
||||
|
||||
return endpoint
|
||||
|
||||
|
||||
def create_dynamic_typed_route(func: Any, method: str):
|
||||
hints = get_type_hints(func)
|
||||
response_model = hints.get("return")
|
||||
|
||||
# NOTE: I think it is better to just add a method within each Api
|
||||
# "Protocol" / adapter-impl to tell what sort of a response this request
|
||||
# is going to produce. /chat_completion can produce a streaming or
|
||||
# non-streaming response depending on if request.stream is True / False.
|
||||
is_streaming = is_async_iterator_type(response_model)
|
||||
|
||||
if is_streaming:
|
||||
|
||||
async def endpoint(**kwargs):
|
||||
await start_trace(func.__name__)
|
||||
|
||||
async def sse_generator(event_gen):
|
||||
try:
|
||||
async for item in event_gen:
|
||||
yield create_sse_event(item)
|
||||
await asyncio.sleep(0.01)
|
||||
except asyncio.CancelledError:
|
||||
print("Generator cancelled")
|
||||
await event_gen.aclose()
|
||||
except Exception as e:
|
||||
traceback.print_exception(e)
|
||||
yield create_sse_event(
|
||||
{
|
||||
"error": {
|
||||
"message": str(translate_exception(e)),
|
||||
},
|
||||
}
|
||||
)
|
||||
finally:
|
||||
await end_trace()
|
||||
|
||||
return StreamingResponse(
|
||||
sse_generator(func(**kwargs)), media_type="text/event-stream"
|
||||
)
|
||||
|
||||
else:
|
||||
|
||||
async def endpoint(**kwargs):
|
||||
await start_trace(func.__name__)
|
||||
try:
|
||||
return (
|
||||
await func(**kwargs)
|
||||
if asyncio.iscoroutinefunction(func)
|
||||
else func(**kwargs)
|
||||
)
|
||||
except Exception as e:
|
||||
traceback.print_exception(e)
|
||||
raise translate_exception(e) from e
|
||||
finally:
|
||||
await end_trace()
|
||||
|
||||
sig = inspect.signature(func)
|
||||
if method == "post":
|
||||
# make sure every parameter is annotated with Body() so FASTAPI doesn't
|
||||
# do anything too intelligent and ask for some parameters in the query
|
||||
# and some in the body
|
||||
endpoint.__signature__ = sig.replace(
|
||||
parameters=[
|
||||
param.replace(
|
||||
annotation=Annotated[param.annotation, Body(..., embed=True)]
|
||||
)
|
||||
for param in sig.parameters.values()
|
||||
]
|
||||
)
|
||||
else:
|
||||
endpoint.__signature__ = sig
|
||||
|
||||
return endpoint
|
||||
|
||||
|
||||
def topological_sort(providers: List[ProviderSpec]) -> List[ProviderSpec]:
|
||||
by_id = {x.api: x for x in providers}
|
||||
|
||||
def dfs(a: ProviderSpec, visited: Set[Api], stack: List[Api]):
|
||||
visited.add(a.api)
|
||||
|
||||
for api in a.api_dependencies:
|
||||
if api not in visited:
|
||||
dfs(by_id[api], visited, stack)
|
||||
|
||||
stack.append(a.api)
|
||||
|
||||
visited = set()
|
||||
stack = []
|
||||
|
||||
for a in providers:
|
||||
if a.api not in visited:
|
||||
dfs(a, visited, stack)
|
||||
|
||||
return [by_id[x] for x in stack]
|
||||
|
||||
|
||||
def snake_to_camel(snake_str):
|
||||
return "".join(word.capitalize() for word in snake_str.split("_"))
|
||||
|
||||
|
||||
async def resolve_impls(
|
||||
provider_map: Dict[str, ProviderMapEntry],
|
||||
) -> Dict[Api, Any]:
|
||||
"""
|
||||
Does two things:
|
||||
- flatmaps, sorts and resolves the providers in dependency order
|
||||
- for each API, produces either a (local, passthrough or router) implementation
|
||||
"""
|
||||
all_providers = api_providers()
|
||||
|
||||
specs = {}
|
||||
for api_str, item in provider_map.items():
|
||||
api = Api(api_str)
|
||||
providers = all_providers[api]
|
||||
|
||||
if isinstance(item, GenericProviderConfig):
|
||||
if item.provider_id not in providers:
|
||||
raise ValueError(
|
||||
f"Unknown provider `{provider_id}` is not available for API `{api}`"
|
||||
)
|
||||
specs[api] = providers[item.provider_id]
|
||||
else:
|
||||
assert isinstance(item, list)
|
||||
inner_specs = []
|
||||
for rt_entry in item:
|
||||
if rt_entry.provider_id not in providers:
|
||||
raise ValueError(
|
||||
f"Unknown provider `{rt_entry.provider_id}` is not available for API `{api}`"
|
||||
)
|
||||
inner_specs.append(providers[rt_entry.provider_id])
|
||||
|
||||
specs[api] = RouterProviderSpec(
|
||||
api=api,
|
||||
module=f"llama_stack.{api.value.lower()}.router",
|
||||
api_dependencies=[],
|
||||
inner_specs=inner_specs,
|
||||
)
|
||||
|
||||
sorted_specs = topological_sort(specs.values())
|
||||
|
||||
impls = {}
|
||||
for spec in sorted_specs:
|
||||
api = spec.api
|
||||
|
||||
deps = {api: impls[api] for api in spec.api_dependencies}
|
||||
impl = await instantiate_provider(spec, deps, provider_map[api.value])
|
||||
impls[api] = impl
|
||||
|
||||
return impls, specs
|
||||
|
||||
|
||||
def main(yaml_config: str, port: int = 5000, disable_ipv6: bool = False):
|
||||
with open(yaml_config, "r") as fp:
|
||||
config = StackRunConfig(**yaml.safe_load(fp))
|
||||
|
||||
app = FastAPI()
|
||||
|
||||
impls, specs = asyncio.run(resolve_impls(config.provider_map))
|
||||
if Api.telemetry in impls:
|
||||
setup_logger(impls[Api.telemetry])
|
||||
|
||||
all_endpoints = api_endpoints()
|
||||
|
||||
apis_to_serve = config.apis_to_serve or list(config.provider_map.keys())
|
||||
for api_str in apis_to_serve:
|
||||
api = Api(api_str)
|
||||
endpoints = all_endpoints[api]
|
||||
impl = impls[api]
|
||||
|
||||
provider_spec = specs[api]
|
||||
if (
|
||||
isinstance(provider_spec, RemoteProviderSpec)
|
||||
and provider_spec.adapter is None
|
||||
):
|
||||
for endpoint in endpoints:
|
||||
url = impl.__provider_config__.url.rstrip("/") + endpoint.route
|
||||
getattr(app, endpoint.method)(endpoint.route)(
|
||||
create_dynamic_passthrough(url)
|
||||
)
|
||||
else:
|
||||
for endpoint in endpoints:
|
||||
if not hasattr(impl, endpoint.name):
|
||||
# ideally this should be a typing violation already
|
||||
raise ValueError(
|
||||
f"Could not find method {endpoint.name} on {impl}!!"
|
||||
)
|
||||
|
||||
impl_method = getattr(impl, endpoint.name)
|
||||
getattr(app, endpoint.method)(endpoint.route, response_model=None)(
|
||||
create_dynamic_typed_route(impl_method, endpoint.method)
|
||||
)
|
||||
|
||||
for route in app.routes:
|
||||
if isinstance(route, APIRoute):
|
||||
cprint(
|
||||
f"Serving {next(iter(route.methods))} {route.path}",
|
||||
"white",
|
||||
attrs=["bold"],
|
||||
)
|
||||
|
||||
app.exception_handler(RequestValidationError)(global_exception_handler)
|
||||
app.exception_handler(Exception)(global_exception_handler)
|
||||
signal.signal(signal.SIGINT, handle_sigint)
|
||||
|
||||
import uvicorn
|
||||
|
||||
# FYI this does not do hot-reloads
|
||||
listen_host = "::" if not disable_ipv6 else "0.0.0.0"
|
||||
print(f"Listening on {listen_host}:{port}")
|
||||
uvicorn.run(app, host=listen_host, port=port)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
fire.Fire(main)
|
42
llama_stack/core/start_conda_env.sh
Executable file
42
llama_stack/core/start_conda_env.sh
Executable file
|
@ -0,0 +1,42 @@
|
|||
#!/bin/bash
|
||||
|
||||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
RED='\033[0;31m'
|
||||
NC='\033[0m' # No Color
|
||||
|
||||
error_handler() {
|
||||
echo "Error occurred in script at line: ${1}" >&2
|
||||
exit 1
|
||||
}
|
||||
|
||||
trap 'error_handler ${LINENO}' ERR
|
||||
|
||||
if [ $# -lt 3 ]; then
|
||||
echo "Usage: $0 <build_name> <yaml_config> <port> <script_args...>"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
build_name="$1"
|
||||
env_name="llamastack-$build_name"
|
||||
shift
|
||||
|
||||
yaml_config="$1"
|
||||
shift
|
||||
|
||||
port="$1"
|
||||
shift
|
||||
|
||||
eval "$(conda shell.bash hook)"
|
||||
conda deactivate && conda activate "$env_name"
|
||||
|
||||
$CONDA_PREFIX/bin/python \
|
||||
-m llama_stack.core.server \
|
||||
--yaml_config "$yaml_config" \
|
||||
--port "$port" "$@"
|
43
llama_stack/core/start_container.sh
Executable file
43
llama_stack/core/start_container.sh
Executable file
|
@ -0,0 +1,43 @@
|
|||
#!/bin/bash
|
||||
|
||||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
RED='\033[0;31m'
|
||||
NC='\033[0m' # No Color
|
||||
|
||||
error_handler() {
|
||||
echo "Error occurred in script at line: ${1}" >&2
|
||||
exit 1
|
||||
}
|
||||
|
||||
trap 'error_handler ${LINENO}' ERR
|
||||
|
||||
if [ $# -lt 3 ]; then
|
||||
echo "Usage: $0 <build_name> <yaml_config> <port> <other_args...>"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
build_name="$1"
|
||||
docker_image="llamastack-$build_name"
|
||||
shift
|
||||
|
||||
yaml_config="$1"
|
||||
shift
|
||||
|
||||
port="$1"
|
||||
shift
|
||||
|
||||
set -x
|
||||
podman run -it \
|
||||
-p $port:$port \
|
||||
-v "$yaml_config:/app/config.yaml" \
|
||||
$docker_image \
|
||||
python -m llama_stack.core.server \
|
||||
--yaml_config /app/config.yaml \
|
||||
--port $port "$@"
|
7
llama_stack/dataset/api/__init__.py
Normal file
7
llama_stack/dataset/api/__init__.py
Normal file
|
@ -0,0 +1,7 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
from .api import * # noqa: F401 F403
|
63
llama_stack/dataset/api/api.py
Normal file
63
llama_stack/dataset/api/api.py
Normal file
|
@ -0,0 +1,63 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
from enum import Enum
|
||||
from typing import Any, Dict, Optional, Protocol
|
||||
|
||||
from llama_models.llama3.api.datatypes import URL
|
||||
|
||||
from llama_models.schema_utils import json_schema_type, webmethod
|
||||
|
||||
from pydantic import BaseModel
|
||||
|
||||
|
||||
@json_schema_type
|
||||
class TrainEvalDatasetColumnType(Enum):
|
||||
dialog = "dialog"
|
||||
text = "text"
|
||||
media = "media"
|
||||
number = "number"
|
||||
json = "json"
|
||||
|
||||
|
||||
@json_schema_type
|
||||
class TrainEvalDataset(BaseModel):
|
||||
"""Dataset to be used for training or evaluating language models."""
|
||||
|
||||
# TODO(ashwin): figure out if we need to add an enum for a "dataset type"
|
||||
|
||||
columns: Dict[str, TrainEvalDatasetColumnType]
|
||||
content_url: URL
|
||||
metadata: Optional[Dict[str, Any]] = None
|
||||
|
||||
|
||||
@json_schema_type
|
||||
class CreateDatasetRequest(BaseModel):
|
||||
"""Request to create a dataset."""
|
||||
|
||||
uuid: str
|
||||
dataset: TrainEvalDataset
|
||||
|
||||
|
||||
class Datasets(Protocol):
|
||||
@webmethod(route="/datasets/create")
|
||||
def create_dataset(
|
||||
self,
|
||||
uuid: str,
|
||||
dataset: TrainEvalDataset,
|
||||
) -> None: ...
|
||||
|
||||
@webmethod(route="/datasets/get")
|
||||
def get_dataset(
|
||||
self,
|
||||
dataset_uuid: str,
|
||||
) -> TrainEvalDataset: ...
|
||||
|
||||
@webmethod(route="/datasets/delete")
|
||||
def delete_dataset(
|
||||
self,
|
||||
dataset_uuid: str,
|
||||
) -> None: ...
|
7
llama_stack/evaluations/api/__init__.py
Normal file
7
llama_stack/evaluations/api/__init__.py
Normal file
|
@ -0,0 +1,7 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
from .api import * # noqa: F401 F403
|
122
llama_stack/evaluations/api/api.py
Normal file
122
llama_stack/evaluations/api/api.py
Normal file
|
@ -0,0 +1,122 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
from enum import Enum
|
||||
from typing import List, Protocol
|
||||
|
||||
from llama_models.schema_utils import webmethod
|
||||
|
||||
from pydantic import BaseModel
|
||||
|
||||
from llama_models.llama3.api.datatypes import * # noqa: F403
|
||||
from llama_stack.dataset.api import * # noqa: F403
|
||||
from llama_stack.common.training_types import * # noqa: F403
|
||||
|
||||
|
||||
class TextGenerationMetric(Enum):
|
||||
perplexity = "perplexity"
|
||||
rouge = "rouge"
|
||||
bleu = "bleu"
|
||||
|
||||
|
||||
class QuestionAnsweringMetric(Enum):
|
||||
em = "em"
|
||||
f1 = "f1"
|
||||
|
||||
|
||||
class SummarizationMetric(Enum):
|
||||
rouge = "rouge"
|
||||
bleu = "bleu"
|
||||
|
||||
|
||||
class EvaluationJob(BaseModel):
|
||||
job_uuid: str
|
||||
|
||||
|
||||
class EvaluationJobLogStream(BaseModel):
|
||||
job_uuid: str
|
||||
|
||||
|
||||
class EvaluateTaskRequestCommon(BaseModel):
|
||||
job_uuid: str
|
||||
dataset: TrainEvalDataset
|
||||
|
||||
checkpoint: Checkpoint
|
||||
|
||||
# generation params
|
||||
sampling_params: SamplingParams = SamplingParams()
|
||||
|
||||
|
||||
@json_schema_type
|
||||
class EvaluateTextGenerationRequest(EvaluateTaskRequestCommon):
|
||||
"""Request to evaluate text generation."""
|
||||
|
||||
metrics: List[TextGenerationMetric]
|
||||
|
||||
|
||||
@json_schema_type
|
||||
class EvaluateQuestionAnsweringRequest(EvaluateTaskRequestCommon):
|
||||
"""Request to evaluate question answering."""
|
||||
|
||||
metrics: List[QuestionAnsweringMetric]
|
||||
|
||||
|
||||
@json_schema_type
|
||||
class EvaluateSummarizationRequest(EvaluateTaskRequestCommon):
|
||||
"""Request to evaluate summarization."""
|
||||
|
||||
metrics: List[SummarizationMetric]
|
||||
|
||||
|
||||
class EvaluationJobStatusResponse(BaseModel):
|
||||
job_uuid: str
|
||||
|
||||
|
||||
@json_schema_type
|
||||
class EvaluationJobArtifactsResponse(BaseModel):
|
||||
"""Artifacts of a evaluation job."""
|
||||
|
||||
job_uuid: str
|
||||
|
||||
|
||||
class Evaluations(Protocol):
|
||||
@webmethod(route="/evaluate/text_generation/")
|
||||
def evaluate_text_generation(
|
||||
self,
|
||||
metrics: List[TextGenerationMetric],
|
||||
) -> EvaluationJob: ...
|
||||
|
||||
@webmethod(route="/evaluate/question_answering/")
|
||||
def evaluate_question_answering(
|
||||
self,
|
||||
metrics: List[QuestionAnsweringMetric],
|
||||
) -> EvaluationJob: ...
|
||||
|
||||
@webmethod(route="/evaluate/summarization/")
|
||||
def evaluate_summarization(
|
||||
self,
|
||||
metrics: List[SummarizationMetric],
|
||||
) -> EvaluationJob: ...
|
||||
|
||||
@webmethod(route="/evaluate/jobs")
|
||||
def get_evaluation_jobs(self) -> List[EvaluationJob]: ...
|
||||
|
||||
@webmethod(route="/evaluate/job/status")
|
||||
def get_evaluation_job_status(
|
||||
self, job_uuid: str
|
||||
) -> EvaluationJobStatusResponse: ...
|
||||
|
||||
# sends SSE stream of logs
|
||||
@webmethod(route="/evaluate/job/logs")
|
||||
def get_evaluation_job_logstream(self, job_uuid: str) -> EvaluationJobLogStream: ...
|
||||
|
||||
@webmethod(route="/evaluate/job/cancel")
|
||||
def cancel_evaluation_job(self, job_uuid: str) -> None: ...
|
||||
|
||||
@webmethod(route="/evaluate/job/artifacts")
|
||||
def get_evaluation_job_artifacts(
|
||||
self, job_uuid: str
|
||||
) -> EvaluationJobArtifactsResponse: ...
|
5
llama_stack/inference/__init__.py
Normal file
5
llama_stack/inference/__init__.py
Normal file
|
@ -0,0 +1,5 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
5
llama_stack/inference/adapters/__init__.py
Normal file
5
llama_stack/inference/adapters/__init__.py
Normal file
|
@ -0,0 +1,5 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
18
llama_stack/inference/adapters/fireworks/__init__.py
Normal file
18
llama_stack/inference/adapters/fireworks/__init__.py
Normal file
|
@ -0,0 +1,18 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
from .config import FireworksImplConfig
|
||||
|
||||
|
||||
async def get_adapter_impl(config: FireworksImplConfig, _deps):
|
||||
from .fireworks import FireworksInferenceAdapter
|
||||
|
||||
assert isinstance(
|
||||
config, FireworksImplConfig
|
||||
), f"Unexpected config type: {type(config)}"
|
||||
impl = FireworksInferenceAdapter(config)
|
||||
await impl.initialize()
|
||||
return impl
|
20
llama_stack/inference/adapters/fireworks/config.py
Normal file
20
llama_stack/inference/adapters/fireworks/config.py
Normal file
|
@ -0,0 +1,20 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
from llama_models.schema_utils import json_schema_type
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
|
||||
@json_schema_type
|
||||
class FireworksImplConfig(BaseModel):
|
||||
url: str = Field(
|
||||
default="https://api.fireworks.ai/inference",
|
||||
description="The URL for the Fireworks server",
|
||||
)
|
||||
api_key: str = Field(
|
||||
default="",
|
||||
description="The Fireworks.ai API Key",
|
||||
)
|
244
llama_stack/inference/adapters/fireworks/fireworks.py
Normal file
244
llama_stack/inference/adapters/fireworks/fireworks.py
Normal file
|
@ -0,0 +1,244 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
from typing import AsyncGenerator
|
||||
|
||||
from fireworks.client import Fireworks
|
||||
from llama_models.llama3.api.chat_format import ChatFormat
|
||||
|
||||
from llama_models.llama3.api.datatypes import Message, StopReason
|
||||
from llama_models.llama3.api.tokenizer import Tokenizer
|
||||
from llama_models.sku_list import resolve_model
|
||||
|
||||
from llama_stack.inference.api import * # noqa: F403
|
||||
from llama_stack.inference.prepare_messages import prepare_messages
|
||||
|
||||
from .config import FireworksImplConfig
|
||||
|
||||
FIREWORKS_SUPPORTED_MODELS = {
|
||||
"Meta-Llama3.1-8B-Instruct": "fireworks/llama-v3p1-8b-instruct",
|
||||
"Meta-Llama3.1-70B-Instruct": "fireworks/llama-v3p1-70b-instruct",
|
||||
"Meta-Llama3.1-405B-Instruct": "fireworks/llama-v3p1-405b-instruct",
|
||||
}
|
||||
|
||||
|
||||
class FireworksInferenceAdapter(Inference):
|
||||
def __init__(self, config: FireworksImplConfig) -> None:
|
||||
self.config = config
|
||||
tokenizer = Tokenizer.get_instance()
|
||||
self.formatter = ChatFormat(tokenizer)
|
||||
|
||||
@property
|
||||
def client(self) -> Fireworks:
|
||||
return Fireworks(api_key=self.config.api_key)
|
||||
|
||||
async def initialize(self) -> None:
|
||||
return
|
||||
|
||||
async def shutdown(self) -> None:
|
||||
pass
|
||||
|
||||
async def completion(self, request: CompletionRequest) -> AsyncGenerator:
|
||||
raise NotImplementedError()
|
||||
|
||||
def _messages_to_fireworks_messages(self, messages: list[Message]) -> list:
|
||||
fireworks_messages = []
|
||||
for message in messages:
|
||||
if message.role == "ipython":
|
||||
role = "tool"
|
||||
else:
|
||||
role = message.role
|
||||
fireworks_messages.append({"role": role, "content": message.content})
|
||||
|
||||
return fireworks_messages
|
||||
|
||||
def resolve_fireworks_model(self, model_name: str) -> str:
|
||||
model = resolve_model(model_name)
|
||||
assert (
|
||||
model is not None
|
||||
and model.descriptor(shorten_default_variant=True)
|
||||
in FIREWORKS_SUPPORTED_MODELS
|
||||
), f"Unsupported model: {model_name}, use one of the supported models: {','.join(FIREWORKS_SUPPORTED_MODELS.keys())}"
|
||||
|
||||
return FIREWORKS_SUPPORTED_MODELS.get(
|
||||
model.descriptor(shorten_default_variant=True)
|
||||
)
|
||||
|
||||
def get_fireworks_chat_options(self, request: ChatCompletionRequest) -> dict:
|
||||
options = {}
|
||||
if request.sampling_params is not None:
|
||||
for attr in {"temperature", "top_p", "top_k", "max_tokens"}:
|
||||
if getattr(request.sampling_params, attr):
|
||||
options[attr] = getattr(request.sampling_params, attr)
|
||||
|
||||
return options
|
||||
|
||||
async def chat_completion(
|
||||
self,
|
||||
model: str,
|
||||
messages: List[Message],
|
||||
sampling_params: Optional[SamplingParams] = SamplingParams(),
|
||||
tools: Optional[List[ToolDefinition]] = list(),
|
||||
tool_choice: Optional[ToolChoice] = ToolChoice.auto,
|
||||
tool_prompt_format: Optional[ToolPromptFormat] = ToolPromptFormat.json,
|
||||
stream: Optional[bool] = False,
|
||||
logprobs: Optional[LogProbConfig] = None,
|
||||
) -> AsyncGenerator:
|
||||
request = ChatCompletionRequest(
|
||||
model=model,
|
||||
messages=messages,
|
||||
sampling_params=sampling_params,
|
||||
tools=tools,
|
||||
tool_choice=tool_choice,
|
||||
tool_prompt_format=tool_prompt_format,
|
||||
stream=stream,
|
||||
logprobs=logprobs,
|
||||
)
|
||||
|
||||
messages = prepare_messages(request)
|
||||
|
||||
# accumulate sampling params and other options to pass to fireworks
|
||||
options = self.get_fireworks_chat_options(request)
|
||||
fireworks_model = self.resolve_fireworks_model(request.model)
|
||||
|
||||
if not request.stream:
|
||||
r = await self.client.chat.completions.acreate(
|
||||
model=fireworks_model,
|
||||
messages=self._messages_to_fireworks_messages(messages),
|
||||
stream=False,
|
||||
**options,
|
||||
)
|
||||
stop_reason = None
|
||||
if r.choices[0].finish_reason:
|
||||
if r.choices[0].finish_reason == "stop":
|
||||
stop_reason = StopReason.end_of_turn
|
||||
elif r.choices[0].finish_reason == "length":
|
||||
stop_reason = StopReason.out_of_tokens
|
||||
|
||||
completion_message = self.formatter.decode_assistant_message_from_content(
|
||||
r.choices[0].message.content, stop_reason
|
||||
)
|
||||
|
||||
yield ChatCompletionResponse(
|
||||
completion_message=completion_message,
|
||||
logprobs=None,
|
||||
)
|
||||
else:
|
||||
yield ChatCompletionResponseStreamChunk(
|
||||
event=ChatCompletionResponseEvent(
|
||||
event_type=ChatCompletionResponseEventType.start,
|
||||
delta="",
|
||||
)
|
||||
)
|
||||
|
||||
buffer = ""
|
||||
ipython = False
|
||||
stop_reason = None
|
||||
|
||||
async for chunk in self.client.chat.completions.acreate(
|
||||
model=fireworks_model,
|
||||
messages=self._messages_to_fireworks_messages(messages),
|
||||
stream=True,
|
||||
**options,
|
||||
):
|
||||
if chunk.choices[0].finish_reason:
|
||||
if stop_reason is None and chunk.choices[0].finish_reason == "stop":
|
||||
stop_reason = StopReason.end_of_turn
|
||||
elif (
|
||||
stop_reason is None
|
||||
and chunk.choices[0].finish_reason == "length"
|
||||
):
|
||||
stop_reason = StopReason.out_of_tokens
|
||||
break
|
||||
|
||||
text = chunk.choices[0].delta.content
|
||||
if text is None:
|
||||
continue
|
||||
|
||||
# check if its a tool call ( aka starts with <|python_tag|> )
|
||||
if not ipython and text.startswith("<|python_tag|>"):
|
||||
ipython = True
|
||||
yield ChatCompletionResponseStreamChunk(
|
||||
event=ChatCompletionResponseEvent(
|
||||
event_type=ChatCompletionResponseEventType.progress,
|
||||
delta=ToolCallDelta(
|
||||
content="",
|
||||
parse_status=ToolCallParseStatus.started,
|
||||
),
|
||||
)
|
||||
)
|
||||
buffer += text
|
||||
continue
|
||||
|
||||
if ipython:
|
||||
if text == "<|eot_id|>":
|
||||
stop_reason = StopReason.end_of_turn
|
||||
text = ""
|
||||
continue
|
||||
elif text == "<|eom_id|>":
|
||||
stop_reason = StopReason.end_of_message
|
||||
text = ""
|
||||
continue
|
||||
|
||||
buffer += text
|
||||
delta = ToolCallDelta(
|
||||
content=text,
|
||||
parse_status=ToolCallParseStatus.in_progress,
|
||||
)
|
||||
|
||||
yield ChatCompletionResponseStreamChunk(
|
||||
event=ChatCompletionResponseEvent(
|
||||
event_type=ChatCompletionResponseEventType.progress,
|
||||
delta=delta,
|
||||
stop_reason=stop_reason,
|
||||
)
|
||||
)
|
||||
else:
|
||||
buffer += text
|
||||
yield ChatCompletionResponseStreamChunk(
|
||||
event=ChatCompletionResponseEvent(
|
||||
event_type=ChatCompletionResponseEventType.progress,
|
||||
delta=text,
|
||||
stop_reason=stop_reason,
|
||||
)
|
||||
)
|
||||
|
||||
# parse tool calls and report errors
|
||||
message = self.formatter.decode_assistant_message_from_content(
|
||||
buffer, stop_reason
|
||||
)
|
||||
parsed_tool_calls = len(message.tool_calls) > 0
|
||||
if ipython and not parsed_tool_calls:
|
||||
yield ChatCompletionResponseStreamChunk(
|
||||
event=ChatCompletionResponseEvent(
|
||||
event_type=ChatCompletionResponseEventType.progress,
|
||||
delta=ToolCallDelta(
|
||||
content="",
|
||||
parse_status=ToolCallParseStatus.failure,
|
||||
),
|
||||
stop_reason=stop_reason,
|
||||
)
|
||||
)
|
||||
|
||||
for tool_call in message.tool_calls:
|
||||
yield ChatCompletionResponseStreamChunk(
|
||||
event=ChatCompletionResponseEvent(
|
||||
event_type=ChatCompletionResponseEventType.progress,
|
||||
delta=ToolCallDelta(
|
||||
content=tool_call,
|
||||
parse_status=ToolCallParseStatus.success,
|
||||
),
|
||||
stop_reason=stop_reason,
|
||||
)
|
||||
)
|
||||
|
||||
yield ChatCompletionResponseStreamChunk(
|
||||
event=ChatCompletionResponseEvent(
|
||||
event_type=ChatCompletionResponseEventType.complete,
|
||||
delta="",
|
||||
stop_reason=stop_reason,
|
||||
)
|
||||
)
|
15
llama_stack/inference/adapters/ollama/__init__.py
Normal file
15
llama_stack/inference/adapters/ollama/__init__.py
Normal file
|
@ -0,0 +1,15 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
from llama_stack.core.datatypes import RemoteProviderConfig
|
||||
|
||||
|
||||
async def get_adapter_impl(config: RemoteProviderConfig, _deps):
|
||||
from .ollama import OllamaInferenceAdapter
|
||||
|
||||
impl = OllamaInferenceAdapter(config.url)
|
||||
await impl.initialize()
|
||||
return impl
|
260
llama_stack/inference/adapters/ollama/ollama.py
Normal file
260
llama_stack/inference/adapters/ollama/ollama.py
Normal file
|
@ -0,0 +1,260 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
from typing import AsyncGenerator
|
||||
|
||||
import httpx
|
||||
|
||||
from llama_models.llama3.api.chat_format import ChatFormat
|
||||
from llama_models.llama3.api.datatypes import Message, StopReason
|
||||
from llama_models.llama3.api.tokenizer import Tokenizer
|
||||
from llama_models.sku_list import resolve_model
|
||||
from ollama import AsyncClient
|
||||
|
||||
from llama_stack.inference.api import * # noqa: F403
|
||||
from llama_stack.inference.prepare_messages import prepare_messages
|
||||
|
||||
# TODO: Eventually this will move to the llama cli model list command
|
||||
# mapping of Model SKUs to ollama models
|
||||
OLLAMA_SUPPORTED_SKUS = {
|
||||
# "Meta-Llama3.1-8B-Instruct": "llama3.1",
|
||||
"Meta-Llama3.1-8B-Instruct": "llama3.1:8b-instruct-fp16",
|
||||
"Meta-Llama3.1-70B-Instruct": "llama3.1:70b-instruct-fp16",
|
||||
}
|
||||
|
||||
|
||||
class OllamaInferenceAdapter(Inference):
|
||||
def __init__(self, url: str) -> None:
|
||||
self.url = url
|
||||
tokenizer = Tokenizer.get_instance()
|
||||
self.formatter = ChatFormat(tokenizer)
|
||||
|
||||
@property
|
||||
def client(self) -> AsyncClient:
|
||||
return AsyncClient(host=self.url)
|
||||
|
||||
async def initialize(self) -> None:
|
||||
try:
|
||||
await self.client.ps()
|
||||
except httpx.ConnectError as e:
|
||||
raise RuntimeError(
|
||||
"Ollama Server is not running, start it using `ollama serve` in a separate terminal"
|
||||
) from e
|
||||
|
||||
async def shutdown(self) -> None:
|
||||
pass
|
||||
|
||||
async def completion(self, request: CompletionRequest) -> AsyncGenerator:
|
||||
raise NotImplementedError()
|
||||
|
||||
def _messages_to_ollama_messages(self, messages: list[Message]) -> list:
|
||||
ollama_messages = []
|
||||
for message in messages:
|
||||
if message.role == "ipython":
|
||||
role = "tool"
|
||||
else:
|
||||
role = message.role
|
||||
ollama_messages.append({"role": role, "content": message.content})
|
||||
|
||||
return ollama_messages
|
||||
|
||||
def resolve_ollama_model(self, model_name: str) -> str:
|
||||
model = resolve_model(model_name)
|
||||
assert (
|
||||
model is not None
|
||||
and model.descriptor(shorten_default_variant=True) in OLLAMA_SUPPORTED_SKUS
|
||||
), f"Unsupported model: {model_name}, use one of the supported models: {','.join(OLLAMA_SUPPORTED_SKUS.keys())}"
|
||||
|
||||
return OLLAMA_SUPPORTED_SKUS.get(model.descriptor(shorten_default_variant=True))
|
||||
|
||||
def get_ollama_chat_options(self, request: ChatCompletionRequest) -> dict:
|
||||
options = {}
|
||||
if request.sampling_params is not None:
|
||||
for attr in {"temperature", "top_p", "top_k", "max_tokens"}:
|
||||
if getattr(request.sampling_params, attr):
|
||||
options[attr] = getattr(request.sampling_params, attr)
|
||||
if (
|
||||
request.sampling_params.repetition_penalty is not None
|
||||
and request.sampling_params.repetition_penalty != 1.0
|
||||
):
|
||||
options["repeat_penalty"] = request.sampling_params.repetition_penalty
|
||||
|
||||
return options
|
||||
|
||||
async def chat_completion(
|
||||
self,
|
||||
model: str,
|
||||
messages: List[Message],
|
||||
sampling_params: Optional[SamplingParams] = SamplingParams(),
|
||||
tools: Optional[List[ToolDefinition]] = list(),
|
||||
tool_choice: Optional[ToolChoice] = ToolChoice.auto,
|
||||
tool_prompt_format: Optional[ToolPromptFormat] = ToolPromptFormat.json,
|
||||
stream: Optional[bool] = False,
|
||||
logprobs: Optional[LogProbConfig] = None,
|
||||
) -> AsyncGenerator:
|
||||
request = ChatCompletionRequest(
|
||||
model=model,
|
||||
messages=messages,
|
||||
sampling_params=sampling_params,
|
||||
tools=tools,
|
||||
tool_choice=tool_choice,
|
||||
tool_prompt_format=tool_prompt_format,
|
||||
stream=stream,
|
||||
logprobs=logprobs,
|
||||
)
|
||||
|
||||
messages = prepare_messages(request)
|
||||
# accumulate sampling params and other options to pass to ollama
|
||||
options = self.get_ollama_chat_options(request)
|
||||
ollama_model = self.resolve_ollama_model(request.model)
|
||||
|
||||
res = await self.client.ps()
|
||||
need_model_pull = True
|
||||
for r in res["models"]:
|
||||
if ollama_model == r["model"]:
|
||||
need_model_pull = False
|
||||
break
|
||||
|
||||
if need_model_pull:
|
||||
print(f"Pulling model: {ollama_model}")
|
||||
status = await self.client.pull(ollama_model)
|
||||
assert (
|
||||
status["status"] == "success"
|
||||
), f"Failed to pull model {self.model} in ollama"
|
||||
|
||||
if not request.stream:
|
||||
r = await self.client.chat(
|
||||
model=ollama_model,
|
||||
messages=self._messages_to_ollama_messages(messages),
|
||||
stream=False,
|
||||
options=options,
|
||||
)
|
||||
stop_reason = None
|
||||
if r["done"]:
|
||||
if r["done_reason"] == "stop":
|
||||
stop_reason = StopReason.end_of_turn
|
||||
elif r["done_reason"] == "length":
|
||||
stop_reason = StopReason.out_of_tokens
|
||||
|
||||
completion_message = self.formatter.decode_assistant_message_from_content(
|
||||
r["message"]["content"], stop_reason
|
||||
)
|
||||
yield ChatCompletionResponse(
|
||||
completion_message=completion_message,
|
||||
logprobs=None,
|
||||
)
|
||||
else:
|
||||
yield ChatCompletionResponseStreamChunk(
|
||||
event=ChatCompletionResponseEvent(
|
||||
event_type=ChatCompletionResponseEventType.start,
|
||||
delta="",
|
||||
)
|
||||
)
|
||||
stream = await self.client.chat(
|
||||
model=ollama_model,
|
||||
messages=self._messages_to_ollama_messages(messages),
|
||||
stream=True,
|
||||
options=options,
|
||||
)
|
||||
|
||||
buffer = ""
|
||||
ipython = False
|
||||
stop_reason = None
|
||||
|
||||
async for chunk in stream:
|
||||
if chunk["done"]:
|
||||
if stop_reason is None and chunk["done_reason"] == "stop":
|
||||
stop_reason = StopReason.end_of_turn
|
||||
elif stop_reason is None and chunk["done_reason"] == "length":
|
||||
stop_reason = StopReason.out_of_tokens
|
||||
break
|
||||
|
||||
text = chunk["message"]["content"]
|
||||
|
||||
# check if its a tool call ( aka starts with <|python_tag|> )
|
||||
if not ipython and text.startswith("<|python_tag|>"):
|
||||
ipython = True
|
||||
yield ChatCompletionResponseStreamChunk(
|
||||
event=ChatCompletionResponseEvent(
|
||||
event_type=ChatCompletionResponseEventType.progress,
|
||||
delta=ToolCallDelta(
|
||||
content="",
|
||||
parse_status=ToolCallParseStatus.started,
|
||||
),
|
||||
)
|
||||
)
|
||||
buffer += text
|
||||
continue
|
||||
|
||||
if ipython:
|
||||
if text == "<|eot_id|>":
|
||||
stop_reason = StopReason.end_of_turn
|
||||
text = ""
|
||||
continue
|
||||
elif text == "<|eom_id|>":
|
||||
stop_reason = StopReason.end_of_message
|
||||
text = ""
|
||||
continue
|
||||
|
||||
buffer += text
|
||||
delta = ToolCallDelta(
|
||||
content=text,
|
||||
parse_status=ToolCallParseStatus.in_progress,
|
||||
)
|
||||
|
||||
yield ChatCompletionResponseStreamChunk(
|
||||
event=ChatCompletionResponseEvent(
|
||||
event_type=ChatCompletionResponseEventType.progress,
|
||||
delta=delta,
|
||||
stop_reason=stop_reason,
|
||||
)
|
||||
)
|
||||
else:
|
||||
buffer += text
|
||||
yield ChatCompletionResponseStreamChunk(
|
||||
event=ChatCompletionResponseEvent(
|
||||
event_type=ChatCompletionResponseEventType.progress,
|
||||
delta=text,
|
||||
stop_reason=stop_reason,
|
||||
)
|
||||
)
|
||||
|
||||
# parse tool calls and report errors
|
||||
message = self.formatter.decode_assistant_message_from_content(
|
||||
buffer, stop_reason
|
||||
)
|
||||
parsed_tool_calls = len(message.tool_calls) > 0
|
||||
if ipython and not parsed_tool_calls:
|
||||
yield ChatCompletionResponseStreamChunk(
|
||||
event=ChatCompletionResponseEvent(
|
||||
event_type=ChatCompletionResponseEventType.progress,
|
||||
delta=ToolCallDelta(
|
||||
content="",
|
||||
parse_status=ToolCallParseStatus.failure,
|
||||
),
|
||||
stop_reason=stop_reason,
|
||||
)
|
||||
)
|
||||
|
||||
for tool_call in message.tool_calls:
|
||||
yield ChatCompletionResponseStreamChunk(
|
||||
event=ChatCompletionResponseEvent(
|
||||
event_type=ChatCompletionResponseEventType.progress,
|
||||
delta=ToolCallDelta(
|
||||
content=tool_call,
|
||||
parse_status=ToolCallParseStatus.success,
|
||||
),
|
||||
stop_reason=stop_reason,
|
||||
)
|
||||
)
|
||||
|
||||
yield ChatCompletionResponseStreamChunk(
|
||||
event=ChatCompletionResponseEvent(
|
||||
event_type=ChatCompletionResponseEventType.complete,
|
||||
delta="",
|
||||
stop_reason=stop_reason,
|
||||
)
|
||||
)
|
24
llama_stack/inference/adapters/tgi/__init__.py
Normal file
24
llama_stack/inference/adapters/tgi/__init__.py
Normal file
|
@ -0,0 +1,24 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
from .config import TGIImplConfig
|
||||
from .tgi import InferenceEndpointAdapter, TGIAdapter
|
||||
|
||||
|
||||
async def get_adapter_impl(config: TGIImplConfig, _deps):
|
||||
assert isinstance(config, TGIImplConfig), f"Unexpected config type: {type(config)}"
|
||||
|
||||
if config.url is not None:
|
||||
impl = TGIAdapter(config)
|
||||
elif config.is_inference_endpoint():
|
||||
impl = InferenceEndpointAdapter(config)
|
||||
else:
|
||||
raise ValueError(
|
||||
"Invalid configuration. Specify either an URL or HF Inference Endpoint details (namespace and endpoint name)."
|
||||
)
|
||||
|
||||
await impl.initialize()
|
||||
return impl
|
29
llama_stack/inference/adapters/tgi/config.py
Normal file
29
llama_stack/inference/adapters/tgi/config.py
Normal file
|
@ -0,0 +1,29 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
from typing import Optional
|
||||
|
||||
from llama_models.schema_utils import json_schema_type
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
|
||||
@json_schema_type
|
||||
class TGIImplConfig(BaseModel):
|
||||
url: Optional[str] = Field(
|
||||
default=None,
|
||||
description="The URL for the local TGI endpoint (e.g., http://localhost:8080)",
|
||||
)
|
||||
api_token: Optional[str] = Field(
|
||||
default=None,
|
||||
description="The HF token for Hugging Face Inference Endpoints (will default to locally saved token if not provided)",
|
||||
)
|
||||
hf_endpoint_name: Optional[str] = Field(
|
||||
default=None,
|
||||
description="The name of the Hugging Face Inference Endpoint : can be either in the format of '{namespace}/{endpoint_name}' (namespace can be the username or organization name) or just '{endpoint_name}' if logged into the same account as the namespace",
|
||||
)
|
||||
|
||||
def is_inference_endpoint(self) -> bool:
|
||||
return self.hf_endpoint_name is not None
|
295
llama_stack/inference/adapters/tgi/tgi.py
Normal file
295
llama_stack/inference/adapters/tgi/tgi.py
Normal file
|
@ -0,0 +1,295 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
|
||||
from typing import Any, AsyncGenerator, Dict
|
||||
|
||||
import requests
|
||||
|
||||
from huggingface_hub import HfApi, InferenceClient
|
||||
from llama_models.llama3.api.chat_format import ChatFormat
|
||||
from llama_models.llama3.api.datatypes import StopReason
|
||||
from llama_models.llama3.api.tokenizer import Tokenizer
|
||||
from llama_stack.inference.api import * # noqa: F403
|
||||
from llama_stack.inference.prepare_messages import prepare_messages
|
||||
|
||||
from .config import TGIImplConfig
|
||||
|
||||
HF_SUPPORTED_MODELS = {
|
||||
"Meta-Llama3.1-8B-Instruct": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||
"Meta-Llama3.1-70B-Instruct": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
||||
"Meta-Llama3.1-405B-Instruct": "meta-llama/Meta-Llama-3.1-405B-Instruct",
|
||||
}
|
||||
|
||||
|
||||
class TGIAdapter(Inference):
|
||||
def __init__(self, config: TGIImplConfig) -> None:
|
||||
self.config = config
|
||||
self.tokenizer = Tokenizer.get_instance()
|
||||
self.formatter = ChatFormat(self.tokenizer)
|
||||
|
||||
@property
|
||||
def client(self) -> InferenceClient:
|
||||
return InferenceClient(model=self.config.url, token=self.config.api_token)
|
||||
|
||||
def _get_endpoint_info(self) -> Dict[str, Any]:
|
||||
return {
|
||||
**self.client.get_endpoint_info(),
|
||||
"inference_url": self.config.url,
|
||||
}
|
||||
|
||||
async def initialize(self) -> None:
|
||||
try:
|
||||
info = self._get_endpoint_info()
|
||||
if "model_id" not in info:
|
||||
raise RuntimeError("Missing model_id in model info")
|
||||
if "max_total_tokens" not in info:
|
||||
raise RuntimeError("Missing max_total_tokens in model info")
|
||||
self.max_tokens = info["max_total_tokens"]
|
||||
|
||||
model_id = info["model_id"]
|
||||
model_name = next(
|
||||
(name for name, id in HF_SUPPORTED_MODELS.items() if id == model_id),
|
||||
None,
|
||||
)
|
||||
if model_name is None:
|
||||
raise RuntimeError(
|
||||
f"TGI is serving model: {model_id}, use one of the supported models: {', '.join(HF_SUPPORTED_MODELS.values())}"
|
||||
)
|
||||
self.model_name = model_name
|
||||
self.inference_url = info["inference_url"]
|
||||
except Exception as e:
|
||||
import traceback
|
||||
|
||||
traceback.print_exc()
|
||||
raise RuntimeError(f"Error initializing TGIAdapter: {e}") from e
|
||||
|
||||
async def shutdown(self) -> None:
|
||||
pass
|
||||
|
||||
async def completion(self, request: CompletionRequest) -> AsyncGenerator:
|
||||
raise NotImplementedError()
|
||||
|
||||
def get_chat_options(self, request: ChatCompletionRequest) -> dict:
|
||||
options = {}
|
||||
if request.sampling_params is not None:
|
||||
for attr in {"temperature", "top_p", "top_k", "max_tokens"}:
|
||||
if getattr(request.sampling_params, attr):
|
||||
options[attr] = getattr(request.sampling_params, attr)
|
||||
|
||||
return options
|
||||
|
||||
async def chat_completion(
|
||||
self,
|
||||
model: str,
|
||||
messages: List[Message],
|
||||
sampling_params: Optional[SamplingParams] = SamplingParams(),
|
||||
tools: Optional[List[ToolDefinition]] = list(),
|
||||
tool_choice: Optional[ToolChoice] = ToolChoice.auto,
|
||||
tool_prompt_format: Optional[ToolPromptFormat] = ToolPromptFormat.json,
|
||||
stream: Optional[bool] = False,
|
||||
logprobs: Optional[LogProbConfig] = None,
|
||||
) -> AsyncGenerator:
|
||||
request = ChatCompletionRequest(
|
||||
model=model,
|
||||
messages=messages,
|
||||
sampling_params=sampling_params,
|
||||
tools=tools,
|
||||
tool_choice=tool_choice,
|
||||
tool_prompt_format=tool_prompt_format,
|
||||
stream=stream,
|
||||
logprobs=logprobs,
|
||||
)
|
||||
|
||||
messages = prepare_messages(request)
|
||||
model_input = self.formatter.encode_dialog_prompt(messages)
|
||||
prompt = self.tokenizer.decode(model_input.tokens)
|
||||
|
||||
input_tokens = len(model_input.tokens)
|
||||
max_new_tokens = min(
|
||||
request.sampling_params.max_tokens or (self.max_tokens - input_tokens),
|
||||
self.max_tokens - input_tokens - 1,
|
||||
)
|
||||
|
||||
print(f"Calculated max_new_tokens: {max_new_tokens}")
|
||||
|
||||
assert (
|
||||
request.model == self.model_name
|
||||
), f"Model mismatch, expected {self.model_name}, got {request.model}"
|
||||
|
||||
options = self.get_chat_options(request)
|
||||
if not request.stream:
|
||||
response = self.client.text_generation(
|
||||
prompt=prompt,
|
||||
stream=False,
|
||||
details=True,
|
||||
max_new_tokens=max_new_tokens,
|
||||
stop_sequences=["<|eom_id|>", "<|eot_id|>"],
|
||||
**options,
|
||||
)
|
||||
stop_reason = None
|
||||
if response.details.finish_reason:
|
||||
if response.details.finish_reason == "stop":
|
||||
stop_reason = StopReason.end_of_turn
|
||||
elif response.details.finish_reason == "length":
|
||||
stop_reason = StopReason.out_of_tokens
|
||||
|
||||
completion_message = self.formatter.decode_assistant_message_from_content(
|
||||
response.generated_text,
|
||||
stop_reason,
|
||||
)
|
||||
yield ChatCompletionResponse(
|
||||
completion_message=completion_message,
|
||||
logprobs=None,
|
||||
)
|
||||
|
||||
else:
|
||||
yield ChatCompletionResponseStreamChunk(
|
||||
event=ChatCompletionResponseEvent(
|
||||
event_type=ChatCompletionResponseEventType.start,
|
||||
delta="",
|
||||
)
|
||||
)
|
||||
buffer = ""
|
||||
ipython = False
|
||||
stop_reason = None
|
||||
tokens = []
|
||||
|
||||
for response in self.client.text_generation(
|
||||
prompt=prompt,
|
||||
stream=True,
|
||||
details=True,
|
||||
max_new_tokens=max_new_tokens,
|
||||
stop_sequences=["<|eom_id|>", "<|eot_id|>"],
|
||||
**options,
|
||||
):
|
||||
token_result = response.token
|
||||
|
||||
buffer += token_result.text
|
||||
tokens.append(token_result.id)
|
||||
|
||||
if not ipython and buffer.startswith("<|python_tag|>"):
|
||||
ipython = True
|
||||
yield ChatCompletionResponseStreamChunk(
|
||||
event=ChatCompletionResponseEvent(
|
||||
event_type=ChatCompletionResponseEventType.progress,
|
||||
delta=ToolCallDelta(
|
||||
content="",
|
||||
parse_status=ToolCallParseStatus.started,
|
||||
),
|
||||
)
|
||||
)
|
||||
buffer = buffer[len("<|python_tag|>") :]
|
||||
continue
|
||||
|
||||
if token_result.text == "<|eot_id|>":
|
||||
stop_reason = StopReason.end_of_turn
|
||||
text = ""
|
||||
elif token_result.text == "<|eom_id|>":
|
||||
stop_reason = StopReason.end_of_message
|
||||
text = ""
|
||||
else:
|
||||
text = token_result.text
|
||||
|
||||
if ipython:
|
||||
delta = ToolCallDelta(
|
||||
content=text,
|
||||
parse_status=ToolCallParseStatus.in_progress,
|
||||
)
|
||||
else:
|
||||
delta = text
|
||||
|
||||
if stop_reason is None:
|
||||
yield ChatCompletionResponseStreamChunk(
|
||||
event=ChatCompletionResponseEvent(
|
||||
event_type=ChatCompletionResponseEventType.progress,
|
||||
delta=delta,
|
||||
stop_reason=stop_reason,
|
||||
)
|
||||
)
|
||||
|
||||
if stop_reason is None:
|
||||
stop_reason = StopReason.out_of_tokens
|
||||
|
||||
# parse tool calls and report errors
|
||||
message = self.formatter.decode_assistant_message(tokens, stop_reason)
|
||||
parsed_tool_calls = len(message.tool_calls) > 0
|
||||
if ipython and not parsed_tool_calls:
|
||||
yield ChatCompletionResponseStreamChunk(
|
||||
event=ChatCompletionResponseEvent(
|
||||
event_type=ChatCompletionResponseEventType.progress,
|
||||
delta=ToolCallDelta(
|
||||
content="",
|
||||
parse_status=ToolCallParseStatus.failure,
|
||||
),
|
||||
stop_reason=stop_reason,
|
||||
)
|
||||
)
|
||||
|
||||
for tool_call in message.tool_calls:
|
||||
yield ChatCompletionResponseStreamChunk(
|
||||
event=ChatCompletionResponseEvent(
|
||||
event_type=ChatCompletionResponseEventType.progress,
|
||||
delta=ToolCallDelta(
|
||||
content=tool_call,
|
||||
parse_status=ToolCallParseStatus.success,
|
||||
),
|
||||
stop_reason=stop_reason,
|
||||
)
|
||||
)
|
||||
|
||||
yield ChatCompletionResponseStreamChunk(
|
||||
event=ChatCompletionResponseEvent(
|
||||
event_type=ChatCompletionResponseEventType.complete,
|
||||
delta="",
|
||||
stop_reason=stop_reason,
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
class InferenceEndpointAdapter(TGIAdapter):
|
||||
def __init__(self, config: TGIImplConfig) -> None:
|
||||
super().__init__(config)
|
||||
self.config.url = self._construct_endpoint_url()
|
||||
|
||||
def _construct_endpoint_url(self) -> str:
|
||||
hf_endpoint_name = self.config.hf_endpoint_name
|
||||
assert hf_endpoint_name.count("/") <= 1, (
|
||||
"Endpoint name must be in the format of 'namespace/endpoint_name' "
|
||||
"or 'endpoint_name'"
|
||||
)
|
||||
if "/" not in hf_endpoint_name:
|
||||
hf_namespace: str = self.get_namespace()
|
||||
endpoint_path = f"{hf_namespace}/{hf_endpoint_name}"
|
||||
else:
|
||||
endpoint_path = hf_endpoint_name
|
||||
return f"https://api.endpoints.huggingface.cloud/v2/endpoint/{endpoint_path}"
|
||||
|
||||
def get_namespace(self) -> str:
|
||||
return HfApi().whoami()["name"]
|
||||
|
||||
@property
|
||||
def client(self) -> InferenceClient:
|
||||
return InferenceClient(model=self.inference_url, token=self.config.api_token)
|
||||
|
||||
def _get_endpoint_info(self) -> Dict[str, Any]:
|
||||
headers = {
|
||||
"accept": "application/json",
|
||||
"authorization": f"Bearer {self.config.api_token}",
|
||||
}
|
||||
response = requests.get(self.config.url, headers=headers)
|
||||
response.raise_for_status()
|
||||
endpoint_info = response.json()
|
||||
return {
|
||||
"inference_url": endpoint_info["status"]["url"],
|
||||
"model_id": endpoint_info["model"]["repository"],
|
||||
"max_total_tokens": int(
|
||||
endpoint_info["model"]["image"]["custom"]["env"]["MAX_TOTAL_TOKENS"]
|
||||
),
|
||||
}
|
||||
|
||||
async def initialize(self) -> None:
|
||||
await super().initialize()
|
18
llama_stack/inference/adapters/together/__init__.py
Normal file
18
llama_stack/inference/adapters/together/__init__.py
Normal file
|
@ -0,0 +1,18 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
from .config import TogetherImplConfig
|
||||
|
||||
|
||||
async def get_adapter_impl(config: TogetherImplConfig, _deps):
|
||||
from .together import TogetherInferenceAdapter
|
||||
|
||||
assert isinstance(
|
||||
config, TogetherImplConfig
|
||||
), f"Unexpected config type: {type(config)}"
|
||||
impl = TogetherInferenceAdapter(config)
|
||||
await impl.initialize()
|
||||
return impl
|
20
llama_stack/inference/adapters/together/config.py
Normal file
20
llama_stack/inference/adapters/together/config.py
Normal file
|
@ -0,0 +1,20 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
from llama_models.schema_utils import json_schema_type
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
|
||||
@json_schema_type
|
||||
class TogetherImplConfig(BaseModel):
|
||||
url: str = Field(
|
||||
default="https://api.together.xyz/v1",
|
||||
description="The URL for the Together AI server",
|
||||
)
|
||||
api_key: str = Field(
|
||||
default="",
|
||||
description="The Together AI API Key",
|
||||
)
|
251
llama_stack/inference/adapters/together/together.py
Normal file
251
llama_stack/inference/adapters/together/together.py
Normal file
|
@ -0,0 +1,251 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
from typing import AsyncGenerator
|
||||
|
||||
from llama_models.llama3.api.chat_format import ChatFormat
|
||||
|
||||
from llama_models.llama3.api.datatypes import Message, StopReason
|
||||
from llama_models.llama3.api.tokenizer import Tokenizer
|
||||
from llama_models.sku_list import resolve_model
|
||||
from together import Together
|
||||
|
||||
from llama_stack.inference.api import * # noqa: F403
|
||||
from llama_stack.inference.prepare_messages import prepare_messages
|
||||
|
||||
from .config import TogetherImplConfig
|
||||
|
||||
TOGETHER_SUPPORTED_MODELS = {
|
||||
"Meta-Llama3.1-8B-Instruct": "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
|
||||
"Meta-Llama3.1-70B-Instruct": "meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo",
|
||||
"Meta-Llama3.1-405B-Instruct": "meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo",
|
||||
}
|
||||
|
||||
|
||||
class TogetherInferenceAdapter(Inference):
|
||||
def __init__(self, config: TogetherImplConfig) -> None:
|
||||
self.config = config
|
||||
tokenizer = Tokenizer.get_instance()
|
||||
self.formatter = ChatFormat(tokenizer)
|
||||
|
||||
@property
|
||||
def client(self) -> Together:
|
||||
return Together(api_key=self.config.api_key)
|
||||
|
||||
async def initialize(self) -> None:
|
||||
return
|
||||
|
||||
async def shutdown(self) -> None:
|
||||
pass
|
||||
|
||||
async def completion(self, request: CompletionRequest) -> AsyncGenerator:
|
||||
raise NotImplementedError()
|
||||
|
||||
def _messages_to_together_messages(self, messages: list[Message]) -> list:
|
||||
together_messages = []
|
||||
for message in messages:
|
||||
if message.role == "ipython":
|
||||
role = "tool"
|
||||
else:
|
||||
role = message.role
|
||||
together_messages.append({"role": role, "content": message.content})
|
||||
|
||||
return together_messages
|
||||
|
||||
def resolve_together_model(self, model_name: str) -> str:
|
||||
model = resolve_model(model_name)
|
||||
assert (
|
||||
model is not None
|
||||
and model.descriptor(shorten_default_variant=True)
|
||||
in TOGETHER_SUPPORTED_MODELS
|
||||
), f"Unsupported model: {model_name}, use one of the supported models: {','.join(TOGETHER_SUPPORTED_MODELS.keys())}"
|
||||
|
||||
return TOGETHER_SUPPORTED_MODELS.get(
|
||||
model.descriptor(shorten_default_variant=True)
|
||||
)
|
||||
|
||||
def get_together_chat_options(self, request: ChatCompletionRequest) -> dict:
|
||||
options = {}
|
||||
if request.sampling_params is not None:
|
||||
for attr in {"temperature", "top_p", "top_k", "max_tokens"}:
|
||||
if getattr(request.sampling_params, attr):
|
||||
options[attr] = getattr(request.sampling_params, attr)
|
||||
|
||||
return options
|
||||
|
||||
async def chat_completion(
|
||||
self,
|
||||
model: str,
|
||||
messages: List[Message],
|
||||
sampling_params: Optional[SamplingParams] = SamplingParams(),
|
||||
tools: Optional[List[ToolDefinition]] = list(),
|
||||
tool_choice: Optional[ToolChoice] = ToolChoice.auto,
|
||||
tool_prompt_format: Optional[ToolPromptFormat] = ToolPromptFormat.json,
|
||||
stream: Optional[bool] = False,
|
||||
logprobs: Optional[LogProbConfig] = None,
|
||||
) -> AsyncGenerator:
|
||||
# wrapper request to make it easier to pass around (internal only, not exposed to API)
|
||||
request = ChatCompletionRequest(
|
||||
model=model,
|
||||
messages=messages,
|
||||
sampling_params=sampling_params,
|
||||
tools=tools,
|
||||
tool_choice=tool_choice,
|
||||
tool_prompt_format=tool_prompt_format,
|
||||
stream=stream,
|
||||
logprobs=logprobs,
|
||||
)
|
||||
|
||||
# accumulate sampling params and other options to pass to together
|
||||
options = self.get_together_chat_options(request)
|
||||
together_model = self.resolve_together_model(request.model)
|
||||
messages = prepare_messages(request)
|
||||
|
||||
if not request.stream:
|
||||
# TODO: might need to add back an async here
|
||||
r = self.client.chat.completions.create(
|
||||
model=together_model,
|
||||
messages=self._messages_to_together_messages(messages),
|
||||
stream=False,
|
||||
**options,
|
||||
)
|
||||
stop_reason = None
|
||||
if r.choices[0].finish_reason:
|
||||
if (
|
||||
r.choices[0].finish_reason == "stop"
|
||||
or r.choices[0].finish_reason == "eos"
|
||||
):
|
||||
stop_reason = StopReason.end_of_turn
|
||||
elif r.choices[0].finish_reason == "length":
|
||||
stop_reason = StopReason.out_of_tokens
|
||||
|
||||
completion_message = self.formatter.decode_assistant_message_from_content(
|
||||
r.choices[0].message.content, stop_reason
|
||||
)
|
||||
yield ChatCompletionResponse(
|
||||
completion_message=completion_message,
|
||||
logprobs=None,
|
||||
)
|
||||
else:
|
||||
yield ChatCompletionResponseStreamChunk(
|
||||
event=ChatCompletionResponseEvent(
|
||||
event_type=ChatCompletionResponseEventType.start,
|
||||
delta="",
|
||||
)
|
||||
)
|
||||
|
||||
buffer = ""
|
||||
ipython = False
|
||||
stop_reason = None
|
||||
|
||||
for chunk in self.client.chat.completions.create(
|
||||
model=together_model,
|
||||
messages=self._messages_to_together_messages(messages),
|
||||
stream=True,
|
||||
**options,
|
||||
):
|
||||
if chunk.choices[0].finish_reason:
|
||||
if (
|
||||
stop_reason is None and chunk.choices[0].finish_reason == "stop"
|
||||
) or (
|
||||
stop_reason is None and chunk.choices[0].finish_reason == "eos"
|
||||
):
|
||||
stop_reason = StopReason.end_of_turn
|
||||
elif (
|
||||
stop_reason is None
|
||||
and chunk.choices[0].finish_reason == "length"
|
||||
):
|
||||
stop_reason = StopReason.out_of_tokens
|
||||
break
|
||||
|
||||
text = chunk.choices[0].delta.content
|
||||
if text is None:
|
||||
continue
|
||||
|
||||
# check if its a tool call ( aka starts with <|python_tag|> )
|
||||
if not ipython and text.startswith("<|python_tag|>"):
|
||||
ipython = True
|
||||
yield ChatCompletionResponseStreamChunk(
|
||||
event=ChatCompletionResponseEvent(
|
||||
event_type=ChatCompletionResponseEventType.progress,
|
||||
delta=ToolCallDelta(
|
||||
content="",
|
||||
parse_status=ToolCallParseStatus.started,
|
||||
),
|
||||
)
|
||||
)
|
||||
buffer += text
|
||||
continue
|
||||
|
||||
if ipython:
|
||||
if text == "<|eot_id|>":
|
||||
stop_reason = StopReason.end_of_turn
|
||||
text = ""
|
||||
continue
|
||||
elif text == "<|eom_id|>":
|
||||
stop_reason = StopReason.end_of_message
|
||||
text = ""
|
||||
continue
|
||||
|
||||
buffer += text
|
||||
delta = ToolCallDelta(
|
||||
content=text,
|
||||
parse_status=ToolCallParseStatus.in_progress,
|
||||
)
|
||||
|
||||
yield ChatCompletionResponseStreamChunk(
|
||||
event=ChatCompletionResponseEvent(
|
||||
event_type=ChatCompletionResponseEventType.progress,
|
||||
delta=delta,
|
||||
stop_reason=stop_reason,
|
||||
)
|
||||
)
|
||||
else:
|
||||
buffer += text
|
||||
yield ChatCompletionResponseStreamChunk(
|
||||
event=ChatCompletionResponseEvent(
|
||||
event_type=ChatCompletionResponseEventType.progress,
|
||||
delta=text,
|
||||
stop_reason=stop_reason,
|
||||
)
|
||||
)
|
||||
|
||||
# parse tool calls and report errors
|
||||
message = self.formatter.decode_assistant_message_from_content(
|
||||
buffer, stop_reason
|
||||
)
|
||||
parsed_tool_calls = len(message.tool_calls) > 0
|
||||
if ipython and not parsed_tool_calls:
|
||||
yield ChatCompletionResponseStreamChunk(
|
||||
event=ChatCompletionResponseEvent(
|
||||
event_type=ChatCompletionResponseEventType.progress,
|
||||
delta=ToolCallDelta(
|
||||
content="",
|
||||
parse_status=ToolCallParseStatus.failure,
|
||||
),
|
||||
stop_reason=stop_reason,
|
||||
)
|
||||
)
|
||||
|
||||
for tool_call in message.tool_calls:
|
||||
yield ChatCompletionResponseStreamChunk(
|
||||
event=ChatCompletionResponseEvent(
|
||||
event_type=ChatCompletionResponseEventType.progress,
|
||||
delta=ToolCallDelta(
|
||||
content=tool_call,
|
||||
parse_status=ToolCallParseStatus.success,
|
||||
),
|
||||
stop_reason=stop_reason,
|
||||
)
|
||||
)
|
||||
|
||||
yield ChatCompletionResponseStreamChunk(
|
||||
event=ChatCompletionResponseEvent(
|
||||
event_type=ChatCompletionResponseEventType.complete,
|
||||
delta="",
|
||||
stop_reason=stop_reason,
|
||||
)
|
||||
)
|
7
llama_stack/inference/api/__init__.py
Normal file
7
llama_stack/inference/api/__init__.py
Normal file
|
@ -0,0 +1,7 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
from .api import * # noqa: F401 F403
|
205
llama_stack/inference/api/api.py
Normal file
205
llama_stack/inference/api/api.py
Normal file
|
@ -0,0 +1,205 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
from enum import Enum
|
||||
|
||||
from typing import List, Literal, Optional, Protocol, Union
|
||||
|
||||
from llama_models.schema_utils import json_schema_type, webmethod
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
from typing_extensions import Annotated
|
||||
|
||||
from llama_models.llama3.api.datatypes import * # noqa: F403
|
||||
|
||||
|
||||
class LogProbConfig(BaseModel):
|
||||
top_k: Optional[int] = 0
|
||||
|
||||
|
||||
@json_schema_type
|
||||
class QuantizationType(Enum):
|
||||
bf16 = "bf16"
|
||||
fp8 = "fp8"
|
||||
|
||||
|
||||
@json_schema_type
|
||||
class Fp8QuantizationConfig(BaseModel):
|
||||
type: Literal[QuantizationType.fp8.value] = QuantizationType.fp8.value
|
||||
|
||||
|
||||
@json_schema_type
|
||||
class Bf16QuantizationConfig(BaseModel):
|
||||
type: Literal[QuantizationType.bf16.value] = QuantizationType.bf16.value
|
||||
|
||||
|
||||
QuantizationConfig = Annotated[
|
||||
Union[Bf16QuantizationConfig, Fp8QuantizationConfig],
|
||||
Field(discriminator="type"),
|
||||
]
|
||||
|
||||
|
||||
@json_schema_type
|
||||
class ChatCompletionResponseEventType(Enum):
|
||||
start = "start"
|
||||
complete = "complete"
|
||||
progress = "progress"
|
||||
|
||||
|
||||
@json_schema_type
|
||||
class ToolCallParseStatus(Enum):
|
||||
started = "started"
|
||||
in_progress = "in_progress"
|
||||
failure = "failure"
|
||||
success = "success"
|
||||
|
||||
|
||||
@json_schema_type
|
||||
class ToolCallDelta(BaseModel):
|
||||
content: Union[str, ToolCall]
|
||||
parse_status: ToolCallParseStatus
|
||||
|
||||
|
||||
@json_schema_type
|
||||
class ChatCompletionResponseEvent(BaseModel):
|
||||
"""Chat completion response event."""
|
||||
|
||||
event_type: ChatCompletionResponseEventType
|
||||
delta: Union[str, ToolCallDelta]
|
||||
logprobs: Optional[List[TokenLogProbs]] = None
|
||||
stop_reason: Optional[StopReason] = None
|
||||
|
||||
|
||||
@json_schema_type
|
||||
class CompletionRequest(BaseModel):
|
||||
model: str
|
||||
content: InterleavedTextMedia
|
||||
sampling_params: Optional[SamplingParams] = SamplingParams()
|
||||
|
||||
stream: Optional[bool] = False
|
||||
logprobs: Optional[LogProbConfig] = None
|
||||
|
||||
|
||||
@json_schema_type
|
||||
class CompletionResponse(BaseModel):
|
||||
"""Completion response."""
|
||||
|
||||
completion_message: CompletionMessage
|
||||
logprobs: Optional[List[TokenLogProbs]] = None
|
||||
|
||||
|
||||
@json_schema_type
|
||||
class CompletionResponseStreamChunk(BaseModel):
|
||||
"""streamed completion response."""
|
||||
|
||||
delta: str
|
||||
stop_reason: Optional[StopReason] = None
|
||||
logprobs: Optional[List[TokenLogProbs]] = None
|
||||
|
||||
|
||||
@json_schema_type
|
||||
class BatchCompletionRequest(BaseModel):
|
||||
model: str
|
||||
content_batch: List[InterleavedTextMedia]
|
||||
sampling_params: Optional[SamplingParams] = SamplingParams()
|
||||
logprobs: Optional[LogProbConfig] = None
|
||||
|
||||
|
||||
@json_schema_type
|
||||
class BatchCompletionResponse(BaseModel):
|
||||
"""Batch completion response."""
|
||||
|
||||
completion_message_batch: List[CompletionMessage]
|
||||
|
||||
|
||||
@json_schema_type
|
||||
class ChatCompletionRequest(BaseModel):
|
||||
model: str
|
||||
messages: List[Message]
|
||||
sampling_params: Optional[SamplingParams] = SamplingParams()
|
||||
|
||||
# zero-shot tool definitions as input to the model
|
||||
tools: Optional[List[ToolDefinition]] = Field(default_factory=list)
|
||||
tool_choice: Optional[ToolChoice] = Field(default=ToolChoice.auto)
|
||||
tool_prompt_format: Optional[ToolPromptFormat] = Field(
|
||||
default=ToolPromptFormat.json
|
||||
)
|
||||
|
||||
stream: Optional[bool] = False
|
||||
logprobs: Optional[LogProbConfig] = None
|
||||
|
||||
|
||||
@json_schema_type
|
||||
class ChatCompletionResponseStreamChunk(BaseModel):
|
||||
"""SSE-stream of these events."""
|
||||
|
||||
event: ChatCompletionResponseEvent
|
||||
|
||||
|
||||
@json_schema_type
|
||||
class ChatCompletionResponse(BaseModel):
|
||||
"""Chat completion response."""
|
||||
|
||||
completion_message: CompletionMessage
|
||||
logprobs: Optional[List[TokenLogProbs]] = None
|
||||
|
||||
|
||||
@json_schema_type
|
||||
class BatchChatCompletionRequest(BaseModel):
|
||||
model: str
|
||||
messages_batch: List[List[Message]]
|
||||
sampling_params: Optional[SamplingParams] = SamplingParams()
|
||||
|
||||
# zero-shot tool definitions as input to the model
|
||||
tools: Optional[List[ToolDefinition]] = Field(default_factory=list)
|
||||
tool_choice: Optional[ToolChoice] = Field(default=ToolChoice.auto)
|
||||
tool_prompt_format: Optional[ToolPromptFormat] = Field(
|
||||
default=ToolPromptFormat.json
|
||||
)
|
||||
logprobs: Optional[LogProbConfig] = None
|
||||
|
||||
|
||||
@json_schema_type
|
||||
class BatchChatCompletionResponse(BaseModel):
|
||||
completion_message_batch: List[CompletionMessage]
|
||||
|
||||
|
||||
@json_schema_type
|
||||
class EmbeddingsResponse(BaseModel):
|
||||
embeddings: List[List[float]]
|
||||
|
||||
|
||||
class Inference(Protocol):
|
||||
@webmethod(route="/inference/completion")
|
||||
async def completion(
|
||||
self,
|
||||
model: str,
|
||||
content: InterleavedTextMedia,
|
||||
sampling_params: Optional[SamplingParams] = SamplingParams(),
|
||||
stream: Optional[bool] = False,
|
||||
logprobs: Optional[LogProbConfig] = None,
|
||||
) -> Union[CompletionResponse, CompletionResponseStreamChunk]: ...
|
||||
|
||||
@webmethod(route="/inference/chat_completion")
|
||||
async def chat_completion(
|
||||
self,
|
||||
model: str,
|
||||
messages: List[Message],
|
||||
sampling_params: Optional[SamplingParams] = SamplingParams(),
|
||||
# zero-shot tool definitions as input to the model
|
||||
tools: Optional[List[ToolDefinition]] = list,
|
||||
tool_choice: Optional[ToolChoice] = ToolChoice.auto,
|
||||
tool_prompt_format: Optional[ToolPromptFormat] = ToolPromptFormat.json,
|
||||
stream: Optional[bool] = False,
|
||||
logprobs: Optional[LogProbConfig] = None,
|
||||
) -> Union[ChatCompletionResponse, ChatCompletionResponseStreamChunk]: ...
|
||||
|
||||
@webmethod(route="/inference/embeddings")
|
||||
async def embeddings(
|
||||
self,
|
||||
model: str,
|
||||
contents: List[InterleavedTextMedia],
|
||||
) -> EmbeddingsResponse: ...
|
106
llama_stack/inference/client.py
Normal file
106
llama_stack/inference/client.py
Normal file
|
@ -0,0 +1,106 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
from typing import Any, AsyncGenerator
|
||||
|
||||
import fire
|
||||
import httpx
|
||||
from pydantic import BaseModel
|
||||
from termcolor import cprint
|
||||
|
||||
from llama_stack.core.datatypes import RemoteProviderConfig
|
||||
|
||||
from .api import (
|
||||
ChatCompletionRequest,
|
||||
ChatCompletionResponse,
|
||||
ChatCompletionResponseStreamChunk,
|
||||
CompletionRequest,
|
||||
Inference,
|
||||
UserMessage,
|
||||
)
|
||||
from .event_logger import EventLogger
|
||||
|
||||
|
||||
async def get_client_impl(config: RemoteProviderConfig, _deps: Any) -> Inference:
|
||||
return InferenceClient(config.url)
|
||||
|
||||
|
||||
def encodable_dict(d: BaseModel):
|
||||
return json.loads(d.json())
|
||||
|
||||
|
||||
class InferenceClient(Inference):
|
||||
def __init__(self, base_url: str):
|
||||
self.base_url = base_url
|
||||
|
||||
async def initialize(self) -> None:
|
||||
pass
|
||||
|
||||
async def shutdown(self) -> None:
|
||||
pass
|
||||
|
||||
async def completion(self, request: CompletionRequest) -> AsyncGenerator:
|
||||
raise NotImplementedError()
|
||||
|
||||
async def chat_completion(self, request: ChatCompletionRequest) -> AsyncGenerator:
|
||||
async with httpx.AsyncClient() as client:
|
||||
async with client.stream(
|
||||
"POST",
|
||||
f"{self.base_url}/inference/chat_completion",
|
||||
json=encodable_dict(request),
|
||||
headers={"Content-Type": "application/json"},
|
||||
timeout=20,
|
||||
) as response:
|
||||
if response.status_code != 200:
|
||||
content = await response.aread()
|
||||
cprint(
|
||||
f"Error: HTTP {response.status_code} {content.decode()}", "red"
|
||||
)
|
||||
return
|
||||
|
||||
async for line in response.aiter_lines():
|
||||
if line.startswith("data:"):
|
||||
data = line[len("data: ") :]
|
||||
try:
|
||||
if request.stream:
|
||||
if "error" in data:
|
||||
cprint(data, "red")
|
||||
continue
|
||||
|
||||
yield ChatCompletionResponseStreamChunk(
|
||||
**json.loads(data)
|
||||
)
|
||||
else:
|
||||
yield ChatCompletionResponse(**json.loads(data))
|
||||
except Exception as e:
|
||||
print(data)
|
||||
print(f"Error with parsing or validation: {e}")
|
||||
|
||||
|
||||
async def run_main(host: str, port: int, stream: bool):
|
||||
client = InferenceClient(f"http://{host}:{port}")
|
||||
|
||||
message = UserMessage(content="hello world, troll me in two-paragraphs about 42")
|
||||
cprint(f"User>{message.content}", "green")
|
||||
iterator = client.chat_completion(
|
||||
ChatCompletionRequest(
|
||||
model="Meta-Llama3.1-8B-Instruct",
|
||||
messages=[message],
|
||||
stream=stream,
|
||||
)
|
||||
)
|
||||
async for log in EventLogger().log(iterator):
|
||||
log.print()
|
||||
|
||||
|
||||
def main(host: str, port: int, stream: bool = True):
|
||||
asyncio.run(run_main(host, port, stream))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
fire.Fire(main)
|
43
llama_stack/inference/event_logger.py
Normal file
43
llama_stack/inference/event_logger.py
Normal file
|
@ -0,0 +1,43 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
from termcolor import cprint
|
||||
|
||||
from llama_stack.inference.api import (
|
||||
ChatCompletionResponseEventType,
|
||||
ChatCompletionResponseStreamChunk,
|
||||
)
|
||||
|
||||
|
||||
class LogEvent:
|
||||
def __init__(
|
||||
self,
|
||||
content: str = "",
|
||||
end: str = "\n",
|
||||
color="white",
|
||||
):
|
||||
self.content = content
|
||||
self.color = color
|
||||
self.end = "\n" if end is None else end
|
||||
|
||||
def print(self, flush=True):
|
||||
cprint(f"{self.content}", color=self.color, end=self.end, flush=flush)
|
||||
|
||||
|
||||
class EventLogger:
|
||||
async def log(self, event_generator):
|
||||
async for chunk in event_generator:
|
||||
if isinstance(chunk, ChatCompletionResponseStreamChunk):
|
||||
event = chunk.event
|
||||
if event.event_type == ChatCompletionResponseEventType.start:
|
||||
yield LogEvent("Assistant> ", color="cyan", end="")
|
||||
elif event.event_type == ChatCompletionResponseEventType.progress:
|
||||
yield LogEvent(event.delta, color="yellow", end="")
|
||||
elif event.event_type == ChatCompletionResponseEventType.complete:
|
||||
yield LogEvent("")
|
||||
else:
|
||||
yield LogEvent("Assistant> ", color="cyan", end="")
|
||||
yield LogEvent(chunk.completion_message.content, color="yellow")
|
19
llama_stack/inference/meta_reference/__init__.py
Normal file
19
llama_stack/inference/meta_reference/__init__.py
Normal file
|
@ -0,0 +1,19 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
from .config import MetaReferenceImplConfig # noqa
|
||||
|
||||
|
||||
async def get_provider_impl(config: MetaReferenceImplConfig, _deps):
|
||||
from .inference import MetaReferenceInferenceImpl
|
||||
|
||||
assert isinstance(
|
||||
config, MetaReferenceImplConfig
|
||||
), f"Unexpected config type: {type(config)}"
|
||||
|
||||
impl = MetaReferenceInferenceImpl(config)
|
||||
await impl.initialize()
|
||||
return impl
|
57
llama_stack/inference/meta_reference/config.py
Normal file
57
llama_stack/inference/meta_reference/config.py
Normal file
|
@ -0,0 +1,57 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
from typing import Optional
|
||||
|
||||
from llama_models.datatypes import ModelFamily
|
||||
|
||||
from llama_models.schema_utils import json_schema_type
|
||||
from llama_models.sku_list import all_registered_models, resolve_model
|
||||
|
||||
from pydantic import BaseModel, Field, field_validator
|
||||
|
||||
from llama_stack.inference.api import QuantizationConfig
|
||||
|
||||
|
||||
@json_schema_type
|
||||
class MetaReferenceImplConfig(BaseModel):
|
||||
model: str = Field(
|
||||
default="Meta-Llama3.1-8B-Instruct",
|
||||
description="Model descriptor from `llama model list`",
|
||||
)
|
||||
quantization: Optional[QuantizationConfig] = None
|
||||
torch_seed: Optional[int] = None
|
||||
max_seq_len: int
|
||||
max_batch_size: int = 1
|
||||
|
||||
@field_validator("model")
|
||||
@classmethod
|
||||
def validate_model(cls, model: str) -> str:
|
||||
permitted_models = [
|
||||
m.descriptor()
|
||||
for m in all_registered_models()
|
||||
if m.model_family == ModelFamily.llama3_1
|
||||
]
|
||||
if model not in permitted_models:
|
||||
model_list = "\n\t".join(permitted_models)
|
||||
raise ValueError(
|
||||
f"Unknown model: `{model}`. Choose from [\n\t{model_list}\n]"
|
||||
)
|
||||
return model
|
||||
|
||||
@property
|
||||
def model_parallel_size(self) -> int:
|
||||
# HUGE HACK ALERT: this will be fixed when we move inference configuration
|
||||
# to ModelsRegistry and we can explicitly ask for `model_parallel_size`
|
||||
# as configuration there
|
||||
gpu_count = 1
|
||||
resolved = resolve_model(self.model)
|
||||
assert resolved is not None
|
||||
descriptor = resolved.descriptor().lower()
|
||||
if "-70b" in descriptor or "-405b" in descriptor:
|
||||
gpu_count = 8
|
||||
|
||||
return gpu_count
|
327
llama_stack/inference/meta_reference/generation.py
Normal file
327
llama_stack/inference/meta_reference/generation.py
Normal file
|
@ -0,0 +1,327 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# This software may be used and distributed in accordance with the terms of the Llama 3 Community License Agreement.
|
||||
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import Generator, List, Optional
|
||||
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
from fairscale.nn.model_parallel.initialize import (
|
||||
get_model_parallel_rank,
|
||||
initialize_model_parallel,
|
||||
model_parallel_is_initialized,
|
||||
)
|
||||
from llama_models.llama3.api.args import ModelArgs
|
||||
from llama_models.llama3.api.chat_format import ChatFormat, ModelInput
|
||||
from llama_models.llama3.api.datatypes import Message, ToolPromptFormat
|
||||
from llama_models.llama3.api.tokenizer import Tokenizer
|
||||
from llama_models.llama3.reference_impl.model import Transformer
|
||||
from llama_models.sku_list import resolve_model
|
||||
from termcolor import cprint
|
||||
|
||||
from llama_stack.common.model_utils import model_local_dir
|
||||
from llama_stack.inference.api import QuantizationType
|
||||
|
||||
from .config import MetaReferenceImplConfig
|
||||
|
||||
|
||||
def model_checkpoint_dir(model) -> str:
|
||||
checkpoint_dir = Path(model_local_dir(model.descriptor()))
|
||||
if not Path(checkpoint_dir / "consolidated.00.pth").exists():
|
||||
checkpoint_dir = checkpoint_dir / "original"
|
||||
|
||||
assert checkpoint_dir.exists(), (
|
||||
f"Could not find checkpoint dir: {checkpoint_dir}."
|
||||
f"Please download model using `llama download {model.descriptor()}`"
|
||||
)
|
||||
return str(checkpoint_dir)
|
||||
|
||||
|
||||
@dataclass
|
||||
class TokenResult:
|
||||
token: int
|
||||
text: str
|
||||
logprobs: Optional[List[float]] = None
|
||||
|
||||
|
||||
class Llama:
|
||||
@staticmethod
|
||||
def build(config: MetaReferenceImplConfig):
|
||||
"""
|
||||
Build a Llama instance by initializing and loading a model checkpoint.
|
||||
|
||||
Note:
|
||||
This method initializes the distributed process group, sets the device to CUDA,
|
||||
and loads the pre-trained model and tokenizer.
|
||||
"""
|
||||
model = resolve_model(config.model)
|
||||
|
||||
if (
|
||||
config.quantization
|
||||
and config.quantization.type == QuantizationType.fp8.value
|
||||
):
|
||||
from .quantization.loader import is_fbgemm_available
|
||||
|
||||
if not is_fbgemm_available():
|
||||
raise ImportError("fbgemm-gpu is required for FP8 quantization")
|
||||
|
||||
if not torch.distributed.is_initialized():
|
||||
torch.distributed.init_process_group("nccl")
|
||||
|
||||
model_parallel_size = config.model_parallel_size
|
||||
|
||||
if not model_parallel_is_initialized():
|
||||
initialize_model_parallel(model_parallel_size)
|
||||
|
||||
local_rank = int(os.environ.get("LOCAL_RANK", 0))
|
||||
torch.cuda.set_device(local_rank)
|
||||
|
||||
# seed must be the same in all processes
|
||||
if config.torch_seed is not None:
|
||||
torch.manual_seed(config.torch_seed)
|
||||
|
||||
if local_rank > 0:
|
||||
sys.stdout = open(os.devnull, "w")
|
||||
|
||||
start_time = time.time()
|
||||
ckpt_dir = model_checkpoint_dir(model)
|
||||
|
||||
checkpoints = sorted(Path(ckpt_dir).glob("*.pth"))
|
||||
assert len(checkpoints) > 0, f"no checkpoint files found in {ckpt_dir}"
|
||||
assert model_parallel_size == len(
|
||||
checkpoints
|
||||
), f"Loading a checkpoint for MP={len(checkpoints)} but world size is {model_parallel_size}"
|
||||
ckpt_path = checkpoints[get_model_parallel_rank()]
|
||||
state_dict = torch.load(ckpt_path, map_location="cpu", weights_only=True)
|
||||
with open(Path(ckpt_dir) / "params.json", "r") as f:
|
||||
params = json.loads(f.read())
|
||||
|
||||
if "model" in params:
|
||||
params = params["model"]
|
||||
|
||||
model_args: ModelArgs = ModelArgs(
|
||||
max_seq_len=config.max_seq_len,
|
||||
max_batch_size=config.max_batch_size,
|
||||
**params,
|
||||
)
|
||||
|
||||
tokenizer_path = os.path.join(ckpt_dir, "tokenizer.model")
|
||||
tokenizer = Tokenizer(model_path=tokenizer_path)
|
||||
|
||||
assert (
|
||||
model_args.vocab_size == tokenizer.n_words
|
||||
), f"model_args vocab = {model_args.vocab_size} but tokenizer vocab = {tokenizer.n_words}"
|
||||
|
||||
fp8 = (
|
||||
config.quantization
|
||||
and config.quantization.type == QuantizationType.fp8.value
|
||||
)
|
||||
|
||||
if fp8:
|
||||
from .quantization.loader import convert_to_quantized_model
|
||||
|
||||
# load on CPU in bf16 so that fp8 conversion does not find an
|
||||
# unexpected (fp32, e.g.) datatype
|
||||
torch.set_default_tensor_type(torch.BFloat16Tensor)
|
||||
model = Transformer(model_args)
|
||||
model.load_state_dict(state_dict, strict=False)
|
||||
model = convert_to_quantized_model(model, config)
|
||||
else:
|
||||
if torch.cuda.is_bf16_supported():
|
||||
torch.set_default_tensor_type(torch.cuda.BFloat16Tensor)
|
||||
else:
|
||||
torch.set_default_tensor_type(torch.cuda.HalfTensor)
|
||||
model = Transformer(model_args)
|
||||
model.load_state_dict(state_dict, strict=False)
|
||||
|
||||
print(f"Loaded in {time.time() - start_time:.2f} seconds")
|
||||
return Llama(model, tokenizer, model_args)
|
||||
|
||||
def __init__(self, model: Transformer, tokenizer: Tokenizer, args: ModelArgs):
|
||||
self.args = args
|
||||
self.model = model
|
||||
self.tokenizer = tokenizer
|
||||
self.formatter = ChatFormat(tokenizer)
|
||||
|
||||
@torch.inference_mode()
|
||||
def generate(
|
||||
self,
|
||||
model_input: ModelInput,
|
||||
max_gen_len: int,
|
||||
temperature: float = 0.6,
|
||||
top_p: float = 0.9,
|
||||
logprobs: bool = False,
|
||||
echo: bool = False,
|
||||
include_stop_token: bool = False,
|
||||
) -> Generator:
|
||||
params = self.model.params
|
||||
|
||||
# cprint("Input to model -> " + self.tokenizer.decode(model_input.tokens), "red")
|
||||
prompt_tokens = [model_input.tokens]
|
||||
|
||||
bsz = 1
|
||||
assert bsz <= params.max_batch_size, (bsz, params.max_batch_size)
|
||||
|
||||
min_prompt_len = min(len(t) for t in prompt_tokens)
|
||||
max_prompt_len = max(len(t) for t in prompt_tokens)
|
||||
|
||||
if max_prompt_len >= params.max_seq_len:
|
||||
cprint(
|
||||
f"Out of token budget {max_prompt_len} vs {params.max_seq_len}", "red"
|
||||
)
|
||||
return
|
||||
|
||||
total_len = min(max_gen_len + max_prompt_len, params.max_seq_len)
|
||||
pad_id = self.tokenizer.pad_id
|
||||
tokens = torch.full((bsz, total_len), pad_id, dtype=torch.long, device="cuda")
|
||||
for k, t in enumerate(prompt_tokens):
|
||||
tokens[k, : len(t)] = torch.tensor(t, dtype=torch.long, device="cuda")
|
||||
if logprobs:
|
||||
token_logprobs = torch.zeros_like(tokens, dtype=torch.float)
|
||||
|
||||
prev_pos = 0
|
||||
eos_reached = torch.tensor([False] * bsz, device="cuda")
|
||||
input_text_mask = tokens != pad_id
|
||||
if min_prompt_len == total_len:
|
||||
# TODO(ashwin): unify this branch with the one below and figure out multimodal crap
|
||||
logits = self.model.forward(tokens, prev_pos)
|
||||
token_logprobs = -F.cross_entropy(
|
||||
input=logits.transpose(1, 2),
|
||||
target=tokens,
|
||||
reduction="none",
|
||||
ignore_index=pad_id,
|
||||
)
|
||||
|
||||
stop_tokens = torch.tensor(self.tokenizer.stop_tokens)
|
||||
|
||||
for cur_pos in range(min_prompt_len, total_len):
|
||||
logits = self.model.forward(tokens[:, prev_pos:cur_pos], prev_pos)
|
||||
|
||||
if temperature > 0:
|
||||
probs = torch.softmax(logits[:, -1] / temperature, dim=-1)
|
||||
next_token = sample_top_p(probs, top_p)
|
||||
else:
|
||||
next_token = torch.argmax(logits[:, -1], dim=-1)
|
||||
|
||||
next_token = next_token.reshape(-1)
|
||||
# only replace token if prompt has already been generated
|
||||
next_token = torch.where(
|
||||
input_text_mask[:, cur_pos], tokens[:, cur_pos], next_token
|
||||
)
|
||||
tokens[:, cur_pos] = next_token
|
||||
|
||||
target = tokens[:, prev_pos + 1 : cur_pos + 1]
|
||||
if logprobs:
|
||||
token_logprobs[:, prev_pos + 1 : cur_pos + 1] = -F.cross_entropy(
|
||||
input=logits.transpose(1, 2),
|
||||
target=tokens[:, prev_pos + 1 : cur_pos + 1],
|
||||
reduction="none",
|
||||
ignore_index=pad_id,
|
||||
)
|
||||
eos_reached |= (~input_text_mask[:, cur_pos]) & (
|
||||
torch.isin(next_token, stop_tokens)
|
||||
)
|
||||
yield TokenResult(
|
||||
token=next_token[0].item(),
|
||||
text=self.tokenizer.decode(next_token.tolist()),
|
||||
logprobs=(
|
||||
token_logprobs[:, prev_pos + 1 : cur_pos + 1][0].tolist()
|
||||
if logprobs
|
||||
else None
|
||||
),
|
||||
)
|
||||
|
||||
prev_pos = cur_pos
|
||||
if all(eos_reached):
|
||||
break
|
||||
|
||||
def text_completion(
|
||||
self,
|
||||
prompt: str,
|
||||
temperature: float = 0.6,
|
||||
top_p: float = 0.9,
|
||||
max_gen_len: Optional[int] = None,
|
||||
logprobs: bool = False,
|
||||
echo: bool = False,
|
||||
) -> Generator:
|
||||
if (
|
||||
max_gen_len is None
|
||||
or max_gen_len == 0
|
||||
or max_gen_len >= self.model.params.max_seq_len
|
||||
):
|
||||
max_gen_len = self.model.params.max_seq_len - 1
|
||||
|
||||
prompt_tokens = self.tokenizer.encode(prompt, bos=True, eos=False)
|
||||
|
||||
yield from self.generate(
|
||||
model_input=ModelInput(tokens=prompt_tokens),
|
||||
max_gen_len=max_gen_len,
|
||||
temperature=temperature,
|
||||
top_p=top_p,
|
||||
logprobs=logprobs,
|
||||
echo=echo,
|
||||
)
|
||||
|
||||
def chat_completion(
|
||||
self,
|
||||
messages: List[Message],
|
||||
temperature: float = 0.6,
|
||||
top_p: float = 0.9,
|
||||
max_gen_len: Optional[int] = None,
|
||||
logprobs: bool = False,
|
||||
tool_prompt_format: ToolPromptFormat = ToolPromptFormat.json,
|
||||
) -> Generator:
|
||||
if (
|
||||
max_gen_len is None
|
||||
or max_gen_len == 0
|
||||
or max_gen_len >= self.model.params.max_seq_len
|
||||
):
|
||||
max_gen_len = self.model.params.max_seq_len - 1
|
||||
|
||||
yield from self.generate(
|
||||
model_input=self.formatter.encode_dialog_prompt(
|
||||
messages,
|
||||
tool_prompt_format,
|
||||
),
|
||||
max_gen_len=max_gen_len,
|
||||
temperature=temperature,
|
||||
top_p=top_p,
|
||||
logprobs=logprobs,
|
||||
include_stop_token=True,
|
||||
)
|
||||
|
||||
|
||||
def sample_top_p(probs, p):
|
||||
"""
|
||||
Perform top-p (nucleus) sampling on a probability distribution.
|
||||
|
||||
Args:
|
||||
probs (torch.Tensor): Probability distribution tensor.
|
||||
p (float): Probability threshold for top-p sampling.
|
||||
|
||||
Returns:
|
||||
torch.Tensor: Sampled token indices.
|
||||
|
||||
Note:
|
||||
Top-p sampling selects the smallest set of tokens whose cumulative probability mass
|
||||
exceeds the threshold p. The distribution is renormalized based on the selected tokens.
|
||||
"""
|
||||
probs_sort, probs_idx = torch.sort(probs, dim=-1, descending=True)
|
||||
probs_sum = torch.cumsum(probs_sort, dim=-1)
|
||||
mask = probs_sum - probs_sort > p
|
||||
probs_sort[mask] = 0.0
|
||||
probs_sort.div_(probs_sort.sum(dim=-1, keepdim=True))
|
||||
next_token = torch.multinomial(probs_sort, num_samples=1)
|
||||
next_token = torch.gather(probs_idx, -1, next_token)
|
||||
return next_token
|
215
llama_stack/inference/meta_reference/inference.py
Normal file
215
llama_stack/inference/meta_reference/inference.py
Normal file
|
@ -0,0 +1,215 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
import asyncio
|
||||
|
||||
from typing import AsyncIterator, Union
|
||||
|
||||
from llama_models.llama3.api.datatypes import StopReason
|
||||
from llama_models.sku_list import resolve_model
|
||||
|
||||
from llama_stack.inference.api import (
|
||||
ChatCompletionRequest,
|
||||
ChatCompletionResponse,
|
||||
ChatCompletionResponseEvent,
|
||||
ChatCompletionResponseEventType,
|
||||
ChatCompletionResponseStreamChunk,
|
||||
Inference,
|
||||
ToolCallDelta,
|
||||
ToolCallParseStatus,
|
||||
)
|
||||
from llama_stack.inference.prepare_messages import prepare_messages
|
||||
|
||||
from .config import MetaReferenceImplConfig
|
||||
from .model_parallel import LlamaModelParallelGenerator
|
||||
|
||||
from llama_models.llama3.api.datatypes import * # noqa: F403
|
||||
from llama_stack.inference.api import * # noqa: F403
|
||||
|
||||
# there's a single model parallel process running serving the model. for now,
|
||||
# we don't support multiple concurrent requests to this process.
|
||||
SEMAPHORE = asyncio.Semaphore(1)
|
||||
|
||||
|
||||
class MetaReferenceInferenceImpl(Inference):
|
||||
def __init__(self, config: MetaReferenceImplConfig) -> None:
|
||||
self.config = config
|
||||
model = resolve_model(config.model)
|
||||
if model is None:
|
||||
raise RuntimeError(f"Unknown model: {config.model}, Run `llama model list`")
|
||||
self.model = model
|
||||
# verify that the checkpoint actually is for this model lol
|
||||
|
||||
async def initialize(self) -> None:
|
||||
self.generator = LlamaModelParallelGenerator(self.config)
|
||||
self.generator.start()
|
||||
|
||||
async def shutdown(self) -> None:
|
||||
self.generator.stop()
|
||||
|
||||
# hm, when stream=False, we should not be doing SSE :/ which is what the
|
||||
# top-level server is going to do. make the typing more specific here
|
||||
async def chat_completion(
|
||||
self,
|
||||
model: str,
|
||||
messages: List[Message],
|
||||
sampling_params: Optional[SamplingParams] = SamplingParams(),
|
||||
tools: Optional[List[ToolDefinition]] = list(),
|
||||
tool_choice: Optional[ToolChoice] = ToolChoice.auto,
|
||||
tool_prompt_format: Optional[ToolPromptFormat] = ToolPromptFormat.json,
|
||||
stream: Optional[bool] = False,
|
||||
logprobs: Optional[LogProbConfig] = None,
|
||||
) -> AsyncIterator[
|
||||
Union[ChatCompletionResponseStreamChunk, ChatCompletionResponse]
|
||||
]:
|
||||
# wrapper request to make it easier to pass around (internal only, not exposed to API)
|
||||
request = ChatCompletionRequest(
|
||||
model=model,
|
||||
messages=messages,
|
||||
sampling_params=sampling_params,
|
||||
tools=tools,
|
||||
tool_choice=tool_choice,
|
||||
tool_prompt_format=tool_prompt_format,
|
||||
stream=stream,
|
||||
logprobs=logprobs,
|
||||
)
|
||||
|
||||
messages = prepare_messages(request)
|
||||
model = resolve_model(request.model)
|
||||
if model is None:
|
||||
raise RuntimeError(
|
||||
f"Unknown model: {request.model}, Run `llama model list`"
|
||||
)
|
||||
elif model.descriptor() != self.model.descriptor():
|
||||
raise RuntimeError(
|
||||
f"Model mismatch: {request.model} != {self.model.descriptor()}"
|
||||
)
|
||||
|
||||
if SEMAPHORE.locked():
|
||||
raise RuntimeError("Only one concurrent request is supported")
|
||||
|
||||
async with SEMAPHORE:
|
||||
if request.stream:
|
||||
yield ChatCompletionResponseStreamChunk(
|
||||
event=ChatCompletionResponseEvent(
|
||||
event_type=ChatCompletionResponseEventType.start,
|
||||
delta="",
|
||||
)
|
||||
)
|
||||
|
||||
tokens = []
|
||||
logprobs = []
|
||||
|
||||
stop_reason = None
|
||||
|
||||
buffer = ""
|
||||
ipython = False
|
||||
|
||||
for token_result in self.generator.chat_completion(
|
||||
messages=messages,
|
||||
temperature=request.sampling_params.temperature,
|
||||
top_p=request.sampling_params.top_p,
|
||||
max_gen_len=request.sampling_params.max_tokens,
|
||||
logprobs=request.logprobs,
|
||||
tool_prompt_format=request.tool_prompt_format,
|
||||
):
|
||||
buffer += token_result.text
|
||||
tokens.append(token_result.token)
|
||||
|
||||
if not ipython and buffer.startswith("<|python_tag|>"):
|
||||
ipython = True
|
||||
yield ChatCompletionResponseStreamChunk(
|
||||
event=ChatCompletionResponseEvent(
|
||||
event_type=ChatCompletionResponseEventType.progress,
|
||||
delta=ToolCallDelta(
|
||||
content="",
|
||||
parse_status=ToolCallParseStatus.started,
|
||||
),
|
||||
)
|
||||
)
|
||||
buffer = buffer[len("<|python_tag|>") :]
|
||||
continue
|
||||
|
||||
if not request.stream:
|
||||
if request.logprobs:
|
||||
logprobs.append(token_result.logprob)
|
||||
|
||||
continue
|
||||
|
||||
if token_result.text == "<|eot_id|>":
|
||||
stop_reason = StopReason.end_of_turn
|
||||
text = ""
|
||||
elif token_result.text == "<|eom_id|>":
|
||||
stop_reason = StopReason.end_of_message
|
||||
text = ""
|
||||
else:
|
||||
text = token_result.text
|
||||
|
||||
if ipython:
|
||||
delta = ToolCallDelta(
|
||||
content=text,
|
||||
parse_status=ToolCallParseStatus.in_progress,
|
||||
)
|
||||
else:
|
||||
delta = text
|
||||
|
||||
if stop_reason is None:
|
||||
yield ChatCompletionResponseStreamChunk(
|
||||
event=ChatCompletionResponseEvent(
|
||||
event_type=ChatCompletionResponseEventType.progress,
|
||||
delta=delta,
|
||||
stop_reason=stop_reason,
|
||||
)
|
||||
)
|
||||
|
||||
if stop_reason is None:
|
||||
stop_reason = StopReason.out_of_tokens
|
||||
|
||||
# TODO(ashwin): parse tool calls separately here and report errors?
|
||||
# if someone breaks the iteration before coming here we are toast
|
||||
message = self.generator.formatter.decode_assistant_message(
|
||||
tokens, stop_reason
|
||||
)
|
||||
if request.stream:
|
||||
parsed_tool_calls = len(message.tool_calls) > 0
|
||||
if ipython and not parsed_tool_calls:
|
||||
yield ChatCompletionResponseStreamChunk(
|
||||
event=ChatCompletionResponseEvent(
|
||||
event_type=ChatCompletionResponseEventType.progress,
|
||||
delta=ToolCallDelta(
|
||||
content="",
|
||||
parse_status=ToolCallParseStatus.failure,
|
||||
),
|
||||
stop_reason=stop_reason,
|
||||
)
|
||||
)
|
||||
|
||||
for tool_call in message.tool_calls:
|
||||
yield ChatCompletionResponseStreamChunk(
|
||||
event=ChatCompletionResponseEvent(
|
||||
event_type=ChatCompletionResponseEventType.progress,
|
||||
delta=ToolCallDelta(
|
||||
content=tool_call,
|
||||
parse_status=ToolCallParseStatus.success,
|
||||
),
|
||||
stop_reason=stop_reason,
|
||||
)
|
||||
)
|
||||
|
||||
yield ChatCompletionResponseStreamChunk(
|
||||
event=ChatCompletionResponseEvent(
|
||||
event_type=ChatCompletionResponseEventType.complete,
|
||||
delta="",
|
||||
stop_reason=stop_reason,
|
||||
)
|
||||
)
|
||||
|
||||
# TODO(ashwin): what else do we need to send out here when everything finishes?
|
||||
else:
|
||||
yield ChatCompletionResponse(
|
||||
completion_message=message,
|
||||
logprobs=logprobs if request.logprobs else None,
|
||||
)
|
110
llama_stack/inference/meta_reference/model_parallel.py
Normal file
110
llama_stack/inference/meta_reference/model_parallel.py
Normal file
|
@ -0,0 +1,110 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
import os
|
||||
from copy import deepcopy
|
||||
from dataclasses import dataclass
|
||||
from functools import partial
|
||||
from typing import Generator, List, Optional
|
||||
|
||||
from llama_models.llama3.api.chat_format import ChatFormat
|
||||
from llama_models.llama3.api.datatypes import Message, ToolPromptFormat
|
||||
from llama_models.llama3.api.tokenizer import Tokenizer
|
||||
from llama_models.sku_list import resolve_model
|
||||
|
||||
from .config import MetaReferenceImplConfig
|
||||
from .generation import Llama, model_checkpoint_dir
|
||||
from .parallel_utils import ModelParallelProcessGroup
|
||||
|
||||
|
||||
@dataclass
|
||||
class InferenceArgs:
|
||||
messages: List[Message]
|
||||
temperature: float
|
||||
top_p: float
|
||||
max_gen_len: int
|
||||
logprobs: bool
|
||||
tool_prompt_format: ToolPromptFormat
|
||||
|
||||
|
||||
class ModelRunner:
|
||||
def __init__(self, llama):
|
||||
self.llama = llama
|
||||
|
||||
# the `task` object is the same that is sent to `ModelParallelProcessGroup.run_inference()`
|
||||
def __call__(self, task: InferenceArgs):
|
||||
return self.llama.chat_completion(
|
||||
task.messages,
|
||||
task.temperature,
|
||||
task.top_p,
|
||||
task.max_gen_len,
|
||||
task.logprobs,
|
||||
task.tool_prompt_format,
|
||||
)
|
||||
|
||||
|
||||
def init_model_cb(config: MetaReferenceImplConfig):
|
||||
llama = Llama.build(config)
|
||||
return ModelRunner(llama)
|
||||
|
||||
|
||||
class LlamaModelParallelGenerator:
|
||||
"""
|
||||
This abstraction exists so
|
||||
- we can run model parallel code without needing to run the CLIs via torchrun
|
||||
- this also enables use model parallel code within a notebook context.
|
||||
|
||||
A Context Manager is used to ensure that the model parallel process is started and stopped
|
||||
correctly. This does make the ergonomics a little awkward, because it isn't immediately
|
||||
clear at the callsite why we need to use a context manager.
|
||||
"""
|
||||
|
||||
def __init__(self, config: MetaReferenceImplConfig):
|
||||
self.config = config
|
||||
self.model = resolve_model(self.config.model)
|
||||
# this is a hack because Agent's loop uses this to tokenize and check if input is too long
|
||||
# while the tool-use loop is going
|
||||
checkpoint_dir = model_checkpoint_dir(self.model)
|
||||
tokenizer_path = os.path.join(checkpoint_dir, "tokenizer.model")
|
||||
self.formatter = ChatFormat(Tokenizer(tokenizer_path))
|
||||
|
||||
def start(self):
|
||||
self.__enter__()
|
||||
|
||||
def stop(self):
|
||||
self.__exit__(None, None, None)
|
||||
|
||||
def __enter__(self):
|
||||
self.group = ModelParallelProcessGroup(
|
||||
self.config.model_parallel_size,
|
||||
init_model_cb=partial(init_model_cb, self.config),
|
||||
)
|
||||
self.group.start()
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc_value, exc_traceback):
|
||||
self.group.stop()
|
||||
|
||||
def chat_completion(
|
||||
self,
|
||||
messages: List[Message],
|
||||
temperature: float = 0.6,
|
||||
top_p: float = 0.9,
|
||||
max_gen_len: Optional[int] = None,
|
||||
logprobs: bool = False,
|
||||
tool_prompt_format: ToolPromptFormat = ToolPromptFormat.json,
|
||||
) -> Generator:
|
||||
req_obj = InferenceArgs(
|
||||
messages=deepcopy(messages),
|
||||
temperature=temperature,
|
||||
top_p=top_p,
|
||||
max_gen_len=max_gen_len,
|
||||
logprobs=logprobs,
|
||||
tool_prompt_format=tool_prompt_format,
|
||||
)
|
||||
|
||||
gen = self.group.run_inference(req_obj)
|
||||
yield from gen
|
265
llama_stack/inference/meta_reference/parallel_utils.py
Normal file
265
llama_stack/inference/meta_reference/parallel_utils.py
Normal file
|
@ -0,0 +1,265 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
import multiprocessing
|
||||
import os
|
||||
import pickle
|
||||
import tempfile
|
||||
import time
|
||||
import uuid
|
||||
|
||||
from typing import Callable, Generator
|
||||
|
||||
import torch
|
||||
|
||||
import zmq
|
||||
|
||||
from fairscale.nn.model_parallel.initialize import (
|
||||
get_model_parallel_group,
|
||||
get_model_parallel_rank,
|
||||
get_model_parallel_src_rank,
|
||||
)
|
||||
|
||||
from torch.distributed.launcher.api import elastic_launch, LaunchConfig
|
||||
|
||||
|
||||
_END_SENTINEL = "__end_sentinel__"
|
||||
_CANCEL_SENTINEL = "__cancel_sentinel__"
|
||||
|
||||
|
||||
def mp_rank_0() -> bool:
|
||||
return get_model_parallel_rank() == 0
|
||||
|
||||
|
||||
def retrieve_requests(reply_socket_url: str):
|
||||
if mp_rank_0():
|
||||
context = zmq.Context()
|
||||
reply_socket = context.socket(zmq.ROUTER)
|
||||
reply_socket.connect(reply_socket_url)
|
||||
|
||||
while True:
|
||||
client_id, obj = maybe_get_work(reply_socket)
|
||||
if obj is None:
|
||||
time.sleep(0.01)
|
||||
continue
|
||||
|
||||
reply_socket.send_multipart([client_id, pickle.dumps("YES READY")])
|
||||
break
|
||||
|
||||
def send_obj(obj):
|
||||
reply_socket.send_multipart([client_id, pickle.dumps(obj)])
|
||||
|
||||
while True:
|
||||
tasks = [None]
|
||||
if mp_rank_0():
|
||||
client_id, task = maybe_get_work(reply_socket)
|
||||
# there is still an unknown unclean GeneratorExit happening resulting in a
|
||||
# cancel sentinel getting queued _after_ we have finished sending everything :/
|
||||
# kind of a hack this is :/
|
||||
if task != _CANCEL_SENTINEL:
|
||||
tasks = [task]
|
||||
|
||||
torch.distributed.broadcast_object_list(
|
||||
tasks,
|
||||
src=get_model_parallel_src_rank(),
|
||||
group=get_model_parallel_group(),
|
||||
)
|
||||
|
||||
task = tasks[0]
|
||||
if task is None:
|
||||
time.sleep(0.1)
|
||||
else:
|
||||
try:
|
||||
out = yield task
|
||||
if out is None:
|
||||
break
|
||||
|
||||
for obj in out:
|
||||
updates = [None]
|
||||
if mp_rank_0():
|
||||
_, update = maybe_get_work(reply_socket)
|
||||
if update == _CANCEL_SENTINEL:
|
||||
updates = [update]
|
||||
else:
|
||||
# only send the update if it's not cancelled otherwise the object sits in the socket
|
||||
# and gets pulled in the next request lol
|
||||
send_obj(obj)
|
||||
|
||||
torch.distributed.broadcast_object_list(
|
||||
updates,
|
||||
src=get_model_parallel_src_rank(),
|
||||
group=get_model_parallel_group(),
|
||||
)
|
||||
if updates[0] == _CANCEL_SENTINEL:
|
||||
print("quitting generation loop because request was cancelled")
|
||||
break
|
||||
|
||||
if mp_rank_0():
|
||||
send_obj(_END_SENTINEL)
|
||||
except Exception as e:
|
||||
print(f"[debug] got exception {e}")
|
||||
import traceback
|
||||
|
||||
traceback.print_exc()
|
||||
if mp_rank_0():
|
||||
send_obj(e)
|
||||
|
||||
if mp_rank_0():
|
||||
send_obj("DONE")
|
||||
|
||||
|
||||
def maybe_get_work(sock: zmq.Socket):
|
||||
message = None
|
||||
client_id = None
|
||||
try:
|
||||
client_id, obj = sock.recv_multipart(zmq.NOBLOCK)
|
||||
message = pickle.loads(obj)
|
||||
except zmq.ZMQError as e:
|
||||
if e.errno != zmq.EAGAIN:
|
||||
raise e
|
||||
|
||||
return client_id, message
|
||||
|
||||
|
||||
def worker_process_entrypoint(
|
||||
reply_socket_url: str,
|
||||
init_model_cb: Callable,
|
||||
) -> None:
|
||||
model = init_model_cb()
|
||||
torch.distributed.barrier()
|
||||
time.sleep(1)
|
||||
|
||||
# run the requests co-routine which retrieves requests from the socket
|
||||
# and sends responses (we provide) back to the caller
|
||||
req_gen = retrieve_requests(reply_socket_url)
|
||||
result = None
|
||||
while True:
|
||||
try:
|
||||
task = req_gen.send(result)
|
||||
if isinstance(task, str) and task == _END_SENTINEL:
|
||||
break
|
||||
|
||||
result = model(task)
|
||||
except StopIteration:
|
||||
break
|
||||
|
||||
print("[debug] worker process done")
|
||||
|
||||
|
||||
def launch_dist_group(
|
||||
reply_socket_url: str,
|
||||
model_parallel_size: int,
|
||||
init_model_cb: Callable,
|
||||
**kwargs,
|
||||
) -> None:
|
||||
id = uuid.uuid4().hex
|
||||
dist_url = f"file:///tmp/llama3_{id}_{time.time()}"
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
# TODO: track workers and if they terminate, tell parent process about it so cleanup can happen
|
||||
launch_config = LaunchConfig(
|
||||
max_nodes=1,
|
||||
min_nodes=1,
|
||||
nproc_per_node=model_parallel_size,
|
||||
start_method="fork",
|
||||
rdzv_backend="c10d",
|
||||
rdzv_endpoint=os.path.join(tmpdir, "rdzv"),
|
||||
rdzv_configs={"store_type": "file", "timeout": 90},
|
||||
max_restarts=0,
|
||||
monitor_interval=1,
|
||||
run_id=str(uuid.uuid4()),
|
||||
)
|
||||
elastic_launch(launch_config, entrypoint=worker_process_entrypoint)(
|
||||
reply_socket_url,
|
||||
init_model_cb,
|
||||
)
|
||||
|
||||
|
||||
def start_model_parallel_process(
|
||||
model_parallel_size: int,
|
||||
init_model_cb: Callable,
|
||||
**kwargs,
|
||||
):
|
||||
context = zmq.Context()
|
||||
request_socket = context.socket(zmq.DEALER)
|
||||
|
||||
# Binding the request socket to a random port
|
||||
request_socket.bind("tcp://127.0.0.1:0")
|
||||
|
||||
main_process_url = request_socket.getsockopt_string(zmq.LAST_ENDPOINT)
|
||||
|
||||
ctx = multiprocessing.get_context("fork")
|
||||
process = ctx.Process(
|
||||
target=launch_dist_group,
|
||||
args=(
|
||||
main_process_url,
|
||||
model_parallel_size,
|
||||
init_model_cb,
|
||||
),
|
||||
kwargs=kwargs,
|
||||
)
|
||||
process.start()
|
||||
|
||||
# wait until the model is loaded; rank 0 will send a message to indicate it's ready
|
||||
|
||||
request_socket.send_pyobj("READY?")
|
||||
response = request_socket.recv_pyobj()
|
||||
print(f"Finished model load {response}")
|
||||
|
||||
return request_socket, process
|
||||
|
||||
|
||||
class ModelParallelProcessGroup:
|
||||
def __init__(
|
||||
self,
|
||||
model_parallel_size: int,
|
||||
init_model_cb: Callable,
|
||||
**kwargs,
|
||||
):
|
||||
self.model_parallel_size = model_parallel_size
|
||||
self.init_model_cb = init_model_cb
|
||||
self.started = False
|
||||
self.running = False
|
||||
|
||||
def start(self):
|
||||
assert not self.started, "process group already started"
|
||||
self.request_socket, self.process = start_model_parallel_process(
|
||||
self.model_parallel_size,
|
||||
self.init_model_cb,
|
||||
)
|
||||
self.started = True
|
||||
|
||||
def stop(self):
|
||||
assert self.started, "process group not started"
|
||||
if self.process.is_alive():
|
||||
self.request_socket.send_pyobj(_END_SENTINEL, zmq.NOBLOCK)
|
||||
self.process.join()
|
||||
self.started = False
|
||||
|
||||
def run_inference(self, request) -> Generator:
|
||||
assert not self.running, "inference already running"
|
||||
|
||||
self.running = True
|
||||
self.request_socket.send_pyobj(request)
|
||||
try:
|
||||
while True:
|
||||
obj = self.request_socket.recv_pyobj()
|
||||
if obj == _END_SENTINEL:
|
||||
break
|
||||
|
||||
if isinstance(obj, Exception):
|
||||
print(f"[debug] got exception {obj}")
|
||||
raise obj
|
||||
|
||||
yield obj
|
||||
except GeneratorExit as e:
|
||||
self.request_socket.send_pyobj(_CANCEL_SENTINEL)
|
||||
while True:
|
||||
obj = self.request_socket.recv_pyobj()
|
||||
if obj == _END_SENTINEL:
|
||||
break
|
||||
finally:
|
||||
self.running = False
|
84
llama_stack/inference/prepare_messages.py
Normal file
84
llama_stack/inference/prepare_messages.py
Normal file
|
@ -0,0 +1,84 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
from llama_models.llama3.api.datatypes import * # noqa: F403
|
||||
from llama_stack.inference.api import * # noqa: F403
|
||||
from llama_models.llama3.prompt_templates import (
|
||||
BuiltinToolGenerator,
|
||||
FunctionTagCustomToolGenerator,
|
||||
JsonCustomToolGenerator,
|
||||
SystemDefaultGenerator,
|
||||
)
|
||||
|
||||
|
||||
def prepare_messages(request: ChatCompletionRequest) -> List[Message]:
|
||||
|
||||
assert request.tool_choice == ToolChoice.auto, "Only `ToolChoice.auto` supported"
|
||||
|
||||
existing_messages = request.messages
|
||||
existing_system_message = None
|
||||
if existing_messages[0].role == Role.system.value:
|
||||
existing_system_message = existing_messages.pop(0)
|
||||
|
||||
assert (
|
||||
existing_messages[0].role != Role.system.value
|
||||
), "Should only have 1 system message"
|
||||
|
||||
messages = []
|
||||
|
||||
default_gen = SystemDefaultGenerator()
|
||||
default_template = default_gen.gen()
|
||||
|
||||
sys_content = ""
|
||||
|
||||
tool_template = None
|
||||
if request.tools:
|
||||
tool_gen = BuiltinToolGenerator()
|
||||
tool_template = tool_gen.gen(request.tools)
|
||||
|
||||
sys_content += tool_template.render()
|
||||
sys_content += "\n"
|
||||
|
||||
sys_content += default_template.render()
|
||||
|
||||
if existing_system_message:
|
||||
# TODO: this fn is needed in many places
|
||||
def _process(c):
|
||||
if isinstance(c, str):
|
||||
return c
|
||||
else:
|
||||
return "<media>"
|
||||
|
||||
sys_content += "\n"
|
||||
|
||||
if isinstance(existing_system_message.content, str):
|
||||
sys_content += _process(existing_system_message.content)
|
||||
elif isinstance(existing_system_message.content, list):
|
||||
sys_content += "\n".join(
|
||||
[_process(c) for c in existing_system_message.content]
|
||||
)
|
||||
|
||||
messages.append(SystemMessage(content=sys_content))
|
||||
|
||||
has_custom_tools = any(isinstance(dfn.tool_name, str) for dfn in request.tools)
|
||||
if has_custom_tools:
|
||||
if request.tool_prompt_format == ToolPromptFormat.json:
|
||||
tool_gen = JsonCustomToolGenerator()
|
||||
elif request.tool_prompt_format == ToolPromptFormat.function_tag:
|
||||
tool_gen = FunctionTagCustomToolGenerator()
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Non supported ToolPromptFormat {request.tool_prompt_format}"
|
||||
)
|
||||
|
||||
custom_tools = [t for t in request.tools if isinstance(t.tool_name, str)]
|
||||
custom_template = tool_gen.gen(custom_tools)
|
||||
messages.append(UserMessage(content=custom_template.render()))
|
||||
|
||||
# Add back existing messages from the request
|
||||
messages += existing_messages
|
||||
|
||||
return messages
|
69
llama_stack/inference/providers.py
Normal file
69
llama_stack/inference/providers.py
Normal file
|
@ -0,0 +1,69 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
from typing import List
|
||||
|
||||
from llama_stack.core.datatypes import * # noqa: F403
|
||||
|
||||
|
||||
def available_providers() -> List[ProviderSpec]:
|
||||
return [
|
||||
InlineProviderSpec(
|
||||
api=Api.inference,
|
||||
provider_id="meta-reference",
|
||||
pip_packages=[
|
||||
"accelerate",
|
||||
"blobfile",
|
||||
"codeshield",
|
||||
"fairscale",
|
||||
"fbgemm-gpu==0.8.0",
|
||||
"torch",
|
||||
"transformers",
|
||||
"zmq",
|
||||
],
|
||||
module="llama_stack.inference.meta_reference",
|
||||
config_class="llama_stack.inference.meta_reference.MetaReferenceImplConfig",
|
||||
),
|
||||
remote_provider_spec(
|
||||
api=Api.inference,
|
||||
adapter=AdapterSpec(
|
||||
adapter_id="ollama",
|
||||
pip_packages=["ollama"],
|
||||
module="llama_stack.inference.adapters.ollama",
|
||||
),
|
||||
),
|
||||
remote_provider_spec(
|
||||
api=Api.inference,
|
||||
adapter=AdapterSpec(
|
||||
adapter_id="tgi",
|
||||
pip_packages=["huggingface_hub"],
|
||||
module="llama_stack.inference.adapters.tgi",
|
||||
config_class="llama_stack.inference.adapters.tgi.TGIImplConfig",
|
||||
),
|
||||
),
|
||||
remote_provider_spec(
|
||||
api=Api.inference,
|
||||
adapter=AdapterSpec(
|
||||
adapter_id="fireworks",
|
||||
pip_packages=[
|
||||
"fireworks-ai",
|
||||
],
|
||||
module="llama_stack.inference.adapters.fireworks",
|
||||
config_class="llama_stack.inference.adapters.fireworks.FireworksImplConfig",
|
||||
),
|
||||
),
|
||||
remote_provider_spec(
|
||||
api=Api.inference,
|
||||
adapter=AdapterSpec(
|
||||
adapter_id="together",
|
||||
pip_packages=[
|
||||
"together",
|
||||
],
|
||||
module="llama_stack.inference.adapters.together",
|
||||
config_class="llama_stack.inference.adapters.together.TogetherImplConfig",
|
||||
),
|
||||
),
|
||||
]
|
184
llama_stack/inference/quantization/fp8_impls.py
Normal file
184
llama_stack/inference/quantization/fp8_impls.py
Normal file
|
@ -0,0 +1,184 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# This software may be used and distributed in accordance with the terms of the Llama 3 Community License Agreement.
|
||||
|
||||
import collections
|
||||
from typing import Optional, Type
|
||||
|
||||
try:
|
||||
import fbgemm_gpu.experimental.gen_ai # noqa: F401
|
||||
|
||||
print("Using efficient FP8 operators in FBGEMM.")
|
||||
except ImportError:
|
||||
print("No efficient FP8 operators. Please install FBGEMM in fp8_requirements.txt.")
|
||||
raise
|
||||
|
||||
import torch
|
||||
from torch import nn, Tensor
|
||||
|
||||
|
||||
class Fp8ScaledWeights:
|
||||
# TODO: Ugly trick so torch allows us to replace parameters
|
||||
# with our custom Fp8Weights instance. Do this properly.
|
||||
@property
|
||||
def __class__(self) -> Type[nn.parameter.Parameter]:
|
||||
return nn.Parameter
|
||||
|
||||
@property
|
||||
def grad_fn(self) -> None:
|
||||
return None
|
||||
|
||||
|
||||
# pyre-fixme[4]: Attribute annotation cannot be `Any`.
|
||||
# pyre-fixme[2]: Parameter annotation cannot be `Any`.
|
||||
class Fp8RowwiseWeights(
|
||||
Fp8ScaledWeights,
|
||||
collections.namedtuple(
|
||||
"Fp8RowwiseWeights",
|
||||
["weight", "scale", "shape", "activation_scale_ub"],
|
||||
),
|
||||
):
|
||||
pass
|
||||
|
||||
|
||||
def ffn_swiglu(
|
||||
x: Tensor,
|
||||
w1: Fp8RowwiseWeights,
|
||||
w3: Fp8RowwiseWeights,
|
||||
w2: Fp8RowwiseWeights,
|
||||
num_tokens: Optional[Tensor] = None,
|
||||
is_memory_bounded: bool = False,
|
||||
) -> Tensor:
|
||||
if (
|
||||
isinstance(w1, Fp8ScaledWeights)
|
||||
and isinstance(w3, Fp8ScaledWeights)
|
||||
and isinstance(w2, Fp8ScaledWeights)
|
||||
):
|
||||
return ffn_swiglu_fp8_dynamic(
|
||||
x, w1, w3, w2, w1.activation_scale_ub, num_tokens, is_memory_bounded
|
||||
)
|
||||
|
||||
(B, T, D) = x.shape # noqa: N806
|
||||
(HD_L, D_) = w1.shape # noqa: N806
|
||||
assert D_ == D
|
||||
|
||||
assert isinstance(w1, Tensor)
|
||||
assert isinstance(w3, Tensor)
|
||||
x1 = x.view(B * T, D) @ w1.T
|
||||
x2 = x.view(B * T, D) @ w3.T
|
||||
z = torch.nn.functional.silu(x1) * x2
|
||||
del x1, x2
|
||||
assert isinstance(w2, Tensor)
|
||||
return (z @ w2.T).view(B, T, D)
|
||||
|
||||
|
||||
@torch.inference_mode()
|
||||
def quantize_fp8(
|
||||
w: Tensor,
|
||||
fp8_activation_scale_ub: float,
|
||||
output_device: Optional[torch.device] = None,
|
||||
) -> Fp8RowwiseWeights:
|
||||
"""Quantize [n, k] weight tensor.
|
||||
|
||||
Args:
|
||||
w (Tensor): [n, k] input high precision tensor to quantize.
|
||||
fp8_activation_scale_ub (float): Upper bound for activation max.
|
||||
"""
|
||||
activation_scale_ub = torch.tensor(
|
||||
[fp8_activation_scale_ub],
|
||||
dtype=torch.float,
|
||||
device="cuda",
|
||||
)
|
||||
wq, w_scale = torch.ops.fbgemm.quantize_fp8_per_row(w)
|
||||
del w
|
||||
return Fp8RowwiseWeights(
|
||||
weight=wq,
|
||||
scale=w_scale,
|
||||
shape=wq.shape,
|
||||
activation_scale_ub=activation_scale_ub,
|
||||
)
|
||||
|
||||
|
||||
@torch.inference_mode()
|
||||
def load_fp8(
|
||||
w: Tensor,
|
||||
w_scale: Tensor,
|
||||
fp8_activation_scale_ub: float,
|
||||
) -> Fp8RowwiseWeights:
|
||||
"""Load FP8 [n, k] weight tensor.
|
||||
|
||||
Args:
|
||||
w (Tensor): [n, k] input FP8.
|
||||
fp8_activation_scale_ub (float): Upper bound for activation max.
|
||||
"""
|
||||
activation_scale_ub = torch.tensor(
|
||||
[fp8_activation_scale_ub],
|
||||
dtype=torch.float,
|
||||
device="cuda",
|
||||
)
|
||||
return Fp8RowwiseWeights(
|
||||
weight=w.to(torch.float8_e4m3fn).to(device="cuda"),
|
||||
scale=w_scale.to(device="cuda"),
|
||||
shape=w.shape,
|
||||
activation_scale_ub=activation_scale_ub,
|
||||
)
|
||||
|
||||
|
||||
def fc_fp8_dynamic(
|
||||
x: Tensor,
|
||||
w: Fp8RowwiseWeights,
|
||||
activation_scale_ub: Optional[Tensor] = None,
|
||||
num_tokens: Optional[Tensor] = None,
|
||||
is_memory_bounded: bool = False,
|
||||
) -> Tensor:
|
||||
"""
|
||||
Single w8a8 fc layer with dynamic row-wise scaling.
|
||||
"""
|
||||
if isinstance(w, Fp8RowwiseWeights):
|
||||
xq, x_scale = torch.ops.fbgemm.quantize_fp8_per_row(
|
||||
x, num_tokens, activation_scale_ub
|
||||
)
|
||||
y = torch.ops.fbgemm.f8f8bf16_rowwise(
|
||||
xq, w.weight, x_scale, w.scale, use_fast_accum=True
|
||||
)
|
||||
del xq
|
||||
return y
|
||||
|
||||
|
||||
def ffn_swiglu_fp8_dynamic(
|
||||
x: Tensor,
|
||||
w1: Fp8RowwiseWeights,
|
||||
w3: Fp8RowwiseWeights,
|
||||
w2: Fp8RowwiseWeights,
|
||||
activation_scale_ub: Optional[Tensor] = None,
|
||||
num_tokens: Optional[Tensor] = None,
|
||||
is_memory_bounded: bool = False,
|
||||
) -> Tensor:
|
||||
(B, T, D) = x.shape # noqa: N806
|
||||
HD_L = w1.shape[0] # noqa: N806
|
||||
assert HD_L == w3.shape[0]
|
||||
x1 = fc_fp8_dynamic(
|
||||
x.view(B * T, D),
|
||||
w1,
|
||||
activation_scale_ub,
|
||||
num_tokens,
|
||||
is_memory_bounded,
|
||||
)
|
||||
x2 = fc_fp8_dynamic(
|
||||
x.view(B * T, D),
|
||||
w3,
|
||||
activation_scale_ub,
|
||||
num_tokens,
|
||||
is_memory_bounded,
|
||||
)
|
||||
z = torch.nn.functional.silu(x1) * x2
|
||||
del x1, x2
|
||||
|
||||
z_ = fc_fp8_dynamic(z, w2, activation_scale_ub, num_tokens, is_memory_bounded)
|
||||
|
||||
return z_.view(B, T, D)
|
105
llama_stack/inference/quantization/loader.py
Normal file
105
llama_stack/inference/quantization/loader.py
Normal file
|
@ -0,0 +1,105 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# This software may be used and distributed in accordance with the terms of the Llama 3 Community License Agreement.
|
||||
|
||||
import os
|
||||
from typing import Optional
|
||||
|
||||
import torch
|
||||
|
||||
from fairscale.nn.model_parallel.mappings import reduce_from_model_parallel_region
|
||||
from llama_models.llama3.api.model import Transformer, TransformerBlock
|
||||
from llama_stack.inference.api import QuantizationType
|
||||
|
||||
from llama_stack.inference.api.config import (
|
||||
CheckpointQuantizationFormat,
|
||||
MetaReferenceImplConfig,
|
||||
)
|
||||
|
||||
from termcolor import cprint
|
||||
from torch import Tensor
|
||||
|
||||
|
||||
def is_fbgemm_available() -> bool:
|
||||
try:
|
||||
import fbgemm_gpu.experimental.gen_ai # noqa: F401
|
||||
|
||||
return True
|
||||
except ImportError:
|
||||
return False
|
||||
|
||||
|
||||
def swiglu_wrapper(
|
||||
self,
|
||||
x: Tensor,
|
||||
):
|
||||
from .fp8_impls import ffn_swiglu
|
||||
|
||||
out = ffn_swiglu(x, self.w1.weight, self.w3.weight, self.w2.weight)
|
||||
return reduce_from_model_parallel_region(out)
|
||||
|
||||
|
||||
def convert_to_quantized_model(
|
||||
model: Transformer,
|
||||
config: MetaReferenceImplConfig,
|
||||
fp8_activation_scale_ub: Optional[float] = 1200.0,
|
||||
) -> Transformer:
|
||||
if config.quantization.type == QuantizationType.bf16.value:
|
||||
return model
|
||||
|
||||
elif config.quantization.type != QuantizationType.fp8.value:
|
||||
raise ValueError("Only FP8 quantization is supported")
|
||||
|
||||
from .fp8_impls import Fp8ScaledWeights, load_fp8, quantize_fp8
|
||||
|
||||
checkpoint = config.checkpoint_config.checkpoint
|
||||
# Move weights to GPU with quantization
|
||||
if checkpoint.quantization_format == CheckpointQuantizationFormat.fp8_mixed.value:
|
||||
cprint("Loading fp8 scales...", "yellow")
|
||||
fp8_scales_path = os.path.join(
|
||||
checkpoint.checkpoint_dir, f"fp8_scales_{get_model_parallel_rank()}.pt"
|
||||
)
|
||||
assert os.path.isfile(
|
||||
fp8_scales_path
|
||||
), f"fp8_scales_path not found for rank {get_model_parallel_rank()}"
|
||||
fp8_scales = torch.load(fp8_scales_path, weights_only=True)
|
||||
|
||||
for block in model.layers:
|
||||
if isinstance(block, TransformerBlock):
|
||||
if block.layer_id == 0 or block.layer_id == (model.n_layers - 1):
|
||||
continue
|
||||
|
||||
block.feed_forward.forward = swiglu_wrapper.__get__(block.feed_forward)
|
||||
for key in ("w1", "w3", "w2"):
|
||||
param = getattr(block.feed_forward, key)
|
||||
param.weight = load_fp8(
|
||||
param.weight,
|
||||
fp8_scales[
|
||||
f"{block.layer_id}_feed_forward.{key}_{get_model_parallel_rank()}"
|
||||
],
|
||||
fp8_activation_scale_ub,
|
||||
)
|
||||
else:
|
||||
cprint("Quantizing fp8 weights from bf16...", "yellow")
|
||||
for block in model.layers:
|
||||
if isinstance(block, TransformerBlock):
|
||||
if block.layer_id == 0 or block.layer_id == (model.n_layers - 1):
|
||||
continue
|
||||
block.feed_forward.forward = swiglu_wrapper.__get__(block.feed_forward)
|
||||
for key in ("w1", "w3", "w2"):
|
||||
param = getattr(block.feed_forward, key)
|
||||
param.weight = quantize_fp8(
|
||||
param.weight,
|
||||
fp8_activation_scale_ub,
|
||||
output_device=torch.device("cuda"),
|
||||
)
|
||||
|
||||
for _, parameter in model.named_parameters():
|
||||
if not isinstance(parameter, Fp8ScaledWeights):
|
||||
parameter.data = parameter.to(device="cuda")
|
||||
return model
|
30
llama_stack/inference/quantization/scripts/build_conda.sh
Normal file
30
llama_stack/inference/quantization/scripts/build_conda.sh
Normal file
|
@ -0,0 +1,30 @@
|
|||
#!/bin/bash
|
||||
|
||||
if [[ $# -ne 1 ]]; then
|
||||
echo "Error: Please provide the name of CONDA environment you wish to create"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
ENV_NAME=$1
|
||||
|
||||
set -eu
|
||||
eval "$(conda shell.bash hook)"
|
||||
|
||||
echo "Will build env (or overwrite) named '$ENV_NAME'"
|
||||
|
||||
set -x
|
||||
|
||||
run_build() {
|
||||
# Set up the conda environment
|
||||
yes | conda remove --name $ENV_NAME --all
|
||||
yes | conda create -n $ENV_NAME python=3.10
|
||||
conda activate $ENV_NAME
|
||||
|
||||
# PT nightly
|
||||
pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu121
|
||||
|
||||
# install dependencies for `llama-agentic-system`
|
||||
pip install -r fp8_requirements.txt
|
||||
}
|
||||
|
||||
run_build
|
|
@ -0,0 +1,161 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# This software may be used and distributed in accordance with the terms of the Llama 3 Community License Agreement.
|
||||
|
||||
import json
|
||||
import os
|
||||
import shutil
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
import fire
|
||||
|
||||
import torch
|
||||
from fairscale.nn.model_parallel.initialize import (
|
||||
get_model_parallel_rank,
|
||||
initialize_model_parallel,
|
||||
model_parallel_is_initialized,
|
||||
)
|
||||
from fp8.fp8_impls import FfnQuantizeMode, quantize_fp8
|
||||
|
||||
from llama.model import ModelArgs, Transformer, TransformerBlock
|
||||
from llama.tokenizer import Tokenizer
|
||||
from torch.nn.parameter import Parameter
|
||||
|
||||
|
||||
def main(
|
||||
ckpt_dir: str,
|
||||
tokenizer_path: str,
|
||||
quantized_ckpt_dir: str,
|
||||
max_seq_len: Optional[int] = 512,
|
||||
max_batch_size: Optional[int] = 4,
|
||||
model_parallel_size: Optional[int] = None,
|
||||
ffn_quantize_mode: Optional[FfnQuantizeMode] = FfnQuantizeMode.FP8_ROWWISE,
|
||||
fp8_activation_scale_ub: Optional[float] = 1200.0,
|
||||
seed: int = 1,
|
||||
):
|
||||
""" """
|
||||
if not os.path.exists(quantized_ckpt_dir):
|
||||
os.makedirs(quantized_ckpt_dir)
|
||||
shutil.copy(
|
||||
os.path.join(ckpt_dir, "params.json"),
|
||||
os.path.join(quantized_ckpt_dir, "params.json"),
|
||||
)
|
||||
shutil.copy(
|
||||
os.path.join(ckpt_dir, "tokenizer.model"),
|
||||
os.path.join(quantized_ckpt_dir, "tokenizer.model"),
|
||||
)
|
||||
|
||||
if not torch.distributed.is_initialized():
|
||||
torch.distributed.init_process_group("nccl")
|
||||
if not model_parallel_is_initialized():
|
||||
if model_parallel_size is None:
|
||||
model_parallel_size = int(os.environ.get("WORLD_SIZE", 1))
|
||||
initialize_model_parallel(model_parallel_size)
|
||||
|
||||
local_rank = int(os.environ.get("LOCAL_RANK", 0))
|
||||
torch.cuda.set_device(local_rank)
|
||||
|
||||
# seed must be the same in all processes
|
||||
torch.manual_seed(seed)
|
||||
|
||||
if local_rank > 0:
|
||||
sys.stdout = open(os.devnull, "w")
|
||||
|
||||
checkpoints = sorted(Path(ckpt_dir).glob("*.pth"))
|
||||
assert len(checkpoints) > 0, f"no checkpoint files found in {ckpt_dir}"
|
||||
assert model_parallel_size == len(
|
||||
checkpoints
|
||||
), f"Loading a checkpoint for MP={len(checkpoints)} but world size is {model_parallel_size}"
|
||||
ckpt_path = checkpoints[get_model_parallel_rank()]
|
||||
checkpoint = torch.load(ckpt_path, map_location="cpu", weights_only=True)
|
||||
with open(Path(ckpt_dir) / "params.json", "r") as f:
|
||||
params = json.loads(f.read())
|
||||
|
||||
model_args: ModelArgs = ModelArgs(
|
||||
max_seq_len=max_seq_len,
|
||||
max_batch_size=max_batch_size,
|
||||
**params,
|
||||
)
|
||||
tokenizer = Tokenizer(model_path=tokenizer_path)
|
||||
assert (
|
||||
model_args.vocab_size == tokenizer.n_words
|
||||
), f"model_args vocab = {model_args.vocab_size} but tokenizer vocab = {tokenizer.n_words}"
|
||||
|
||||
# load on CPU in bf16 so that fp8 conversion does not find an unexpected (fp32, e.g.) datatype
|
||||
torch.set_default_tensor_type(torch.BFloat16Tensor)
|
||||
|
||||
model = Transformer(model_args)
|
||||
model.load_state_dict(checkpoint, strict=False)
|
||||
|
||||
if torch.cuda.is_bf16_supported():
|
||||
torch.set_default_tensor_type(torch.cuda.BFloat16Tensor)
|
||||
else:
|
||||
torch.set_default_tensor_type(torch.cuda.HalfTensor)
|
||||
|
||||
print(ckpt_path)
|
||||
assert (
|
||||
quantized_ckpt_dir is not None
|
||||
), "QUantized checkpoint directory should not be None"
|
||||
fp8_scales = {}
|
||||
for block in model.layers:
|
||||
if isinstance(block, TransformerBlock):
|
||||
if block.layer_id == 0 or block.layer_id == (model.n_layers - 1):
|
||||
continue
|
||||
|
||||
fp8_weight = quantize_fp8(
|
||||
block.feed_forward.w1.weight,
|
||||
fp8_activation_scale_ub,
|
||||
ffn_quantize_mode,
|
||||
output_device=torch.device("cpu"),
|
||||
)
|
||||
with torch.inference_mode():
|
||||
block.feed_forward.w1.weight = Parameter(fp8_weight.weight)
|
||||
fp8_scales[
|
||||
f"{block.layer_id}_feed_forward.w1_{get_model_parallel_rank()}"
|
||||
] = fp8_weight.scale
|
||||
|
||||
fp8_weight = quantize_fp8(
|
||||
block.feed_forward.w3.weight,
|
||||
fp8_activation_scale_ub,
|
||||
ffn_quantize_mode,
|
||||
output_device=torch.device("cpu"),
|
||||
)
|
||||
with torch.inference_mode():
|
||||
block.feed_forward.w3.weight = Parameter(fp8_weight.weight)
|
||||
fp8_scales[
|
||||
f"{block.layer_id}_feed_forward.w3_{get_model_parallel_rank()}"
|
||||
] = fp8_weight.scale
|
||||
|
||||
fp8_weight = quantize_fp8(
|
||||
block.feed_forward.w2.weight,
|
||||
fp8_activation_scale_ub,
|
||||
ffn_quantize_mode,
|
||||
output_device=torch.device("cpu"),
|
||||
)
|
||||
with torch.inference_mode():
|
||||
block.feed_forward.w2.weight = Parameter(fp8_weight.weight)
|
||||
fp8_scales[
|
||||
f"{block.layer_id}_feed_forward.w2_{get_model_parallel_rank()}"
|
||||
] = fp8_weight.scale
|
||||
|
||||
fp8_scales_path = os.path.join(
|
||||
quantized_ckpt_dir, f"fp8_scales_{get_model_parallel_rank()}.pt"
|
||||
)
|
||||
torch.save(fp8_scales, fp8_scales_path)
|
||||
|
||||
ckpt_path = os.path.join(
|
||||
quantized_ckpt_dir,
|
||||
"consolidated.{:02d}.pth".format(get_model_parallel_rank()),
|
||||
)
|
||||
torch.save(model.state_dict(), ckpt_path)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
fire.Fire(main)
|
31
llama_stack/inference/quantization/scripts/run_quantize_checkpoint.sh
Executable file
31
llama_stack/inference/quantization/scripts/run_quantize_checkpoint.sh
Executable file
|
@ -0,0 +1,31 @@
|
|||
#!/bin/bash
|
||||
|
||||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
set -euo pipefail
|
||||
set -x
|
||||
|
||||
cd $(git rev-parse --show-toplevel)
|
||||
|
||||
MASTER_HOST=$1
|
||||
RUN_ID=$2
|
||||
CKPT_DIR=$3
|
||||
QUANT_CKPT_DIR=$4
|
||||
TOKENIZER_PATH=$5
|
||||
NNODES=$6
|
||||
NPROC=$7
|
||||
|
||||
echo $MASTER_HOST, $RUN_ID, $CKPT_DIR, $QUANT_CKPT_DIR
|
||||
|
||||
NCCL_NET=Socket NCCL_SOCKET_IFNAME=eth TIKTOKEN_CACHE_DIR="" \
|
||||
torchrun \
|
||||
--nnodes=$NNODES --nproc_per_node=$NPROC \
|
||||
--rdzv_id=$RUN_ID \
|
||||
--rdzv_conf='timeout=120' \
|
||||
--rdzv_backend=c10d \
|
||||
--rdzv_endpoint="${MASTER_HOST}:29502" \
|
||||
quantize_checkpoint.py $CKPT_DIR $TOKENIZER_PATH $QUANT_CKPT_DIR
|
76
llama_stack/inference/quantization/test_fp8.py
Normal file
76
llama_stack/inference/quantization/test_fp8.py
Normal file
|
@ -0,0 +1,76 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# This software may be used and distributed in accordance with the terms of the Llama 3 Community License Agreement.
|
||||
|
||||
import unittest
|
||||
|
||||
import torch
|
||||
|
||||
from fp8_impls import ffn_swiglu_fp8_dynamic, FfnQuantizeMode, quantize_fp8
|
||||
from hypothesis import given, settings, strategies as st
|
||||
from torch import Tensor
|
||||
|
||||
|
||||
@unittest.skipIf(
|
||||
not torch.cuda.is_available()
|
||||
or torch.cuda.get_device_properties(torch.cuda.current_device()).major < 9,
|
||||
"Skip when H100 is not available",
|
||||
)
|
||||
class FP8Tests(unittest.TestCase):
|
||||
@settings(deadline=None)
|
||||
@given(
|
||||
D=st.sampled_from([4096, 8192]),
|
||||
HD_L=st.sampled_from([1280, 2560]),
|
||||
B=st.sampled_from([1, 2]),
|
||||
T=st.sampled_from([2048, 4096]),
|
||||
UB=st.sampled_from([1000, 10000]),
|
||||
)
|
||||
def test_fp8_ffn(
|
||||
self,
|
||||
D: int, # noqa
|
||||
HD_L: int,
|
||||
B: int,
|
||||
T: int,
|
||||
UB: float,
|
||||
) -> None:
|
||||
x = torch.randn(size=(B, T, D), dtype=torch.bfloat16, device="cuda") * 0.1
|
||||
w1 = torch.randn(size=(HD_L, D), dtype=torch.bfloat16, device="cuda") * 0.01
|
||||
w3 = torch.randn(size=(HD_L, D), dtype=torch.bfloat16, device="cuda") * 0.01
|
||||
w2 = torch.randn(size=(D, HD_L), dtype=torch.bfloat16, device="cuda") * 0.1
|
||||
|
||||
x_q = quantize_fp8(x, UB, mode=FfnQuantizeMode.FP8_ROWWISE)
|
||||
w1_q = quantize_fp8(w1, UB, mode=FfnQuantizeMode.FP8_ROWWISE)
|
||||
w3_q = quantize_fp8(w3, UB, mode=FfnQuantizeMode.FP8_ROWWISE)
|
||||
w2_q = quantize_fp8(w2, UB, mode=FfnQuantizeMode.FP8_ROWWISE)
|
||||
|
||||
def ref_ffn(x: Tensor, w1: Tensor, w3: Tensor, w2: Tensor) -> Tensor:
|
||||
(B, T, D) = x.shape # noqa: N806
|
||||
(HD_L, D_) = w1.shape # noqa: N806
|
||||
assert D_ == D
|
||||
|
||||
x1 = x.view(B * T, D) @ w1.T
|
||||
x2 = x.view(B * T, D) @ w3.T
|
||||
|
||||
z = torch.nn.functional.silu(x1) * x2
|
||||
return (z @ w2.T).view(B, T, D).to(torch.bfloat16)
|
||||
|
||||
v = ffn_swiglu_fp8_dynamic(x, w1_q, w3_q, w2_q)
|
||||
|
||||
# Fake quant
|
||||
x = x_q.weight.bfloat16() * x_q.scale.unsqueeze(-1)
|
||||
w1 = w1_q.weight.bfloat16() * w1_q.scale.unsqueeze(-1)
|
||||
w3 = w3_q.weight.bfloat16() * w3_q.scale.unsqueeze(-1)
|
||||
w2 = w2_q.weight.bfloat16() * w2_q.scale.unsqueeze(-1)
|
||||
|
||||
v_ref = ref_ffn(x, w1, w3, w2)
|
||||
|
||||
torch.testing.assert_close(v_ref, v, atol=4.0e-3, rtol=4.0e-3)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
Some files were not shown because too many files have changed in this diff Show more
Loading…
Add table
Add a link
Reference in a new issue