mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-06-27 18:50:41 +00:00
Added support for structured output in the API and added a reference implementation for meta-reference. A few notes: * Two formats are specified in the API: Json schema and EBNF based grammar * Implementation only supports Json for now We use lm-format-enhancer to provide the implementation right now but may change this especially because BNF grammars aren't supported by that library. Fireworks has support for structured output and Together has limited supported for it too. Subsequent PRs will add these changes. We would like all our inference providers to provide structured output for llama models since it is an extremely important and highly sought-after need by the developers.
151 lines
5.6 KiB
Python
151 lines
5.6 KiB
Python
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
# All rights reserved.
|
|
#
|
|
# This source code is licensed under the terms described in the LICENSE file in
|
|
# the root directory of this source tree.
|
|
|
|
from typing import List
|
|
|
|
from llama_stack.distribution.datatypes import * # noqa: F403
|
|
|
|
|
|
META_REFERENCE_DEPS = [
|
|
"accelerate",
|
|
"blobfile",
|
|
"fairscale",
|
|
"torch",
|
|
"torchvision",
|
|
"transformers",
|
|
"zmq",
|
|
"lm-format-enforcer",
|
|
]
|
|
|
|
|
|
def available_providers() -> List[ProviderSpec]:
|
|
return [
|
|
InlineProviderSpec(
|
|
api=Api.inference,
|
|
provider_type="meta-reference",
|
|
pip_packages=META_REFERENCE_DEPS,
|
|
module="llama_stack.providers.impls.meta_reference.inference",
|
|
config_class="llama_stack.providers.impls.meta_reference.inference.MetaReferenceInferenceConfig",
|
|
),
|
|
InlineProviderSpec(
|
|
api=Api.inference,
|
|
provider_type="meta-reference-quantized",
|
|
pip_packages=(
|
|
META_REFERENCE_DEPS
|
|
+ [
|
|
"fbgemm-gpu==0.8.0",
|
|
]
|
|
),
|
|
module="llama_stack.providers.impls.meta_reference.inference",
|
|
config_class="llama_stack.providers.impls.meta_reference.inference.MetaReferenceQuantizedInferenceConfig",
|
|
),
|
|
remote_provider_spec(
|
|
api=Api.inference,
|
|
adapter=AdapterSpec(
|
|
adapter_type="sample",
|
|
pip_packages=[],
|
|
module="llama_stack.providers.adapters.inference.sample",
|
|
config_class="llama_stack.providers.adapters.inference.sample.SampleConfig",
|
|
),
|
|
),
|
|
remote_provider_spec(
|
|
api=Api.inference,
|
|
adapter=AdapterSpec(
|
|
adapter_type="ollama",
|
|
pip_packages=["ollama", "aiohttp"],
|
|
config_class="llama_stack.providers.adapters.inference.ollama.OllamaImplConfig",
|
|
module="llama_stack.providers.adapters.inference.ollama",
|
|
),
|
|
),
|
|
# remote_provider_spec(
|
|
# api=Api.inference,
|
|
# adapter=AdapterSpec(
|
|
# adapter_type="vllm",
|
|
# pip_packages=["openai"],
|
|
# module="llama_stack.providers.adapters.inference.vllm",
|
|
# config_class="llama_stack.providers.adapters.inference.vllm.VLLMImplConfig",
|
|
# ),
|
|
# ),
|
|
remote_provider_spec(
|
|
api=Api.inference,
|
|
adapter=AdapterSpec(
|
|
adapter_type="tgi",
|
|
pip_packages=["huggingface_hub", "aiohttp"],
|
|
module="llama_stack.providers.adapters.inference.tgi",
|
|
config_class="llama_stack.providers.adapters.inference.tgi.TGIImplConfig",
|
|
),
|
|
),
|
|
remote_provider_spec(
|
|
api=Api.inference,
|
|
adapter=AdapterSpec(
|
|
adapter_type="hf::serverless",
|
|
pip_packages=["huggingface_hub", "aiohttp"],
|
|
module="llama_stack.providers.adapters.inference.tgi",
|
|
config_class="llama_stack.providers.adapters.inference.tgi.InferenceAPIImplConfig",
|
|
),
|
|
),
|
|
remote_provider_spec(
|
|
api=Api.inference,
|
|
adapter=AdapterSpec(
|
|
adapter_type="hf::endpoint",
|
|
pip_packages=["huggingface_hub", "aiohttp"],
|
|
module="llama_stack.providers.adapters.inference.tgi",
|
|
config_class="llama_stack.providers.adapters.inference.tgi.InferenceEndpointImplConfig",
|
|
),
|
|
),
|
|
remote_provider_spec(
|
|
api=Api.inference,
|
|
adapter=AdapterSpec(
|
|
adapter_type="fireworks",
|
|
pip_packages=[
|
|
"fireworks-ai",
|
|
],
|
|
module="llama_stack.providers.adapters.inference.fireworks",
|
|
config_class="llama_stack.providers.adapters.inference.fireworks.FireworksImplConfig",
|
|
),
|
|
),
|
|
remote_provider_spec(
|
|
api=Api.inference,
|
|
adapter=AdapterSpec(
|
|
adapter_type="together",
|
|
pip_packages=[
|
|
"together",
|
|
],
|
|
module="llama_stack.providers.adapters.inference.together",
|
|
config_class="llama_stack.providers.adapters.inference.together.TogetherImplConfig",
|
|
provider_data_validator="llama_stack.providers.adapters.safety.together.TogetherProviderDataValidator",
|
|
),
|
|
),
|
|
remote_provider_spec(
|
|
api=Api.inference,
|
|
adapter=AdapterSpec(
|
|
adapter_type="bedrock",
|
|
pip_packages=["boto3"],
|
|
module="llama_stack.providers.adapters.inference.bedrock",
|
|
config_class="llama_stack.providers.adapters.inference.bedrock.BedrockConfig",
|
|
),
|
|
),
|
|
remote_provider_spec(
|
|
api=Api.inference,
|
|
adapter=AdapterSpec(
|
|
adapter_type="databricks",
|
|
pip_packages=[
|
|
"openai",
|
|
],
|
|
module="llama_stack.providers.adapters.inference.databricks",
|
|
config_class="llama_stack.providers.adapters.inference.databricks.DatabricksImplConfig",
|
|
),
|
|
),
|
|
InlineProviderSpec(
|
|
api=Api.inference,
|
|
provider_type="vllm",
|
|
pip_packages=[
|
|
"vllm",
|
|
],
|
|
module="llama_stack.providers.impls.vllm",
|
|
config_class="llama_stack.providers.impls.vllm.VLLMConfig",
|
|
),
|
|
]
|