llama-stack-mirror/llama_stack/providers/registry/inference.py
Xi Yan 23210e8679
llama stack distributions / templates / docker refactor (#266)
* docker compose ollama

* comment

* update compose file

* readme for distributions

* readme

* move distribution folders

* move distribution/templates to distributions/

* rename

* kill distribution/templates

* readme

* readme

* build/developer cookbook/new api provider

* developer cookbook

* readme

* readme

* [bugfix] fix case for agent when memory bank registered without specifying provider_id (#264)

* fix case where memory bank is registered without provider_id

* memory test

* agents unit test

* Add an option to not use elastic agents for meta-reference inference (#269)

* Allow overridding checkpoint_dir via config

* Small rename

* Make all methods `async def` again; add completion() for meta-reference (#270)

PR #201 had made several changes while trying to fix issues with getting the stream=False branches of inference and agents API working. As part of this, it made a change which was slightly gratuitous. Namely, making chat_completion() and brethren "def" instead of "async def".

The rationale was that this allowed the user (within llama-stack) of this to use it as:

```
async for chunk in api.chat_completion(params)
```

However, it causes unnecessary confusion for several folks. Given that clients (e.g., llama-stack-apps) anyway use the SDK methods (which are completely isolated) this choice was not ideal. Let's revert back so the call now looks like:

```
async for chunk in await api.chat_completion(params)
```

Bonus: Added a completion() implementation for the meta-reference provider. Technically should have been another PR :)

* Improve an important error message

* update ollama for llama-guard3

* Add vLLM inference provider for OpenAI compatible vLLM server (#178)

This PR adds vLLM inference provider for OpenAI compatible vLLM server.

* Create .readthedocs.yaml

Trying out readthedocs

* Update event_logger.py (#275)

spelling error

* vllm

* build templates

* delete templates

* tmp add back build to avoid merge conflicts

* vllm

* vllm

---------

Co-authored-by: Ashwin Bharambe <ashwin.bharambe@gmail.com>
Co-authored-by: Ashwin Bharambe <ashwin@meta.com>
Co-authored-by: Yuan Tang <terrytangyuan@gmail.com>
Co-authored-by: raghotham <rsm@meta.com>
Co-authored-by: nehal-a2z <nehal@coderabbit.ai>
2024-10-21 11:17:53 -07:00

151 lines
5.7 KiB
Python

# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from typing import List
from llama_stack.distribution.datatypes import * # noqa: F403
def available_providers() -> List[ProviderSpec]:
return [
InlineProviderSpec(
api=Api.inference,
provider_type="meta-reference",
pip_packages=[
"accelerate",
"blobfile",
"fairscale",
"torch",
"torchvision",
"transformers",
"zmq",
],
module="llama_stack.providers.impls.meta_reference.inference",
config_class="llama_stack.providers.impls.meta_reference.inference.MetaReferenceInferenceConfig",
),
InlineProviderSpec(
api=Api.inference,
provider_type="meta-reference-quantized",
pip_packages=[
"accelerate",
"blobfile",
"fairscale",
"fbgemm-gpu==0.8.0",
"torch",
"torchvision",
"transformers",
"zmq",
],
module="llama_stack.providers.impls.meta_reference.inference",
config_class="llama_stack.providers.impls.meta_reference.inference.MetaReferenceQuantizedInferenceConfig",
),
remote_provider_spec(
api=Api.inference,
adapter=AdapterSpec(
adapter_type="sample",
pip_packages=[],
module="llama_stack.providers.adapters.inference.sample",
config_class="llama_stack.providers.adapters.inference.sample.SampleConfig",
),
),
remote_provider_spec(
api=Api.inference,
adapter=AdapterSpec(
adapter_type="ollama",
pip_packages=["ollama"],
config_class="llama_stack.providers.adapters.inference.ollama.OllamaImplConfig",
module="llama_stack.providers.adapters.inference.ollama",
),
),
# remote_provider_spec(
# api=Api.inference,
# adapter=AdapterSpec(
# adapter_type="vllm",
# pip_packages=["openai"],
# module="llama_stack.providers.adapters.inference.vllm",
# config_class="llama_stack.providers.adapters.inference.vllm.VLLMImplConfig",
# ),
# ),
remote_provider_spec(
api=Api.inference,
adapter=AdapterSpec(
adapter_type="tgi",
pip_packages=["huggingface_hub", "aiohttp"],
module="llama_stack.providers.adapters.inference.tgi",
config_class="llama_stack.providers.adapters.inference.tgi.TGIImplConfig",
),
),
remote_provider_spec(
api=Api.inference,
adapter=AdapterSpec(
adapter_type="hf::serverless",
pip_packages=["huggingface_hub", "aiohttp"],
module="llama_stack.providers.adapters.inference.tgi",
config_class="llama_stack.providers.adapters.inference.tgi.InferenceAPIImplConfig",
),
),
remote_provider_spec(
api=Api.inference,
adapter=AdapterSpec(
adapter_type="hf::endpoint",
pip_packages=["huggingface_hub", "aiohttp"],
module="llama_stack.providers.adapters.inference.tgi",
config_class="llama_stack.providers.adapters.inference.tgi.InferenceEndpointImplConfig",
),
),
remote_provider_spec(
api=Api.inference,
adapter=AdapterSpec(
adapter_type="fireworks",
pip_packages=[
"fireworks-ai",
],
module="llama_stack.providers.adapters.inference.fireworks",
config_class="llama_stack.providers.adapters.inference.fireworks.FireworksImplConfig",
),
),
remote_provider_spec(
api=Api.inference,
adapter=AdapterSpec(
adapter_type="together",
pip_packages=[
"together",
],
module="llama_stack.providers.adapters.inference.together",
config_class="llama_stack.providers.adapters.inference.together.TogetherImplConfig",
provider_data_validator="llama_stack.providers.adapters.safety.together.TogetherProviderDataValidator",
),
),
remote_provider_spec(
api=Api.inference,
adapter=AdapterSpec(
adapter_type="bedrock",
pip_packages=["boto3"],
module="llama_stack.providers.adapters.inference.bedrock",
config_class="llama_stack.providers.adapters.inference.bedrock.BedrockConfig",
),
),
remote_provider_spec(
api=Api.inference,
adapter=AdapterSpec(
adapter_type="databricks",
pip_packages=[
"openai",
],
module="llama_stack.providers.adapters.inference.databricks",
config_class="llama_stack.providers.adapters.inference.databricks.DatabricksImplConfig",
),
),
InlineProviderSpec(
api=Api.inference,
provider_type="vllm",
pip_packages=[
"vllm",
],
module="llama_stack.providers.impls.vllm",
config_class="llama_stack.providers.impls.vllm.VLLMConfig",
),
]