Merge branch 'main' into litellm_support_lakera_config_thresholds

This commit is contained in:
Krish Dholakia 2024-08-06 22:47:13 -07:00 committed by GitHub
commit c82fc0cac2
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
250 changed files with 17468 additions and 19307 deletions

View file

@ -48,7 +48,7 @@ jobs:
pip install opentelemetry-sdk==1.25.0
pip install opentelemetry-exporter-otlp==1.25.0
pip install openai==1.34.0
pip install prisma
pip install prisma==0.11.0
pip install "detect_secrets==1.5.0"
pip install "httpx==0.24.1"
pip install fastapi
@ -208,6 +208,8 @@ jobs:
-e AZURE_EUROPE_API_KEY=$AZURE_EUROPE_API_KEY \
-e MISTRAL_API_KEY=$MISTRAL_API_KEY \
-e AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID \
-e GROQ_API_KEY=$GROQ_API_KEY \
-e COHERE_API_KEY=$COHERE_API_KEY \
-e AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY \
-e AWS_REGION_NAME=$AWS_REGION_NAME \
-e AUTO_INFER_REGION=True \
@ -404,7 +406,7 @@ jobs:
circleci step halt
fi
- run:
name: Trigger Github Action for new Docker Container
name: Trigger Github Action for new Docker Container + Trigger Stable Release Testing
command: |
echo "Install TOML package."
python3 -m pip install toml
@ -415,7 +417,8 @@ jobs:
-H "Authorization: Bearer $GITHUB_TOKEN" \
"https://api.github.com/repos/BerriAI/litellm/actions/workflows/ghcr_deploy.yml/dispatches" \
-d "{\"ref\":\"main\", \"inputs\":{\"tag\":\"v${VERSION}\", \"commit_hash\":\"$CIRCLE_SHA1\"}}"
echo "triggering stable release server for version ${VERSION} and commit ${CIRCLE_SHA1}"
curl -X POST "https://proxyloadtester-production.up.railway.app/start/load/test?version=${VERSION}&commit_hash=${CIRCLE_SHA1}"
workflows:
version: 2
build_and_test:

View file

@ -21,6 +21,14 @@ env:
# There is a single job in this workflow. It's configured to run on the latest available version of Ubuntu.
jobs:
# print commit hash, tag, and release type
print:
runs-on: ubuntu-latest
steps:
- run: |
echo "Commit hash: ${{ github.event.inputs.commit_hash }}"
echo "Tag: ${{ github.event.inputs.tag }}"
echo "Release type: ${{ github.event.inputs.release_type }}"
docker-hub-deploy:
if: github.repository == 'BerriAI/litellm'
runs-on: ubuntu-latest

41
Dockerfile.custom_ui Normal file
View file

@ -0,0 +1,41 @@
# Use the provided base image
FROM ghcr.io/berriai/litellm:litellm_fwd_server_root_path-dev
# Set the working directory to /app
WORKDIR /app
# Install Node.js and npm (adjust version as needed)
RUN apt-get update && apt-get install -y nodejs npm
# Copy the UI source into the container
COPY ./ui/litellm-dashboard /app/ui/litellm-dashboard
# Set an environment variable for UI_BASE_PATH
# This can be overridden at build time
# set UI_BASE_PATH to "<your server root path>/ui"
ENV UI_BASE_PATH="/prod/ui"
# Build the UI with the specified UI_BASE_PATH
WORKDIR /app/ui/litellm-dashboard
RUN npm install
RUN UI_BASE_PATH=$UI_BASE_PATH npm run build
# Create the destination directory
RUN mkdir -p /app/litellm/proxy/_experimental/out
# Move the built files to the appropriate location
# Assuming the build output is in ./out directory
RUN rm -rf /app/litellm/proxy/_experimental/out/* && \
mv ./out/* /app/litellm/proxy/_experimental/out/
# Switch back to the main app directory
WORKDIR /app
# Make sure your entrypoint.sh is executable
RUN chmod +x entrypoint.sh
# Expose the necessary port
EXPOSE 4000/tcp
# Override the CMD instruction with your desired command and arguments
CMD ["--port", "4000", "--config", "config.yaml", "--detailed_debug"]

View file

@ -11,7 +11,7 @@
<p align="center">Call all LLM APIs using the OpenAI format [Bedrock, Huggingface, VertexAI, TogetherAI, Azure, OpenAI, Groq etc.]
<br>
</p>
<h4 align="center"><a href="https://docs.litellm.ai/docs/simple_proxy" target="_blank">OpenAI Proxy Server</a> | <a href="https://docs.litellm.ai/docs/hosted" target="_blank"> Hosted Proxy (Preview)</a> | <a href="https://docs.litellm.ai/docs/enterprise"target="_blank">Enterprise Tier</a></h4>
<h4 align="center"><a href="https://docs.litellm.ai/docs/simple_proxy" target="_blank">LiteLLM Proxy Server</a> | <a href="https://docs.litellm.ai/docs/hosted" target="_blank"> Hosted Proxy (Preview)</a> | <a href="https://docs.litellm.ai/docs/enterprise"target="_blank">Enterprise Tier</a></h4>
<h4 align="center">
<a href="https://pypi.org/project/litellm/" target="_blank">
<img src="https://img.shields.io/pypi/v/litellm.svg" alt="PyPI Version">
@ -35,7 +35,7 @@ LiteLLM manages:
- Translate inputs to provider's `completion`, `embedding`, and `image_generation` endpoints
- [Consistent output](https://docs.litellm.ai/docs/completion/output), text responses will always be available at `['choices'][0]['message']['content']`
- Retry/fallback logic across multiple deployments (e.g. Azure/OpenAI) - [Router](https://docs.litellm.ai/docs/routing)
- Set Budgets & Rate limits per project, api key, model [OpenAI Proxy Server](https://docs.litellm.ai/docs/simple_proxy)
- Set Budgets & Rate limits per project, api key, model [LiteLLM Proxy Server](https://docs.litellm.ai/docs/simple_proxy)
[**Jump to OpenAI Proxy Docs**](https://github.com/BerriAI/litellm?tab=readme-ov-file#openai-proxy---docs) <br>
[**Jump to Supported LLM Providers**](https://github.com/BerriAI/litellm?tab=readme-ov-file#supported-providers-docs)
@ -166,6 +166,10 @@ $ litellm --model huggingface/bigcode/starcoder
### Step 2: Make ChatCompletions Request to Proxy
> [!IMPORTANT]
> 💡 [Use LiteLLM Proxy with Langchain (Python, JS), OpenAI SDK (Python, JS) Anthropic SDK, Mistral SDK, LlamaIndex, Instructor, Curl](https://docs.litellm.ai/docs/proxy/user_keys)
```python
import openai # openai v1.0.0+
client = openai.OpenAI(api_key="anything",base_url="http://0.0.0.0:4000") # set proxy to base_url

View file

@ -0,0 +1,565 @@
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": []
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
}
},
"cells": [
{
"cell_type": "markdown",
"source": [
"# Migrating to LiteLLM Proxy from OpenAI/Azure OpenAI\n",
"\n",
"Covers:\n",
"\n",
"* /chat/completion\n",
"* /embedding\n",
"\n",
"\n",
"These are **selected examples**. LiteLLM Proxy is **OpenAI-Compatible**, it works with any project that calls OpenAI. Just change the `base_url`, `api_key` and `model`.\n",
"\n",
"For more examples, [go here](https://docs.litellm.ai/docs/proxy/user_keys)\n",
"\n",
"To pass provider-specific args, [go here](https://docs.litellm.ai/docs/completion/provider_specific_params#proxy-usage)\n",
"\n",
"To drop unsupported params (E.g. frequency_penalty for bedrock with librechat), [go here](https://docs.litellm.ai/docs/completion/drop_params#openai-proxy-usage)\n"
],
"metadata": {
"id": "kccfk0mHZ4Ad"
}
},
{
"cell_type": "markdown",
"source": [
"## /chat/completion\n",
"\n"
],
"metadata": {
"id": "nmSClzCPaGH6"
}
},
{
"cell_type": "markdown",
"source": [
"### OpenAI Python SDK"
],
"metadata": {
"id": "_vqcjwOVaKpO"
}
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "x1e_Ok3KZzeP"
},
"outputs": [],
"source": [
"import openai\n",
"client = openai.OpenAI(\n",
" api_key=\"anything\",\n",
" base_url=\"http://0.0.0.0:4000\"\n",
")\n",
"\n",
"# request sent to model set on litellm proxy, `litellm --model`\n",
"response = client.chat.completions.create(\n",
" model=\"gpt-3.5-turbo\",\n",
" messages = [\n",
" {\n",
" \"role\": \"user\",\n",
" \"content\": \"this is a test request, write a short poem\"\n",
" }\n",
" ],\n",
" extra_body={ # pass in any provider-specific param, if not supported by openai, https://docs.litellm.ai/docs/completion/input#provider-specific-params\n",
" \"metadata\": { # 👈 use for logging additional params (e.g. to langfuse)\n",
" \"generation_name\": \"ishaan-generation-openai-client\",\n",
" \"generation_id\": \"openai-client-gen-id22\",\n",
" \"trace_id\": \"openai-client-trace-id22\",\n",
" \"trace_user_id\": \"openai-client-user-id2\"\n",
" }\n",
" }\n",
")\n",
"\n",
"print(response)"
]
},
{
"cell_type": "markdown",
"source": [
"## Function Calling"
],
"metadata": {
"id": "AqkyKk9Scxgj"
}
},
{
"cell_type": "code",
"source": [
"from openai import OpenAI\n",
"client = OpenAI(\n",
" api_key=\"sk-1234\", # [OPTIONAL] set if you set one on proxy, else set \"\"\n",
" base_url=\"http://0.0.0.0:4000\",\n",
")\n",
"\n",
"tools = [\n",
" {\n",
" \"type\": \"function\",\n",
" \"function\": {\n",
" \"name\": \"get_current_weather\",\n",
" \"description\": \"Get the current weather in a given location\",\n",
" \"parameters\": {\n",
" \"type\": \"object\",\n",
" \"properties\": {\n",
" \"location\": {\n",
" \"type\": \"string\",\n",
" \"description\": \"The city and state, e.g. San Francisco, CA\",\n",
" },\n",
" \"unit\": {\"type\": \"string\", \"enum\": [\"celsius\", \"fahrenheit\"]},\n",
" },\n",
" \"required\": [\"location\"],\n",
" },\n",
" }\n",
" }\n",
"]\n",
"messages = [{\"role\": \"user\", \"content\": \"What's the weather like in Boston today?\"}]\n",
"completion = client.chat.completions.create(\n",
" model=\"gpt-4o\", # use 'model_name' from config.yaml\n",
" messages=messages,\n",
" tools=tools,\n",
" tool_choice=\"auto\"\n",
")\n",
"\n",
"print(completion)\n"
],
"metadata": {
"id": "wDg10VqLczE1"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"### Azure OpenAI Python SDK"
],
"metadata": {
"id": "YYoxLloSaNWW"
}
},
{
"cell_type": "code",
"source": [
"import openai\n",
"client = openai.AzureOpenAI(\n",
" api_key=\"anything\",\n",
" base_url=\"http://0.0.0.0:4000\"\n",
")\n",
"\n",
"# request sent to model set on litellm proxy, `litellm --model`\n",
"response = client.chat.completions.create(\n",
" model=\"gpt-3.5-turbo\",\n",
" messages = [\n",
" {\n",
" \"role\": \"user\",\n",
" \"content\": \"this is a test request, write a short poem\"\n",
" }\n",
" ],\n",
" extra_body={ # pass in any provider-specific param, if not supported by openai, https://docs.litellm.ai/docs/completion/input#provider-specific-params\n",
" \"metadata\": { # 👈 use for logging additional params (e.g. to langfuse)\n",
" \"generation_name\": \"ishaan-generation-openai-client\",\n",
" \"generation_id\": \"openai-client-gen-id22\",\n",
" \"trace_id\": \"openai-client-trace-id22\",\n",
" \"trace_user_id\": \"openai-client-user-id2\"\n",
" }\n",
" }\n",
")\n",
"\n",
"print(response)"
],
"metadata": {
"id": "yA1XcgowaSRy"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"### Langchain Python"
],
"metadata": {
"id": "yl9qhDvnaTpL"
}
},
{
"cell_type": "code",
"source": [
"from langchain.chat_models import ChatOpenAI\n",
"from langchain.prompts.chat import (\n",
" ChatPromptTemplate,\n",
" HumanMessagePromptTemplate,\n",
" SystemMessagePromptTemplate,\n",
")\n",
"from langchain.schema import HumanMessage, SystemMessage\n",
"import os\n",
"\n",
"os.environ[\"OPENAI_API_KEY\"] = \"anything\"\n",
"\n",
"chat = ChatOpenAI(\n",
" openai_api_base=\"http://0.0.0.0:4000\",\n",
" model = \"gpt-3.5-turbo\",\n",
" temperature=0.1,\n",
" extra_body={\n",
" \"metadata\": {\n",
" \"generation_name\": \"ishaan-generation-langchain-client\",\n",
" \"generation_id\": \"langchain-client-gen-id22\",\n",
" \"trace_id\": \"langchain-client-trace-id22\",\n",
" \"trace_user_id\": \"langchain-client-user-id2\"\n",
" }\n",
" }\n",
")\n",
"\n",
"messages = [\n",
" SystemMessage(\n",
" content=\"You are a helpful assistant that im using to make a test request to.\"\n",
" ),\n",
" HumanMessage(\n",
" content=\"test from litellm. tell me why it's amazing in 1 sentence\"\n",
" ),\n",
"]\n",
"response = chat(messages)\n",
"\n",
"print(response)"
],
"metadata": {
"id": "5MUZgSquaW5t"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"### Curl"
],
"metadata": {
"id": "B9eMgnULbRaz"
}
},
{
"cell_type": "markdown",
"source": [
"\n",
"\n",
"```\n",
"curl -X POST 'http://0.0.0.0:4000/chat/completions' \\\n",
" -H 'Content-Type: application/json' \\\n",
" -d '{\n",
" \"model\": \"gpt-3.5-turbo\",\n",
" \"messages\": [\n",
" {\n",
" \"role\": \"user\",\n",
" \"content\": \"what llm are you\"\n",
" }\n",
" ],\n",
" \"metadata\": {\n",
" \"generation_name\": \"ishaan-test-generation\",\n",
" \"generation_id\": \"gen-id22\",\n",
" \"trace_id\": \"trace-id22\",\n",
" \"trace_user_id\": \"user-id2\"\n",
" }\n",
"}'\n",
"```\n",
"\n"
],
"metadata": {
"id": "VWCCk5PFcmhS"
}
},
{
"cell_type": "markdown",
"source": [
"### LlamaIndex"
],
"metadata": {
"id": "drBAm2e1b6xe"
}
},
{
"cell_type": "code",
"source": [
"import os, dotenv\n",
"\n",
"from llama_index.llms import AzureOpenAI\n",
"from llama_index.embeddings import AzureOpenAIEmbedding\n",
"from llama_index import VectorStoreIndex, SimpleDirectoryReader, ServiceContext\n",
"\n",
"llm = AzureOpenAI(\n",
" engine=\"azure-gpt-3.5\", # model_name on litellm proxy\n",
" temperature=0.0,\n",
" azure_endpoint=\"http://0.0.0.0:4000\", # litellm proxy endpoint\n",
" api_key=\"sk-1234\", # litellm proxy API Key\n",
" api_version=\"2023-07-01-preview\",\n",
")\n",
"\n",
"embed_model = AzureOpenAIEmbedding(\n",
" deployment_name=\"azure-embedding-model\",\n",
" azure_endpoint=\"http://0.0.0.0:4000\",\n",
" api_key=\"sk-1234\",\n",
" api_version=\"2023-07-01-preview\",\n",
")\n",
"\n",
"\n",
"documents = SimpleDirectoryReader(\"llama_index_data\").load_data()\n",
"service_context = ServiceContext.from_defaults(llm=llm, embed_model=embed_model)\n",
"index = VectorStoreIndex.from_documents(documents, service_context=service_context)\n",
"\n",
"query_engine = index.as_query_engine()\n",
"response = query_engine.query(\"What did the author do growing up?\")\n",
"print(response)\n"
],
"metadata": {
"id": "d0bZcv8fb9mL"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"### Langchain JS"
],
"metadata": {
"id": "xypvNdHnb-Yy"
}
},
{
"cell_type": "code",
"source": [
"import { ChatOpenAI } from \"@langchain/openai\";\n",
"\n",
"\n",
"const model = new ChatOpenAI({\n",
" modelName: \"gpt-4\",\n",
" openAIApiKey: \"sk-1234\",\n",
" modelKwargs: {\"metadata\": \"hello world\"} // 👈 PASS Additional params here\n",
"}, {\n",
" basePath: \"http://0.0.0.0:4000\",\n",
"});\n",
"\n",
"const message = await model.invoke(\"Hi there!\");\n",
"\n",
"console.log(message);\n"
],
"metadata": {
"id": "R55mK2vCcBN2"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"### OpenAI JS"
],
"metadata": {
"id": "nC4bLifCcCiW"
}
},
{
"cell_type": "code",
"source": [
"const { OpenAI } = require('openai');\n",
"\n",
"const openai = new OpenAI({\n",
" apiKey: \"sk-1234\", // This is the default and can be omitted\n",
" baseURL: \"http://0.0.0.0:4000\"\n",
"});\n",
"\n",
"async function main() {\n",
" const chatCompletion = await openai.chat.completions.create({\n",
" messages: [{ role: 'user', content: 'Say this is a test' }],\n",
" model: 'gpt-3.5-turbo',\n",
" }, {\"metadata\": {\n",
" \"generation_name\": \"ishaan-generation-openaijs-client\",\n",
" \"generation_id\": \"openaijs-client-gen-id22\",\n",
" \"trace_id\": \"openaijs-client-trace-id22\",\n",
" \"trace_user_id\": \"openaijs-client-user-id2\"\n",
" }});\n",
"}\n",
"\n",
"main();\n"
],
"metadata": {
"id": "MICH8kIMcFpg"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"### Anthropic SDK"
],
"metadata": {
"id": "D1Q07pEAcGTb"
}
},
{
"cell_type": "code",
"source": [
"import os\n",
"\n",
"from anthropic import Anthropic\n",
"\n",
"client = Anthropic(\n",
" base_url=\"http://localhost:4000\", # proxy endpoint\n",
" api_key=\"sk-s4xN1IiLTCytwtZFJaYQrA\", # litellm proxy virtual key\n",
")\n",
"\n",
"message = client.messages.create(\n",
" max_tokens=1024,\n",
" messages=[\n",
" {\n",
" \"role\": \"user\",\n",
" \"content\": \"Hello, Claude\",\n",
" }\n",
" ],\n",
" model=\"claude-3-opus-20240229\",\n",
")\n",
"print(message.content)"
],
"metadata": {
"id": "qBjFcAvgcI3t"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"## /embeddings"
],
"metadata": {
"id": "dFAR4AJGcONI"
}
},
{
"cell_type": "markdown",
"source": [
"### OpenAI Python SDK"
],
"metadata": {
"id": "lgNoM281cRzR"
}
},
{
"cell_type": "code",
"source": [
"import openai\n",
"from openai import OpenAI\n",
"\n",
"# set base_url to your proxy server\n",
"# set api_key to send to proxy server\n",
"client = OpenAI(api_key=\"<proxy-api-key>\", base_url=\"http://0.0.0.0:4000\")\n",
"\n",
"response = client.embeddings.create(\n",
" input=[\"hello from litellm\"],\n",
" model=\"text-embedding-ada-002\"\n",
")\n",
"\n",
"print(response)\n"
],
"metadata": {
"id": "NY3DJhPfcQhA"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"### Langchain Embeddings"
],
"metadata": {
"id": "hmbg-DW6cUZs"
}
},
{
"cell_type": "code",
"source": [
"from langchain.embeddings import OpenAIEmbeddings\n",
"\n",
"embeddings = OpenAIEmbeddings(model=\"sagemaker-embeddings\", openai_api_base=\"http://0.0.0.0:4000\", openai_api_key=\"temp-key\")\n",
"\n",
"\n",
"text = \"This is a test document.\"\n",
"\n",
"query_result = embeddings.embed_query(text)\n",
"\n",
"print(f\"SAGEMAKER EMBEDDINGS\")\n",
"print(query_result[:5])\n",
"\n",
"embeddings = OpenAIEmbeddings(model=\"bedrock-embeddings\", openai_api_base=\"http://0.0.0.0:4000\", openai_api_key=\"temp-key\")\n",
"\n",
"text = \"This is a test document.\"\n",
"\n",
"query_result = embeddings.embed_query(text)\n",
"\n",
"print(f\"BEDROCK EMBEDDINGS\")\n",
"print(query_result[:5])\n",
"\n",
"embeddings = OpenAIEmbeddings(model=\"bedrock-titan-embeddings\", openai_api_base=\"http://0.0.0.0:4000\", openai_api_key=\"temp-key\")\n",
"\n",
"text = \"This is a test document.\"\n",
"\n",
"query_result = embeddings.embed_query(text)\n",
"\n",
"print(f\"TITAN EMBEDDINGS\")\n",
"print(query_result[:5])"
],
"metadata": {
"id": "lX2S8Nl1cWVP"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"### Curl Request"
],
"metadata": {
"id": "oqGbWBCQcYfd"
}
},
{
"cell_type": "markdown",
"source": [
"\n",
"\n",
"```curl\n",
"curl -X POST 'http://0.0.0.0:4000/embeddings' \\\n",
" -H 'Content-Type: application/json' \\\n",
" -d ' {\n",
" \"model\": \"text-embedding-ada-002\",\n",
" \"input\": [\"write a litellm poem\"]\n",
" }'\n",
"```\n",
"\n"
],
"metadata": {
"id": "7rkIMV9LcdwQ"
}
}
]
}

View file

@ -1,10 +1,10 @@
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages:
Calling 10
Exception: Expecting value: line 1 column 1 (char 0)
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages:
Calling 10
@ -21,13 +21,13 @@ Exception: Expecting value: line 1 column 1 (char 0)
Question: Does litellm support ooobagooba llms? how can i call oobagooba llms. Call all LLM APIs using the Ope
Exception: Expecting value: line 1 column 1 (char 0)
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages:
Calling 10
Exception: Expecting value: line 1 column 1 (char 0)
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages:
Calling 10
@ -49,7 +49,7 @@ Exception: Expecting value: line 1 column 1 (char 0)
Question: Does litellm support ooobagooba llms? how can i call oobagooba llms. Call all LLM APIs using the Ope
Exception: Expecting value: line 1 column 1 (char 0)
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages:
Calling 10
@ -61,7 +61,7 @@ Exception: Expecting value: line 1 column 1 (char 0)
Question: Does litellm support ooobagooba llms? how can i call oobagooba llms. Call all LLM APIs using the Ope
Exception: Expecting value: line 1 column 1 (char 0)
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages:
Calling 10
@ -70,7 +70,7 @@ Exception: Expecting value: line 1 column 1 (char 0)
Question: Does litellm support ooobagooba llms? how can i call oobagooba llms. Call all LLM APIs using the Ope
Exception: Expecting value: line 1 column 1 (char 0)
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages:
Calling 10
@ -79,7 +79,7 @@ Exception: Expecting value: line 1 column 1 (char 0)
Question: Does litellm support ooobagooba llms? how can i call oobagooba llms. Call all LLM APIs using the Ope
Exception: Expecting value: line 1 column 1 (char 0)
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages:
Calling 10
@ -109,7 +109,7 @@ Question: Given this context, what is litellm? LiteLLM about: About
Call all LLM APIs using the OpenAI format.
Exception: Expecting value: line 1 column 1 (char 0)
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages:
Calling 10
@ -128,7 +128,7 @@ Exception: Expecting value: line 1 column 1 (char 0)
Question: Does litellm support ooobagooba llms? how can i call oobagooba llms. Call all LLM APIs using the Ope
Exception: Expecting value: line 1 column 1 (char 0)
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages:
Calling 10
@ -148,7 +148,7 @@ Exception: Expecting value: line 1 column 1 (char 0)
Question: Does litellm support ooobagooba llms? how can i call oobagooba llms. Call all LLM APIs using the Ope
Exception: Expecting value: line 1 column 1 (char 0)
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages:
Calling 10
@ -162,7 +162,7 @@ Question: Given this context, what is litellm? LiteLLM about: About
Call all LLM APIs using the OpenAI format.
Exception: Expecting value: line 1 column 1 (char 0)
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages:
Calling 10
@ -174,7 +174,7 @@ Exception: Expecting value: line 1 column 1 (char 0)
Question: Does litellm support ooobagooba llms? how can i call oobagooba llms. Call all LLM APIs using the Ope
Exception: Expecting value: line 1 column 1 (char 0)
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages:
Calling 10
@ -184,7 +184,7 @@ Question: Given this context, what is litellm? LiteLLM about: About
Call all LLM APIs using the OpenAI format.
Exception: Expecting value: line 1 column 1 (char 0)
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages:
Calling 10
@ -193,19 +193,19 @@ Exception: Expecting value: line 1 column 1 (char 0)
Question: Does litellm support ooobagooba llms? how can i call oobagooba llms. Call all LLM APIs using the Ope
Exception: Expecting value: line 1 column 1 (char 0)
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages:
Calling 10
Exception: Expecting value: line 1 column 1 (char 0)
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages:
Calling 10
Exception: Expecting value: line 1 column 1 (char 0)
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages:
Calling 10
@ -214,7 +214,7 @@ Exception: Expecting value: line 1 column 1 (char 0)
Question: Does litellm support ooobagooba llms? how can i call oobagooba llms. Call all LLM APIs using the Ope
Exception: Expecting value: line 1 column 1 (char 0)
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages:
Calling 10
@ -234,7 +234,7 @@ Exception: Expecting value: line 1 column 1 (char 0)
Question: Does litellm support ooobagooba llms? how can i call oobagooba llms. Call all LLM APIs using the Ope
Exception: Expecting value: line 1 column 1 (char 0)
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages:
Calling 10
@ -244,7 +244,7 @@ Question: Given this context, what is litellm? LiteLLM about: About
Call all LLM APIs using the OpenAI format.
Exception: Expecting value: line 1 column 1 (char 0)
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages:
Calling 10
@ -253,7 +253,7 @@ Exception: Expecting value: line 1 column 1 (char 0)
Question: Does litellm support ooobagooba llms? how can i call oobagooba llms. Call all LLM APIs using the Ope
Exception: Expecting value: line 1 column 1 (char 0)
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages:
Calling 10
@ -267,31 +267,31 @@ Question: Given this context, what is litellm? LiteLLM about: About
Call all LLM APIs using the OpenAI format.
Exception: Expecting value: line 1 column 1 (char 0)
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages:
Calling 10
Exception: Expecting value: line 1 column 1 (char 0)
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages:
Calling 10
Exception: Expecting value: line 1 column 1 (char 0)
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages:
Calling 10
Exception: Expecting value: line 1 column 1 (char 0)
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages:
Calling 10
Exception: Expecting value: line 1 column 1 (char 0)
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages:
Calling 10
@ -305,7 +305,7 @@ Question: Given this context, what is litellm? LiteLLM about: About
Call all LLM APIs using the OpenAI format.
Exception: Expecting value: line 1 column 1 (char 0)
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages:
Calling 10
@ -330,7 +330,7 @@ Question: Given this context, what is litellm? LiteLLM about: About
Call all LLM APIs using the OpenAI format.
Exception: Expecting value: line 1 column 1 (char 0)
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages:
Calling 10
@ -339,7 +339,7 @@ Exception: Expecting value: line 1 column 1 (char 0)
Question: Does litellm support ooobagooba llms? how can i call oobagooba llms. Call all LLM APIs using the Ope
Exception: Expecting value: line 1 column 1 (char 0)
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages:
Calling 10
@ -360,7 +360,7 @@ Question: Given this context, what is litellm? LiteLLM about: About
Call all LLM APIs using the OpenAI format.
Exception: Expecting value: line 1 column 1 (char 0)
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages:
Calling 10
@ -369,7 +369,7 @@ Exception: Expecting value: line 1 column 1 (char 0)
Question: Does litellm support ooobagooba llms? how can i call oobagooba llms. Call all LLM APIs using the Ope
Exception: Expecting value: line 1 column 1 (char 0)
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages:
Calling 10
@ -378,7 +378,7 @@ Exception: Expecting value: line 1 column 1 (char 0)
Question: Does litellm support ooobagooba llms? how can i call oobagooba llms. Call all LLM APIs using the Ope
Exception: Expecting value: line 1 column 1 (char 0)
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages:
Calling 10
@ -388,7 +388,7 @@ Question: Given this context, what is litellm? LiteLLM about: About
Call all LLM APIs using the OpenAI format.
Exception: Expecting value: line 1 column 1 (char 0)
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages:
Calling 10
@ -409,7 +409,7 @@ Exception: Expecting value: line 1 column 1 (char 0)
Question: Does litellm support ooobagooba llms? how can i call oobagooba llms. Call all LLM APIs using the Ope
Exception: Expecting value: line 1 column 1 (char 0)
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages:
Calling 10
@ -422,13 +422,13 @@ Exception: Expecting value: line 1 column 1 (char 0)
Question: Does litellm support ooobagooba llms? how can i call oobagooba llms. Call all LLM APIs using the Ope
Exception: Expecting value: line 1 column 1 (char 0)
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages:
Calling 10
Exception: Expecting value: line 1 column 1 (char 0)
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages:
Calling 10
@ -438,7 +438,7 @@ Question: Given this context, what is litellm? LiteLLM about: About
Call all LLM APIs using the OpenAI format.
Exception: Expecting value: line 1 column 1 (char 0)
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages:
Calling 10
@ -462,7 +462,7 @@ Question: Given this context, what is litellm? LiteLLM about: About
Call all LLM APIs using the OpenAI format.
Exception: 'Response' object has no attribute 'get'
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages:
Calling 10
@ -482,7 +482,7 @@ Exception: 'Response' object has no attribute 'get'
Question: Does litellm support ooobagooba llms? how can i call oobagooba llms. Call all LLM APIs using the Ope
Exception: 'Response' object has no attribute 'get'
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages:
Calling 10
@ -492,7 +492,7 @@ Question: Given this context, what is litellm? LiteLLM about: About
Call all LLM APIs using the OpenAI format.
Exception: 'Response' object has no attribute 'get'
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages:
Calling 10
@ -516,7 +516,7 @@ Question: Given this context, what is litellm? LiteLLM about: About
Call all LLM APIs using the OpenAI format.
Exception: 'Response' object has no attribute 'get'
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages:
Calling 10
@ -529,7 +529,7 @@ Exception: 'Response' object has no attribute 'get'
Question: Does litellm support ooobagooba llms? how can i call oobagooba llms. Call all LLM APIs using the Ope
Exception: 'Response' object has no attribute 'get'
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages:
Calling 10
@ -546,13 +546,13 @@ Question: Given this context, what is litellm? LiteLLM about: About
Call all LLM APIs using the OpenAI format.
Exception: 'Response' object has no attribute 'get'
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages:
Calling 10
Exception: 'Response' object has no attribute 'get'
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages:
Calling 10
@ -580,13 +580,13 @@ Question: Given this context, what is litellm? LiteLLM about: About
Call all LLM APIs using the OpenAI format.
Exception: 'Response' object has no attribute 'get'
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages:
Calling 10
Exception: 'Response' object has no attribute 'get'
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages:
Calling 10
@ -624,7 +624,7 @@ Question: Given this context, what is litellm? LiteLLM about: About
Call all LLM APIs using the OpenAI format.
Exception: 'Response' object has no attribute 'get'
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages:
Calling 10
@ -638,13 +638,13 @@ Question: Given this context, what is litellm? LiteLLM about: About
Call all LLM APIs using the OpenAI format.
Exception: 'Response' object has no attribute 'get'
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages:
Calling 10
Exception: 'Response' object has no attribute 'get'
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages:
Calling 10
@ -660,7 +660,7 @@ Question: Given this context, what is litellm? LiteLLM about: About
Call all LLM APIs using the OpenAI format.
Exception: 'Response' object has no attribute 'get'
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages:
Calling 10
@ -681,7 +681,7 @@ Question: Given this context, what is litellm? LiteLLM about: About
Call all LLM APIs using the OpenAI format.
Exception: 'Response' object has no attribute 'get'
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages:
Calling 10
@ -691,31 +691,31 @@ Question: Given this context, what is litellm? LiteLLM about: About
Call all LLM APIs using the OpenAI format.
Exception: 'Response' object has no attribute 'get'
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages:
Calling 10
Exception: 'Response' object has no attribute 'get'
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages:
Calling 10
Exception: 'Response' object has no attribute 'get'
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages:
Calling 10
Exception: 'Response' object has no attribute 'get'
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages:
Calling 10
Exception: 'Response' object has no attribute 'get'
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages:
Calling 10
@ -771,7 +771,7 @@ Question: Given this context, what is litellm? LiteLLM about: About
Call all LLM APIs using the OpenAI format.
Exception: 'Response' object has no attribute 'get'
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages:
Calling 10
@ -780,7 +780,7 @@ Exception: 'Response' object has no attribute 'get'
Question: Does litellm support ooobagooba llms? how can i call oobagooba llms. Call all LLM APIs using the Ope
Exception: 'Response' object has no attribute 'get'
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages:
Calling 10
@ -800,7 +800,7 @@ Question: Given this context, what is litellm? LiteLLM about: About
Call all LLM APIs using the OpenAI format.
Exception: 'Response' object has no attribute 'get'
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages:
Calling 10
@ -820,7 +820,7 @@ Exception: 'Response' object has no attribute 'get'
Question: Does litellm support ooobagooba llms? how can i call oobagooba llms. Call all LLM APIs using the Ope
Exception: 'Response' object has no attribute 'get'
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages:
Calling 10
@ -830,7 +830,7 @@ Question: Given this context, what is litellm? LiteLLM about: About
Call all LLM APIs using the OpenAI format.
Exception: 'Response' object has no attribute 'get'
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages:
Calling 10
@ -840,7 +840,7 @@ Question: Given this context, what is litellm? LiteLLM about: About
Call all LLM APIs using the OpenAI format.
Exception: 'Response' object has no attribute 'get'
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages:
Calling 10
@ -850,7 +850,7 @@ Question: Given this context, what is litellm? LiteLLM about: About
Call all LLM APIs using the OpenAI format.
Exception: 'Response' object has no attribute 'get'
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages:
Calling 10
@ -862,13 +862,13 @@ Exception: 'Response' object has no attribute 'get'
Question: Does litellm support ooobagooba llms? how can i call oobagooba llms. Call all LLM APIs using the Ope
Exception: 'Response' object has no attribute 'get'
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages:
Calling 10
Exception: 'Response' object has no attribute 'get'
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages:
Calling 10
@ -877,7 +877,7 @@ Exception: 'Response' object has no attribute 'get'
Question: Does litellm support ooobagooba llms? how can i call oobagooba llms. Call all LLM APIs using the Ope
Exception: 'Response' object has no attribute 'get'
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages:
Calling 10
@ -898,7 +898,7 @@ Question: Given this context, what is litellm? LiteLLM about: About
Call all LLM APIs using the OpenAI format.
Exception: 'Response' object has no attribute 'get'
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages:
Calling 10
@ -919,7 +919,7 @@ Question: Given this context, what is litellm? LiteLLM about: About
Call all LLM APIs using the OpenAI format.
Exception: 'Response' object has no attribute 'get'
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages:
Calling 10
@ -936,19 +936,19 @@ Question: Given this context, what is litellm? LiteLLM about: About
Call all LLM APIs using the OpenAI format.
Exception: 'Response' object has no attribute 'get'
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages:
Calling 10
Exception: 'Response' object has no attribute 'get'
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages:
Calling 10
Exception: 'Response' object has no attribute 'get'
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages:
Calling 10
@ -961,25 +961,25 @@ Exception: 'Response' object has no attribute 'get'
Question: Does litellm support ooobagooba llms? how can i call oobagooba llms. Call all LLM APIs using the Ope
Exception: 'Response' object has no attribute 'get'
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages:
Calling 10
Exception: 'Response' object has no attribute 'get'
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages:
Calling 10
Exception: 'Response' object has no attribute 'get'
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages:
Calling 10
Exception: 'Response' object has no attribute 'get'
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages:
Calling 10
@ -993,7 +993,7 @@ Question: Given this context, what is litellm? LiteLLM about: About
Call all LLM APIs using the OpenAI format.
Exception: 'Response' object has no attribute 'get'
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages:
Calling 10

View file

@ -20,7 +20,7 @@ Call all LLM APIs using the OpenAI format.
Response ID: 52dbbd49-eedb-4c11-8382-3ca7deb1af35 Url: /queue/response/52dbbd49-eedb-4c11-8382-3ca7deb1af35
Time: 3.50 seconds
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages:
Calling 10
@ -35,7 +35,7 @@ Question: Does litellm support ooobagooba llms? how can i call oobagooba llms. C
Response ID: ae1e2b71-d711-456d-8df0-13ce0709eb04 Url: /queue/response/ae1e2b71-d711-456d-8df0-13ce0709eb04
Time: 5.60 seconds
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages:
Calling 10

View file

@ -1,4 +1,4 @@
What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages:
Calling 100+ LLMs Huggingface/Bedrock/TogetherAI/etc. in the OpenAI ChatCompletions & Completions format

View file

@ -18,13 +18,13 @@ type: application
# This is the chart version. This version number should be incremented each time you make changes
# to the chart and its templates, including the app version.
# Versions are expected to follow Semantic Versioning (https://semver.org/)
version: 0.2.1
version: 0.2.2
# This is the version number of the application being deployed. This version number should be
# incremented each time you make changes to the application. Versions are not expected to
# follow Semantic Versioning. They should reflect the version the application is using.
# It is recommended to use it with quotes.
appVersion: v1.41.8
appVersion: v1.42.7
dependencies:
- name: "postgresql"

View file

@ -1,5 +1,9 @@
# Helm Chart for LiteLLM
> [!IMPORTANT]
> This is community maintained, Please make an issue if you run into a bug
> We recommend using [Docker or Kubernetes for production deployments](https://docs.litellm.ai/docs/proxy/prod)
## Prerequisites
- Kubernetes 1.21+

View file

@ -1,23 +1,73 @@
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# Batches API
# [BETA] Batches API
Covers Batches, Files
## Quick Start
Call an existing Assistant.
- Create File for Batch Completion
- Create Batch Request
- List Batches
- Retrieve the Specific Batch and File Content
<Tabs>
<TabItem value="proxy" label="LiteLLM PROXY Server">
```bash
$ export OPENAI_API_KEY="sk-..."
$ litellm
# RUNNING on http://0.0.0.0:4000
```
**Create File for Batch Completion**
```shell
curl http://localhost:4000/v1/files \
-H "Authorization: Bearer sk-1234" \
-F purpose="batch" \
-F file="@mydata.jsonl"
```
**Create Batch Request**
```bash
curl http://localhost:4000/v1/batches \
-H "Authorization: Bearer sk-1234" \
-H "Content-Type: application/json" \
-d '{
"input_file_id": "file-abc123",
"endpoint": "/v1/chat/completions",
"completion_window": "24h"
}'
```
**Retrieve the Specific Batch**
```bash
curl http://localhost:4000/v1/batches/batch_abc123 \
-H "Authorization: Bearer sk-1234" \
-H "Content-Type: application/json" \
```
**List Batches**
```bash
curl http://localhost:4000/v1/batches \
-H "Authorization: Bearer sk-1234" \
-H "Content-Type: application/json" \
```
</TabItem>
<TabItem value="sdk" label="SDK">
**Create File for Batch Completion**
@ -77,48 +127,15 @@ file_content = await litellm.afile_content(
print("file content = ", file_content)
```
</TabItem>
<TabItem value="proxy" label="PROXY">
**List Batches**
```bash
$ export OPENAI_API_KEY="sk-..."
$ litellm
# RUNNING on http://0.0.0.0:4000
```
**Create File for Batch Completion**
```shell
curl https://api.openai.com/v1/files \
-H "Authorization: Bearer sk-1234" \
-F purpose="batch" \
-F file="@mydata.jsonl"
```
**Create Batch Request**
```bash
curl http://localhost:4000/v1/batches \
-H "Authorization: Bearer sk-1234" \
-H "Content-Type: application/json" \
-d '{
"input_file_id": "file-abc123",
"endpoint": "/v1/chat/completions",
"completion_window": "24h"
}'
```
**Retrieve the Specific Batch**
```bash
curl http://localhost:4000/v1/batches/batch_abc123 \
-H "Authorization: Bearer sk-1234" \
-H "Content-Type: application/json" \
```python
list_batches_response = litellm.list_batches(custom_llm_provider="openai", limit=2)
print("list_batches_response=", list_batches_response)
```
</TabItem>
</Tabs>
## [👉 Proxy API Reference](https://litellm-api.up.railway.app/#/batch)

View file

@ -7,14 +7,14 @@ Don't want to get crazy bills because either while you're calling LLM APIs **or*
:::info
If you want a server to manage user keys, budgets, etc. use our [OpenAI Proxy Server](./proxy/virtual_keys.md)
If you want a server to manage user keys, budgets, etc. use our [LiteLLM Proxy Server](./proxy/virtual_keys.md)
:::
LiteLLM exposes:
* `litellm.max_budget`: a global variable you can use to set the max budget (in USD) across all your litellm calls. If this budget is exceeded, it will raise a BudgetExceededError
* `BudgetManager`: A class to help set budgets per user. BudgetManager creates a dictionary to manage the user budgets, where the key is user and the object is their current cost + model-specific costs.
* `OpenAI Proxy Server`: A server to call 100+ LLMs with an openai-compatible endpoint. Manages user budgets, spend tracking, load balancing etc.
* `LiteLLM Proxy Server`: A server to call 100+ LLMs with an openai-compatible endpoint. Manages user budgets, spend tracking, load balancing etc.
## quick start

View file

@ -48,19 +48,20 @@ Use `litellm.get_supported_openai_params()` for an updated list of params for ea
|Anyscale | ✅ | ✅ | ✅ | ✅ | ✅ |
|Cohere| ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | |
|Huggingface| ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | | |
|Openrouter| ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | | | | ✅ | | | | |
|Openrouter| ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | | | | ✅ | | | | |
|AI21| ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | |
|VertexAI| ✅ | ✅ | | ✅ | ✅ | | | | | | | | | ✅ | ✅ | | |
|Bedrock| ✅ | ✅ | ✅ | ✅ | ✅ | | | | | | | | | | ✅ (for anthropic) | |
|Bedrock| ✅ | ✅ | ✅ | ✅ | ✅ | | | | | | | | | | ✅ (model dependent) | |
|Sagemaker| ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | | |
|TogetherAI| ✅ | ✅ | ✅ | ✅ | ✅ | | | | | | ✅ |
|AlephAlpha| ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | | |
|Palm| ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | | |
|NLP Cloud| ✅ | ✅ | ✅ | ✅ | ✅ | | | | | |
|Petals| ✅ | ✅ | | ✅ | ✅ | | | | | |
|Ollama| ✅ | ✅ | ✅ | ✅ | ✅ | | | ✅ | | | | | ✅ | | |
|Ollama| ✅ | ✅ | ✅ | ✅ | ✅ | | | ✅ | | | | | ✅ | | |✅| | | | | | |
|Databricks| ✅ | ✅ | ✅ | ✅ | ✅ | | | | | | | | | | |
|ClarifAI| ✅ | ✅ | |✅ | ✅ | | | | | | | | | | |
|Github| ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | | | | ✅ |✅ (model dependent)|✅ (model dependent)| | |
:::note
By default, LiteLLM raises an exception if the openai param being passed in isn't supported.

View file

@ -1,7 +1,7 @@
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# JSON Mode
# Structured Outputs (JSON Mode)
## Quick Start
@ -61,8 +61,180 @@ params = get_supported_openai_params(model="anthropic.claude-3", custom_llm_prov
assert "response_format" in params
```
## Pass in 'json_schema'
To use Structured Outputs, simply specify
```
response_format: { "type": "json_schema", "json_schema": … , "strict": true }
```
Works for OpenAI models
:::info
Support for passing in a pydantic object to litellm sdk will be [coming soon](https://github.com/BerriAI/litellm/issues/5074#issuecomment-2272355842)
:::
<Tabs>
<TabItem value="sdk" label="SDK">
```python
import os
from litellm import completion
# add to env var
os.environ["OPENAI_API_KEY"] = ""
messages = [{"role": "user", "content": "List 5 cookie recipes"}]
resp = completion(
model="gpt-4o-2024-08-06",
messages=messages,
response_format={
"type": "json_schema",
"json_schema": {
"name": "math_reasoning",
"schema": {
"type": "object",
"properties": {
"steps": {
"type": "array",
"items": {
"type": "object",
"properties": {
"explanation": { "type": "string" },
"output": { "type": "string" }
},
"required": ["explanation", "output"],
"additionalProperties": False
}
},
"final_answer": { "type": "string" }
},
"required": ["steps", "final_answer"],
"additionalProperties": False
},
"strict": True
},
}
)
print("Received={}".format(resp))
```
</TabItem>
<TabItem value="proxy" label="PROXY">
1. Add openai model to config.yaml
```yaml
model_list:
- model_name: "gpt-4o"
litellm_params:
model: "gpt-4o-2024-08-06"
```
2. Start proxy with config.yaml
```bash
litellm --config /path/to/config.yaml
```
3. Call with OpenAI SDK / Curl!
Just replace the 'base_url' in the openai sdk, to call the proxy with 'json_schema' for openai models
**OpenAI SDK**
```python
from pydantic import BaseModel
from openai import OpenAI
client = OpenAI(
api_key="anything", # 👈 PROXY KEY (can be anything, if master_key not set)
base_url="http://0.0.0.0:4000" # 👈 PROXY BASE URL
)
class Step(BaseModel):
explanation: str
output: str
class MathReasoning(BaseModel):
steps: list[Step]
final_answer: str
completion = client.beta.chat.completions.parse(
model="gpt-4o",
messages=[
{"role": "system", "content": "You are a helpful math tutor. Guide the user through the solution step by step."},
{"role": "user", "content": "how can I solve 8x + 7 = -23"}
],
response_format=MathReasoning,
)
math_reasoning = completion.choices[0].message.parsed
```
**Curl**
```bash
curl -X POST 'http://0.0.0.0:4000/v1/chat/completions' \
-H 'Content-Type: application/json' \
-H 'Authorization: Bearer sk-1234' \
-d '{
"model": "gpt-4o",
"messages": [
{
"role": "system",
"content": "You are a helpful math tutor. Guide the user through the solution step by step."
},
{
"role": "user",
"content": "how can I solve 8x + 7 = -23"
}
],
"response_format": {
"type": "json_schema",
"json_schema": {
"name": "math_reasoning",
"schema": {
"type": "object",
"properties": {
"steps": {
"type": "array",
"items": {
"type": "object",
"properties": {
"explanation": { "type": "string" },
"output": { "type": "string" }
},
"required": ["explanation", "output"],
"additionalProperties": false
}
},
"final_answer": { "type": "string" }
},
"required": ["steps", "final_answer"],
"additionalProperties": false
},
"strict": true
}
}
}'
```
</TabItem>
</Tabs>
## Validate JSON Schema
:::info
Support for doing this in the openai 'json_schema' format will be [coming soon](https://github.com/BerriAI/litellm/issues/5074#issuecomment-2272355842)
:::
For VertexAI models, LiteLLM supports passing the `response_schema` and validating the JSON output.
This works across Gemini (`vertex_ai_beta/`) + Anthropic (`vertex_ai/`) models.

View file

@ -270,7 +270,7 @@ response = embedding(
| embed-multilingual-v2.0 | `embedding(model="embed-multilingual-v2.0", input=["good morning from litellm", "this is another item"])` |
## HuggingFace Embedding Models
LiteLLM supports all Feature-Extraction Embedding models: https://huggingface.co/models?pipeline_tag=feature-extraction
LiteLLM supports all Feature-Extraction + Sentence Similarity Embedding models: https://huggingface.co/models?pipeline_tag=feature-extraction
### Usage
```python
@ -282,6 +282,25 @@ response = embedding(
input=["good morning from litellm"]
)
```
### Usage - Set input_type
LiteLLM infers input type (feature-extraction or sentence-similarity) by making a GET request to the api base.
Override this, by setting the `input_type` yourself.
```python
from litellm import embedding
import os
os.environ['HUGGINGFACE_API_KEY'] = ""
response = embedding(
model='huggingface/microsoft/codebert-base',
input=["good morning from litellm", "you are a good bot"],
api_base = "https://p69xlsj6rpno5drq.us-east-1.aws.endpoints.huggingface.cloud",
input_type="sentence-similarity"
)
```
### Usage - Custom API Base
```python
from litellm import embedding

View file

@ -29,8 +29,12 @@ This covers:
- ✅ [Use LiteLLM keys/authentication on Pass Through Endpoints](./proxy/pass_through#✨-enterprise---use-litellm-keysauthentication-on-pass-through-endpoints)
- ✅ Set Max Request / File Size on Requests
- ✅ [Enforce Required Params for LLM Requests (ex. Reject requests missing ["metadata"]["generation_name"])](./proxy/enterprise#enforce-required-params-for-llm-requests)
- **Spend Tracking**
- **Customize Logging, Guardrails, Caching per project**
- ✅ [Team Based Logging](./proxy/team_logging.md) - Allow each team to use their own Langfuse Project / custom callbacks
- ✅ [Disable Logging for a Team](./proxy/team_logging.md#disable-logging-for-a-team) - Switch off all logging for a team/project (GDPR Compliance)
- **Spend Tracking & Data Exports**
- ✅ [Tracking Spend for Custom Tags](./proxy/enterprise#tracking-spend-for-custom-tags)
- ✅ [Exporting LLM Logs to GCS Bucket](./proxy/bucket#🪣-logging-gcs-s3-buckets)
- ✅ [API Endpoints to get Spend Reports per Team, API Key, Customer](./proxy/cost_tracking.md#✨-enterprise-api-endpoints-to-get-spend)
- **Advanced Metrics**
- ✅ [`x-ratelimit-remaining-requests`, `x-ratelimit-remaining-tokens` for LLM APIs on Prometheus](./proxy/prometheus#✨-enterprise-llm-remaining-requests-and-remaining-tokens)

View file

@ -0,0 +1,313 @@
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# [Beta] Fine-tuning API
:::info
This is an Enterprise only endpoint [Get Started with Enterprise here](https://calendly.com/d/4mp-gd3-k5k/litellm-1-1-onboarding-chat)
:::
## Supported Providers
- Azure OpenAI
- OpenAI
- Vertex AI
Add `finetune_settings` and `files_settings` to your litellm config.yaml to use the fine-tuning endpoints.
## Example config.yaml for `finetune_settings` and `files_settings`
```yaml
model_list:
- model_name: gpt-4
litellm_params:
model: openai/fake
api_key: fake-key
api_base: https://exampleopenaiendpoint-production.up.railway.app/
# For /fine_tuning/jobs endpoints
finetune_settings:
- custom_llm_provider: azure
api_base: https://exampleopenaiendpoint-production.up.railway.app
api_key: os.environ/AZURE_API_KEY
api_version: "2023-03-15-preview"
- custom_llm_provider: openai
api_key: os.environ/OPENAI_API_KEY
- custom_llm_provider: "vertex_ai"
vertex_project: "adroit-crow-413218"
vertex_location: "us-central1"
vertex_credentials: "/Users/ishaanjaffer/Downloads/adroit-crow-413218-a956eef1a2a8.json"
# for /files endpoints
files_settings:
- custom_llm_provider: azure
api_base: https://exampleopenaiendpoint-production.up.railway.app
api_key: fake-key
api_version: "2023-03-15-preview"
- custom_llm_provider: openai
api_key: os.environ/OPENAI_API_KEY
```
## Create File for fine-tuning
<Tabs>
<TabItem value="openai" label="OpenAI Python SDK">
```python
client = AsyncOpenAI(api_key="sk-1234", base_url="http://0.0.0.0:4000") # base_url is your litellm proxy url
file_name = "openai_batch_completions.jsonl"
response = await client.files.create(
extra_body={"custom_llm_provider": "azure"}, # tell litellm proxy which provider to use
file=open(file_name, "rb"),
purpose="fine-tune",
)
```
</TabItem>
<TabItem value="curl" label="curl">
```shell
curl http://localhost:4000/v1/files \
-H "Authorization: Bearer sk-1234" \
-F purpose="batch" \
-F custom_llm_provider="azure"\
-F file="@mydata.jsonl"
```
</TabItem>
</Tabs>
## Create fine-tuning job
<Tabs>
<TabItem value="azure" label="Azure OpenAI">
<Tabs>
<TabItem value="openai" label="OpenAI Python SDK">
```python
ft_job = await client.fine_tuning.jobs.create(
model="gpt-35-turbo-1106", # Azure OpenAI model you want to fine-tune
training_file="file-abc123", # file_id from create file response
extra_body={"custom_llm_provider": "azure"}, # tell litellm proxy which provider to use
)
```
</TabItem>
<TabItem value="curl" label="curl">
```shell
curl http://localhost:4000/v1/fine_tuning/jobs \
-H "Content-Type: application/json" \
-H "Authorization: Bearer sk-1234" \
-d '{
"custom_llm_provider": "azure",
"model": "gpt-35-turbo-1106",
"training_file": "file-abc123"
}'
```
</TabItem>
</Tabs>
</TabItem>
<TabItem value="Vertex" label="VertexAI">
<Tabs>
<TabItem value="openai" label="OpenAI Python SDK">
```python
ft_job = await client.fine_tuning.jobs.create(
model="gemini-1.0-pro-002", # Vertex model you want to fine-tune
training_file="gs://cloud-samples-data/ai-platform/generative_ai/sft_train_data.jsonl", # file_id from create file response
extra_body={"custom_llm_provider": "vertex_ai"}, # tell litellm proxy which provider to use
)
```
</TabItem>
<TabItem value="curl" label="curl (Unified API)">
```shell
curl http://localhost:4000/v1/fine_tuning/jobs \
-H "Content-Type: application/json" \
-H "Authorization: Bearer sk-1234" \
-d '{
"custom_llm_provider": "vertex_ai",
"model": "gemini-1.0-pro-002",
"training_file": "gs://cloud-samples-data/ai-platform/generative_ai/sft_train_data.jsonl"
}'
```
</TabItem>
<TabItem value="curl-vtx" label="curl (VertexAI API)">
:::info
Use this to create Fine tuning Jobs in [the Vertex AI API Format](https://cloud.google.com/vertex-ai/generative-ai/docs/model-reference/tuning#create-tuning)
:::
```shell
curl http://localhost:4000/v1/projects/tuningJobs \
-H "Content-Type: application/json" \
-H "Authorization: Bearer sk-1234" \
-d '{
"baseModel": "gemini-1.0-pro-002",
"supervisedTuningSpec" : {
"training_dataset_uri": "gs://cloud-samples-data/ai-platform/generative_ai/sft_train_data.jsonl"
}
}'
```
</TabItem>
</Tabs>
</TabItem>
</Tabs>
### Request Body
<Tabs>
<TabItem value="params" label="Supported Params">
* `model`
**Type:** string
**Required:** Yes
The name of the model to fine-tune
* `custom_llm_provider`
**Type:** `Literal["azure", "openai", "vertex_ai"]`
**Required:** Yes
The name of the model to fine-tune. You can select one of the [**supported providers**](#supported-providers)
* `training_file`
**Type:** string
**Required:** Yes
The ID of an uploaded file that contains training data.
- See **upload file** for how to upload a file.
- Your dataset must be formatted as a JSONL file.
* `hyperparameters`
**Type:** object
**Required:** No
The hyperparameters used for the fine-tuning job.
> #### Supported `hyperparameters`
> #### batch_size
**Type:** string or integer
**Required:** No
Number of examples in each batch. A larger batch size means that model parameters are updated less frequently, but with lower variance.
> #### learning_rate_multiplier
**Type:** string or number
**Required:** No
Scaling factor for the learning rate. A smaller learning rate may be useful to avoid overfitting.
> #### n_epochs
**Type:** string or integer
**Required:** No
The number of epochs to train the model for. An epoch refers to one full cycle through the training dataset.
* `suffix`
**Type:** string or null
**Required:** No
**Default:** null
A string of up to 18 characters that will be added to your fine-tuned model name.
Example: A `suffix` of "custom-model-name" would produce a model name like `ft:gpt-4o-mini:openai:custom-model-name:7p4lURel`.
* `validation_file`
**Type:** string or null
**Required:** No
The ID of an uploaded file that contains validation data.
- If provided, this data is used to generate validation metrics periodically during fine-tuning.
* `integrations`
**Type:** array or null
**Required:** No
A list of integrations to enable for your fine-tuning job.
* `seed`
**Type:** integer or null
**Required:** No
The seed controls the reproducibility of the job. Passing in the same seed and job parameters should produce the same results, but may differ in rare cases. If a seed is not specified, one will be generated for you.
</TabItem>
<TabItem value="example" label="Example Request Body">
```json
{
"model": "gpt-4o-mini",
"training_file": "file-abcde12345",
"hyperparameters": {
"batch_size": 4,
"learning_rate_multiplier": 0.1,
"n_epochs": 3
},
"suffix": "custom-model-v1",
"validation_file": "file-fghij67890",
"seed": 42
}
```
</TabItem>
</Tabs>
## Cancel fine-tuning job
<Tabs>
<TabItem value="openai" label="OpenAI Python SDK">
```python
# cancel specific fine tuning job
cancel_ft_job = await client.fine_tuning.jobs.cancel(
fine_tuning_job_id="123", # fine tuning job id
extra_body={"custom_llm_provider": "azure"}, # tell litellm proxy which provider to use
)
print("response from cancel ft job={}".format(cancel_ft_job))
```
</TabItem>
<TabItem value="curl" label="curl">
```shell
curl -X POST http://localhost:4000/v1/fine_tuning/jobs/ftjob-abc123/cancel \
-H "Authorization: Bearer sk-1234" \
-H "Content-Type: application/json" \
-d '{"custom_llm_provider": "azure"}'
```
</TabItem>
</Tabs>
## List fine-tuning jobs
<Tabs>
<TabItem value="openai" label="OpenAI Python SDK">
```python
list_ft_jobs = await client.fine_tuning.jobs.list(
extra_query={"custom_llm_provider": "azure"} # tell litellm proxy which provider to use
)
print("list of ft jobs={}".format(list_ft_jobs))
```
</TabItem>
<TabItem value="curl" label="curl">
```shell
curl -X GET 'http://localhost:4000/v1/fine_tuning/jobs?custom_llm_provider=azure' \
-H "Content-Type: application/json" \
-H "Authorization: Bearer sk-1234"
```
</TabItem>
</Tabs>
## [👉 Proxy API Reference](https://litellm-api.up.railway.app/#/fine-tuning)

View file

@ -10,14 +10,40 @@ https://github.com/BerriAI/litellm
- Translate inputs to provider's `completion`, `embedding`, and `image_generation` endpoints
- [Consistent output](https://docs.litellm.ai/docs/completion/output), text responses will always be available at `['choices'][0]['message']['content']`
- Retry/fallback logic across multiple deployments (e.g. Azure/OpenAI) - [Router](https://docs.litellm.ai/docs/routing)
- Track spend & set budgets per project [OpenAI Proxy Server](https://docs.litellm.ai/docs/simple_proxy)
- Track spend & set budgets per project [LiteLLM Proxy Server](https://docs.litellm.ai/docs/simple_proxy)
## How to use LiteLLM
You can use litellm through either:
1. [OpenAI proxy Server](#openai-proxy) - Server to call 100+ LLMs, load balance, cost tracking across projects
1. [LiteLLM Proxy Server](#openai-proxy) - Server to call 100+ LLMs, load balance, cost tracking across projects
2. [LiteLLM python SDK](#basic-usage) - Python Client to call 100+ LLMs, load balance, cost tracking
## LiteLLM Python SDK
### When to use LiteLLM Proxy Server
:::tip
Use LiteLLM Proxy Server if you want a **central service to access multiple LLMs**
Typically used by Gen AI Enablement / ML PLatform Teams
:::
- LiteLLM Proxy gives you a unified interface to access multiple LLMs (100+ LLMs)
- Track LLM Usage and setup guardrails
- Customize Logging, Guardrails, Caching per project
### When to use LiteLLM Python SDK
:::tip
Use LiteLLM Python SDK if you want to use LiteLLM in your **python code**
Typically used by developers building llm projects
:::
- LiteLLM SDK gives you a unified interface to access multiple LLMs (100+ LLMs)
- Retry/fallback logic across multiple deployments (e.g. Azure/OpenAI) - [Router](https://docs.litellm.ai/docs/routing)
### Basic usage

View file

@ -0,0 +1,127 @@
import Image from '@theme/IdealImage';
# 🪣 Google Cloud Storage Buckets - Logging LLM Input/Output
Log LLM Logs to [Google Cloud Storage Buckets](https://cloud.google.com/storage?hl=en)
:::info
✨ This is an Enterprise only feature [Get Started with Enterprise here](https://calendly.com/d/4mp-gd3-k5k/litellm-1-1-onboarding-chat)
:::
### Usage
1. Add `gcs_bucket` to LiteLLM Config.yaml
```yaml
model_list:
- litellm_params:
api_base: https://openai-function-calling-workers.tasslexyz.workers.dev/
api_key: my-fake-key
model: openai/my-fake-model
model_name: fake-openai-endpoint
litellm_settings:
callbacks: ["gcs_bucket"] # 👈 KEY CHANGE # 👈 KEY CHANGE
```
2. Set required env variables
```shell
GCS_BUCKET_NAME="<your-gcs-bucket-name>"
GCS_PATH_SERVICE_ACCOUNT="/Users/ishaanjaffer/Downloads/adroit-crow-413218-a956eef1a2a8.json" # Add path to service account.json
```
3. Start Proxy
```
litellm --config /path/to/config.yaml
```
4. Test it!
```bash
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
--data ' {
"model": "fake-openai-endpoint",
"messages": [
{
"role": "user",
"content": "what llm are you"
}
],
}
'
```
## Expected Logs on GCS Buckets
<Image img={require('../../img/gcs_bucket.png')} />
### Fields Logged on GCS Buckets
Example payload of a `/chat/completion` request logged on GCS
```json
{
"request_kwargs": {
"model": "gpt-3.5-turbo",
"messages": [
{
"role": "user",
"content": "This is a test"
}
],
"optional_params": {
"temperature": 0.7,
"max_tokens": 10,
"user": "ishaan-2",
"extra_body": {}
}
},
"response_obj": {
"id": "chatcmpl-bd836a8c-89bc-4abd-bee5-e3f1ebfdb541",
"choices": [
{
"finish_reason": "stop",
"index": 0,
"message": {
"content": "Hi!",
"role": "assistant",
"tool_calls": null,
"function_call": null
}
}
],
"created": 1722868456,
"model": "gpt-3.5-turbo",
"object": "chat.completion",
"system_fingerprint": null,
"usage": {
"prompt_tokens": 10,
"completion_tokens": 20,
"total_tokens": 30
}
},
"start_time": "2024-08-05 07:34:16",
"end_time": "2024-08-05 07:34:16"
}
```
## Getting `service_account.json` from Google Cloud Console
1. Go to [Google Cloud Console](https://console.cloud.google.com/)
2. Search for IAM & Admin
3. Click on Service Accounts
4. Select a Service Account
5. Click on 'Keys' -> Add Key -> Create New Key -> JSON
6. Save the JSON file and add the path to `GCS_PATH_SERVICE_ACCOUNT`
## Support & Talk to Founders
- [Schedule Demo 👋](https://calendly.com/d/4mp-gd3-k5k/berriai-1-1-onboarding-litellm-hosted-version)
- [Community Discord 💭](https://discord.gg/wuPM9dRgDw)
- Our numbers 📞 +1 (770) 8783-106 / +1 (412) 618-6238
- Our emails ✉️ ishaan@berri.ai / krrish@berri.ai

View file

@ -82,6 +82,47 @@ model_list:
```bash
litellm --config /path/to/config.yaml
```
</TabItem>
<TabItem value="config-all" label="config - default all Anthropic Model">
Use this if you want to make requests to `claude-3-haiku-20240307`,`claude-3-opus-20240229`,`claude-2.1` without defining them on the config.yaml
#### Required env variables
```
ANTHROPIC_API_KEY=sk-ant****
```
```yaml
model_list:
- model_name: "*"
litellm_params:
model: "*"
```
```bash
litellm --config /path/to/config.yaml
```
Example Request for this config.yaml
**Ensure you use `anthropic/` prefix to route the request to Anthropic API**
```shell
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
--data ' {
"model": "anthropic/claude-3-haiku-20240307",
"messages": [
{
"role": "user",
"content": "what llm are you"
}
]
}
'
```
</TabItem>
<TabItem value="cli" label="cli">

View file

@ -66,8 +66,15 @@ response = litellm.completion(
## Azure OpenAI Chat Completion Models
:::tip
**We support ALL Azure models, just set `model=azure/<your deployment name>` as a prefix when sending litellm requests**
:::
| Model Name | Function Call |
|------------------|----------------------------------------|
| gpt-4o-mini | `completion('azure/<your deployment name>', messages)` |
| gpt-4o | `completion('azure/<your deployment name>', messages)` |
| gpt-4 | `completion('azure/<your deployment name>', messages)` |
| gpt-4-0314 | `completion('azure/<your deployment name>', messages)` |

View file

@ -360,6 +360,71 @@ resp = litellm.completion(
print(f"\nResponse: {resp}")
```
## Usage - Bedrock Guardrails
Example of using [Bedrock Guardrails with LiteLLM](https://docs.aws.amazon.com/bedrock/latest/userguide/guardrails-use-converse-api.html)
<Tabs>
<TabItem value="sdk" label="LiteLLM SDK">
```python
from litellm import completion
# set env
os.environ["AWS_ACCESS_KEY_ID"] = ""
os.environ["AWS_SECRET_ACCESS_KEY"] = ""
os.environ["AWS_REGION_NAME"] = ""
response = completion(
model="anthropic.claude-v2",
messages=[
{
"content": "where do i buy coffee from? ",
"role": "user",
}
],
max_tokens=10,
guardrailConfig={
"guardrailIdentifier": "ff6ujrregl1q", # The identifier (ID) for the guardrail.
"guardrailVersion": "DRAFT", # The version of the guardrail.
"trace": "disabled", # The trace behavior for the guardrail. Can either be "disabled" or "enabled"
},
)
```
</TabItem>
<TabItem value="proxy" label="LiteLLM Proxy Server">
```python
import openai
client = openai.OpenAI(
api_key="anything",
base_url="http://0.0.0.0:4000"
)
# request sent to model set on litellm proxy, `litellm --model`
response = client.chat.completions.create(model="anthropic.claude-v2", messages = [
{
"role": "user",
"content": "this is a test request, write a short poem"
}
],
temperature=0.7,
extra_body={
"guardrailConfig": {
"guardrailIdentifier": "ff6ujrregl1q", # The identifier (ID) for the guardrail.
"guardrailVersion": "DRAFT", # The version of the guardrail.
"trace": "disabled", # The trace behavior for the guardrail. Can either be "disabled" or "enabled"
},
}
)
print(response)
```
</TabItem>
</Tabs>
## Usage - "Assistant Pre-fill"
If you're using Anthropic's Claude with Bedrock, you can "put words in Claude's mouth" by including an `assistant` role message as the last item in the `messages` array.

View file

@ -1,7 +1,6 @@
# Custom API Server (Custom Format)
LiteLLM allows you to call your custom endpoint in the OpenAI ChatCompletion format
Call your custom torch-serve / internal LLM APIs via LiteLLM
:::info

View file

@ -5,6 +5,11 @@ import TabItem from '@theme/TabItem';
LiteLLM supports all models on Databricks
:::tip
**We support ALL Databricks models, just set `model=databricks/<any-model-on-databricks>` as a prefix when sending litellm requests**
:::
## Usage
@ -185,8 +190,17 @@ response = litellm.embedding(
## Supported Databricks Chat Completion Models
:::tip
**We support ALL Databricks models, just set `model=databricks/<any-model-on-databricks>` as a prefix when sending litellm requests**
:::
| Model Name | Command |
|----------------------------|------------------------------------------------------------------|
| databricks-meta-llama-3-1-70b-instruct | `completion(model='databricks/databricks-meta-llama-3-1-70b-instruct', messages=messages)` |
| databricks-meta-llama-3-1-405b-instruct | `completion(model='databricks/databricks-meta-llama-3-1-405b-instruct', messages=messages)` |
| databricks-dbrx-instruct | `completion(model='databricks/databricks-dbrx-instruct', messages=messages)` |
| databricks-meta-llama-3-70b-instruct | `completion(model='databricks/databricks-meta-llama-3-70b-instruct', messages=messages)` |
| databricks-llama-2-70b-chat | `completion(model='databricks/databricks-llama-2-70b-chat', messages=messages)` |
@ -196,6 +210,13 @@ response = litellm.embedding(
## Supported Databricks Embedding Models
:::tip
**We support ALL Databricks models, just set `model=databricks/<any-model-on-databricks>` as a prefix when sending litellm requests**
:::
| Model Name | Command |
|----------------------------|------------------------------------------------------------------|
| databricks-bge-large-en | `embedding(model='databricks/databricks-bge-large-en', messages=messages)` |

View file

@ -1,3 +1,7 @@
import Image from '@theme/IdealImage';
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# Gemini - Google AI Studio
## Pre-requisites
@ -17,6 +21,335 @@ response = completion(
)
```
## Supported OpenAI Params
- temperature
- top_p
- max_tokens
- stream
- tools
- tool_choice
- response_format
- n
- stop
[**See Updated List**](https://github.com/BerriAI/litellm/blob/1c747f3ad372399c5b95cc5696b06a5fbe53186b/litellm/llms/vertex_httpx.py#L122)
## Passing Gemini Specific Params
### Response schema
LiteLLM supports sending `response_schema` as a param for Gemini-1.5-Pro on Google AI Studio.
**Response Schema**
<Tabs>
<TabItem value="sdk" label="SDK">
```python
from litellm import completion
import json
import os
os.environ['GEMINI_API_KEY'] = ""
messages = [
{
"role": "user",
"content": "List 5 popular cookie recipes."
}
]
response_schema = {
"type": "array",
"items": {
"type": "object",
"properties": {
"recipe_name": {
"type": "string",
},
},
"required": ["recipe_name"],
},
}
completion(
model="gemini/gemini-1.5-pro",
messages=messages,
response_format={"type": "json_object", "response_schema": response_schema} # 👈 KEY CHANGE
)
print(json.loads(completion.choices[0].message.content))
```
</TabItem>
<TabItem value="proxy" label="PROXY">
1. Add model to config.yaml
```yaml
model_list:
- model_name: gemini-pro
litellm_params:
model: gemini/gemini-1.5-pro
api_key: os.environ/GEMINI_API_KEY
```
2. Start Proxy
```
$ litellm --config /path/to/config.yaml
```
3. Make Request!
```bash
curl -X POST 'http://0.0.0.0:4000/chat/completions' \
-H 'Content-Type: application/json' \
-H 'Authorization: Bearer sk-1234' \
-D '{
"model": "gemini-pro",
"messages": [
{"role": "user", "content": "List 5 popular cookie recipes."}
],
"response_format": {"type": "json_object", "response_schema": {
"type": "array",
"items": {
"type": "object",
"properties": {
"recipe_name": {
"type": "string",
},
},
"required": ["recipe_name"],
},
}}
}
'
```
</TabItem>
</Tabs>
**Validate Schema**
To validate the response_schema, set `enforce_validation: true`.
<Tabs>
<TabItem value="sdk" label="SDK">
```python
from litellm import completion, JSONSchemaValidationError
try:
completion(
model="gemini/gemini-1.5-pro",
messages=messages,
response_format={
"type": "json_object",
"response_schema": response_schema,
"enforce_validation": true # 👈 KEY CHANGE
}
)
except JSONSchemaValidationError as e:
print("Raw Response: {}".format(e.raw_response))
raise e
```
</TabItem>
<TabItem value="proxy" label="PROXY">
1. Add model to config.yaml
```yaml
model_list:
- model_name: gemini-pro
litellm_params:
model: gemini/gemini-1.5-pro
api_key: os.environ/GEMINI_API_KEY
```
2. Start Proxy
```
$ litellm --config /path/to/config.yaml
```
3. Make Request!
```bash
curl -X POST 'http://0.0.0.0:4000/chat/completions' \
-H 'Content-Type: application/json' \
-H 'Authorization: Bearer sk-1234' \
-D '{
"model": "gemini-pro",
"messages": [
{"role": "user", "content": "List 5 popular cookie recipes."}
],
"response_format": {"type": "json_object", "response_schema": {
"type": "array",
"items": {
"type": "object",
"properties": {
"recipe_name": {
"type": "string",
},
},
"required": ["recipe_name"],
},
},
"enforce_validation": true
}
}
'
```
</TabItem>
</Tabs>
LiteLLM will validate the response against the schema, and raise a `JSONSchemaValidationError` if the response does not match the schema.
JSONSchemaValidationError inherits from `openai.APIError`
Access the raw response with `e.raw_response`
### GenerationConfig Params
To pass additional GenerationConfig params - e.g. `topK`, just pass it in the request body of the call, and LiteLLM will pass it straight through as a key-value pair in the request body.
[**See Gemini GenerationConfigParams**](https://ai.google.dev/api/generate-content#v1beta.GenerationConfig)
<Tabs>
<TabItem value="sdk" label="SDK">
```python
from litellm import completion
import json
import os
os.environ['GEMINI_API_KEY'] = ""
messages = [
{
"role": "user",
"content": "List 5 popular cookie recipes."
}
]
completion(
model="gemini/gemini-1.5-pro",
messages=messages,
topK=1 # 👈 KEY CHANGE
)
print(json.loads(completion.choices[0].message.content))
```
</TabItem>
<TabItem value="proxy" label="PROXY">
1. Add model to config.yaml
```yaml
model_list:
- model_name: gemini-pro
litellm_params:
model: gemini/gemini-1.5-pro
api_key: os.environ/GEMINI_API_KEY
```
2. Start Proxy
```
$ litellm --config /path/to/config.yaml
```
3. Make Request!
```bash
curl -X POST 'http://0.0.0.0:4000/chat/completions' \
-H 'Content-Type: application/json' \
-H 'Authorization: Bearer sk-1234' \
-d '{
"model": "gemini-pro",
"messages": [
{"role": "user", "content": "List 5 popular cookie recipes."}
],
"topK": 1 # 👈 KEY CHANGE
}
'
```
</TabItem>
</Tabs>
**Validate Schema**
To validate the response_schema, set `enforce_validation: true`.
<Tabs>
<TabItem value="sdk" label="SDK">
```python
from litellm import completion, JSONSchemaValidationError
try:
completion(
model="gemini/gemini-1.5-pro",
messages=messages,
response_format={
"type": "json_object",
"response_schema": response_schema,
"enforce_validation": true # 👈 KEY CHANGE
}
)
except JSONSchemaValidationError as e:
print("Raw Response: {}".format(e.raw_response))
raise e
```
</TabItem>
<TabItem value="proxy" label="PROXY">
1. Add model to config.yaml
```yaml
model_list:
- model_name: gemini-pro
litellm_params:
model: gemini/gemini-1.5-pro
api_key: os.environ/GEMINI_API_KEY
```
2. Start Proxy
```
$ litellm --config /path/to/config.yaml
```
3. Make Request!
```bash
curl -X POST 'http://0.0.0.0:4000/chat/completions' \
-H 'Content-Type: application/json' \
-H 'Authorization: Bearer sk-1234' \
-D '{
"model": "gemini-pro",
"messages": [
{"role": "user", "content": "List 5 popular cookie recipes."}
],
"response_format": {"type": "json_object", "response_schema": {
"type": "array",
"items": {
"type": "object",
"properties": {
"recipe_name": {
"type": "string",
},
},
"required": ["recipe_name"],
},
},
"enforce_validation": true
}
}
'
```
</TabItem>
</Tabs>
## Specifying Safety Settings
In certain use-cases you may need to make calls to the models and pass [safety settigns](https://ai.google.dev/docs/safety_setting_gemini) different from the defaults. To do so, simple pass the `safety_settings` argument to `completion` or `acompletion`. For example:
@ -91,6 +424,72 @@ assert isinstance(
```
## JSON Mode
<Tabs>
<TabItem value="sdk" label="SDK">
```python
from litellm import completion
import json
import os
os.environ['GEMINI_API_KEY'] = ""
messages = [
{
"role": "user",
"content": "List 5 popular cookie recipes."
}
]
completion(
model="gemini/gemini-1.5-pro",
messages=messages,
response_format={"type": "json_object"} # 👈 KEY CHANGE
)
print(json.loads(completion.choices[0].message.content))
```
</TabItem>
<TabItem value="proxy" label="PROXY">
1. Add model to config.yaml
```yaml
model_list:
- model_name: gemini-pro
litellm_params:
model: gemini/gemini-1.5-pro
api_key: os.environ/GEMINI_API_KEY
```
2. Start Proxy
```
$ litellm --config /path/to/config.yaml
```
3. Make Request!
```bash
curl -X POST 'http://0.0.0.0:4000/chat/completions' \
-H 'Content-Type: application/json' \
-H 'Authorization: Bearer sk-1234' \
-d '{
"model": "gemini-pro",
"messages": [
{"role": "user", "content": "List 5 popular cookie recipes."}
],
"response_format": {"type": "json_object"}
}
'
```
</TabItem>
</Tabs>
# Gemini-Pro-Vision
LiteLLM Supports the following image types passed in `url`
- Images with direct links - https://storage.googleapis.com/github-repo/img/gemini/intro/landmark3.jpg
@ -141,8 +540,13 @@ print(content)
```
## Chat Models
:::tip
**We support ALL Gemini models, just set `model=gemini/<any-model-on-gemini>` as a prefix when sending litellm requests**
:::
| Model Name | Function Call | Required OS Variables |
|-----------------------|--------------------------------------------------------|--------------------------------|
| gemini-pro | `completion('gemini/gemini-pro', messages)` | `os.environ['GEMINI_API_KEY']` |
| gemini-1.5-pro-latest | `completion('gemini/gemini-1.5-pro-latest', messages)` | `os.environ['GEMINI_API_KEY']` |
| gemini-pro-vision | `completion('gemini/gemini-pro-vision', messages)` | `os.environ['GEMINI_API_KEY']` |
| gemini-pro | `completion(model='gemini/gemini-pro', messages)` | `os.environ['GEMINI_API_KEY']` |
| gemini-1.5-pro-latest | `completion(model='gemini/gemini-1.5-pro-latest', messages)` | `os.environ['GEMINI_API_KEY']` |
| gemini-pro-vision | `completion(model='gemini/gemini-pro-vision', messages)` | `os.environ['GEMINI_API_KEY']` |

View file

@ -0,0 +1,261 @@
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# 🆕 Github
https://github.com/marketplace/models
:::tip
**We support ALL Github models, just set `model=github/<any-model-on-github>` as a prefix when sending litellm requests**
:::
## API Key
```python
# env variable
os.environ['GITHUB_API_KEY']
```
## Sample Usage
```python
from litellm import completion
import os
os.environ['GITHUB_API_KEY'] = ""
response = completion(
model="github/llama3-8b-8192",
messages=[
{"role": "user", "content": "hello from litellm"}
],
)
print(response)
```
## Sample Usage - Streaming
```python
from litellm import completion
import os
os.environ['GITHUB_API_KEY'] = ""
response = completion(
model="github/llama3-8b-8192",
messages=[
{"role": "user", "content": "hello from litellm"}
],
stream=True
)
for chunk in response:
print(chunk)
```
## Usage with LiteLLM Proxy
### 1. Set Github Models on config.yaml
```yaml
model_list:
- model_name: github-llama3-8b-8192 # Model Alias to use for requests
litellm_params:
model: github/llama3-8b-8192
api_key: "os.environ/GITHUB_API_KEY" # ensure you have `GITHUB_API_KEY` in your .env
```
### 2. Start Proxy
```
litellm --config config.yaml
```
### 3. Test it
Make request to litellm proxy
<Tabs>
<TabItem value="Curl" label="Curl Request">
```shell
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
--data ' {
"model": "github-llama3-8b-8192",
"messages": [
{
"role": "user",
"content": "what llm are you"
}
]
}
'
```
</TabItem>
<TabItem value="openai" label="OpenAI v1.0.0+">
```python
import openai
client = openai.OpenAI(
api_key="anything",
base_url="http://0.0.0.0:4000"
)
response = client.chat.completions.create(model="github-llama3-8b-8192", messages = [
{
"role": "user",
"content": "this is a test request, write a short poem"
}
])
print(response)
```
</TabItem>
<TabItem value="langchain" label="Langchain">
```python
from langchain.chat_models import ChatOpenAI
from langchain.prompts.chat import (
ChatPromptTemplate,
HumanMessagePromptTemplate,
SystemMessagePromptTemplate,
)
from langchain.schema import HumanMessage, SystemMessage
chat = ChatOpenAI(
openai_api_base="http://0.0.0.0:4000", # set openai_api_base to the LiteLLM Proxy
model = "github-llama3-8b-8192",
temperature=0.1
)
messages = [
SystemMessage(
content="You are a helpful assistant that im using to make a test request to."
),
HumanMessage(
content="test from litellm. tell me why it's amazing in 1 sentence"
),
]
response = chat(messages)
print(response)
```
</TabItem>
</Tabs>
## Supported Models - ALL Github Models Supported!
We support ALL Github models, just set `github/` as a prefix when sending completion requests
| Model Name | Usage |
|--------------------|---------------------------------------------------------|
| llama-3.1-8b-instant | `completion(model="github/llama-3.1-8b-instant", messages)` |
| llama-3.1-70b-versatile | `completion(model="github/llama-3.1-70b-versatile", messages)` |
| llama-3.1-405b-reasoning | `completion(model="github/llama-3.1-405b-reasoning", messages)` |
| llama3-8b-8192 | `completion(model="github/llama3-8b-8192", messages)` |
| llama3-70b-8192 | `completion(model="github/llama3-70b-8192", messages)` |
| llama2-70b-4096 | `completion(model="github/llama2-70b-4096", messages)` |
| mixtral-8x7b-32768 | `completion(model="github/mixtral-8x7b-32768", messages)` |
| gemma-7b-it | `completion(model="github/gemma-7b-it", messages)` |
## Github - Tool / Function Calling Example
```python
# Example dummy function hard coded to return the current weather
import json
def get_current_weather(location, unit="fahrenheit"):
"""Get the current weather in a given location"""
if "tokyo" in location.lower():
return json.dumps({"location": "Tokyo", "temperature": "10", "unit": "celsius"})
elif "san francisco" in location.lower():
return json.dumps(
{"location": "San Francisco", "temperature": "72", "unit": "fahrenheit"}
)
elif "paris" in location.lower():
return json.dumps({"location": "Paris", "temperature": "22", "unit": "celsius"})
else:
return json.dumps({"location": location, "temperature": "unknown"})
# Step 1: send the conversation and available functions to the model
messages = [
{
"role": "system",
"content": "You are a function calling LLM that uses the data extracted from get_current_weather to answer questions about the weather in San Francisco.",
},
{
"role": "user",
"content": "What's the weather like in San Francisco?",
},
]
tools = [
{
"type": "function",
"function": {
"name": "get_current_weather",
"description": "Get the current weather in a given location",
"parameters": {
"type": "object",
"properties": {
"location": {
"type": "string",
"description": "The city and state, e.g. San Francisco, CA",
},
"unit": {
"type": "string",
"enum": ["celsius", "fahrenheit"],
},
},
"required": ["location"],
},
},
}
]
response = litellm.completion(
model="github/llama3-8b-8192",
messages=messages,
tools=tools,
tool_choice="auto", # auto is default, but we'll be explicit
)
print("Response\n", response)
response_message = response.choices[0].message
tool_calls = response_message.tool_calls
# Step 2: check if the model wanted to call a function
if tool_calls:
# Step 3: call the function
# Note: the JSON response may not always be valid; be sure to handle errors
available_functions = {
"get_current_weather": get_current_weather,
}
messages.append(
response_message
) # extend conversation with assistant's reply
print("Response message\n", response_message)
# Step 4: send the info for each function call and function response to the model
for tool_call in tool_calls:
function_name = tool_call.function.name
function_to_call = available_functions[function_name]
function_args = json.loads(tool_call.function.arguments)
function_response = function_to_call(
location=function_args.get("location"),
unit=function_args.get("unit"),
)
messages.append(
{
"tool_call_id": tool_call.id,
"role": "tool",
"name": function_name,
"content": function_response,
}
) # extend conversation with function response
print(f"messages: {messages}")
second_response = litellm.completion(
model="github/llama3-8b-8192", messages=messages
) # get a new response from the model where it can see the function response
print("second response\n", second_response)
```

View file

@ -1,3 +1,6 @@
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# Ollama
LiteLLM supports all models from [Ollama](https://github.com/ollama/ollama)
@ -84,6 +87,120 @@ response = completion(
)
```
## Example Usage - Tool Calling
To use ollama tool calling, pass `tools=[{..}]` to `litellm.completion()`
<Tabs>
<TabItem value="sdk" label="SDK">
```python
from litellm import completion
import litellm
## [OPTIONAL] REGISTER MODEL - not all ollama models support function calling, litellm defaults to json mode tool calls if native tool calling not supported.
# litellm.register_model(model_cost={
# "ollama_chat/llama3.1": {
# "supports_function_calling": true
# },
# })
tools = [
{
"type": "function",
"function": {
"name": "get_current_weather",
"description": "Get the current weather in a given location",
"parameters": {
"type": "object",
"properties": {
"location": {
"type": "string",
"description": "The city and state, e.g. San Francisco, CA",
},
"unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
},
"required": ["location"],
},
}
}
]
messages = [{"role": "user", "content": "What's the weather like in Boston today?"}]
response = completion(
model="ollama_chat/llama3.1",
messages=messages,
tools=tools
)
```
</TabItem>
<TabItem value="proxy" label="PROXY">
1. Setup config.yaml
```yaml
model_list:
- model_name: "llama3.1"
litellm_params:
model: "ollama_chat/llama3.1"
model_info:
supports_function_calling: true
```
2. Start proxy
```bash
litellm --config /path/to/config.yaml
```
3. Test it!
```bash
curl -X POST 'http://0.0.0.0:4000/chat/completions' \
-H 'Content-Type: application/json' \
-H 'Authorization: Bearer sk-1234' \
-d '{
"model": "llama3.1",
"messages": [
{
"role": "user",
"content": "What'\''s the weather like in Boston today?"
}
],
"tools": [
{
"type": "function",
"function": {
"name": "get_current_weather",
"description": "Get the current weather in a given location",
"parameters": {
"type": "object",
"properties": {
"location": {
"type": "string",
"description": "The city and state, e.g. San Francisco, CA"
},
"unit": {
"type": "string",
"enum": ["celsius", "fahrenheit"]
}
},
"required": ["location"]
}
}
}
],
"tool_choice": "auto",
"stream": true
}'
```
</TabItem>
</Tabs>
## Using ollama `api/chat`
In order to send ollama requests to `POST /api/chat` on your ollama server, set the model prefix to `ollama_chat`

View file

@ -166,6 +166,7 @@ os.environ["OPENAI_API_BASE"] = "openaiai-api-base" # OPTIONAL
| gpt-4o-mini | `response = completion(model="gpt-4o-mini", messages=messages)` |
| gpt-4o-mini-2024-07-18 | `response = completion(model="gpt-4o-mini-2024-07-18", messages=messages)` |
| gpt-4o | `response = completion(model="gpt-4o", messages=messages)` |
| gpt-4o-2024-08-06 | `response = completion(model="gpt-4o-2024-08-06", messages=messages)` |
| gpt-4o-2024-05-13 | `response = completion(model="gpt-4o-2024-05-13", messages=messages)` |
| gpt-4-turbo | `response = completion(model="gpt-4-turbo", messages=messages)` |
| gpt-4-turbo-preview | `response = completion(model="gpt-4-0125-preview", messages=messages)` |

View file

@ -775,7 +775,6 @@ vertex_ai_location = "your-vertex-location" # can also set this as os.environ["V
response = completion(
model="vertex_ai/" + model,
messages=[{"role": "user", "content": "hi"}],
temperature=0.7,
vertex_ai_project=vertex_ai_project,
vertex_ai_location=vertex_ai_location,
)
@ -828,6 +827,178 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \
</TabItem>
</Tabs>
## Mistral API
[**Supported OpenAI Params**](https://github.com/BerriAI/litellm/blob/e0f3cd580cb85066f7d36241a03c30aa50a8a31d/litellm/llms/openai.py#L137)
| Model Name | Function Call |
|------------------|--------------------------------------|
| mistral-large@latest | `completion('vertex_ai/mistral-large@latest', messages)` |
| mistral-large@2407 | `completion('vertex_ai/mistral-large@2407', messages)` |
| mistral-nemo@latest | `completion('vertex_ai/mistral-nemo@latest', messages)` |
| codestral@latest | `completion('vertex_ai/codestral@latest', messages)` |
| codestral@@2405 | `completion('vertex_ai/codestral@2405', messages)` |
### Usage
<Tabs>
<TabItem value="sdk" label="SDK">
```python
from litellm import completion
import os
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = ""
model = "mistral-large@2407"
vertex_ai_project = "your-vertex-project" # can also set this as os.environ["VERTEXAI_PROJECT"]
vertex_ai_location = "your-vertex-location" # can also set this as os.environ["VERTEXAI_LOCATION"]
response = completion(
model="vertex_ai/" + model,
messages=[{"role": "user", "content": "hi"}],
vertex_ai_project=vertex_ai_project,
vertex_ai_location=vertex_ai_location,
)
print("\nModel Response", response)
```
</TabItem>
<TabItem value="proxy" label="Proxy">
**1. Add to config**
```yaml
model_list:
- model_name: vertex-mistral
litellm_params:
model: vertex_ai/mistral-large@2407
vertex_ai_project: "my-test-project"
vertex_ai_location: "us-east-1"
- model_name: vertex-mistral
litellm_params:
model: vertex_ai/mistral-large@2407
vertex_ai_project: "my-test-project"
vertex_ai_location: "us-west-1"
```
**2. Start proxy**
```bash
litellm --config /path/to/config.yaml
# RUNNING at http://0.0.0.0:4000
```
**3. Test it!**
```bash
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Authorization: Bearer sk-1234' \
--header 'Content-Type: application/json' \
--data '{
"model": "vertex-mistral", # 👈 the 'model_name' in config
"messages": [
{
"role": "user",
"content": "what llm are you"
}
],
}'
```
</TabItem>
</Tabs>
### Usage - Codestral FIM
Call Codestral on VertexAI via the OpenAI [`/v1/completion`](https://platform.openai.com/docs/api-reference/completions/create) endpoint for FIM tasks.
Note: You can also call Codestral via `/chat/completion`.
<Tabs>
<TabItem value="sdk" label="SDK">
```python
from litellm import completion
import os
# os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = ""
# OR run `!gcloud auth print-access-token` in your terminal
model = "codestral@2405"
vertex_ai_project = "your-vertex-project" # can also set this as os.environ["VERTEXAI_PROJECT"]
vertex_ai_location = "your-vertex-location" # can also set this as os.environ["VERTEXAI_LOCATION"]
response = text_completion(
model="vertex_ai/" + model,
vertex_ai_project=vertex_ai_project,
vertex_ai_location=vertex_ai_location,
prompt="def is_odd(n): \n return n % 2 == 1 \ndef test_is_odd():",
suffix="return True", # optional
temperature=0, # optional
top_p=1, # optional
max_tokens=10, # optional
min_tokens=10, # optional
seed=10, # optional
stop=["return"], # optional
)
print("\nModel Response", response)
```
</TabItem>
<TabItem value="proxy" label="Proxy">
**1. Add to config**
```yaml
model_list:
- model_name: vertex-codestral
litellm_params:
model: vertex_ai/codestral@2405
vertex_ai_project: "my-test-project"
vertex_ai_location: "us-east-1"
- model_name: vertex-codestral
litellm_params:
model: vertex_ai/codestral@2405
vertex_ai_project: "my-test-project"
vertex_ai_location: "us-west-1"
```
**2. Start proxy**
```bash
litellm --config /path/to/config.yaml
# RUNNING at http://0.0.0.0:4000
```
**3. Test it!**
```bash
curl -X POST 'http://0.0.0.0:4000/completions' \
-H 'Authorization: Bearer sk-1234' \
-H 'Content-Type: application/json' \
-d '{
"model": "vertex-codestral", # 👈 the 'model_name' in config
"prompt": "def is_odd(n): \n return n % 2 == 1 \ndef test_is_odd():",
"suffix":"return True", # optional
"temperature":0, # optional
"top_p":1, # optional
"max_tokens":10, # optional
"min_tokens":10, # optional
"seed":10, # optional
"stop":["return"], # optional
}'
```
</TabItem>
</Tabs>
## Model Garden
| Model Name | Function Call |
|------------------|--------------------------------------|

View file

@ -0,0 +1,191 @@
import Image from '@theme/IdealImage';
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# 🪣 Logging GCS, s3 Buckets
LiteLLM Supports Logging to the following Cloud Buckets
- (Enterprise) ✨ [Google Cloud Storage Buckets](#logging-proxy-inputoutput-to-google-cloud-storage-buckets)
- (Free OSS) [Amazon s3 Buckets](#logging-proxy-inputoutput---s3-buckets)
## Logging Proxy Input/Output to Google Cloud Storage Buckets
Log LLM Logs to [Google Cloud Storage Buckets](https://cloud.google.com/storage?hl=en)
:::info
✨ This is an Enterprise only feature [Get Started with Enterprise here](https://calendly.com/d/4mp-gd3-k5k/litellm-1-1-onboarding-chat)
:::
### Usage
1. Add `gcs_bucket` to LiteLLM Config.yaml
```yaml
model_list:
- litellm_params:
api_base: https://openai-function-calling-workers.tasslexyz.workers.dev/
api_key: my-fake-key
model: openai/my-fake-model
model_name: fake-openai-endpoint
litellm_settings:
callbacks: ["gcs_bucket"] # 👈 KEY CHANGE # 👈 KEY CHANGE
```
2. Set required env variables
```shell
GCS_BUCKET_NAME="<your-gcs-bucket-name>"
GCS_PATH_SERVICE_ACCOUNT="/Users/ishaanjaffer/Downloads/adroit-crow-413218-a956eef1a2a8.json" # Add path to service account.json
```
3. Start Proxy
```
litellm --config /path/to/config.yaml
```
4. Test it!
```bash
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
--data ' {
"model": "fake-openai-endpoint",
"messages": [
{
"role": "user",
"content": "what llm are you"
}
],
}
'
```
### Expected Logs on GCS Buckets
<Image img={require('../../img/gcs_bucket.png')} />
### Fields Logged on GCS Buckets
Example payload of a `/chat/completion` request logged on GCS
```json
{
"request_kwargs": {
"model": "gpt-3.5-turbo",
"messages": [
{
"role": "user",
"content": "This is a test"
}
],
"optional_params": {
"temperature": 0.7,
"max_tokens": 10,
"user": "ishaan-2",
"extra_body": {}
}
},
"response_obj": {
"id": "chatcmpl-bd836a8c-89bc-4abd-bee5-e3f1ebfdb541",
"choices": [
{
"finish_reason": "stop",
"index": 0,
"message": {
"content": "Hi!",
"role": "assistant",
"tool_calls": null,
"function_call": null
}
}
],
"created": 1722868456,
"model": "gpt-3.5-turbo",
"object": "chat.completion",
"system_fingerprint": null,
"usage": {
"prompt_tokens": 10,
"completion_tokens": 20,
"total_tokens": 30
}
},
"start_time": "2024-08-05 07:34:16",
"end_time": "2024-08-05 07:34:16"
}
```
### Getting `service_account.json` from Google Cloud Console
1. Go to [Google Cloud Console](https://console.cloud.google.com/)
2. Search for IAM & Admin
3. Click on Service Accounts
4. Select a Service Account
5. Click on 'Keys' -> Add Key -> Create New Key -> JSON
6. Save the JSON file and add the path to `GCS_PATH_SERVICE_ACCOUNT`
## Logging Proxy Input/Output - s3 Buckets
We will use the `--config` to set
- `litellm.success_callback = ["s3"]`
This will log all successfull LLM calls to s3 Bucket
**Step 1** Set AWS Credentials in .env
```shell
AWS_ACCESS_KEY_ID = ""
AWS_SECRET_ACCESS_KEY = ""
AWS_REGION_NAME = ""
```
**Step 2**: Create a `config.yaml` file and set `litellm_settings`: `success_callback`
```yaml
model_list:
- model_name: gpt-3.5-turbo
litellm_params:
model: gpt-3.5-turbo
litellm_settings:
success_callback: ["s3"]
s3_callback_params:
s3_bucket_name: logs-bucket-litellm # AWS Bucket Name for S3
s3_region_name: us-west-2 # AWS Region Name for S3
s3_aws_access_key_id: os.environ/AWS_ACCESS_KEY_ID # us os.environ/<variable name> to pass environment variables. This is AWS Access Key ID for S3
s3_aws_secret_access_key: os.environ/AWS_SECRET_ACCESS_KEY # AWS Secret Access Key for S3
s3_path: my-test-path # [OPTIONAL] set path in bucket you want to write logs to
s3_endpoint_url: https://s3.amazonaws.com # [OPTIONAL] S3 endpoint URL, if you want to use Backblaze/cloudflare s3 buckets
```
**Step 3**: Start the proxy, make a test request
Start proxy
```shell
litellm --config config.yaml --debug
```
Test Request
```shell
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
--data ' {
"model": "Azure OpenAI GPT-4 East",
"messages": [
{
"role": "user",
"content": "what llm are you"
}
]
}'
```
Your logs should be available on the specified s3 Bucket

View file

@ -260,6 +260,21 @@ curl --location 'http://0.0.0.0:4000/cache/ping' -H "Authorization: Bearer sk-1
```
## Advanced
### Control Call Types Caching is on for - (`/chat/completion`, `/embeddings`, etc.)
By default, caching is on for all call types. You can control which call types caching is on for by setting `supported_call_types` in `cache_params`
**Cache will only be on for the call types specified in `supported_call_types`**
```yaml
litellm_settings:
cache: True
cache_params:
type: redis
supported_call_types: ["acompletion", "atext_completion", "aembedding", "atranscription"]
# /chat/completions, /completions, /embeddings, /audio/transcriptions
```
### Set Cache Params on config.yaml
```yaml
model_list:
@ -280,7 +295,8 @@ litellm_settings:
password: "your_password" # The password for the Redis cache. Required if type is "redis".
# Optional configurations
supported_call_types: ["acompletion", "completion", "embedding", "aembedding"] # defaults to all litellm call types
supported_call_types: ["acompletion", "atext_completion", "aembedding", "atranscription"]
# /chat/completions, /completions, /embeddings, /audio/transcriptions
```
### Turn on / off caching per request.
@ -625,11 +641,8 @@ cache_params:
# List of litellm call types to cache for
# Options: "completion", "acompletion", "embedding", "aembedding"
supported_call_types:
- completion
- acompletion
- embedding
- aembedding
supported_call_types: ["acompletion", "atext_completion", "aembedding", "atranscription"]
# /chat/completions, /completions, /embeddings, /audio/transcriptions
# Redis cache parameters
host: localhost # Redis server hostname or IP address

View file

@ -60,6 +60,13 @@ model_list:
model_info:
version: 2
# Use this if you want to make requests to `claude-3-haiku-20240307`,`claude-3-opus-20240229`,`claude-2.1` without defining them on the config.yaml
# Default models
# Works for ALL Providers and needs the default provider credentials in .env
- model_name: "*"
litellm_params:
model: "*"
litellm_settings: # module level litellm settings - https://github.com/BerriAI/litellm/blob/main/litellm/__init__.py
drop_params: True
success_callback: ["langfuse"] # OPTIONAL - if you want to start sending LLM Logs to Langfuse. Make sure to set `LANGFUSE_PUBLIC_KEY` and `LANGFUSE_SECRET_KEY` in your env
@ -288,7 +295,7 @@ Dynamically call any model from any given provider without the need to predefine
model_list:
- model_name: "*" # all requests where model not in your config go to this deployment
litellm_params:
model: "openai/*" # passes our validation check that a real provider is given
model: "*" # passes our validation check that a real provider is given
```
2. Start LiteLLM proxy

View file

@ -1,6 +1,6 @@
import Image from '@theme/IdealImage';
# Custom Pricing - Sagemaker, etc.
# Custom LLM Pricing - Sagemaker, Azure, etc
Use this to register custom pricing for models.
@ -16,39 +16,9 @@ LiteLLM already has pricing for any model in our [model cost map](https://github
:::
## Quick Start
## Cost Per Second (e.g. Sagemaker)
Register custom pricing for sagemaker completion model.
For cost per second pricing, you **just** need to register `input_cost_per_second`.
```python
# !pip install boto3
from litellm import completion, completion_cost
os.environ["AWS_ACCESS_KEY_ID"] = ""
os.environ["AWS_SECRET_ACCESS_KEY"] = ""
os.environ["AWS_REGION_NAME"] = ""
def test_completion_sagemaker():
try:
print("testing sagemaker")
response = completion(
model="sagemaker/berri-benchmarking-Llama-2-70b-chat-hf-4",
messages=[{"role": "user", "content": "Hey, how's it going?"}],
input_cost_per_second=0.000420,
)
# Add any assertions here to check the response
print(response)
cost = completion_cost(completion_response=response)
print(cost)
except Exception as e:
raise Exception(f"Error occurred: {e}")
```
### Usage with OpenAI Proxy Server
### Usage with LiteLLM Proxy Server
**Step 1: Add pricing to config.yaml**
```yaml
@ -75,38 +45,7 @@ litellm /path/to/config.yaml
## Cost Per Token (e.g. Azure)
```python
# !pip install boto3
from litellm import completion, completion_cost
## set ENV variables
os.environ["AZURE_API_KEY"] = ""
os.environ["AZURE_API_BASE"] = ""
os.environ["AZURE_API_VERSION"] = ""
def test_completion_azure_model():
try:
print("testing azure custom pricing")
# azure call
response = completion(
model = "azure/<your_deployment_name>",
messages = [{ "content": "Hello, how are you?","role": "user"}]
input_cost_per_token=0.005,
output_cost_per_token=1,
)
# Add any assertions here to check the response
print(response)
cost = completion_cost(completion_response=response)
print(cost)
except Exception as e:
raise Exception(f"Error occurred: {e}")
test_completion_azure_model()
```
### Usage with OpenAI Proxy Server
### Usage with LiteLLM Proxy Server
```yaml
model_list:

View file

@ -35,6 +35,22 @@ $ litellm --detailed_debug
os.environ["LITELLM_LOG"] = "DEBUG"
```
### Debug Logs
Run the proxy with `--detailed_debug` to view detailed debug logs
```shell
litellm --config /path/to/config.yaml --detailed_debug
```
When making requests you should see the POST request sent by LiteLLM to the LLM on the Terminal output
```shell
POST Request Sent from LiteLLM:
curl -X POST \
https://api.openai.com/v1/chat/completions \
-H 'content-type: application/json' -H 'Authorization: Bearer sk-qnWGUIW9****************************************' \
-d '{"model": "gpt-3.5-turbo", "messages": [{"role": "user", "content": "this is a test request, write a short poem"}]}'
```
## JSON LOGS
Set `JSON_LOGS="True"` in your env:

View file

@ -246,7 +246,7 @@ helm install lite-helm ./litellm-helm
kubectl --namespace default port-forward $POD_NAME 8080:$CONTAINER_PORT
```
Your OpenAI proxy server is now running on `http://127.0.0.1:4000`.
Your LiteLLM Proxy Server is now running on `http://127.0.0.1:4000`.
</TabItem>
@ -254,6 +254,15 @@ Your OpenAI proxy server is now running on `http://127.0.0.1:4000`.
**That's it ! That's the quick start to deploy litellm**
## Use with Langchain, OpenAI SDK, LlamaIndex, Instructor, Curl
:::info
💡 Go here 👉 [to make your first LLM API Request](user_keys)
LiteLLM is compatible with several SDKs - including OpenAI SDK, Anthropic SDK, Mistral SDK, LLamaIndex, Langchain (Js, Python)
:::
## Options to deploy LiteLLM
| Docs | When to Use |
@ -292,7 +301,7 @@ docker run \
--config /app/config.yaml --detailed_debug
```
Your OpenAI proxy server is now running on `http://0.0.0.0:4000`.
Your LiteLLM Proxy Server is now running on `http://0.0.0.0:4000`.
</TabItem>
<TabItem value="kubernetes-deploy" label="Kubernetes">
@ -390,7 +399,7 @@ kubectl apply -f /path/to/service.yaml
kubectl port-forward service/litellm-service 4000:4000
```
Your OpenAI proxy server is now running on `http://0.0.0.0:4000`.
Your LiteLLM Proxy Server is now running on `http://0.0.0.0:4000`.
</TabItem>
@ -432,7 +441,7 @@ kubectl \
4000:4000
```
Your OpenAI proxy server is now running on `http://127.0.0.1:4000`.
Your LiteLLM Proxy Server is now running on `http://127.0.0.1:4000`.
If you need to set your litellm proxy config.yaml, you can find this in [values.yaml](https://github.com/BerriAI/litellm/blob/main/deploy/charts/litellm-helm/values.yaml)
@ -477,7 +486,7 @@ helm install lite-helm ./litellm-helm
kubectl --namespace default port-forward $POD_NAME 8080:$CONTAINER_PORT
```
Your OpenAI proxy server is now running on `http://127.0.0.1:4000`.
Your LiteLLM Proxy Server is now running on `http://127.0.0.1:4000`.
</TabItem>
</Tabs>
@ -549,6 +558,39 @@ docker run --name litellm-proxy \
ghcr.io/berriai/litellm-database:main-latest --config your_config.yaml
```
## LiteLLM without Internet Connection
By default `prisma generate` downloads [prisma's engine binaries](https://www.prisma.io/docs/orm/reference/environment-variables-reference#custom-engine-file-locations). This might cause errors when running without internet connection.
Use this dockerfile to build an image which pre-generates the prisma binaries.
```Dockerfile
# Use the provided base image
FROM ghcr.io/berriai/litellm:main-latest
# Set the working directory to /app
WORKDIR /app
### [👇 KEY STEP] ###
# Install Prisma CLI and generate Prisma client
RUN pip install prisma
RUN prisma generate
### FIN ####
# Expose the necessary port
EXPOSE 4000
# Override the CMD instruction with your desired command and arguments
# WARNING: FOR PROD DO NOT USE `--detailed_debug` it slows down response times, instead use the following CMD
# CMD ["--port", "4000", "--config", "config.yaml"]
# Define the command to run your app
ENTRYPOINT ["litellm"]
CMD ["--port", "4000"]
```
## Advanced Deployment Settings
### 1. Customization of the server root path (custom Proxy base url)
@ -563,24 +605,87 @@ In a Kubernetes deployment, it's possible to utilize a shared DNS to host multip
Customize the root path to eliminate the need for employing multiple DNS configurations during deployment.
Step 1.
👉 Set `SERVER_ROOT_PATH` in your .env and this will be set as your server root path
```
export SERVER_ROOT_PATH="/api/v1"
```
**Step 1. Run Proxy with `SERVER_ROOT_PATH` set in your env **
**Step 2** (If you want the Proxy Admin UI to work with your root path you need to use this dockerfile)
- Use the dockerfile below (it uses litellm as a base image)
- 👉 Set `UI_BASE_PATH=$SERVER_ROOT_PATH/ui` in the Dockerfile, example `UI_BASE_PATH=/api/v1/ui`
Dockerfile
```shell
docker run --name litellm-proxy \
-e DATABASE_URL=postgresql://<user>:<password>@<host>:<port>/<dbname> \
-e SERVER_ROOT_PATH="/api/v1" \
-p 4000:4000 \
ghcr.io/berriai/litellm-database:main-latest --config your_config.yaml
# Use the provided base image
FROM ghcr.io/berriai/litellm:main-latest
# Set the working directory to /app
WORKDIR /app
# Install Node.js and npm (adjust version as needed)
RUN apt-get update && apt-get install -y nodejs npm
# Copy the UI source into the container
COPY ./ui/litellm-dashboard /app/ui/litellm-dashboard
# Set an environment variable for UI_BASE_PATH
# This can be overridden at build time
# set UI_BASE_PATH to "<your server root path>/ui"
# 👇👇 Enter your UI_BASE_PATH here
ENV UI_BASE_PATH="/api/v1/ui"
# Build the UI with the specified UI_BASE_PATH
WORKDIR /app/ui/litellm-dashboard
RUN npm install
RUN UI_BASE_PATH=$UI_BASE_PATH npm run build
# Create the destination directory
RUN mkdir -p /app/litellm/proxy/_experimental/out
# Move the built files to the appropriate location
# Assuming the build output is in ./out directory
RUN rm -rf /app/litellm/proxy/_experimental/out/* && \
mv ./out/* /app/litellm/proxy/_experimental/out/
# Switch back to the main app directory
WORKDIR /app
# Make sure your entrypoint.sh is executable
RUN chmod +x entrypoint.sh
# Expose the necessary port
EXPOSE 4000/tcp
# Override the CMD instruction with your desired command and arguments
# only use --detailed_debug for debugging
CMD ["--port", "4000", "--config", "config.yaml"]
```
**Step 3** build this Dockerfile
```shell
docker build -f Dockerfile -t litellm-prod-build . --progress=plain
```
**Step 4. Run Proxy with `SERVER_ROOT_PATH` set in your env **
```shell
docker run \
-v $(pwd)/proxy_config.yaml:/app/config.yaml \
-p 4000:4000 \
-e LITELLM_LOG="DEBUG"\
-e SERVER_ROOT_PATH="/api/v1"\
-e DATABASE_URL=postgresql://<user>:<password>@<host>:<port>/<dbname> \
-e LITELLM_MASTER_KEY="sk-1234"\
litellm-prod-build \
--config /app/config.yaml
```
After running the proxy you can access it on `http://0.0.0.0:4000/api/v1/` (since we set `SERVER_ROOT_PATH="/api/v1"`)
**Step 2. Verify Running on correct path**
**Step 5. Verify Running on correct path**
<Image img={require('../../img/custom_root_path.png')} />
@ -785,3 +890,31 @@ Run the command `docker-compose up` or `docker compose up` as per your docker in
Your LiteLLM container should be running now on the defined port e.g. `4000`.
### IAM-based Auth for RDS DB
1. Set AWS env var
```bash
export AWS_WEB_IDENTITY_TOKEN='/path/to/token'
export AWS_ROLE_NAME='arn:aws:iam::123456789012:role/MyRole'
export AWS_SESSION_NAME='MySession'
```
[**See all Auth options**](https://github.com/BerriAI/litellm/blob/089a4f279ad61b7b3e213d8039fb9b75204a7abc/litellm/proxy/auth/rds_iam_token.py#L165)
2. Add RDS credentials to env
```bash
export DATABASE_USER="db-user"
export DATABASE_PORT="5432"
export DATABASE_HOST="database-1-instance-1.cs1ksmwz2xt3.us-west-2.rds.amazonaws.com"
export DATABASE_NAME="database-1-instance-1"
```
3. Run proxy with iam+rds
```bash
litellm --config /path/to/config.yaml --iam_token_db_auth
```

View file

@ -21,10 +21,14 @@ Features:
- ✅ IP addressbased access control lists
- ✅ Track Request IP Address
- ✅ [Use LiteLLM keys/authentication on Pass Through Endpoints](pass_through#✨-enterprise---use-litellm-keysauthentication-on-pass-through-endpoints)
- ✅ Set Max Request / File Size on Requests
- ✅ [Set Max Request Size / File Size on Requests](#set-max-request--response-size-on-litellm-proxy)
- ✅ [Enforce Required Params for LLM Requests (ex. Reject requests missing ["metadata"]["generation_name"])](#enforce-required-params-for-llm-requests)
- **Enterprise Spend Tracking Features**
- **Customize Logging, Guardrails, Caching per project**
- ✅ [Team Based Logging](./team_logging.md) - Allow each team to use their own Langfuse Project / custom callbacks
- ✅ [Disable Logging for a Team](./team_logging.md#disable-logging-for-a-team) - Switch off all logging for a team/project (GDPR Compliance)
-- **Spend Tracking & Data Exports**
- ✅ [Tracking Spend for Custom Tags](#tracking-spend-for-custom-tags)
- ✅ [Exporting LLM Logs to GCS Bucket](./proxy/bucket#🪣-logging-gcs-s3-buckets)
- ✅ [`/spend/report` API endpoint](cost_tracking.md#✨-enterprise-api-endpoints-to-get-spend)
- **Advanced Metrics**
- ✅ [`x-ratelimit-remaining-requests`, `x-ratelimit-remaining-tokens` for LLM APIs on Prometheus](prometheus#✨-enterprise-llm-remaining-requests-and-remaining-tokens)
@ -1288,3 +1292,52 @@ How it works?
**Note:** Setting an environment variable within a Python script using os.environ will not make that variable accessible via SSH sessions or any other new processes that are started independently of the Python script. Environment variables set this way only affect the current process and its child processes.
## Set Max Request / Response Size on LiteLLM Proxy
Use this if you want to set a maximum request / response size for your proxy server. If a request size is above the size it gets rejected + slack alert triggered
#### Usage
**Step 1.** Set `max_request_size_mb` and `max_response_size_mb`
For this example we set a very low limit on `max_request_size_mb` and expect it to get rejected
:::info
In production we recommend setting a `max_request_size_mb` / `max_response_size_mb` around `32 MB`
:::
```yaml
model_list:
- model_name: fake-openai-endpoint
litellm_params:
model: openai/fake
api_key: fake-key
api_base: https://exampleopenaiendpoint-production.up.railway.app/
general_settings:
master_key: sk-1234
# Security controls
max_request_size_mb: 0.000000001 # 👈 Key Change - Max Request Size in MB. Set this very low for testing
max_response_size_mb: 100 # 👈 Key Change - Max Response Size in MB
```
**Step 2.** Test it with `/chat/completions` request
```shell
curl http://localhost:4000/v1/chat/completions \
-H "Content-Type: application/json" \
-H "Authorization: Bearer sk-1234" \
-d '{
"model": "fake-openai-endpoint",
"messages": [
{"role": "user", "content": "Hello, Claude!"}
]
}'
```
**Expected Response from request**
We expect this to fail since the request size is over `max_request_size_mb`
```shell
{"error":{"message":"Request size is too large. Request size is 0.0001125335693359375 MB. Max size is 1e-09 MB","type":"bad_request_error","param":"content-length","code":400}}
```

View file

@ -8,7 +8,6 @@ Log Proxy input, output, and exceptions using:
- Langsmith
- DataDog
- DynamoDB
- s3 Bucket
- etc.
import Image from '@theme/IdealImage';
@ -714,6 +713,23 @@ Search for Trace=`80e1afed08e019fc1110464cfa66635c` on your OTEL Collector
<Image img={require('../../img/otel_parent.png')} />
### Forwarding `Traceparent HTTP Header` to LLM APIs
Use this if you want to forward the traceparent headers to your self hosted LLMs like vLLM
Set `forward_traceparent_to_llm_provider: True` in your `config.yaml`. This will forward the `traceparent` header to your LLM API
:::warning
Only use this for self hosted LLMs, this can cause Bedrock, VertexAI calls to fail
:::
```yaml
litellm_settings:
forward_traceparent_to_llm_provider: True
```
## Custom Callback Class [Async]
Use this when you want to run custom callbacks in `python`
@ -1362,66 +1378,6 @@ Expected output on Datadog
<Image img={require('../../img/dd_small1.png')} />
## Logging Proxy Input/Output - s3 Buckets
We will use the `--config` to set
- `litellm.success_callback = ["s3"]`
This will log all successfull LLM calls to s3 Bucket
**Step 1** Set AWS Credentials in .env
```shell
AWS_ACCESS_KEY_ID = ""
AWS_SECRET_ACCESS_KEY = ""
AWS_REGION_NAME = ""
```
**Step 2**: Create a `config.yaml` file and set `litellm_settings`: `success_callback`
```yaml
model_list:
- model_name: gpt-3.5-turbo
litellm_params:
model: gpt-3.5-turbo
litellm_settings:
success_callback: ["s3"]
s3_callback_params:
s3_bucket_name: logs-bucket-litellm # AWS Bucket Name for S3
s3_region_name: us-west-2 # AWS Region Name for S3
s3_aws_access_key_id: os.environ/AWS_ACCESS_KEY_ID # us os.environ/<variable name> to pass environment variables. This is AWS Access Key ID for S3
s3_aws_secret_access_key: os.environ/AWS_SECRET_ACCESS_KEY # AWS Secret Access Key for S3
s3_path: my-test-path # [OPTIONAL] set path in bucket you want to write logs to
s3_endpoint_url: https://s3.amazonaws.com # [OPTIONAL] S3 endpoint URL, if you want to use Backblaze/cloudflare s3 buckets
```
**Step 3**: Start the proxy, make a test request
Start proxy
```shell
litellm --config config.yaml --debug
```
Test Request
```shell
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
--data ' {
"model": "Azure OpenAI GPT-4 East",
"messages": [
{
"role": "user",
"content": "what llm are you"
}
]
}'
```
Your logs should be available on the specified s3 Bucket
## Logging Proxy Input/Output - DynamoDB
We will use the `--config` to set

View file

@ -35,6 +35,7 @@ general_settings:
Authorization: "bearer os.environ/COHERE_API_KEY" # (Optional) Auth Header to forward to your Endpoint
content-type: application/json # (Optional) Extra Headers to pass to this endpoint
accept: application/json
forward_headers: True # (Optional) Forward all headers from the incoming request to the target endpoint
```
**Step 2** Start Proxy Server in detailed_debug mode
@ -220,6 +221,7 @@ general_settings:
* `LANGFUSE_PUBLIC_KEY` *string*: Your Langfuse account public key - only set this when forwarding to Langfuse.
* `LANGFUSE_SECRET_KEY` *string*: Your Langfuse account secret key - only set this when forwarding to Langfuse.
* `<your-custom-header>` *string*: Pass any custom header key/value pair
* `forward_headers` *Optional(boolean)*: If true, all headers from the incoming request will be forwarded to the target endpoint. Default is `False`.
## Custom Chat Endpoints (Anthropic/Bedrock/Vertex)

View file

@ -84,6 +84,20 @@ Set `export LITELLM_MODE="PRODUCTION"`
This disables the load_dotenv() functionality, which will automatically load your environment credentials from the local `.env`.
## 5. Set LiteLLM Salt Key
If you plan on using the DB, set a salt key for encrypting/decrypting variables in the DB.
Do not change this after adding a model. It is used to encrypt / decrypt your LLM API Key credentials
We recommned - https://1password.com/password-generator/ password generator to get a random hash for litellm salt key.
```bash
export LITELLM_SALT_KEY="sk-1234"
```
[**See Code**](https://github.com/BerriAI/litellm/blob/036a6821d588bd36d170713dcf5a72791a694178/litellm/proxy/common_utils/encrypt_decrypt_utils.py#L15)
## Extras
### Expected Performance in Production

View file

@ -13,7 +13,7 @@ LiteLLM Supports the following methods for detecting prompt injection attacks
Use this if you want to reject /chat, /completions, /embeddings calls that have prompt injection attacks
LiteLLM uses [LakerAI API](https://platform.lakera.ai/) to detect if a request has a prompt injection attack
LiteLLM uses [LakeraAI API](https://platform.lakera.ai/) to detect if a request has a prompt injection attack
### Usage

View file

@ -255,6 +255,12 @@ litellm --config your_config.yaml
## Using LiteLLM Proxy - Curl Request, OpenAI Package, Langchain
:::info
LiteLLM is compatible with several SDKs - including OpenAI SDK, Anthropic SDK, Mistral SDK, LLamaIndex, Langchain (Js, Python)
[More examples here](user_keys)
:::
<Tabs>
<TabItem value="Curl" label="Curl Request">
@ -382,6 +388,34 @@ print(response)
```
</TabItem>
<TabItem value="anthropic-py" label="Anthropic Python SDK">
```python
import os
from anthropic import Anthropic
client = Anthropic(
base_url="http://localhost:4000", # proxy endpoint
api_key="sk-s4xN1IiLTCytwtZFJaYQrA", # litellm proxy virtual key
)
message = client.messages.create(
max_tokens=1024,
messages=[
{
"role": "user",
"content": "Hello, Claude",
}
],
model="claude-3-opus-20240229",
)
print(message.content)
```
</TabItem>
</Tabs>
[**More Info**](./configs.md)
@ -396,165 +430,6 @@ print(response)
- POST `/key/generate` - generate a key to access the proxy
## Using with OpenAI compatible projects
Set `base_url` to the LiteLLM Proxy server
<Tabs>
<TabItem value="openai" label="OpenAI v1.0.0+">
```python
import openai
client = openai.OpenAI(
api_key="anything",
base_url="http://0.0.0.0:4000"
)
# request sent to model set on litellm proxy, `litellm --model`
response = client.chat.completions.create(model="gpt-3.5-turbo", messages = [
{
"role": "user",
"content": "this is a test request, write a short poem"
}
])
print(response)
```
</TabItem>
<TabItem value="librechat" label="LibreChat">
#### Start the LiteLLM proxy
```shell
litellm --model gpt-3.5-turbo
#INFO: Proxy running on http://0.0.0.0:4000
```
#### 1. Clone the repo
```shell
git clone https://github.com/danny-avila/LibreChat.git
```
#### 2. Modify Librechat's `docker-compose.yml`
LiteLLM Proxy is running on port `4000`, set `4000` as the proxy below
```yaml
OPENAI_REVERSE_PROXY=http://host.docker.internal:4000/v1/chat/completions
```
#### 3. Save fake OpenAI key in Librechat's `.env`
Copy Librechat's `.env.example` to `.env` and overwrite the default OPENAI_API_KEY (by default it requires the user to pass a key).
```env
OPENAI_API_KEY=sk-1234
```
#### 4. Run LibreChat:
```shell
docker compose up
```
</TabItem>
<TabItem value="continue-dev" label="ContinueDev">
Continue-Dev brings ChatGPT to VSCode. See how to [install it here](https://continue.dev/docs/quickstart).
In the [config.py](https://continue.dev/docs/reference/Models/openai) set this as your default model.
```python
default=OpenAI(
api_key="IGNORED",
model="fake-model-name",
context_length=2048, # customize if needed for your model
api_base="http://localhost:4000" # your proxy server url
),
```
Credits [@vividfog](https://github.com/ollama/ollama/issues/305#issuecomment-1751848077) for this tutorial.
</TabItem>
<TabItem value="aider" label="Aider">
```shell
$ pip install aider
$ aider --openai-api-base http://0.0.0.0:4000 --openai-api-key fake-key
```
</TabItem>
<TabItem value="autogen" label="AutoGen">
```python
pip install pyautogen
```
```python
from autogen import AssistantAgent, UserProxyAgent, oai
config_list=[
{
"model": "my-fake-model",
"api_base": "http://localhost:4000", #litellm compatible endpoint
"api_type": "open_ai",
"api_key": "NULL", # just a placeholder
}
]
response = oai.Completion.create(config_list=config_list, prompt="Hi")
print(response) # works fine
llm_config={
"config_list": config_list,
}
assistant = AssistantAgent("assistant", llm_config=llm_config)
user_proxy = UserProxyAgent("user_proxy")
user_proxy.initiate_chat(assistant, message="Plot a chart of META and TESLA stock price change YTD.", config_list=config_list)
```
Credits [@victordibia](https://github.com/microsoft/autogen/issues/45#issuecomment-1749921972) for this tutorial.
</TabItem>
<TabItem value="guidance" label="guidance">
A guidance language for controlling large language models.
https://github.com/guidance-ai/guidance
**NOTE:** Guidance sends additional params like `stop_sequences` which can cause some models to fail if they don't support it.
**Fix**: Start your proxy using the `--drop_params` flag
```shell
litellm --model ollama/codellama --temperature 0.3 --max_tokens 2048 --drop_params
```
```python
import guidance
# set api_base to your proxy
# set api_key to anything
gpt4 = guidance.llms.OpenAI("gpt-4", api_base="http://0.0.0.0:4000", api_key="anything")
experts = guidance('''
{{#system~}}
You are a helpful and terse assistant.
{{~/system}}
{{#user~}}
I want a response to the following question:
{{query}}
Name 3 world-class experts (past or present) who would be great at answering this?
Don't answer the question yet.
{{~/user}}
{{#assistant~}}
{{gen 'expert_names' temperature=0 max_tokens=300}}
{{~/assistant}}
''', llm=gpt4)
result = experts(query='How can I be more productive?')
print(result)
```
</TabItem>
</Tabs>
## Debugging Proxy
Events that occur during normal operation

View file

@ -50,7 +50,7 @@ Detailed information about [routing strategies can be found here](../routing)
$ litellm --config /path/to/config.yaml
```
### Test - Load Balancing
### Test - Simple Call
Here requests with model=gpt-3.5-turbo will be routed across multiple instances of azure/gpt-3.5-turbo
@ -138,6 +138,27 @@ print(response)
</Tabs>
### Test - Loadbalancing
In this request, the following will occur:
1. A rate limit exception will be raised
2. LiteLLM proxy will retry the request on the model group (default is 3).
```bash
curl -X POST 'http://0.0.0.0:4000/chat/completions' \
-H 'Content-Type: application/json' \
-H 'Authorization: Bearer sk-1234' \
-d '{
"model": "gpt-3.5-turbo",
"messages": [
{"role": "user", "content": "Hi there!"}
],
"mock_testing_rate_limit_error": true
}'
```
[**See Code**](https://github.com/BerriAI/litellm/blob/6b8806b45f970cb2446654d2c379f8dcaa93ce3c/litellm/router.py#L2535)
### Test - Client Side Fallbacks
In this request the following will occur:
1. The request to `model="zephyr-beta"` will fail

View file

@ -2,7 +2,7 @@ import Image from '@theme/IdealImage';
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# 👥📊 Team Based Logging
# 👥📊 [BETA] Team Based Logging
Allow each team to use their own Langfuse Project / custom callbacks
@ -11,7 +11,14 @@ Allow each team to use their own Langfuse Project / custom callbacks
Team 1 -> Logs to Langfuse Project 1
Team 2 -> Logs to Langfuse Project 2
Team 3 -> Disabled Logging (for GDPR compliance)
```
:::info
✨ This is an Enterprise only feature [Get Started with Enterprise here](https://calendly.com/d/4mp-gd3-k5k/litellm-1-1-onboarding-chat)
:::
## Set Callbacks Per Team

View file

@ -1,7 +1,43 @@
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# Use with Langchain, OpenAI SDK, LlamaIndex, Instructor, Curl
# 💡 Migrating from OpenAI (Langchain, OpenAI SDK, LlamaIndex, Instructor, Curl)
LiteLLM Proxy is **OpenAI-Compatible**, and supports:
* /chat/completions
* /embeddings
* /completions
* /image/generations
* /moderations
* /audio/transcriptions
* /audio/speech
* [Assistants API endpoints](https://docs.litellm.ai/docs/assistants)
* [Batches API endpoints](https://docs.litellm.ai/docs/batches)
* [Fine-Tuning API endpoints](https://docs.litellm.ai/docs/fine_tuning)
LiteLLM Proxy is **Azure OpenAI-compatible**:
* /chat/completions
* /completions
* /embeddings
LiteLLM Proxy is **Anthropic-compatible**:
* /messages
LiteLLM Proxy is **Vertex AI compatible**:
- [Supports ALL Vertex Endpoints](../vertex_ai)
This doc covers:
* /chat/completion
* /embedding
These are **selected examples**. LiteLLM Proxy is **OpenAI-Compatible**, it works with any project that calls OpenAI. Just change the `base_url`, `api_key` and `model`.
To pass provider-specific args, [go here](https://docs.litellm.ai/docs/completion/provider_specific_params#proxy-usage)
To drop unsupported params (E.g. frequency_penalty for bedrock with librechat), [go here](https://docs.litellm.ai/docs/completion/drop_params#openai-proxy-usage)
:::info
@ -234,6 +270,54 @@ main();
```
</TabItem>
<TabItem value="anthropic-py" label="Anthropic Python SDK">
```python
import os
from anthropic import Anthropic
client = Anthropic(
base_url="http://localhost:4000", # proxy endpoint
api_key="sk-s4xN1IiLTCytwtZFJaYQrA", # litellm proxy virtual key
)
message = client.messages.create(
max_tokens=1024,
messages=[
{
"role": "user",
"content": "Hello, Claude",
}
],
model="claude-3-opus-20240229",
)
print(message.content)
```
</TabItem>
<TabItem value="mistral-py" label="Mistral Python SDK">
```python
import os
from mistralai.client import MistralClient
from mistralai.models.chat_completion import ChatMessage
client = MistralClient(api_key="sk-1234", endpoint="http://0.0.0.0:4000")
chat_response = client.chat(
model="mistral-small-latest",
messages=[
{"role": "user", "content": "this is a test request, write a short poem"}
],
)
print(chat_response.choices[0].message.content)
```
</TabItem>
<TabItem value="instructor" label="Instructor">
```python
@ -566,6 +650,166 @@ curl --location 'http://0.0.0.0:4000/moderations' \
```
## Using with OpenAI compatible projects
Set `base_url` to the LiteLLM Proxy server
<Tabs>
<TabItem value="openai" label="OpenAI v1.0.0+">
```python
import openai
client = openai.OpenAI(
api_key="anything",
base_url="http://0.0.0.0:4000"
)
# request sent to model set on litellm proxy, `litellm --model`
response = client.chat.completions.create(model="gpt-3.5-turbo", messages = [
{
"role": "user",
"content": "this is a test request, write a short poem"
}
])
print(response)
```
</TabItem>
<TabItem value="librechat" label="LibreChat">
#### Start the LiteLLM proxy
```shell
litellm --model gpt-3.5-turbo
#INFO: Proxy running on http://0.0.0.0:4000
```
#### 1. Clone the repo
```shell
git clone https://github.com/danny-avila/LibreChat.git
```
#### 2. Modify Librechat's `docker-compose.yml`
LiteLLM Proxy is running on port `4000`, set `4000` as the proxy below
```yaml
OPENAI_REVERSE_PROXY=http://host.docker.internal:4000/v1/chat/completions
```
#### 3. Save fake OpenAI key in Librechat's `.env`
Copy Librechat's `.env.example` to `.env` and overwrite the default OPENAI_API_KEY (by default it requires the user to pass a key).
```env
OPENAI_API_KEY=sk-1234
```
#### 4. Run LibreChat:
```shell
docker compose up
```
</TabItem>
<TabItem value="continue-dev" label="ContinueDev">
Continue-Dev brings ChatGPT to VSCode. See how to [install it here](https://continue.dev/docs/quickstart).
In the [config.py](https://continue.dev/docs/reference/Models/openai) set this as your default model.
```python
default=OpenAI(
api_key="IGNORED",
model="fake-model-name",
context_length=2048, # customize if needed for your model
api_base="http://localhost:4000" # your proxy server url
),
```
Credits [@vividfog](https://github.com/ollama/ollama/issues/305#issuecomment-1751848077) for this tutorial.
</TabItem>
<TabItem value="aider" label="Aider">
```shell
$ pip install aider
$ aider --openai-api-base http://0.0.0.0:4000 --openai-api-key fake-key
```
</TabItem>
<TabItem value="autogen" label="AutoGen">
```python
pip install pyautogen
```
```python
from autogen import AssistantAgent, UserProxyAgent, oai
config_list=[
{
"model": "my-fake-model",
"api_base": "http://localhost:4000", #litellm compatible endpoint
"api_type": "open_ai",
"api_key": "NULL", # just a placeholder
}
]
response = oai.Completion.create(config_list=config_list, prompt="Hi")
print(response) # works fine
llm_config={
"config_list": config_list,
}
assistant = AssistantAgent("assistant", llm_config=llm_config)
user_proxy = UserProxyAgent("user_proxy")
user_proxy.initiate_chat(assistant, message="Plot a chart of META and TESLA stock price change YTD.", config_list=config_list)
```
Credits [@victordibia](https://github.com/microsoft/autogen/issues/45#issuecomment-1749921972) for this tutorial.
</TabItem>
<TabItem value="guidance" label="guidance">
A guidance language for controlling large language models.
https://github.com/guidance-ai/guidance
**NOTE:** Guidance sends additional params like `stop_sequences` which can cause some models to fail if they don't support it.
**Fix**: Start your proxy using the `--drop_params` flag
```shell
litellm --model ollama/codellama --temperature 0.3 --max_tokens 2048 --drop_params
```
```python
import guidance
# set api_base to your proxy
# set api_key to anything
gpt4 = guidance.llms.OpenAI("gpt-4", api_base="http://0.0.0.0:4000", api_key="anything")
experts = guidance('''
{{#system~}}
You are a helpful and terse assistant.
{{~/system}}
{{#user~}}
I want a response to the following question:
{{query}}
Name 3 world-class experts (past or present) who would be great at answering this?
Don't answer the question yet.
{{~/user}}
{{#assistant~}}
{{gen 'expert_names' temperature=0 max_tokens=300}}
{{~/assistant}}
''', llm=gpt4)
result = experts(query='How can I be more productive?')
print(result)
```
</TabItem>
</Tabs>
## Advanced
### (BETA) Batch Completions - pass multiple models

View file

@ -1,7 +1,7 @@
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# [OLD PROXY 👉 [NEW proxy here](./simple_proxy)] Local OpenAI Proxy Server
# [OLD PROXY 👉 [NEW proxy here](./simple_proxy)] Local LiteLLM Proxy Server
A fast, and lightweight OpenAI-compatible server to call 100+ LLM APIs.

View file

@ -14,7 +14,7 @@ In production, litellm supports using Redis as a way to track cooldown server an
:::info
If you want a server to load balance across different LLM APIs, use our [OpenAI Proxy Server](./proxy/load_balancing.md)
If you want a server to load balance across different LLM APIs, use our [LiteLLM Proxy Server](./proxy/load_balancing.md)
:::
@ -1637,7 +1637,7 @@ response = router.completion(
## Deploy Router
If you want a server to load balance across different LLM APIs, use our [OpenAI Proxy Server](./simple_proxy#load-balancing---multiple-instances-of-1-model)
If you want a server to load balance across different LLM APIs, use our [LiteLLM Proxy Server](./simple_proxy#load-balancing---multiple-instances-of-1-model)
## Init Params for the litellm.Router

View file

@ -147,6 +147,9 @@ model_list:
mock_response: "hello world!"
api_key: my-good-key
litellm_settings:
request_timeout: 600 # 👈 Will keep retrying until timeout occurs
router_settings:
redis_host; os.environ/REDIS_HOST
redis_password: os.environ/REDIS_PASSWORD

View file

@ -0,0 +1,65 @@
# Custom Pricing - SageMaker, Azure, etc
Register custom pricing for sagemaker completion model.
For cost per second pricing, you **just** need to register `input_cost_per_second`.
```python
# !pip install boto3
from litellm import completion, completion_cost
os.environ["AWS_ACCESS_KEY_ID"] = ""
os.environ["AWS_SECRET_ACCESS_KEY"] = ""
os.environ["AWS_REGION_NAME"] = ""
def test_completion_sagemaker():
try:
print("testing sagemaker")
response = completion(
model="sagemaker/berri-benchmarking-Llama-2-70b-chat-hf-4",
messages=[{"role": "user", "content": "Hey, how's it going?"}],
input_cost_per_second=0.000420,
)
# Add any assertions here to check the response
print(response)
cost = completion_cost(completion_response=response)
print(cost)
except Exception as e:
raise Exception(f"Error occurred: {e}")
```
## Cost Per Token (e.g. Azure)
```python
# !pip install boto3
from litellm import completion, completion_cost
## set ENV variables
os.environ["AZURE_API_KEY"] = ""
os.environ["AZURE_API_BASE"] = ""
os.environ["AZURE_API_VERSION"] = ""
def test_completion_azure_model():
try:
print("testing azure custom pricing")
# azure call
response = completion(
model = "azure/<your_deployment_name>",
messages = [{ "content": "Hello, how are you?","role": "user"}]
input_cost_per_token=0.005,
output_cost_per_token=1,
)
# Add any assertions here to check the response
print(response)
cost = completion_cost(completion_response=response)
print(cost)
except Exception as e:
raise Exception(f"Error occurred: {e}")
test_completion_azure_model()
```

View file

@ -61,7 +61,7 @@ litellm --config /path/to/config.yaml
```
## Azure Key Vault
<!--
### Quick Start
```python
@ -88,9 +88,9 @@ import litellm
litellm.secret_manager = client
litellm.get_secret("your-test-key")
```
``` -->
### Usage with OpenAI Proxy Server
### Usage with LiteLLM Proxy Server
1. Install Proxy dependencies
```bash
@ -129,7 +129,7 @@ litellm --config /path/to/config.yaml
Use encrypted keys from Google KMS on the proxy
### Usage with OpenAI Proxy Server
### Usage with LiteLLM Proxy Server
## Step 1. Add keys to env
```
@ -160,29 +160,6 @@ $ litellm --test
[Quick Test Proxy](./proxy/quick_start#using-litellm-proxy---curl-request-openai-package-langchain-langchain-js)
## Infisical Secret Manager
Integrates with [Infisical's Secret Manager](https://infisical.com/) for secure storage and retrieval of API keys and sensitive data.
### Usage
liteLLM manages reading in your LLM API secrets/env variables from Infisical for you
```python
import litellm
from infisical import InfisicalClient
litellm.secret_manager = InfisicalClient(token="your-token")
messages = [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "What's the weather like today?"},
]
response = litellm.completion(model="gpt-3.5-turbo", messages=messages)
print(response)
```
<!--
## .env Files
If no secret manager client is specified, Litellm automatically uses the `.env` file to manage sensitive data.
If no secret manager client is specified, Litellm automatically uses the `.env` file to manage sensitive data. -->

View file

@ -2,7 +2,7 @@ import Image from '@theme/IdealImage';
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# 💥 OpenAI Proxy Server
# 💥 LiteLLM Proxy Server
LiteLLM Server manages:

View file

@ -0,0 +1,93 @@
# [BETA] Vertex AI Endpoints
## Supported API Endpoints
- Gemini API
- Embeddings API
- Imagen API
- Code Completion API
- Batch prediction API
- Tuning API
- CountTokens API
## Quick Start Usage
#### 1. Set `default_vertex_config` on your `config.yaml`
Add the following credentials to your litellm config.yaml to use the Vertex AI endpoints.
```yaml
default_vertex_config:
vertex_project: "adroit-crow-413218"
vertex_location: "us-central1"
vertex_credentials: "/Users/ishaanjaffer/Downloads/adroit-crow-413218-a956eef1a2a8.json" # Add path to service account.json
```
#### 2. Start litellm proxy
```shell
litellm --config /path/to/config.yaml
```
#### 3. Test it
```shell
curl http://localhost:4000/vertex-ai/publishers/google/models/textembedding-gecko@001:countTokens \
-H "Content-Type: application/json" \
-H "Authorization: Bearer sk-1234" \
-d '{"instances":[{"content": "gm"}]}'
```
## Usage Examples
### Gemini API (Generate Content)
```shell
curl http://localhost:4000/vertex-ai/publishers/google/models/gemini-1.5-flash-001:generateContent \
-H "Content-Type: application/json" \
-H "Authorization: Bearer sk-1234" \
-d '{"contents":[{"role": "user", "parts":[{"text": "hi"}]}]}'
```
### Embeddings API
```shell
curl http://localhost:4000/vertex-ai/publishers/google/models/textembedding-gecko@001:predict \
-H "Content-Type: application/json" \
-H "Authorization: Bearer sk-1234" \
-d '{"instances":[{"content": "gm"}]}'
```
### Imagen API
```shell
curl http://localhost:4000/vertex-ai/publishers/google/models/imagen-3.0-generate-001:predict \
-H "Content-Type: application/json" \
-H "Authorization: Bearer sk-1234" \
-d '{"instances":[{"prompt": "make an otter"}], "parameters": {"sampleCount": 1}}'
```
### Count Tokens API
```shell
curl http://localhost:4000/vertex-ai/publishers/google/models/gemini-1.5-flash-001:countTokens \
-H "Content-Type: application/json" \
-H "Authorization: Bearer sk-1234" \
-d '{"contents":[{"role": "user", "parts":[{"text": "hi"}]}]}'
```
### Tuning API
Create Fine Tuning Job
```shell
curl http://localhost:4000/vertex-ai/tuningJobs \
-H "Content-Type: application/json" \
-H "Authorization: Bearer sk-1234" \
-d '{
"baseModel": "gemini-1.0-pro-002",
"supervisedTuningSpec" : {
"training_dataset_uri": "gs://cloud-samples-data/ai-platform/generative_ai/sft_train_data.jsonl"
}
}'
```

View file

@ -28,6 +28,24 @@ const config = {
},
plugins: [
[
require.resolve("@getcanary/docusaurus-pagefind"),
{
indexOnly: true,
styles: {
"--canary-color-primary-c": 0.1,
"--canary-color-primary-h": 270,
},
pagefind: {
ranking: {
pageLength: 0.9,
termFrequency: 1.0,
termSimilarity: 1.0,
termSaturation: 1.5,
}
}
},
],
[
'@docusaurus/plugin-ideal-image',
{
@ -117,6 +135,11 @@ const config = {
label: '🚀 Hosted',
to: "docs/hosted"
},
{
href: 'https://models.litellm.ai/',
label: '💸 LLM Model Cost Map',
position: 'right',
},
{
href: 'https://github.com/BerriAI/litellm',
label: 'GitHub',

Binary file not shown.

After

Width:  |  Height:  |  Size: 301 KiB

File diff suppressed because it is too large Load diff

View file

@ -18,13 +18,14 @@
"@docusaurus/plugin-google-gtag": "^2.4.1",
"@docusaurus/plugin-ideal-image": "^2.4.1",
"@docusaurus/preset-classic": "2.4.1",
"@getcanary/docusaurus-pagefind": "^0.0.12",
"@getcanary/web": "^0.0.55",
"@mdx-js/react": "^1.6.22",
"clsx": "^1.2.1",
"docusaurus": "^1.14.7",
"docusaurus-lunr-search": "^2.4.1",
"prism-react-renderer": "^1.3.5",
"react": "^18.1.0",
"react-dom": "^18.1.0",
"react": "^17.0.2",
"react-dom": "^17.0.2",
"sharp": "^0.32.6",
"uuid": "^9.0.1"
},

View file

@ -20,11 +20,11 @@ const sidebars = {
{ type: "doc", id: "index" }, // NEW
{
type: "category",
label: "💥 OpenAI Proxy Server",
label: "💥 LiteLLM Proxy Server",
link: {
type: "generated-index",
title: "💥 OpenAI Proxy Server",
description: `Proxy Server to call 100+ LLMs in a unified interface & track spend, set budgets per virtual key/user`,
title: "💥 LiteLLM Proxy Server",
description: `OpenAI Proxy Server to call 100+ LLMs in a unified interface & track spend, set budgets per virtual key/user`,
slug: "/simple_proxy",
},
items: [
@ -42,12 +42,21 @@ const sidebars = {
"proxy/configs",
"proxy/reliability",
"proxy/cost_tracking",
"proxy/custom_pricing",
"proxy/self_serve",
"proxy/virtual_keys",
{
type: "category",
label: "🪢 Logging",
items: ["proxy/logging", "proxy/streaming_logging"],
items: ["proxy/logging", "proxy/bucket", "proxy/streaming_logging"],
},
{
type: "category",
label: "Secret Manager - storing LLM API Keys",
items: [
"secret",
"oidc"
]
},
"proxy/team_logging",
"proxy/guardrails",
@ -83,49 +92,7 @@ const sidebars = {
},
{
type: "category",
label: "Completion()",
link: {
type: "generated-index",
title: "Completion()",
description: "Details on the completion() function",
slug: "/completion",
},
items: [
"completion/input",
"completion/provider_specific_params",
"completion/json_mode",
"completion/drop_params",
"completion/prompt_formatting",
"completion/output",
"exception_mapping",
"completion/stream",
"completion/message_trimming",
"completion/function_call",
"completion/vision",
"completion/model_alias",
"completion/batching",
"completion/mock_requests",
"completion/reliable_completions",
],
},
{
type: "category",
label: "Embedding(), Image Generation(), Assistants(), Moderation(), Audio Transcriptions(), TTS(), Batches()",
items: [
"embedding/supported_embedding",
"embedding/async_embedding",
"embedding/moderation",
"image_generation",
"audio_transcription",
"text_to_speech",
"assistants",
"batches",
"anthropic_completion"
],
},
{
type: "category",
label: "Supported Models & Providers",
label: "💯 Supported Models & Providers",
link: {
type: "generated-index",
title: "Providers",
@ -160,6 +127,7 @@ const sidebars = {
"providers/perplexity",
"providers/friendliai",
"providers/groq",
"providers/github",
"providers/deepseek",
"providers/fireworks_ai",
"providers/clarifai",
@ -181,20 +149,68 @@ const sidebars = {
],
},
"proxy/custom_pricing",
"routing",
"scheduler",
"set_keys",
"budget_manager",
{
type: "category",
label: "Secret Manager",
label: "Chat Completions (litellm.completion)",
link: {
type: "generated-index",
title: "Chat Completions",
description: "Details on the completion() function",
slug: "/completion",
},
items: [
"secret",
"oidc"
]
"completion/input",
"completion/provider_specific_params",
"completion/json_mode",
"completion/drop_params",
"completion/prompt_formatting",
"completion/output",
"exception_mapping",
"completion/stream",
"completion/message_trimming",
"completion/function_call",
"completion/vision",
"completion/model_alias",
"completion/batching",
"completion/mock_requests",
"completion/reliable_completions",
],
},
{
type: "category",
label: "Supported Endpoints - /images, /audio/speech, /assistants etc",
items: [
"embedding/supported_embedding",
"embedding/async_embedding",
"embedding/moderation",
"image_generation",
"audio_transcription",
"text_to_speech",
"assistants",
"batches",
"fine_tuning",
"anthropic_completion",
"vertex_ai"
],
},
{
type: "category",
label: "🚅 LiteLLM Python SDK",
items: [
"routing",
"scheduler",
"set_keys",
"completion/token_usage",
"sdk_custom_pricing",
"budget_manager",
"caching/all_caches",
{
type: "category",
label: "LangChain, LlamaIndex, Instructor Integration",
items: ["langchain/langchain", "tutorials/instructor"],
},
],
},
"completion/token_usage",
"load_test",
{
type: "category",
@ -202,6 +218,7 @@ const sidebars = {
items: [
"observability/langfuse_integration",
"observability/logfire_integration",
"observability/gcs_bucket_integration",
"observability/langsmith_integration",
"observability/arize_integration",
"debugging/local_debugging",
@ -224,14 +241,12 @@ const sidebars = {
`observability/telemetry`,
],
},
"caching/all_caches",
{
type: "category",
label: "Tutorials",
items: [
'tutorials/azure_openai',
'tutorials/instructor',
'tutorials/oobabooga',
"tutorials/gradio_integration",
"tutorials/huggingface_codellama",
"tutorials/huggingface_tutorial",
@ -243,11 +258,6 @@ const sidebars = {
"tutorials/model_fallbacks",
],
},
{
type: "category",
label: "LangChain, LlamaIndex, Instructor Integration",
items: ["langchain/langchain", "tutorials/instructor"],
},
{
type: "category",
label: "Extras",

View file

@ -10,7 +10,7 @@ https://github.com/BerriAI/litellm
- Translate inputs to provider's `completion`, `embedding`, and `image_generation` endpoints
- [Consistent output](https://docs.litellm.ai/docs/completion/output), text responses will always be available at `['choices'][0]['message']['content']`
- Retry/fallback logic across multiple deployments (e.g. Azure/OpenAI) - [Router](https://docs.litellm.ai/docs/routing)
- Track spend & set budgets per project [OpenAI Proxy Server](https://docs.litellm.ai/docs/simple_proxy)
- Track spend & set budgets per project [LiteLLM Proxy Server](https://docs.litellm.ai/docs/simple_proxy)
## Basic usage

View file

@ -31,3 +31,47 @@ response = asyncio.run(test_get_response())
print(response)
```
## Streaming Token Usage
Supported across all providers. Works the same as openai.
`stream_options={"include_usage": True}`
If set, an additional chunk will be streamed before the data: [DONE] message. The usage field on this chunk shows the token usage statistics for the entire request, and the choices field will always be an empty array. All other chunks will also include a usage field, but with a null value.
### SDK
```python
from litellm import completion
import os
os.environ["OPENAI_API_KEY"] = ""
response = completion(model="gpt-3.5-turbo", messages=messages, stream=True, stream_options={"include_usage": True})
for chunk in response:
print(chunk['choices'][0]['delta'])
```
### PROXY
```bash
curl https://0.0.0.0:4000/v1/chat/completions \
-H "Content-Type: application/json" \
-H "Authorization: Bearer $OPENAI_API_KEY" \
-d '{
"model": "gpt-4o",
"messages": [
{
"role": "system",
"content": "You are a helpful assistant."
},
{
"role": "user",
"content": "Hello!"
}
],
"stream": true,
"stream_options": {"include_usage": true}
}'
```

View file

@ -0,0 +1,95 @@
import React from "react";
import SearchBar from "@theme-original/SearchBar";
import useDocusaurusContext from "@docusaurus/useDocusaurusContext";
import { usePluginData } from "@docusaurus/useGlobalData";
export default function SearchBarWrapper(props) {
const { siteConfig } = useDocusaurusContext();
const { options } = usePluginData("docusaurus-plugin-pagefind-canary");
const [path, setPath] = React.useState("");
const [loaded, setLoaded] = React.useState(false);
React.useEffect(() => {
setPath(`${siteConfig.baseUrl}pagefind/pagefind.js`);
}, [siteConfig]);
React.useEffect(() => {
Promise.all([
import("@getcanary/web/components/canary-root"),
import("@getcanary/web/components/canary-provider-pagefind"),
import("@getcanary/web/components/canary-modal"),
import("@getcanary/web/components/canary-trigger-logo"),
import("@getcanary/web/components/canary-content"),
import("@getcanary/web/components/canary-search"),
import("@getcanary/web/components/canary-search-input"),
import("@getcanary/web/components/canary-search-results-group"),
import("@getcanary/web/components/canary-footer"),
import("@getcanary/web/components/canary-callout-calendly"),
import("@getcanary/web/components/canary-callout-discord"),
])
.then(() => setLoaded(true))
.catch(console.error);
}, []);
return (
<div
style={{
display: "flex",
flexDirection: "row",
alignItems: "center",
gap: "6px",
}}
>
{!loaded || !path ? (
<button
style={{
fontSize: "2rem",
backgroundColor: "transparent",
border: "none",
outline: "none",
padding: "0",
marginRight: "6px",
}}
>
🐤
</button>
) : (
<canary-root framework="docusaurus">
<canary-provider-pagefind
options={JSON.stringify({ ...options, path })}
>
<canary-modal>
<canary-trigger-logo slot="trigger"></canary-trigger-logo>
<canary-content slot="content">
<canary-search slot="search">
<canary-search-input slot="input"></canary-search-input>
<canary-search-results-group
slot="results"
groups="SDK:*;Proxy:/docs/(simple_proxy|proxy/.*)"
></canary-search-results-group>
<canary-callout-discord
slot="callout"
message="👋 Looking for help?"
url="https://discord.com/invite/wuPM9dRgDw"
keywords="discord,help,support,community"
></canary-callout-discord>
<canary-callout-calendly
slot="callout"
message="🚅 Interested in enterprise features?"
keywords="sso,enterprise,security,audit"
url="https://calendly.com/d/4mp-gd3-k5k/litellm-1-1-onboarding-chat"
></canary-callout-calendly>
</canary-search>
<canary-footer slot="footer"></canary-footer>
</canary-content>
</canary-modal>
</canary-provider-pagefind>
</canary-root>
)}
<SearchBar {...props} />
</div>
);
}

File diff suppressed because it is too large Load diff

View file

@ -138,11 +138,24 @@ class _ENTERPRISE_lakeraAI_Moderation(CustomLogger):
return
text = ""
if "messages" in data and isinstance(data["messages"], list):
enabled_roles = litellm.guardrail_name_config_map[
"prompt_injection"
].enabled_roles
prompt_injection_obj: Optional[GuardrailItem] = (
litellm.guardrail_name_config_map.get("prompt_injection")
)
if prompt_injection_obj is not None:
enabled_roles = prompt_injection_obj.enabled_roles
else:
enabled_roles = None
if enabled_roles is None:
enabled_roles = default_roles
stringified_roles: List[str] = []
if enabled_roles is not None: # convert to list of str
for role in enabled_roles:
if isinstance(role, Role):
stringified_roles.append(role.value)
elif isinstance(role, str):
stringified_roles.append(role)
lakera_input_dict: Dict = {
role: None for role in INPUT_POSITIONING_MAP.keys()
}
@ -150,7 +163,7 @@ class _ENTERPRISE_lakeraAI_Moderation(CustomLogger):
tool_call_messages: List = []
for message in data["messages"]:
role = message.get("role")
if role in enabled_roles:
if role in stringified_roles:
if "tool_calls" in message:
tool_call_messages = [
*tool_call_messages,

View file

@ -2,8 +2,8 @@ apiVersion: v1
entries:
litellm-helm:
- apiVersion: v2
appVersion: v1.41.8
created: "2024-07-10T00:59:11.1889+08:00"
appVersion: v1.42.7
created: "2024-08-01T12:25:58.808699+08:00"
dependencies:
- condition: db.deployStandalone
name: postgresql
@ -14,31 +14,12 @@ entries:
repository: oci://registry-1.docker.io/bitnamicharts
version: '>=18.0.0'
description: Call all LLM APIs using the OpenAI format
digest: eeff5e4e6cebb4c977cb7359c1ec6c773c66982f6aa39dbed94a674890144a43
digest: b1de8fa444a37410e223a3d1bd3cc2120f3f22204005fcb61e701c0c7db95d86
name: litellm-helm
type: application
urls:
- https://berriai.github.io/litellm/litellm-helm-0.2.1.tgz
version: 0.2.1
- apiVersion: v2
appVersion: v1.35.38
created: "2024-05-06T10:22:24.384392-07:00"
dependencies:
- condition: db.deployStandalone
name: postgresql
repository: oci://registry-1.docker.io/bitnamicharts
version: '>=13.3.0'
- condition: redis.enabled
name: redis
repository: oci://registry-1.docker.io/bitnamicharts
version: '>=18.0.0'
description: Call all LLM APIs using the OpenAI format
digest: 60f0cfe9e7c1087437cb35f6fb7c43c3ab2be557b6d3aec8295381eb0dfa760f
name: litellm-helm
type: application
urls:
- litellm-helm-0.2.0.tgz
version: 0.2.0
- https://berriai.github.io/litellm/litellm-helm-0.2.2.tgz
version: 0.2.2
postgresql:
- annotations:
category: Database
@ -52,7 +33,7 @@ entries:
licenses: Apache-2.0
apiVersion: v2
appVersion: 16.2.0
created: "2024-07-10T00:59:11.191731+08:00"
created: "2024-08-01T12:25:58.812033+08:00"
dependencies:
- name: common
repository: oci://registry-1.docker.io/bitnamicharts
@ -98,7 +79,7 @@ entries:
licenses: Apache-2.0
apiVersion: v2
appVersion: 7.2.4
created: "2024-07-10T00:59:11.195667+08:00"
created: "2024-08-01T12:25:58.816784+08:00"
dependencies:
- name: common
repository: oci://registry-1.docker.io/bitnamicharts
@ -124,4 +105,4 @@ entries:
urls:
- https://berriai.github.io/litellm/charts/redis-18.19.1.tgz
version: 18.19.1
generated: "2024-07-10T00:59:11.179952+08:00"
generated: "2024-08-01T12:25:58.800261+08:00"

BIN
litellm-helm-0.2.2.tgz Normal file

Binary file not shown.

View file

@ -46,6 +46,7 @@ _custom_logger_compatible_callbacks_literal = Literal[
"galileo",
"braintrust",
"arize",
"gcs_bucket",
]
_known_custom_logger_compatible_callbacks: List = list(
get_args(_custom_logger_compatible_callbacks_literal)
@ -145,6 +146,9 @@ return_response_headers: bool = (
)
##################
logging: bool = True
enable_caching_on_provider_specific_optional_params: bool = (
False # feature-flag for caching on optional params - e.g. 'top_k'
)
caching: bool = (
False # Not used anymore, will be removed in next MAJOR release - https://github.com/BerriAI/litellm/discussions/648
)
@ -165,6 +169,7 @@ budget_duration: Optional[str] = (
default_soft_budget: float = (
50.0 # by default all litellm proxy keys have a soft budget of 50.0
)
forward_traceparent_to_llm_provider: bool = False
_openai_finish_reasons = ["stop", "length", "function_call", "content_filter", "null"]
_openai_completion_params = [
"functions",
@ -266,7 +271,7 @@ default_fallbacks: Optional[List] = None
fallbacks: Optional[List] = None
context_window_fallbacks: Optional[List] = None
content_policy_fallbacks: Optional[List] = None
allowed_fails: int = 0
allowed_fails: int = 3
num_retries_per_request: Optional[int] = (
None # for the request overall (incl. fallbacks + model retries)
)
@ -358,6 +363,7 @@ vertex_code_text_models: List = []
vertex_embedding_models: List = []
vertex_anthropic_models: List = []
vertex_llama3_models: List = []
vertex_mistral_models: List = []
ai21_models: List = []
nlp_cloud_models: List = []
aleph_alpha_models: List = []
@ -403,6 +409,9 @@ for key, value in model_cost.items():
elif value.get("litellm_provider") == "vertex_ai-llama_models":
key = key.replace("vertex_ai/", "")
vertex_llama3_models.append(key)
elif value.get("litellm_provider") == "vertex_ai-mistral_models":
key = key.replace("vertex_ai/", "")
vertex_mistral_models.append(key)
elif value.get("litellm_provider") == "ai21":
ai21_models.append(key)
elif value.get("litellm_provider") == "nlp_cloud":
@ -452,6 +461,7 @@ openai_compatible_providers: List = [
"empower",
"friendliai",
"azure_ai",
"github",
]
@ -692,6 +702,7 @@ provider_list: List = [
"predibase",
"databricks",
"empower",
"github",
"custom", # custom apis
]
@ -809,9 +820,19 @@ from .utils import (
ModelResponse,
EmbeddingResponse,
ImageResponse,
TranscriptionResponse,
TextCompletionResponse,
get_provider_fields,
)
ALL_LITELLM_RESPONSE_TYPES = [
ModelResponse,
EmbeddingResponse,
ImageResponse,
TranscriptionResponse,
TextCompletionResponse,
]
from .types.utils import ImageObject
from .llms.custom_llm import CustomLLM
from .llms.huggingface_restapi import HuggingfaceConfig
@ -833,7 +854,7 @@ from .llms.petals import PetalsConfig
from .llms.vertex_httpx import VertexGeminiConfig, GoogleAIStudioGeminiConfig
from .llms.vertex_ai import VertexAIConfig, VertexAITextEmbeddingConfig
from .llms.vertex_ai_anthropic import VertexAIAnthropicConfig
from .llms.vertex_ai_llama import VertexAILlama3Config
from .llms.vertex_ai_partner import VertexAILlama3Config
from .llms.sagemaker import SagemakerConfig
from .llms.ollama import OllamaConfig
from .llms.ollama_chat import OllamaChatConfig
@ -902,6 +923,7 @@ from .proxy.proxy_cli import run_server
from .router import Router
from .assistants.main import *
from .batches.main import *
from .fine_tuning.main import *
from .files.main import *
from .scheduler import *
from .cost_calculator import response_cost_calculator, cost_per_token

View file

@ -56,6 +56,7 @@ class ServiceLogging(CustomLogger):
parent_otel_span: Optional[Span] = None,
start_time: Optional[Union[datetime, float]] = None,
end_time: Optional[Union[datetime, float]] = None,
event_metadata: Optional[dict] = None,
):
"""
- For counting if the redis, postgres call is successful
@ -84,6 +85,7 @@ class ServiceLogging(CustomLogger):
parent_otel_span=parent_otel_span,
start_time=start_time,
end_time=end_time,
event_metadata=event_metadata,
)
async def async_service_failure_hook(
@ -95,6 +97,7 @@ class ServiceLogging(CustomLogger):
parent_otel_span: Optional[Span] = None,
start_time: Optional[Union[datetime, float]] = None,
end_time: Optional[Union[float, datetime]] = None,
event_metadata: Optional[dict] = None,
):
"""
- For counting if the redis, postgres call is unsuccessful
@ -125,12 +128,16 @@ class ServiceLogging(CustomLogger):
from litellm.proxy.proxy_server import open_telemetry_logger
if parent_otel_span is not None and open_telemetry_logger is not None:
if not isinstance(error, str):
error = str(error)
if open_telemetry_logger is not None:
await open_telemetry_logger.async_service_failure_hook(
payload=payload,
parent_otel_span=parent_otel_span,
start_time=start_time,
end_time=end_time,
event_metadata=event_metadata,
error=error,
)
async def async_post_call_failure_hook(

View file

@ -4,7 +4,7 @@ import json
import os
import traceback
import uuid
from typing import Literal, Optional
from typing import Any, Literal, Optional
import dotenv
import httpx
@ -13,7 +13,12 @@ from pydantic import BaseModel
import litellm
from litellm import ChatCompletionRequest, verbose_logger
from litellm.integrations.custom_logger import CustomLogger
from litellm.types.llms.anthropic import AnthropicMessagesRequest, AnthropicResponse
from litellm.types.llms.anthropic import (
AnthropicMessagesRequest,
AnthropicResponse,
ContentBlockDelta,
)
from litellm.types.utils import AdapterCompletionStreamWrapper
class AnthropicAdapter(CustomLogger):
@ -43,8 +48,150 @@ class AnthropicAdapter(CustomLogger):
response=response
)
def translate_completion_output_params_streaming(self) -> Optional[BaseModel]:
return super().translate_completion_output_params_streaming()
def translate_completion_output_params_streaming(
self, completion_stream: Any
) -> AdapterCompletionStreamWrapper | None:
return AnthropicStreamWrapper(completion_stream=completion_stream)
anthropic_adapter = AnthropicAdapter()
class AnthropicStreamWrapper(AdapterCompletionStreamWrapper):
"""
- first chunk return 'message_start'
- content block must be started and stopped
- finish_reason must map exactly to anthropic reason, else anthropic client won't be able to parse it.
"""
sent_first_chunk: bool = False
sent_content_block_start: bool = False
sent_content_block_finish: bool = False
sent_last_message: bool = False
holding_chunk: Optional[Any] = None
def __next__(self):
try:
if self.sent_first_chunk is False:
self.sent_first_chunk = True
return {
"type": "message_start",
"message": {
"id": "msg_1nZdL29xx5MUA1yADyHTEsnR8uuvGzszyY",
"type": "message",
"role": "assistant",
"content": [],
"model": "claude-3-5-sonnet-20240620",
"stop_reason": None,
"stop_sequence": None,
"usage": {"input_tokens": 25, "output_tokens": 1},
},
}
if self.sent_content_block_start is False:
self.sent_content_block_start = True
return {
"type": "content_block_start",
"index": 0,
"content_block": {"type": "text", "text": ""},
}
for chunk in self.completion_stream:
if chunk == "None" or chunk is None:
raise Exception
processed_chunk = litellm.AnthropicConfig().translate_streaming_openai_response_to_anthropic(
response=chunk
)
if (
processed_chunk["type"] == "message_delta"
and self.sent_content_block_finish is False
):
self.holding_chunk = processed_chunk
self.sent_content_block_finish = True
return {
"type": "content_block_stop",
"index": 0,
}
elif self.holding_chunk is not None:
return_chunk = self.holding_chunk
self.holding_chunk = processed_chunk
return return_chunk
else:
return processed_chunk
if self.holding_chunk is not None:
return_chunk = self.holding_chunk
self.holding_chunk = None
return return_chunk
if self.sent_last_message is False:
self.sent_last_message = True
return {"type": "message_stop"}
raise StopIteration
except StopIteration:
if self.sent_last_message is False:
self.sent_last_message = True
return {"type": "message_stop"}
raise StopIteration
except Exception as e:
verbose_logger.error(
"Anthropic Adapter - {}\n{}".format(e, traceback.format_exc())
)
async def __anext__(self):
try:
if self.sent_first_chunk is False:
self.sent_first_chunk = True
return {
"type": "message_start",
"message": {
"id": "msg_1nZdL29xx5MUA1yADyHTEsnR8uuvGzszyY",
"type": "message",
"role": "assistant",
"content": [],
"model": "claude-3-5-sonnet-20240620",
"stop_reason": None,
"stop_sequence": None,
"usage": {"input_tokens": 25, "output_tokens": 1},
},
}
if self.sent_content_block_start is False:
self.sent_content_block_start = True
return {
"type": "content_block_start",
"index": 0,
"content_block": {"type": "text", "text": ""},
}
async for chunk in self.completion_stream:
if chunk == "None" or chunk is None:
raise Exception
processed_chunk = litellm.AnthropicConfig().translate_streaming_openai_response_to_anthropic(
response=chunk
)
if (
processed_chunk["type"] == "message_delta"
and self.sent_content_block_finish is False
):
self.holding_chunk = processed_chunk
self.sent_content_block_finish = True
return {
"type": "content_block_stop",
"index": 0,
}
elif self.holding_chunk is not None:
return_chunk = self.holding_chunk
self.holding_chunk = processed_chunk
return return_chunk
else:
return processed_chunk
if self.holding_chunk is not None:
return_chunk = self.holding_chunk
self.holding_chunk = None
return return_chunk
if self.sent_last_message is False:
self.sent_last_message = True
return {"type": "message_stop"}
raise StopIteration
except StopIteration:
if self.sent_last_message is False:
self.sent_last_message = True
return {"type": "message_stop"}
raise StopAsyncIteration

View file

@ -20,10 +20,8 @@ import httpx
import litellm
from litellm import client
from litellm.utils import supports_httpx_timeout
from ..llms.openai import OpenAIBatchesAPI, OpenAIFilesAPI
from ..types.llms.openai import (
from litellm.llms.openai import OpenAIBatchesAPI, OpenAIFilesAPI
from litellm.types.llms.openai import (
Batch,
CancelBatchRequest,
CreateBatchRequest,
@ -34,7 +32,8 @@ from ..types.llms.openai import (
HttpxBinaryResponseContent,
RetrieveBatchRequest,
)
from ..types.router import *
from litellm.types.router import GenericLiteLLMParams
from litellm.utils import supports_httpx_timeout
####### ENVIRONMENT VARIABLES ###################
openai_batches_instance = OpenAIBatchesAPI()
@ -314,17 +313,135 @@ def retrieve_batch(
raise e
def cancel_batch():
async def alist_batches(
after: Optional[str] = None,
limit: Optional[int] = None,
custom_llm_provider: Literal["openai"] = "openai",
metadata: Optional[Dict[str, str]] = None,
extra_headers: Optional[Dict[str, str]] = None,
extra_body: Optional[Dict[str, str]] = None,
**kwargs,
) -> Batch:
"""
Async: List your organization's batches.
"""
try:
loop = asyncio.get_event_loop()
kwargs["alist_batches"] = True
# Use a partial function to pass your keyword arguments
func = partial(
list_batches,
after,
limit,
custom_llm_provider,
extra_headers,
extra_body,
**kwargs,
)
# Add the context to the function
ctx = contextvars.copy_context()
func_with_context = partial(ctx.run, func)
init_response = await loop.run_in_executor(None, func_with_context)
if asyncio.iscoroutine(init_response):
response = await init_response
else:
response = init_response # type: ignore
return response
except Exception as e:
raise e
def list_batches(
after: Optional[str] = None,
limit: Optional[int] = None,
custom_llm_provider: Literal["openai"] = "openai",
extra_headers: Optional[Dict[str, str]] = None,
extra_body: Optional[Dict[str, str]] = None,
**kwargs,
):
"""
Lists batches
List your organization's batches.
"""
try:
optional_params = GenericLiteLLMParams(**kwargs)
if custom_llm_provider == "openai":
# for deepinfra/perplexity/anyscale/groq we check in get_llm_provider and pass in the api base from there
api_base = (
optional_params.api_base
or litellm.api_base
or os.getenv("OPENAI_API_BASE")
or "https://api.openai.com/v1"
)
organization = (
optional_params.organization
or litellm.organization
or os.getenv("OPENAI_ORGANIZATION", None)
or None # default - https://github.com/openai/openai-python/blob/284c1799070c723c6a553337134148a7ab088dd8/openai/util.py#L105
)
# set API KEY
api_key = (
optional_params.api_key
or litellm.api_key # for deepinfra/perplexity/anyscale we check in get_llm_provider and pass in the api key from there
or litellm.openai_key
or os.getenv("OPENAI_API_KEY")
)
### TIMEOUT LOGIC ###
timeout = (
optional_params.timeout or kwargs.get("request_timeout", 600) or 600
)
# set timeout for 10 minutes by default
if (
timeout is not None
and isinstance(timeout, httpx.Timeout)
and supports_httpx_timeout(custom_llm_provider) == False
):
read_timeout = timeout.read or 600
timeout = read_timeout # default 10 min timeout
elif timeout is not None and not isinstance(timeout, httpx.Timeout):
timeout = float(timeout) # type: ignore
elif timeout is None:
timeout = 600.0
_is_async = kwargs.pop("alist_batches", False) is True
response = openai_batches_instance.list_batches(
_is_async=_is_async,
after=after,
limit=limit,
api_base=api_base,
api_key=api_key,
organization=organization,
timeout=timeout,
max_retries=optional_params.max_retries,
)
else:
raise litellm.exceptions.BadRequestError(
message="LiteLLM doesn't support {} for 'create_batch'. Only 'openai' is supported.".format(
custom_llm_provider
),
model="n/a",
llm_provider=custom_llm_provider,
response=httpx.Response(
status_code=400,
content="Unsupported provider",
request=httpx.Request(method="create_thread", url="https://github.com/BerriAI/litellm"), # type: ignore
),
)
return response
except Exception as e:
raise e
pass
def list_batch():
def cancel_batch():
pass
async def acancel_batch():
pass
async def alist_batch():
pass

View file

@ -10,6 +10,7 @@
import ast
import asyncio
import hashlib
import io
import json
import logging
import time
@ -21,7 +22,9 @@ from openai._models import BaseModel as OpenAIObject
import litellm
from litellm._logging import verbose_logger
from litellm.litellm_core_utils.core_helpers import _get_parent_otel_span_from_kwargs
from litellm.types.services import ServiceLoggerPayload, ServiceTypes
from litellm.types.utils import all_litellm_params
def print_verbose(print_statement):
@ -33,16 +36,6 @@ def print_verbose(print_statement):
pass
def _get_parent_otel_span_from_kwargs(kwargs: Optional[dict] = None):
try:
if kwargs is None:
return None
_metadata = kwargs.get("metadata") or {}
return _metadata.get("litellm_parent_otel_span")
except:
return None
class BaseCache:
def set_cache(self, key, value, **kwargs):
raise NotImplementedError
@ -1701,6 +1694,8 @@ class Cache:
"aembedding",
"atranscription",
"transcription",
"atext_completion",
"text_completion",
]
]
] = [
@ -1710,6 +1705,8 @@ class Cache:
"aembedding",
"atranscription",
"transcription",
"atext_completion",
"text_completion",
],
# s3 Bucket, boto3 configuration
s3_bucket_name: Optional[str] = None,
@ -1843,6 +1840,7 @@ class Cache:
"seed",
"tools",
"tool_choice",
"stream",
]
embedding_only_kwargs = [
"input",
@ -1856,9 +1854,9 @@ class Cache:
combined_kwargs = (
completion_kwargs + embedding_only_kwargs + transcription_only_kwargs
)
for param in combined_kwargs:
# ignore litellm params here
if param in kwargs:
litellm_param_kwargs = all_litellm_params
for param in kwargs:
if param in combined_kwargs:
# check if param == model and model_group is passed in, then override model with model_group
if param == "model":
model_group = None
@ -1888,21 +1886,33 @@ class Cache:
caching_group or model_group or kwargs[param]
) # use caching_group, if set then model_group if it exists, else use kwargs["model"]
elif param == "file":
metadata_file_name = kwargs.get("metadata", {}).get(
"file_name", None
file = kwargs.get("file")
metadata = kwargs.get("metadata", {})
litellm_params = kwargs.get("litellm_params", {})
# get checksum of file content
param_value = (
metadata.get("file_checksum")
or getattr(file, "name", None)
or metadata.get("file_name")
or litellm_params.get("file_name")
)
litellm_params_file_name = kwargs.get("litellm_params", {}).get(
"file_name", None
)
if metadata_file_name is not None:
param_value = metadata_file_name
elif litellm_params_file_name is not None:
param_value = litellm_params_file_name
else:
if kwargs[param] is None:
continue # ignore None params
param_value = kwargs[param]
cache_key += f"{str(param)}: {str(param_value)}"
elif (
param not in litellm_param_kwargs
): # check if user passed in optional param - e.g. top_k
if (
litellm.enable_caching_on_provider_specific_optional_params is True
): # feature flagged for now
if kwargs[param] is None:
continue # ignore None params
param_value = kwargs[param]
cache_key += f"{str(param)}: {str(param_value)}"
print_verbose(f"\nCreated cache key: {cache_key}")
# Use hashlib to create a sha256 hash of the cache key
hash_object = hashlib.sha256(cache_key.encode())
@ -2107,9 +2117,7 @@ class Cache:
try:
cache_list = []
for idx, i in enumerate(kwargs["input"]):
preset_cache_key = litellm.cache.get_cache_key(
*args, **{**kwargs, "input": i}
)
preset_cache_key = self.get_cache_key(*args, **{**kwargs, "input": i})
kwargs["cache_key"] = preset_cache_key
embedding_response = result.data[idx]
cache_key, cached_data, kwargs = self._add_cache_logic(
@ -2244,6 +2252,8 @@ def enable_cache(
"aembedding",
"atranscription",
"transcription",
"atext_completion",
"text_completion",
]
]
] = [
@ -2253,6 +2263,8 @@ def enable_cache(
"aembedding",
"atranscription",
"transcription",
"atext_completion",
"text_completion",
],
**kwargs,
):
@ -2309,6 +2321,8 @@ def update_cache(
"aembedding",
"atranscription",
"transcription",
"atext_completion",
"text_completion",
]
]
] = [
@ -2318,6 +2332,8 @@ def update_cache(
"aembedding",
"atranscription",
"transcription",
"atext_completion",
"text_completion",
],
**kwargs,
):

View file

@ -106,7 +106,6 @@ def cost_per_token(
Returns:
tuple: A tuple containing the cost in USD dollars for prompt tokens and completion tokens, respectively.
"""
args = locals()
if model is None:
raise Exception("Invalid arg. Model cannot be none.")
## CUSTOM PRICING ##
@ -117,6 +116,7 @@ def cost_per_token(
custom_cost_per_second=custom_cost_per_second,
custom_cost_per_token=custom_cost_per_token,
)
if response_cost is not None:
return response_cost[0], response_cost[1]
@ -495,9 +495,9 @@ def completion_cost(
completion_tokens = completion_response.get("usage", {}).get(
"completion_tokens", 0
)
total_time = completion_response.get("_response_ms", 0)
total_time = getattr(completion_response, "_response_ms", 0)
verbose_logger.debug(
f"completion_response response ms: {completion_response.get('_response_ms')} "
f"completion_response response ms: {getattr(completion_response, '_response_ms', None)} "
)
model = model or completion_response.get(
"model", None
@ -509,7 +509,7 @@ def completion_cost(
):
model = completion_response._hidden_params.get("model", model)
custom_llm_provider = completion_response._hidden_params.get(
"custom_llm_provider", ""
"custom_llm_provider", custom_llm_provider or ""
)
region_name = completion_response._hidden_params.get(
"region_name", region_name
@ -659,9 +659,7 @@ def completion_cost(
call_type=call_type,
)
_final_cost = prompt_tokens_cost_usd_dollar + completion_tokens_cost_usd_dollar
print_verbose(
f"final cost: {_final_cost}; prompt_tokens_cost_usd_dollar: {prompt_tokens_cost_usd_dollar}; completion_tokens_cost_usd_dollar: {completion_tokens_cost_usd_dollar}"
)
return _final_cost
except Exception as e:
raise e
@ -732,14 +730,21 @@ def response_cost_calculator(
)
return response_cost
except litellm.NotFoundError as e:
print_verbose(
verbose_logger.debug( # debug since it can be spammy in logs, for calls
f"Model={model} for LLM Provider={custom_llm_provider} not found in completion cost map."
)
return None
except Exception as e:
verbose_logger.warning(
"litellm.cost_calculator.py::response_cost_calculator - Returning None. Exception occurred - {}/n{}".format(
str(e), traceback.format_exc()
if litellm.suppress_debug_info: # allow cli tools to suppress this information.
verbose_logger.debug(
"litellm.cost_calculator.py::response_cost_calculator - Returning None. Exception occurred - {}/n{}".format(
str(e), traceback.format_exc()
)
)
else:
verbose_logger.warning(
"litellm.cost_calculator.py::response_cost_calculator - Returning None. Exception occurred - {}/n{}".format(
str(e), traceback.format_exc()
)
)
)
return None

View file

@ -122,7 +122,7 @@ class BadRequestError(openai.BadRequestError): # type: ignore
self.model = model
self.llm_provider = llm_provider
self.litellm_debug_info = litellm_debug_info
response = response or httpx.Response(
response = httpx.Response(
status_code=self.status_code,
request=httpx.Request(
method="GET", url="https://litellm.ai"
@ -199,8 +199,12 @@ class Timeout(openai.APITimeoutError): # type: ignore
litellm_debug_info: Optional[str] = None,
max_retries: Optional[int] = None,
num_retries: Optional[int] = None,
headers: Optional[dict] = None,
):
request = httpx.Request(method="POST", url="https://api.openai.com/v1")
request = httpx.Request(
method="POST",
url="https://api.openai.com/v1",
)
super().__init__(
request=request
) # Call the base class constructor with the parameters it needs
@ -211,6 +215,7 @@ class Timeout(openai.APITimeoutError): # type: ignore
self.litellm_debug_info = litellm_debug_info
self.max_retries = max_retries
self.num_retries = num_retries
self.headers = headers
# custom function to convert to str
def __str__(self):
@ -287,16 +292,13 @@ class RateLimitError(openai.RateLimitError): # type: ignore
self.litellm_debug_info = litellm_debug_info
self.max_retries = max_retries
self.num_retries = num_retries
if response is None:
self.response = httpx.Response(
status_code=429,
request=httpx.Request(
method="POST",
url=" https://cloud.google.com/vertex-ai/",
),
)
else:
self.response = response
self.response = httpx.Response(
status_code=429,
request=httpx.Request(
method="POST",
url=" https://cloud.google.com/vertex-ai/",
),
)
super().__init__(
self.message, response=self.response, body=None
) # Call the base class constructor with the parameters it needs
@ -334,7 +336,7 @@ class ContextWindowExceededError(BadRequestError): # type: ignore
self.llm_provider = llm_provider
self.litellm_debug_info = litellm_debug_info
request = httpx.Request(method="POST", url="https://api.openai.com/v1")
self.response = response or httpx.Response(status_code=400, request=request)
self.response = httpx.Response(status_code=400, request=request)
super().__init__(
message=self.message,
model=self.model, # type: ignore
@ -377,7 +379,7 @@ class RejectedRequestError(BadRequestError): # type: ignore
self.litellm_debug_info = litellm_debug_info
self.request_data = request_data
request = httpx.Request(method="POST", url="https://api.openai.com/v1")
response = httpx.Response(status_code=500, request=request)
response = httpx.Response(status_code=400, request=request)
super().__init__(
message=self.message,
model=self.model, # type: ignore
@ -419,7 +421,7 @@ class ContentPolicyViolationError(BadRequestError): # type: ignore
self.llm_provider = llm_provider
self.litellm_debug_info = litellm_debug_info
request = httpx.Request(method="POST", url="https://api.openai.com/v1")
self.response = response or httpx.Response(status_code=500, request=request)
self.response = httpx.Response(status_code=400, request=request)
super().__init__(
message=self.message,
model=self.model, # type: ignore
@ -463,16 +465,13 @@ class ServiceUnavailableError(openai.APIStatusError): # type: ignore
self.litellm_debug_info = litellm_debug_info
self.max_retries = max_retries
self.num_retries = num_retries
if response is None:
self.response = httpx.Response(
status_code=self.status_code,
request=httpx.Request(
method="POST",
url=" https://cloud.google.com/vertex-ai/",
),
)
else:
self.response = response
self.response = httpx.Response(
status_code=self.status_code,
request=httpx.Request(
method="POST",
url=" https://cloud.google.com/vertex-ai/",
),
)
super().__init__(
self.message, response=self.response, body=None
) # Call the base class constructor with the parameters it needs
@ -512,16 +511,13 @@ class InternalServerError(openai.InternalServerError): # type: ignore
self.litellm_debug_info = litellm_debug_info
self.max_retries = max_retries
self.num_retries = num_retries
if response is None:
self.response = httpx.Response(
status_code=self.status_code,
request=httpx.Request(
method="POST",
url=" https://cloud.google.com/vertex-ai/",
),
)
else:
self.response = response
self.response = httpx.Response(
status_code=self.status_code,
request=httpx.Request(
method="POST",
url=" https://cloud.google.com/vertex-ai/",
),
)
super().__init__(
self.message, response=self.response, body=None
) # Call the base class constructor with the parameters it needs
@ -547,7 +543,7 @@ class InternalServerError(openai.InternalServerError): # type: ignore
class APIError(openai.APIError): # type: ignore
def __init__(
self,
status_code,
status_code: int,
message,
llm_provider,
model,
@ -591,7 +587,7 @@ class APIConnectionError(openai.APIConnectionError): # type: ignore
message,
llm_provider,
model,
request: httpx.Request,
request: Optional[httpx.Request] = None,
litellm_debug_info: Optional[str] = None,
max_retries: Optional[int] = None,
num_retries: Optional[int] = None,
@ -601,9 +597,10 @@ class APIConnectionError(openai.APIConnectionError): # type: ignore
self.model = model
self.status_code = 500
self.litellm_debug_info = litellm_debug_info
self.request = httpx.Request(method="POST", url="https://api.openai.com/v1")
self.max_retries = max_retries
self.num_retries = num_retries
super().__init__(message=self.message, request=request)
super().__init__(message=self.message, request=self.request)
def __str__(self):
_message = self.message
@ -757,7 +754,7 @@ class MockException(openai.APIError):
# used for testing
def __init__(
self,
status_code,
status_code: int,
message,
llm_provider,
model,

View file

@ -14,7 +14,8 @@ from typing import Any, Coroutine, Dict, Literal, Optional, Union
import httpx
import litellm
from litellm import client
from litellm import client, get_secret
from litellm.llms.files_apis.azure import AzureOpenAIFilesAPI
from litellm.llms.openai import FileDeleted, FileObject, OpenAIFilesAPI
from litellm.types.llms.openai import (
Batch,
@ -28,12 +29,13 @@ from litellm.utils import supports_httpx_timeout
####### ENVIRONMENT VARIABLES ###################
openai_files_instance = OpenAIFilesAPI()
azure_files_instance = AzureOpenAIFilesAPI()
#################################################
async def afile_retrieve(
file_id: str,
custom_llm_provider: Literal["openai"] = "openai",
custom_llm_provider: Literal["openai", "azure"] = "openai",
extra_headers: Optional[Dict[str, str]] = None,
extra_body: Optional[Dict[str, str]] = None,
**kwargs,
@ -73,7 +75,7 @@ async def afile_retrieve(
def file_retrieve(
file_id: str,
custom_llm_provider: Literal["openai"] = "openai",
custom_llm_provider: Literal["openai", "azure"] = "openai",
extra_headers: Optional[Dict[str, str]] = None,
extra_body: Optional[Dict[str, str]] = None,
**kwargs,
@ -156,7 +158,7 @@ def file_retrieve(
# Delete file
async def afile_delete(
file_id: str,
custom_llm_provider: Literal["openai"] = "openai",
custom_llm_provider: Literal["openai", "azure"] = "openai",
extra_headers: Optional[Dict[str, str]] = None,
extra_body: Optional[Dict[str, str]] = None,
**kwargs,
@ -196,7 +198,7 @@ async def afile_delete(
def file_delete(
file_id: str,
custom_llm_provider: Literal["openai"] = "openai",
custom_llm_provider: Literal["openai", "azure"] = "openai",
extra_headers: Optional[Dict[str, str]] = None,
extra_body: Optional[Dict[str, str]] = None,
**kwargs,
@ -208,6 +210,22 @@ def file_delete(
"""
try:
optional_params = GenericLiteLLMParams(**kwargs)
### TIMEOUT LOGIC ###
timeout = optional_params.timeout or kwargs.get("request_timeout", 600) or 600
# set timeout for 10 minutes by default
if (
timeout is not None
and isinstance(timeout, httpx.Timeout)
and supports_httpx_timeout(custom_llm_provider) == False
):
read_timeout = timeout.read or 600
timeout = read_timeout # default 10 min timeout
elif timeout is not None and not isinstance(timeout, httpx.Timeout):
timeout = float(timeout) # type: ignore
elif timeout is None:
timeout = 600.0
_is_async = kwargs.pop("is_async", False) is True
if custom_llm_provider == "openai":
# for deepinfra/perplexity/anyscale/groq we check in get_llm_provider and pass in the api base from there
api_base = (
@ -229,26 +247,6 @@ def file_delete(
or litellm.openai_key
or os.getenv("OPENAI_API_KEY")
)
### TIMEOUT LOGIC ###
timeout = (
optional_params.timeout or kwargs.get("request_timeout", 600) or 600
)
# set timeout for 10 minutes by default
if (
timeout is not None
and isinstance(timeout, httpx.Timeout)
and supports_httpx_timeout(custom_llm_provider) == False
):
read_timeout = timeout.read or 600
timeout = read_timeout # default 10 min timeout
elif timeout is not None and not isinstance(timeout, httpx.Timeout):
timeout = float(timeout) # type: ignore
elif timeout is None:
timeout = 600.0
_is_async = kwargs.pop("is_async", False) is True
response = openai_files_instance.delete_file(
file_id=file_id,
_is_async=_is_async,
@ -258,6 +256,38 @@ def file_delete(
max_retries=optional_params.max_retries,
organization=organization,
)
elif custom_llm_provider == "azure":
api_base = optional_params.api_base or litellm.api_base or get_secret("AZURE_API_BASE") # type: ignore
api_version = (
optional_params.api_version
or litellm.api_version
or get_secret("AZURE_API_VERSION")
) # type: ignore
api_key = (
optional_params.api_key
or litellm.api_key
or litellm.azure_key
or get_secret("AZURE_OPENAI_API_KEY")
or get_secret("AZURE_API_KEY")
) # type: ignore
extra_body = optional_params.get("extra_body", {})
azure_ad_token: Optional[str] = None
if extra_body is not None:
azure_ad_token = extra_body.pop("azure_ad_token", None)
else:
azure_ad_token = get_secret("AZURE_AD_TOKEN") # type: ignore
response = azure_files_instance.delete_file(
_is_async=_is_async,
api_base=api_base,
api_key=api_key,
api_version=api_version,
timeout=timeout,
max_retries=optional_params.max_retries,
file_id=file_id,
)
else:
raise litellm.exceptions.BadRequestError(
message="LiteLLM doesn't support {} for 'create_batch'. Only 'openai' is supported.".format(
@ -278,7 +308,7 @@ def file_delete(
# List files
async def afile_list(
custom_llm_provider: Literal["openai"] = "openai",
custom_llm_provider: Literal["openai", "azure"] = "openai",
purpose: Optional[str] = None,
extra_headers: Optional[Dict[str, str]] = None,
extra_body: Optional[Dict[str, str]] = None,
@ -318,7 +348,7 @@ async def afile_list(
def file_list(
custom_llm_provider: Literal["openai"] = "openai",
custom_llm_provider: Literal["openai", "azure"] = "openai",
purpose: Optional[str] = None,
extra_headers: Optional[Dict[str, str]] = None,
extra_body: Optional[Dict[str, str]] = None,
@ -402,7 +432,7 @@ def file_list(
async def acreate_file(
file: FileTypes,
purpose: Literal["assistants", "batch", "fine-tune"],
custom_llm_provider: Literal["openai"] = "openai",
custom_llm_provider: Literal["openai", "azure"] = "openai",
extra_headers: Optional[Dict[str, str]] = None,
extra_body: Optional[Dict[str, str]] = None,
**kwargs,
@ -444,7 +474,7 @@ async def acreate_file(
def create_file(
file: FileTypes,
purpose: Literal["assistants", "batch", "fine-tune"],
custom_llm_provider: Literal["openai"] = "openai",
custom_llm_provider: Literal["openai", "azure"] = "openai",
extra_headers: Optional[Dict[str, str]] = None,
extra_body: Optional[Dict[str, str]] = None,
**kwargs,
@ -455,7 +485,31 @@ def create_file(
LiteLLM Equivalent of POST: POST https://api.openai.com/v1/files
"""
try:
_is_async = kwargs.pop("acreate_file", False) is True
optional_params = GenericLiteLLMParams(**kwargs)
### TIMEOUT LOGIC ###
timeout = optional_params.timeout or kwargs.get("request_timeout", 600) or 600
# set timeout for 10 minutes by default
if (
timeout is not None
and isinstance(timeout, httpx.Timeout)
and supports_httpx_timeout(custom_llm_provider) == False
):
read_timeout = timeout.read or 600
timeout = read_timeout # default 10 min timeout
elif timeout is not None and not isinstance(timeout, httpx.Timeout):
timeout = float(timeout) # type: ignore
elif timeout is None:
timeout = 600.0
_create_file_request = CreateFileRequest(
file=file,
purpose=purpose,
extra_headers=extra_headers,
extra_body=extra_body,
)
if custom_llm_provider == "openai":
# for deepinfra/perplexity/anyscale/groq we check in get_llm_provider and pass in the api base from there
api_base = (
@ -477,32 +531,6 @@ def create_file(
or litellm.openai_key
or os.getenv("OPENAI_API_KEY")
)
### TIMEOUT LOGIC ###
timeout = (
optional_params.timeout or kwargs.get("request_timeout", 600) or 600
)
# set timeout for 10 minutes by default
if (
timeout is not None
and isinstance(timeout, httpx.Timeout)
and supports_httpx_timeout(custom_llm_provider) == False
):
read_timeout = timeout.read or 600
timeout = read_timeout # default 10 min timeout
elif timeout is not None and not isinstance(timeout, httpx.Timeout):
timeout = float(timeout) # type: ignore
elif timeout is None:
timeout = 600.0
_create_file_request = CreateFileRequest(
file=file,
purpose=purpose,
extra_headers=extra_headers,
extra_body=extra_body,
)
_is_async = kwargs.pop("acreate_file", False) is True
response = openai_files_instance.create_file(
_is_async=_is_async,
@ -513,6 +541,38 @@ def create_file(
organization=organization,
create_file_data=_create_file_request,
)
elif custom_llm_provider == "azure":
api_base = optional_params.api_base or litellm.api_base or get_secret("AZURE_API_BASE") # type: ignore
api_version = (
optional_params.api_version
or litellm.api_version
or get_secret("AZURE_API_VERSION")
) # type: ignore
api_key = (
optional_params.api_key
or litellm.api_key
or litellm.azure_key
or get_secret("AZURE_OPENAI_API_KEY")
or get_secret("AZURE_API_KEY")
) # type: ignore
extra_body = optional_params.get("extra_body", {})
azure_ad_token: Optional[str] = None
if extra_body is not None:
azure_ad_token = extra_body.pop("azure_ad_token", None)
else:
azure_ad_token = get_secret("AZURE_AD_TOKEN") # type: ignore
response = azure_files_instance.create_file(
_is_async=_is_async,
api_base=api_base,
api_key=api_key,
api_version=api_version,
timeout=timeout,
max_retries=optional_params.max_retries,
create_file_data=_create_file_request,
)
else:
raise litellm.exceptions.BadRequestError(
message="LiteLLM doesn't support {} for 'create_batch'. Only 'openai' is supported.".format(
@ -533,7 +593,7 @@ def create_file(
async def afile_content(
file_id: str,
custom_llm_provider: Literal["openai"] = "openai",
custom_llm_provider: Literal["openai", "azure"] = "openai",
extra_headers: Optional[Dict[str, str]] = None,
extra_body: Optional[Dict[str, str]] = None,
**kwargs,
@ -573,7 +633,7 @@ async def afile_content(
def file_content(
file_id: str,
custom_llm_provider: Literal["openai"] = "openai",
custom_llm_provider: Literal["openai", "azure"] = "openai",
extra_headers: Optional[Dict[str, str]] = None,
extra_body: Optional[Dict[str, str]] = None,
**kwargs,

593
litellm/fine_tuning/main.py Normal file
View file

@ -0,0 +1,593 @@
"""
Main File for Fine Tuning API implementation
https://platform.openai.com/docs/api-reference/fine-tuning
- fine_tuning.jobs.create()
- fine_tuning.jobs.list()
- client.fine_tuning.jobs.list_events()
"""
import asyncio
import contextvars
import os
from functools import partial
from typing import Any, Coroutine, Dict, Literal, Optional, Union
import httpx
import litellm
from litellm import get_secret
from litellm._logging import verbose_logger
from litellm.llms.fine_tuning_apis.azure import AzureOpenAIFineTuningAPI
from litellm.llms.fine_tuning_apis.openai import (
FineTuningJob,
FineTuningJobCreate,
OpenAIFineTuningAPI,
)
from litellm.llms.fine_tuning_apis.vertex_ai import VertexFineTuningAPI
from litellm.types.llms.openai import Hyperparameters
from litellm.types.router import *
from litellm.utils import supports_httpx_timeout
####### ENVIRONMENT VARIABLES ###################
openai_fine_tuning_apis_instance = OpenAIFineTuningAPI()
azure_fine_tuning_apis_instance = AzureOpenAIFineTuningAPI()
vertex_fine_tuning_apis_instance = VertexFineTuningAPI()
#################################################
async def acreate_fine_tuning_job(
model: str,
training_file: str,
hyperparameters: Optional[Hyperparameters] = {}, # type: ignore
suffix: Optional[str] = None,
validation_file: Optional[str] = None,
integrations: Optional[List[str]] = None,
seed: Optional[int] = None,
custom_llm_provider: Literal["openai", "azure", "vertex_ai"] = "openai",
extra_headers: Optional[Dict[str, str]] = None,
extra_body: Optional[Dict[str, str]] = None,
**kwargs,
) -> FineTuningJob:
"""
Async: Creates and executes a batch from an uploaded file of request
"""
verbose_logger.debug(
"inside acreate_fine_tuning_job model=%s and kwargs=%s", model, kwargs
)
try:
loop = asyncio.get_event_loop()
kwargs["acreate_fine_tuning_job"] = True
# Use a partial function to pass your keyword arguments
func = partial(
create_fine_tuning_job,
model,
training_file,
hyperparameters,
suffix,
validation_file,
integrations,
seed,
custom_llm_provider,
extra_headers,
extra_body,
**kwargs,
)
# Add the context to the function
ctx = contextvars.copy_context()
func_with_context = partial(ctx.run, func)
init_response = await loop.run_in_executor(None, func_with_context)
if asyncio.iscoroutine(init_response):
response = await init_response
else:
response = init_response # type: ignore
return response
except Exception as e:
raise e
def create_fine_tuning_job(
model: str,
training_file: str,
hyperparameters: Optional[Hyperparameters] = {}, # type: ignore
suffix: Optional[str] = None,
validation_file: Optional[str] = None,
integrations: Optional[List[str]] = None,
seed: Optional[int] = None,
custom_llm_provider: Literal["openai", "azure", "vertex_ai"] = "openai",
extra_headers: Optional[Dict[str, str]] = None,
extra_body: Optional[Dict[str, str]] = None,
**kwargs,
) -> Union[FineTuningJob, Coroutine[Any, Any, FineTuningJob]]:
"""
Creates a fine-tuning job which begins the process of creating a new model from a given dataset.
Response includes details of the enqueued job including job status and the name of the fine-tuned models once complete
"""
try:
_is_async = kwargs.pop("acreate_fine_tuning_job", False) is True
optional_params = GenericLiteLLMParams(**kwargs)
### TIMEOUT LOGIC ###
timeout = optional_params.timeout or kwargs.get("request_timeout", 600) or 600
# set timeout for 10 minutes by default
if (
timeout is not None
and isinstance(timeout, httpx.Timeout)
and supports_httpx_timeout(custom_llm_provider) == False
):
read_timeout = timeout.read or 600
timeout = read_timeout # default 10 min timeout
elif timeout is not None and not isinstance(timeout, httpx.Timeout):
timeout = float(timeout) # type: ignore
elif timeout is None:
timeout = 600.0
# OpenAI
if custom_llm_provider == "openai":
# for deepinfra/perplexity/anyscale/groq we check in get_llm_provider and pass in the api base from there
api_base = (
optional_params.api_base
or litellm.api_base
or os.getenv("OPENAI_API_BASE")
or "https://api.openai.com/v1"
)
organization = (
optional_params.organization
or litellm.organization
or os.getenv("OPENAI_ORGANIZATION", None)
or None # default - https://github.com/openai/openai-python/blob/284c1799070c723c6a553337134148a7ab088dd8/openai/util.py#L105
)
# set API KEY
api_key = (
optional_params.api_key
or litellm.api_key # for deepinfra/perplexity/anyscale we check in get_llm_provider and pass in the api key from there
or litellm.openai_key
or os.getenv("OPENAI_API_KEY")
)
create_fine_tuning_job_data = FineTuningJobCreate(
model=model,
training_file=training_file,
hyperparameters=hyperparameters,
suffix=suffix,
validation_file=validation_file,
integrations=integrations,
seed=seed,
)
create_fine_tuning_job_data_dict = create_fine_tuning_job_data.model_dump(
exclude_none=True
)
response = openai_fine_tuning_apis_instance.create_fine_tuning_job(
api_base=api_base,
api_key=api_key,
organization=organization,
create_fine_tuning_job_data=create_fine_tuning_job_data_dict,
timeout=timeout,
max_retries=optional_params.max_retries,
_is_async=_is_async,
)
# Azure OpenAI
elif custom_llm_provider == "azure":
api_base = optional_params.api_base or litellm.api_base or get_secret("AZURE_API_BASE") # type: ignore
api_version = (
optional_params.api_version
or litellm.api_version
or get_secret("AZURE_API_VERSION")
) # type: ignore
api_key = (
optional_params.api_key
or litellm.api_key
or litellm.azure_key
or get_secret("AZURE_OPENAI_API_KEY")
or get_secret("AZURE_API_KEY")
) # type: ignore
extra_body = optional_params.get("extra_body", {})
azure_ad_token: Optional[str] = None
if extra_body is not None:
azure_ad_token = extra_body.pop("azure_ad_token", None)
else:
azure_ad_token = get_secret("AZURE_AD_TOKEN") # type: ignore
create_fine_tuning_job_data = FineTuningJobCreate(
model=model,
training_file=training_file,
hyperparameters=hyperparameters,
suffix=suffix,
validation_file=validation_file,
integrations=integrations,
seed=seed,
)
create_fine_tuning_job_data_dict = create_fine_tuning_job_data.model_dump(
exclude_none=True
)
response = azure_fine_tuning_apis_instance.create_fine_tuning_job(
api_base=api_base,
api_key=api_key,
api_version=api_version,
create_fine_tuning_job_data=create_fine_tuning_job_data_dict,
timeout=timeout,
max_retries=optional_params.max_retries,
_is_async=_is_async,
)
elif custom_llm_provider == "vertex_ai":
api_base = optional_params.api_base or ""
vertex_ai_project = (
optional_params.vertex_project
or litellm.vertex_project
or get_secret("VERTEXAI_PROJECT")
)
vertex_ai_location = (
optional_params.vertex_location
or litellm.vertex_location
or get_secret("VERTEXAI_LOCATION")
)
vertex_credentials = optional_params.vertex_credentials or get_secret(
"VERTEXAI_CREDENTIALS"
)
create_fine_tuning_job_data = FineTuningJobCreate(
model=model,
training_file=training_file,
hyperparameters=hyperparameters,
suffix=suffix,
validation_file=validation_file,
integrations=integrations,
seed=seed,
)
response = vertex_fine_tuning_apis_instance.create_fine_tuning_job(
_is_async=_is_async,
create_fine_tuning_job_data=create_fine_tuning_job_data,
vertex_credentials=vertex_credentials,
vertex_project=vertex_ai_project,
vertex_location=vertex_ai_location,
timeout=timeout,
api_base=api_base,
kwargs=kwargs,
)
else:
raise litellm.exceptions.BadRequestError(
message="LiteLLM doesn't support {} for 'create_batch'. Only 'openai' is supported.".format(
custom_llm_provider
),
model="n/a",
llm_provider=custom_llm_provider,
response=httpx.Response(
status_code=400,
content="Unsupported provider",
request=httpx.Request(method="create_thread", url="https://github.com/BerriAI/litellm"), # type: ignore
),
)
return response
except Exception as e:
verbose_logger.error("got exception in create_fine_tuning_job=%s", str(e))
raise e
async def acancel_fine_tuning_job(
fine_tuning_job_id: str,
custom_llm_provider: Literal["openai"] = "openai",
extra_headers: Optional[Dict[str, str]] = None,
extra_body: Optional[Dict[str, str]] = None,
**kwargs,
) -> FineTuningJob:
"""
Async: Immediately cancel a fine-tune job.
"""
try:
loop = asyncio.get_event_loop()
kwargs["acancel_fine_tuning_job"] = True
# Use a partial function to pass your keyword arguments
func = partial(
cancel_fine_tuning_job,
fine_tuning_job_id,
custom_llm_provider,
extra_headers,
extra_body,
**kwargs,
)
# Add the context to the function
ctx = contextvars.copy_context()
func_with_context = partial(ctx.run, func)
init_response = await loop.run_in_executor(None, func_with_context)
if asyncio.iscoroutine(init_response):
response = await init_response
else:
response = init_response # type: ignore
return response
except Exception as e:
raise e
def cancel_fine_tuning_job(
fine_tuning_job_id: str,
custom_llm_provider: Literal["openai"] = "openai",
extra_headers: Optional[Dict[str, str]] = None,
extra_body: Optional[Dict[str, str]] = None,
**kwargs,
) -> Union[FineTuningJob, Coroutine[Any, Any, FineTuningJob]]:
"""
Immediately cancel a fine-tune job.
Response includes details of the enqueued job including job status and the name of the fine-tuned models once complete
"""
try:
optional_params = GenericLiteLLMParams(**kwargs)
### TIMEOUT LOGIC ###
timeout = optional_params.timeout or kwargs.get("request_timeout", 600) or 600
# set timeout for 10 minutes by default
if (
timeout is not None
and isinstance(timeout, httpx.Timeout)
and supports_httpx_timeout(custom_llm_provider) == False
):
read_timeout = timeout.read or 600
timeout = read_timeout # default 10 min timeout
elif timeout is not None and not isinstance(timeout, httpx.Timeout):
timeout = float(timeout) # type: ignore
elif timeout is None:
timeout = 600.0
_is_async = kwargs.pop("acancel_fine_tuning_job", False) is True
# OpenAI
if custom_llm_provider == "openai":
# for deepinfra/perplexity/anyscale/groq we check in get_llm_provider and pass in the api base from there
api_base = (
optional_params.api_base
or litellm.api_base
or os.getenv("OPENAI_API_BASE")
or "https://api.openai.com/v1"
)
organization = (
optional_params.organization
or litellm.organization
or os.getenv("OPENAI_ORGANIZATION", None)
or None # default - https://github.com/openai/openai-python/blob/284c1799070c723c6a553337134148a7ab088dd8/openai/util.py#L105
)
# set API KEY
api_key = (
optional_params.api_key
or litellm.api_key # for deepinfra/perplexity/anyscale we check in get_llm_provider and pass in the api key from there
or litellm.openai_key
or os.getenv("OPENAI_API_KEY")
)
response = openai_fine_tuning_apis_instance.cancel_fine_tuning_job(
api_base=api_base,
api_key=api_key,
organization=organization,
fine_tuning_job_id=fine_tuning_job_id,
timeout=timeout,
max_retries=optional_params.max_retries,
_is_async=_is_async,
)
# Azure OpenAI
elif custom_llm_provider == "azure":
api_base = optional_params.api_base or litellm.api_base or get_secret("AZURE_API_BASE") # type: ignore
api_version = (
optional_params.api_version
or litellm.api_version
or get_secret("AZURE_API_VERSION")
) # type: ignore
api_key = (
optional_params.api_key
or litellm.api_key
or litellm.azure_key
or get_secret("AZURE_OPENAI_API_KEY")
or get_secret("AZURE_API_KEY")
) # type: ignore
extra_body = optional_params.get("extra_body", {})
azure_ad_token: Optional[str] = None
if extra_body is not None:
azure_ad_token = extra_body.pop("azure_ad_token", None)
else:
azure_ad_token = get_secret("AZURE_AD_TOKEN") # type: ignore
response = azure_fine_tuning_apis_instance.cancel_fine_tuning_job(
api_base=api_base,
api_key=api_key,
api_version=api_version,
fine_tuning_job_id=fine_tuning_job_id,
timeout=timeout,
max_retries=optional_params.max_retries,
_is_async=_is_async,
)
else:
raise litellm.exceptions.BadRequestError(
message="LiteLLM doesn't support {} for 'create_batch'. Only 'openai' is supported.".format(
custom_llm_provider
),
model="n/a",
llm_provider=custom_llm_provider,
response=httpx.Response(
status_code=400,
content="Unsupported provider",
request=httpx.Request(method="create_thread", url="https://github.com/BerriAI/litellm"), # type: ignore
),
)
return response
except Exception as e:
raise e
async def alist_fine_tuning_jobs(
after: Optional[str] = None,
limit: Optional[int] = None,
custom_llm_provider: Literal["openai"] = "openai",
extra_headers: Optional[Dict[str, str]] = None,
extra_body: Optional[Dict[str, str]] = None,
**kwargs,
) -> FineTuningJob:
"""
Async: List your organization's fine-tuning jobs
"""
try:
loop = asyncio.get_event_loop()
kwargs["alist_fine_tuning_jobs"] = True
# Use a partial function to pass your keyword arguments
func = partial(
list_fine_tuning_jobs,
after,
limit,
custom_llm_provider,
extra_headers,
extra_body,
**kwargs,
)
# Add the context to the function
ctx = contextvars.copy_context()
func_with_context = partial(ctx.run, func)
init_response = await loop.run_in_executor(None, func_with_context)
if asyncio.iscoroutine(init_response):
response = await init_response
else:
response = init_response # type: ignore
return response
except Exception as e:
raise e
def list_fine_tuning_jobs(
after: Optional[str] = None,
limit: Optional[int] = None,
custom_llm_provider: Literal["openai"] = "openai",
extra_headers: Optional[Dict[str, str]] = None,
extra_body: Optional[Dict[str, str]] = None,
**kwargs,
):
"""
List your organization's fine-tuning jobs
Params:
- after: Optional[str] = None, Identifier for the last job from the previous pagination request.
- limit: Optional[int] = None, Number of fine-tuning jobs to retrieve. Defaults to 20
"""
try:
optional_params = GenericLiteLLMParams(**kwargs)
### TIMEOUT LOGIC ###
timeout = optional_params.timeout or kwargs.get("request_timeout", 600) or 600
# set timeout for 10 minutes by default
if (
timeout is not None
and isinstance(timeout, httpx.Timeout)
and supports_httpx_timeout(custom_llm_provider) == False
):
read_timeout = timeout.read or 600
timeout = read_timeout # default 10 min timeout
elif timeout is not None and not isinstance(timeout, httpx.Timeout):
timeout = float(timeout) # type: ignore
elif timeout is None:
timeout = 600.0
_is_async = kwargs.pop("alist_fine_tuning_jobs", False) is True
# OpenAI
if custom_llm_provider == "openai":
# for deepinfra/perplexity/anyscale/groq we check in get_llm_provider and pass in the api base from there
api_base = (
optional_params.api_base
or litellm.api_base
or os.getenv("OPENAI_API_BASE")
or "https://api.openai.com/v1"
)
organization = (
optional_params.organization
or litellm.organization
or os.getenv("OPENAI_ORGANIZATION", None)
or None # default - https://github.com/openai/openai-python/blob/284c1799070c723c6a553337134148a7ab088dd8/openai/util.py#L105
)
# set API KEY
api_key = (
optional_params.api_key
or litellm.api_key # for deepinfra/perplexity/anyscale we check in get_llm_provider and pass in the api key from there
or litellm.openai_key
or os.getenv("OPENAI_API_KEY")
)
response = openai_fine_tuning_apis_instance.list_fine_tuning_jobs(
api_base=api_base,
api_key=api_key,
organization=organization,
after=after,
limit=limit,
timeout=timeout,
max_retries=optional_params.max_retries,
_is_async=_is_async,
)
# Azure OpenAI
elif custom_llm_provider == "azure":
api_base = optional_params.api_base or litellm.api_base or get_secret("AZURE_API_BASE") # type: ignore
api_version = (
optional_params.api_version
or litellm.api_version
or get_secret("AZURE_API_VERSION")
) # type: ignore
api_key = (
optional_params.api_key
or litellm.api_key
or litellm.azure_key
or get_secret("AZURE_OPENAI_API_KEY")
or get_secret("AZURE_API_KEY")
) # type: ignore
extra_body = optional_params.get("extra_body", {})
azure_ad_token: Optional[str] = None
if extra_body is not None:
azure_ad_token = extra_body.pop("azure_ad_token", None)
else:
azure_ad_token = get_secret("AZURE_AD_TOKEN") # type: ignore
response = azure_fine_tuning_apis_instance.list_fine_tuning_jobs(
api_base=api_base,
api_key=api_key,
api_version=api_version,
after=after,
limit=limit,
timeout=timeout,
max_retries=optional_params.max_retries,
_is_async=_is_async,
)
else:
raise litellm.exceptions.BadRequestError(
message="LiteLLM doesn't support {} for 'create_batch'. Only 'openai' is supported.".format(
custom_llm_provider
),
model="n/a",
llm_provider=custom_llm_provider,
response=httpx.Response(
status_code=400,
content="Unsupported provider",
request=httpx.Request(method="create_thread", url="https://github.com/BerriAI/litellm"), # type: ignore
),
)
return response
except Exception as e:
raise e

View file

@ -11,6 +11,7 @@ from typing import Literal, Optional
import dotenv
import httpx
from pydantic import BaseModel
import litellm
from litellm import verbose_logger
@ -280,22 +281,20 @@ class BraintrustLogger(CustomLogger):
) # if litellm_params['metadata'] == None
metadata = self.add_metadata_from_header(litellm_params, metadata)
clean_metadata = {}
try:
metadata = copy.deepcopy(
metadata
) # Avoid modifying the original metadata
except:
new_metadata = {}
for key, value in metadata.items():
if (
isinstance(value, list)
or isinstance(value, dict)
or isinstance(value, str)
or isinstance(value, int)
or isinstance(value, float)
):
new_metadata[key] = copy.deepcopy(value)
metadata = new_metadata
new_metadata = {}
for key, value in metadata.items():
if (
isinstance(value, list)
or isinstance(value, dict)
or isinstance(value, str)
or isinstance(value, int)
or isinstance(value, float)
):
new_metadata[key] = value
elif isinstance(value, BaseModel):
new_metadata[key] = value.model_dump_json()
metadata = new_metadata
tags = []
if isinstance(metadata, dict):

View file

@ -10,7 +10,7 @@ from pydantic import BaseModel
from litellm.caching import DualCache
from litellm.proxy._types import UserAPIKeyAuth
from litellm.types.llms.openai import ChatCompletionRequest
from litellm.types.utils import ModelResponse
from litellm.types.utils import AdapterCompletionStreamWrapper, ModelResponse
class CustomLogger: # https://docs.litellm.ai/docs/observability/custom_callback#callback-class
@ -76,7 +76,9 @@ class CustomLogger: # https://docs.litellm.ai/docs/observability/custom_callbac
"""
pass
def translate_completion_output_params_streaming(self) -> Optional[BaseModel]:
def translate_completion_output_params_streaming(
self, completion_stream: Any
) -> Optional[AdapterCompletionStreamWrapper]:
"""
Translates the streaming chunk, from the OpenAI format to the custom format.
"""

View file

@ -1,5 +1,5 @@
#### What this does ####
# On success + failure, log events to Supabase
# On success + failure, log events to Datadog
import dotenv, os
import requests # type: ignore
@ -9,6 +9,21 @@ import litellm, uuid
from litellm._logging import print_verbose, verbose_logger
def make_json_serializable(payload):
for key, value in payload.items():
try:
if isinstance(value, dict):
# recursively sanitize dicts
payload[key] = make_json_serializable(value.copy())
elif not isinstance(value, (str, int, float, bool, type(None))):
# everything else becomes a string
payload[key] = str(value)
except:
# non blocking if it can't cast to a str
pass
return payload
class DataDogLogger:
# Class variables or attributes
def __init__(
@ -61,7 +76,7 @@ class DataDogLogger:
id = response_obj.get("id", str(uuid.uuid4()))
usage = dict(usage)
try:
response_time = (end_time - start_time).total_seconds()
response_time = (end_time - start_time).total_seconds() * 1000
except:
response_time = None
@ -91,12 +106,12 @@ class DataDogLogger:
"id": id,
"call_type": call_type,
"cache_hit": cache_hit,
"startTime": start_time,
"endTime": end_time,
"responseTime (seconds)": response_time,
"start_time": start_time,
"end_time": end_time,
"response_time": response_time,
"model": kwargs.get("model", ""),
"user": kwargs.get("user", ""),
"modelParameters": optional_params,
"model_parameters": optional_params,
"spend": kwargs.get("response_cost", 0),
"messages": messages,
"response": response_obj,
@ -104,13 +119,7 @@ class DataDogLogger:
"metadata": clean_metadata,
}
# Ensure everything in the payload is converted to str
for key, value in payload.items():
try:
payload[key] = str(value)
except:
# non blocking if it can't cast to a str
pass
make_json_serializable(payload)
import json
payload = json.dumps(payload)

View file

@ -0,0 +1,203 @@
import json
import os
from datetime import datetime
from typing import Any, Dict, List, Optional, TypedDict, Union
import httpx
from pydantic import BaseModel, Field
import litellm
from litellm._logging import verbose_logger
from litellm.integrations.custom_logger import CustomLogger
from litellm.litellm_core_utils.logging_utils import (
convert_litellm_response_object_to_dict,
)
from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler
from litellm.proxy._types import CommonProxyErrors, SpendLogsPayload
class RequestKwargs(TypedDict):
model: Optional[str]
messages: Optional[List]
optional_params: Optional[Dict[str, Any]]
class GCSBucketPayload(TypedDict):
request_kwargs: Optional[RequestKwargs]
response_obj: Optional[Dict]
start_time: str
end_time: str
class GCSBucketLogger(CustomLogger):
def __init__(self) -> None:
from litellm.proxy.proxy_server import premium_user
if premium_user is not True:
raise ValueError(
f"GCS Bucket logging is a premium feature. Please upgrade to use it. {CommonProxyErrors.not_premium_user.value}"
)
self.async_httpx_client = AsyncHTTPHandler(
timeout=httpx.Timeout(timeout=600.0, connect=5.0)
)
self.path_service_account_json = os.getenv("GCS_PATH_SERVICE_ACCOUNT", None)
self.BUCKET_NAME = os.getenv("GCS_BUCKET_NAME", None)
if self.BUCKET_NAME is None:
raise ValueError(
"GCS_BUCKET_NAME is not set in the environment, but GCS Bucket is being used as a logging callback. Please set 'GCS_BUCKET_NAME' in the environment."
)
if self.path_service_account_json is None:
raise ValueError(
"GCS_PATH_SERVICE_ACCOUNT is not set in the environment, but GCS Bucket is being used as a logging callback. Please set 'GCS_PATH_SERVICE_ACCOUNT' in the environment."
)
pass
#### ASYNC ####
async def async_log_success_event(self, kwargs, response_obj, start_time, end_time):
from litellm.proxy.proxy_server import premium_user
if premium_user is not True:
raise ValueError(
f"GCS Bucket logging is a premium feature. Please upgrade to use it. {CommonProxyErrors.not_premium_user.value}"
)
try:
verbose_logger.debug(
"GCS Logger: async_log_success_event logging kwargs: %s, response_obj: %s",
kwargs,
response_obj,
)
start_time_str = start_time.strftime("%Y-%m-%d %H:%M:%S")
end_time_str = end_time.strftime("%Y-%m-%d %H:%M:%S")
headers = await self.construct_request_headers()
logging_payload: GCSBucketPayload = await self.get_gcs_payload(
kwargs, response_obj, start_time_str, end_time_str
)
object_name = response_obj["id"]
response = await self.async_httpx_client.post(
headers=headers,
url=f"https://storage.googleapis.com/upload/storage/v1/b/{self.BUCKET_NAME}/o?uploadType=media&name={object_name}",
json=logging_payload,
)
if response.status_code != 200:
verbose_logger.error("GCS Bucket logging error: %s", str(response.text))
verbose_logger.debug("GCS Bucket response %s", response)
verbose_logger.debug("GCS Bucket status code %s", response.status_code)
verbose_logger.debug("GCS Bucket response.text %s", response.text)
except Exception as e:
verbose_logger.error("GCS Bucket logging error: %s", str(e))
async def async_log_failure_event(self, kwargs, response_obj, start_time, end_time):
pass
async def construct_request_headers(self) -> Dict[str, str]:
from litellm import vertex_chat_completion
auth_header, _ = vertex_chat_completion._get_token_and_url(
model="gcs-bucket",
vertex_credentials=self.path_service_account_json,
vertex_project=None,
vertex_location=None,
gemini_api_key=None,
stream=None,
custom_llm_provider="vertex_ai",
api_base=None,
)
verbose_logger.debug("constructed auth_header %s", auth_header)
headers = {
"Authorization": f"Bearer {auth_header}", # auth_header
"Content-Type": "application/json",
}
return headers
async def get_gcs_payload(
self, kwargs, response_obj, start_time, end_time
) -> GCSBucketPayload:
request_kwargs = RequestKwargs(
model=kwargs.get("model", None),
messages=kwargs.get("messages", None),
optional_params=kwargs.get("optional_params", None),
)
response_dict = {}
response_dict = convert_litellm_response_object_to_dict(
response_obj=response_obj
)
gcs_payload: GCSBucketPayload = GCSBucketPayload(
request_kwargs=request_kwargs,
response_obj=response_dict,
start_time=start_time,
end_time=end_time,
)
return gcs_payload
async def download_gcs_object(self, object_name):
"""
Download an object from GCS.
https://cloud.google.com/storage/docs/downloading-objects#download-object-json
"""
try:
headers = await self.construct_request_headers()
url = f"https://storage.googleapis.com/storage/v1/b/{self.BUCKET_NAME}/o/{object_name}?alt=media"
# Send the GET request to download the object
response = await self.async_httpx_client.get(url=url, headers=headers)
if response.status_code != 200:
verbose_logger.error(
"GCS object download error: %s", str(response.text)
)
return None
verbose_logger.debug(
"GCS object download response status code: %s", response.status_code
)
# Return the content of the downloaded object
return response.content
except Exception as e:
verbose_logger.error("GCS object download error: %s", str(e))
return None
async def delete_gcs_object(self, object_name):
"""
Delete an object from GCS.
"""
try:
headers = await self.construct_request_headers()
url = f"https://storage.googleapis.com/storage/v1/b/{self.BUCKET_NAME}/o/{object_name}"
# Send the DELETE request to delete the object
response = await self.async_httpx_client.delete(url=url, headers=headers)
if (response.status_code != 200) or (response.status_code != 204):
verbose_logger.error(
"GCS object delete error: %s, status code: %s",
str(response.text),
response.status_code,
)
return None
verbose_logger.debug(
"GCS object delete response status code: %s, response: %s",
response.status_code,
response.text,
)
# Return the content of the downloaded object
return response.text
except Exception as e:
verbose_logger.error("GCS object download error: %s", str(e))
return None

View file

@ -31,13 +31,36 @@ class HeliconeLogger:
prompt += f"{AI_PROMPT}"
claude_provider_request = {"model": model, "prompt": prompt}
choice = response_obj["choices"][0]
message = choice["message"]
content = []
if "tool_calls" in message and message["tool_calls"]:
for tool_call in message["tool_calls"]:
content.append({
"type": "tool_use",
"id": tool_call["id"],
"name": tool_call["function"]["name"],
"input": tool_call["function"]["arguments"]
})
elif "content" in message and message["content"]:
content = [{"type": "text", "text": message["content"]}]
claude_response_obj = {
"completion": response_obj["choices"][0]["message"]["content"],
"id": response_obj["id"],
"type": "message",
"role": "assistant",
"model": model,
"stop_reason": "stop_sequence",
"content": content,
"stop_reason": choice["finish_reason"],
"stop_sequence": None,
"usage": {
"input_tokens": response_obj["usage"]["prompt_tokens"],
"output_tokens": response_obj["usage"]["completion_tokens"]
}
}
return claude_provider_request, claude_response_obj
return claude_response_obj
@staticmethod
def add_metadata_from_header(litellm_params: dict, metadata: dict) -> dict:
@ -96,7 +119,7 @@ class HeliconeLogger:
response_obj = response_obj.json()
if "claude" in model:
provider_request, response_obj = self.claude_mapping(
response_obj = self.claude_mapping(
model=model, messages=messages, response_obj=response_obj
)
@ -107,7 +130,11 @@ class HeliconeLogger:
}
# Code to be executed
provider_url = self.provider_url
url = "https://api.hconeai.com/oai/v1/log"
if "claude" in model:
url = "https://api.hconeai.com/anthropic/v1/log"
provider_url = "https://api.anthropic.com/v1/messages"
headers = {
"Authorization": f"Bearer {self.key}",
"Content-Type": "application/json",
@ -124,7 +151,7 @@ class HeliconeLogger:
meta.update(metadata)
data = {
"providerRequest": {
"url": self.provider_url,
"url": provider_url,
"json": provider_request,
"meta": meta,
},

View file

@ -5,6 +5,7 @@ import os
import traceback
from packaging.version import Version
from pydantic import BaseModel
import litellm
from litellm._logging import verbose_logger
@ -144,6 +145,10 @@ class LangFuseLogger:
f"Langfuse Logging - Enters logging function for model {kwargs}"
)
# set default values for input/output for langfuse logging
input = None
output = None
litellm_params = kwargs.get("litellm_params", {})
litellm_call_id = kwargs.get("litellm_call_id", None)
metadata = (
@ -198,6 +203,11 @@ class LangFuseLogger:
):
input = prompt
output = response_obj["data"]
elif response_obj is not None and isinstance(
response_obj, litellm.TranscriptionResponse
):
input = prompt
output = response_obj["text"]
print_verbose(f"OUTPUT IN LANGFUSE: {output}; original: {response_obj}")
trace_id = None
generation_id = None
@ -322,7 +332,7 @@ class LangFuseLogger:
metadata = copy.deepcopy(
metadata
) # Avoid modifying the original metadata
except:
except Exception:
new_metadata = {}
for key, value in metadata.items():
if (
@ -333,6 +343,8 @@ class LangFuseLogger:
or isinstance(value, float)
):
new_metadata[key] = copy.deepcopy(value)
elif isinstance(value, BaseModel):
new_metadata[key] = value.model_dump()
metadata = new_metadata
supports_tags = Version(langfuse.version.__version__) >= Version("2.6.3")

View file

@ -2,10 +2,6 @@
# On success + failure, log events to Logfire
import os
import dotenv
dotenv.load_dotenv() # Loading env variables using dotenv
import traceback
import uuid
from enum import Enum

View file

@ -119,6 +119,7 @@ class OpenTelemetry(CustomLogger):
parent_otel_span: Optional[Span] = None,
start_time: Optional[Union[datetime, float]] = None,
end_time: Optional[Union[datetime, float]] = None,
event_metadata: Optional[dict] = None,
):
from datetime import datetime
@ -149,15 +150,26 @@ class OpenTelemetry(CustomLogger):
service_logging_span.set_attribute(
key="service", value=payload.service.value
)
if event_metadata:
for key, value in event_metadata.items():
if isinstance(value, dict):
try:
value = str(value)
except Exception:
value = "litllm logging error - could_not_json_serialize"
service_logging_span.set_attribute(key, value)
service_logging_span.set_status(Status(StatusCode.OK))
service_logging_span.end(end_time=_end_time_ns)
async def async_service_failure_hook(
self,
payload: ServiceLoggerPayload,
error: Optional[str] = "",
parent_otel_span: Optional[Span] = None,
start_time: Optional[Union[datetime, float]] = None,
end_time: Optional[Union[float, datetime]] = None,
event_metadata: Optional[dict] = None,
):
from datetime import datetime
@ -188,6 +200,17 @@ class OpenTelemetry(CustomLogger):
service_logging_span.set_attribute(
key="service", value=payload.service.value
)
if error:
service_logging_span.set_attribute(key="error", value=error)
if event_metadata:
for key, value in event_metadata.items():
if isinstance(value, dict):
try:
value = str(value)
except Exception:
value = "litllm logging error - could_not_json_serialize"
service_logging_span.set_attribute(key, value)
service_logging_span.set_status(Status(StatusCode.ERROR))
service_logging_span.end(end_time=_end_time_ns)
@ -258,15 +281,26 @@ class OpenTelemetry(CustomLogger):
def _handle_failure(self, kwargs, response_obj, start_time, end_time):
from opentelemetry.trace import Status, StatusCode
verbose_logger.debug(
"OpenTelemetry Logger: Failure HandlerLogging kwargs: %s, OTEL config settings=%s",
kwargs,
self.config,
)
_parent_context, parent_otel_span = self._get_span_context(kwargs)
# Span 1: Requst sent to litellm SDK
span = self.tracer.start_span(
name=self._get_span_name(kwargs),
start_time=self._to_ns(start_time),
context=self._get_span_context(kwargs),
context=_parent_context,
)
span.set_status(Status(StatusCode.ERROR))
self.set_attributes(span, kwargs, response_obj)
span.end(end_time=self._to_ns(end_time))
if parent_otel_span is not None:
parent_otel_span.end(end_time=self._to_ns(datetime.now()))
def set_tools_attributes(self, span: Span, tools):
import json
@ -299,153 +333,165 @@ class OpenTelemetry(CustomLogger):
return isinstance(value, (str, bool, int, float))
def set_attributes(self, span: Span, kwargs, response_obj):
if self.callback_name == "arize":
from litellm.integrations.arize_ai import set_arize_ai_attributes
try:
if self.callback_name == "arize":
from litellm.integrations.arize_ai import set_arize_ai_attributes
set_arize_ai_attributes(span, kwargs, response_obj)
return
from litellm.proxy._types import SpanAttributes
set_arize_ai_attributes(span, kwargs, response_obj)
return
from litellm.proxy._types import SpanAttributes
optional_params = kwargs.get("optional_params", {})
litellm_params = kwargs.get("litellm_params", {}) or {}
optional_params = kwargs.get("optional_params", {})
litellm_params = kwargs.get("litellm_params", {}) or {}
# https://github.com/open-telemetry/semantic-conventions/blob/main/model/registry/gen-ai.yaml
# Following Conventions here: https://github.com/open-telemetry/semantic-conventions/blob/main/docs/gen-ai/llm-spans.md
#############################################
############ LLM CALL METADATA ##############
#############################################
metadata = litellm_params.get("metadata", {}) or {}
# https://github.com/open-telemetry/semantic-conventions/blob/main/model/registry/gen-ai.yaml
# Following Conventions here: https://github.com/open-telemetry/semantic-conventions/blob/main/docs/gen-ai/llm-spans.md
#############################################
############ LLM CALL METADATA ##############
#############################################
metadata = litellm_params.get("metadata", {}) or {}
clean_metadata = redact_user_api_key_info(metadata=metadata)
clean_metadata = redact_user_api_key_info(metadata=metadata)
for key, value in clean_metadata.items():
if self.is_primitive(value):
span.set_attribute("metadata.{}".format(key), value)
for key, value in clean_metadata.items():
if self.is_primitive(value):
span.set_attribute("metadata.{}".format(key), value)
#############################################
########## LLM Request Attributes ###########
#############################################
#############################################
########## LLM Request Attributes ###########
#############################################
# The name of the LLM a request is being made to
if kwargs.get("model"):
span.set_attribute(SpanAttributes.LLM_REQUEST_MODEL, kwargs.get("model"))
# The name of the LLM a request is being made to
if kwargs.get("model"):
span.set_attribute(
SpanAttributes.LLM_REQUEST_MODEL, kwargs.get("model")
)
# The Generative AI Provider: Azure, OpenAI, etc.
span.set_attribute(
SpanAttributes.LLM_SYSTEM,
litellm_params.get("custom_llm_provider", "Unknown"),
)
# The maximum number of tokens the LLM generates for a request.
if optional_params.get("max_tokens"):
# The Generative AI Provider: Azure, OpenAI, etc.
span.set_attribute(
SpanAttributes.LLM_REQUEST_MAX_TOKENS, optional_params.get("max_tokens")
SpanAttributes.LLM_SYSTEM,
litellm_params.get("custom_llm_provider", "Unknown"),
)
# The temperature setting for the LLM request.
if optional_params.get("temperature"):
# The maximum number of tokens the LLM generates for a request.
if optional_params.get("max_tokens"):
span.set_attribute(
SpanAttributes.LLM_REQUEST_MAX_TOKENS,
optional_params.get("max_tokens"),
)
# The temperature setting for the LLM request.
if optional_params.get("temperature"):
span.set_attribute(
SpanAttributes.LLM_REQUEST_TEMPERATURE,
optional_params.get("temperature"),
)
# The top_p sampling setting for the LLM request.
if optional_params.get("top_p"):
span.set_attribute(
SpanAttributes.LLM_REQUEST_TOP_P, optional_params.get("top_p")
)
span.set_attribute(
SpanAttributes.LLM_REQUEST_TEMPERATURE,
optional_params.get("temperature"),
SpanAttributes.LLM_IS_STREAMING,
str(optional_params.get("stream", False)),
)
# The top_p sampling setting for the LLM request.
if optional_params.get("top_p"):
span.set_attribute(
SpanAttributes.LLM_REQUEST_TOP_P, optional_params.get("top_p")
)
if optional_params.get("tools"):
tools = optional_params["tools"]
self.set_tools_attributes(span, tools)
span.set_attribute(
SpanAttributes.LLM_IS_STREAMING, str(optional_params.get("stream", False))
)
if optional_params.get("user"):
span.set_attribute(SpanAttributes.LLM_USER, optional_params.get("user"))
if optional_params.get("tools"):
tools = optional_params["tools"]
self.set_tools_attributes(span, tools)
if optional_params.get("user"):
span.set_attribute(SpanAttributes.LLM_USER, optional_params.get("user"))
if kwargs.get("messages"):
for idx, prompt in enumerate(kwargs.get("messages")):
if prompt.get("role"):
span.set_attribute(
f"{SpanAttributes.LLM_PROMPTS}.{idx}.role",
prompt.get("role"),
)
if prompt.get("content"):
if not isinstance(prompt.get("content"), str):
prompt["content"] = str(prompt.get("content"))
span.set_attribute(
f"{SpanAttributes.LLM_PROMPTS}.{idx}.content",
prompt.get("content"),
)
#############################################
########## LLM Response Attributes ##########
#############################################
if response_obj.get("choices"):
for idx, choice in enumerate(response_obj.get("choices")):
if choice.get("finish_reason"):
span.set_attribute(
f"{SpanAttributes.LLM_COMPLETIONS}.{idx}.finish_reason",
choice.get("finish_reason"),
)
if choice.get("message"):
if choice.get("message").get("role"):
if kwargs.get("messages"):
for idx, prompt in enumerate(kwargs.get("messages")):
if prompt.get("role"):
span.set_attribute(
f"{SpanAttributes.LLM_COMPLETIONS}.{idx}.role",
choice.get("message").get("role"),
f"{SpanAttributes.LLM_PROMPTS}.{idx}.role",
prompt.get("role"),
)
if choice.get("message").get("content"):
if not isinstance(choice.get("message").get("content"), str):
choice["message"]["content"] = str(
choice.get("message").get("content")
if prompt.get("content"):
if not isinstance(prompt.get("content"), str):
prompt["content"] = str(prompt.get("content"))
span.set_attribute(
f"{SpanAttributes.LLM_PROMPTS}.{idx}.content",
prompt.get("content"),
)
#############################################
########## LLM Response Attributes ##########
#############################################
if response_obj is not None:
if response_obj.get("choices"):
for idx, choice in enumerate(response_obj.get("choices")):
if choice.get("finish_reason"):
span.set_attribute(
f"{SpanAttributes.LLM_COMPLETIONS}.{idx}.finish_reason",
choice.get("finish_reason"),
)
span.set_attribute(
f"{SpanAttributes.LLM_COMPLETIONS}.{idx}.content",
choice.get("message").get("content"),
)
if choice.get("message"):
if choice.get("message").get("role"):
span.set_attribute(
f"{SpanAttributes.LLM_COMPLETIONS}.{idx}.role",
choice.get("message").get("role"),
)
if choice.get("message").get("content"):
if not isinstance(
choice.get("message").get("content"), str
):
choice["message"]["content"] = str(
choice.get("message").get("content")
)
span.set_attribute(
f"{SpanAttributes.LLM_COMPLETIONS}.{idx}.content",
choice.get("message").get("content"),
)
message = choice.get("message")
tool_calls = message.get("tool_calls")
if tool_calls:
span.set_attribute(
f"{SpanAttributes.LLM_COMPLETIONS}.{idx}.function_call.name",
tool_calls[0].get("function").get("name"),
)
span.set_attribute(
f"{SpanAttributes.LLM_COMPLETIONS}.{idx}.function_call.arguments",
tool_calls[0].get("function").get("arguments"),
)
message = choice.get("message")
tool_calls = message.get("tool_calls")
if tool_calls:
span.set_attribute(
f"{SpanAttributes.LLM_COMPLETIONS}.{idx}.function_call.name",
tool_calls[0].get("function").get("name"),
)
span.set_attribute(
f"{SpanAttributes.LLM_COMPLETIONS}.{idx}.function_call.arguments",
tool_calls[0].get("function").get("arguments"),
)
# The unique identifier for the completion.
if response_obj.get("id"):
span.set_attribute("gen_ai.response.id", response_obj.get("id"))
# The unique identifier for the completion.
if response_obj.get("id"):
span.set_attribute("gen_ai.response.id", response_obj.get("id"))
# The model used to generate the response.
if response_obj.get("model"):
span.set_attribute(
SpanAttributes.LLM_RESPONSE_MODEL, response_obj.get("model")
)
# The model used to generate the response.
if response_obj.get("model"):
span.set_attribute(
SpanAttributes.LLM_RESPONSE_MODEL, response_obj.get("model")
)
usage = response_obj.get("usage")
if usage:
span.set_attribute(
SpanAttributes.LLM_USAGE_TOTAL_TOKENS,
usage.get("total_tokens"),
)
usage = response_obj.get("usage")
if usage:
span.set_attribute(
SpanAttributes.LLM_USAGE_TOTAL_TOKENS,
usage.get("total_tokens"),
)
# The number of tokens used in the LLM response (completion).
span.set_attribute(
SpanAttributes.LLM_USAGE_COMPLETION_TOKENS,
usage.get("completion_tokens"),
)
# The number of tokens used in the LLM response (completion).
span.set_attribute(
SpanAttributes.LLM_USAGE_COMPLETION_TOKENS,
usage.get("completion_tokens"),
)
# The number of tokens used in the LLM prompt.
span.set_attribute(
SpanAttributes.LLM_USAGE_PROMPT_TOKENS,
usage.get("prompt_tokens"),
# The number of tokens used in the LLM prompt.
span.set_attribute(
SpanAttributes.LLM_USAGE_PROMPT_TOKENS,
usage.get("prompt_tokens"),
)
except Exception as e:
verbose_logger.error(
"OpenTelemetry logging error in set_attributes %s", str(e)
)
def set_raw_request_attributes(self, span: Span, kwargs, response_obj):
@ -463,7 +509,7 @@ class OpenTelemetry(CustomLogger):
#############################################
# OTEL Attributes for the RAW Request to https://docs.anthropic.com/en/api/messages
if complete_input_dict:
if complete_input_dict and isinstance(complete_input_dict, dict):
for param, val in complete_input_dict.items():
if not isinstance(val, str):
val = str(val)

View file

@ -1263,6 +1263,10 @@ Model Info:
if self.alerting is None or "email" not in self.alerting:
# do nothing if user does not want email alerts
verbose_proxy_logger.error(
"Error sending email alert - 'email' not in self.alerting %s",
self.alerting,
)
return False
from litellm.proxy.proxy_server import premium_user, prisma_client

View file

@ -1,5 +1,6 @@
# What is this?
## Helper utilities
from typing import List, Literal, Optional, Tuple
def map_finish_reason(
@ -54,3 +55,31 @@ def remove_index_from_tool_calls(messages, tool_calls):
tool_call.pop("index")
return
def get_litellm_metadata_from_kwargs(kwargs: dict):
"""
Helper to get litellm metadata from all litellm request kwargs
"""
return kwargs.get("litellm_params", {}).get("metadata", {})
# Helper functions used for OTEL logging
def _get_parent_otel_span_from_kwargs(kwargs: Optional[dict] = None):
try:
if kwargs is None:
return None
litellm_params = kwargs.get("litellm_params")
_metadata = kwargs.get("metadata") or {}
if "litellm_parent_otel_span" in _metadata:
return _metadata["litellm_parent_otel_span"]
elif (
litellm_params is not None
and litellm_params.get("metadata") is not None
and "litellm_parent_otel_span" in litellm_params.get("metadata", {})
):
return litellm_params["metadata"]["litellm_parent_otel_span"]
elif "litellm_parent_otel_span" in kwargs:
return kwargs["litellm_parent_otel_span"]
except:
return None

View file

@ -10,7 +10,9 @@ import sys
import time
import traceback
import uuid
from typing import Any, Callable, Dict, List, Literal, Optional
from typing import Any, Callable, Dict, List, Literal, Optional, Union
from pydantic import BaseModel
import litellm
from litellm import (
@ -59,6 +61,7 @@ from ..integrations.custom_logger import CustomLogger
from ..integrations.datadog import DataDogLogger
from ..integrations.dynamodb import DyanmoDBLogger
from ..integrations.galileo import GalileoObserve
from ..integrations.gcs_bucket import GCSBucketLogger
from ..integrations.greenscale import GreenscaleLogger
from ..integrations.helicone import HeliconeLogger
from ..integrations.lago import LagoLogger
@ -231,6 +234,9 @@ class Logging:
):
self.custom_pricing = True
if "custom_llm_provider" in self.model_call_details:
self.custom_llm_provider = self.model_call_details["custom_llm_provider"]
def _pre_call(self, input, api_key, model=None, additional_args={}):
"""
Common helper function across the sync + async pre-call function
@ -500,6 +506,44 @@ class Logging:
)
)
def _response_cost_calculator(
self,
result: Union[
ModelResponse,
EmbeddingResponse,
ImageResponse,
TranscriptionResponse,
TextCompletionResponse,
HttpxBinaryResponseContent,
],
):
"""
Calculate response cost using result + logging object variables.
used for consistent cost calculation across response headers + logging integrations.
"""
## RESPONSE COST ##
custom_pricing = use_custom_pricing_for_model(
litellm_params=self.litellm_params
)
response_cost = litellm.response_cost_calculator(
response_object=result,
model=self.model,
cache_hit=self.model_call_details.get("cache_hit", False),
custom_llm_provider=self.model_call_details.get(
"custom_llm_provider", None
),
base_model=_get_base_model_from_metadata(
model_call_details=self.model_call_details
),
call_type=self.call_type,
optional_params=self.optional_params,
custom_pricing=custom_pricing,
)
return response_cost
def _success_handler_helper_fn(
self, result=None, start_time=None, end_time=None, cache_hit=None
):
@ -529,25 +573,32 @@ class Logging:
or isinstance(result, TextCompletionResponse)
or isinstance(result, HttpxBinaryResponseContent) # tts
):
## RESPONSE COST ##
custom_pricing = use_custom_pricing_for_model(
litellm_params=self.litellm_params
)
self.model_call_details["response_cost"] = (
litellm.response_cost_calculator(
response_object=result,
model=self.model,
cache_hit=self.model_call_details.get("cache_hit", False),
custom_llm_provider=self.model_call_details.get(
"custom_llm_provider", None
),
base_model=_get_base_model_from_metadata(
model_call_details=self.model_call_details
),
call_type=self.call_type,
optional_params=self.optional_params,
custom_pricing=custom_pricing,
)
self._response_cost_calculator(result=result)
)
## HIDDEN PARAMS ##
if hasattr(result, "_hidden_params"):
# add to metadata for logging
if self.model_call_details.get("litellm_params") is not None:
self.model_call_details["litellm_params"].setdefault(
"metadata", {}
)
if (
self.model_call_details["litellm_params"]["metadata"]
is None
):
self.model_call_details["litellm_params"][
"metadata"
] = {}
self.model_call_details["litellm_params"]["metadata"][
"hidden_params"
] = result._hidden_params
else: # streaming chunks + image gen.
self.model_call_details["response_cost"] = None
@ -1220,7 +1271,9 @@ class Logging:
"""
Implementing async callbacks, to handle asyncio event loop issues when custom integrations need to use async functions.
"""
print_verbose("Logging Details LiteLLM-Async Success Call")
print_verbose(
"Logging Details LiteLLM-Async Success Call, cache_hit={}".format(cache_hit)
)
start_time, end_time, result = self._success_handler_helper_fn(
start_time=start_time, end_time=end_time, result=result, cache_hit=cache_hit
)
@ -1490,6 +1543,13 @@ class Logging:
self.model_call_details["end_time"] = end_time
self.model_call_details.setdefault("original_response", None)
self.model_call_details["response_cost"] = 0
if hasattr(exception, "headers") and isinstance(exception.headers, dict):
self.model_call_details.setdefault("litellm_params", {})
metadata = (
self.model_call_details["litellm_params"].get("metadata", {}) or {}
)
metadata.update(exception.headers)
return start_time, end_time
def failure_handler(
@ -1962,6 +2022,14 @@ def _init_custom_logger_compatible_class(
_langsmith_logger = LangsmithLogger()
_in_memory_loggers.append(_langsmith_logger)
return _langsmith_logger # type: ignore
elif logging_integration == "gcs_bucket":
for callback in _in_memory_loggers:
if isinstance(callback, GCSBucketLogger):
return callback # type: ignore
_gcs_bucket_logger = GCSBucketLogger()
_in_memory_loggers.append(_gcs_bucket_logger)
return _gcs_bucket_logger # type: ignore
elif logging_integration == "arize":
if "ARIZE_SPACE_KEY" not in os.environ:
raise ValueError("ARIZE_SPACE_KEY not found in environment variables")
@ -2076,6 +2144,10 @@ def get_custom_logger_compatible_class(
for callback in _in_memory_loggers:
if isinstance(callback, LangsmithLogger):
return callback
elif logging_integration == "gcs_bucket":
for callback in _in_memory_loggers:
if isinstance(callback, GCSBucketLogger):
return callback
elif logging_integration == "otel":
from litellm.integrations.opentelemetry import OpenTelemetry

View file

@ -44,7 +44,12 @@ def cost_router(
Returns
- str, the specific google cost calc function it should route to.
"""
if custom_llm_provider == "vertex_ai" and "claude" in model:
if custom_llm_provider == "vertex_ai" and (
"claude" in model
or "llama" in model
or "mistral" in model
or "codestral" in model
):
return "cost_per_token"
elif custom_llm_provider == "gemini":
return "cost_per_token"

View file

@ -0,0 +1,22 @@
from typing import Any
import litellm
"""
Helper utils used for logging callbacks
"""
def convert_litellm_response_object_to_dict(response_obj: Any) -> dict:
"""
Convert a LiteLLM response object to a dictionary
"""
if isinstance(response_obj, dict):
return response_obj
for _type in litellm.ALL_LITELLM_RESPONSE_TYPES:
if isinstance(response_obj, _type):
return response_obj.model_dump()
# If it's not a LiteLLM type, return the object as is
return dict(response_obj)

View file

@ -5,13 +5,16 @@ import time
import types
from enum import Enum
from functools import partial
from typing import Callable, List, Optional, Union
from typing import Callable, List, Literal, Optional, Tuple, Union
import httpx # type: ignore
import requests # type: ignore
from openai.types.chat.chat_completion_chunk import Choice as OpenAIStreamingChoice
import litellm
import litellm.litellm_core_utils
import litellm.types
import litellm.types.utils
from litellm import verbose_logger
from litellm.litellm_core_utils.core_helpers import map_finish_reason
from litellm.llms.custom_httpx.http_handler import (
@ -33,8 +36,12 @@ from litellm.types.llms.anthropic import (
AnthropicResponseUsageBlock,
ContentBlockDelta,
ContentBlockStart,
ContentJsonBlockDelta,
ContentTextBlockDelta,
MessageBlockDelta,
MessageDelta,
MessageStartBlock,
UsageDelta,
)
from litellm.types.llms.openai import (
AllMessageValues,
@ -72,7 +79,7 @@ class AnthropicConstants(Enum):
class AnthropicError(Exception):
def __init__(self, status_code, message):
def __init__(self, status_code: int, message):
self.status_code = status_code
self.message: str = message
self.request = httpx.Request(
@ -464,7 +471,8 @@ class AnthropicConfig:
# extract usage
usage: litellm.Usage = getattr(response, "usage")
anthropic_usage = AnthropicResponseUsageBlock(
input_tokens=usage.prompt_tokens, output_tokens=usage.completion_tokens
input_tokens=usage.prompt_tokens or 0,
output_tokens=usage.completion_tokens or 0,
)
translated_obj = AnthropicResponse(
id=response.id,
@ -479,6 +487,74 @@ class AnthropicConfig:
return translated_obj
def _translate_streaming_openai_chunk_to_anthropic(
self, choices: List[OpenAIStreamingChoice]
) -> Tuple[
Literal["text_delta", "input_json_delta"],
Union[ContentTextBlockDelta, ContentJsonBlockDelta],
]:
text: str = ""
partial_json: Optional[str] = None
for choice in choices:
if choice.delta.content is not None:
text += choice.delta.content
elif choice.delta.tool_calls is not None:
partial_json = ""
for tool in choice.delta.tool_calls:
if (
tool.function is not None
and tool.function.arguments is not None
):
partial_json += tool.function.arguments
if partial_json is not None:
return "input_json_delta", ContentJsonBlockDelta(
type="input_json_delta", partial_json=partial_json
)
else:
return "text_delta", ContentTextBlockDelta(type="text_delta", text=text)
def translate_streaming_openai_response_to_anthropic(
self, response: litellm.ModelResponse
) -> Union[ContentBlockDelta, MessageBlockDelta]:
## base case - final chunk w/ finish reason
if response.choices[0].finish_reason is not None:
delta = MessageDelta(
stop_reason=self._translate_openai_finish_reason_to_anthropic(
response.choices[0].finish_reason
),
)
if getattr(response, "usage", None) is not None:
litellm_usage_chunk: Optional[litellm.Usage] = response.usage # type: ignore
elif (
hasattr(response, "_hidden_params")
and "usage" in response._hidden_params
):
litellm_usage_chunk = response._hidden_params["usage"]
else:
litellm_usage_chunk = None
if litellm_usage_chunk is not None:
usage_delta = UsageDelta(
input_tokens=litellm_usage_chunk.prompt_tokens or 0,
output_tokens=litellm_usage_chunk.completion_tokens or 0,
)
else:
usage_delta = UsageDelta(input_tokens=0, output_tokens=0)
return MessageBlockDelta(
type="message_delta", delta=delta, usage=usage_delta
)
(
type_of_content,
content_block_delta,
) = self._translate_streaming_openai_chunk_to_anthropic(
choices=response.choices # type: ignore
)
return ContentBlockDelta(
type="content_block_delta",
index=response.choices[0].index,
delta=content_block_delta,
)
# makes headers for API call
def validate_environment(api_key, user_headers, model):
@ -507,17 +583,23 @@ async def make_call(
model: str,
messages: list,
logging_obj,
timeout: Optional[Union[float, httpx.Timeout]],
):
if client is None:
client = _get_async_httpx_client() # Create a new client if none provided
try:
response = await client.post(api_base, headers=headers, data=data, stream=True)
response = await client.post(
api_base, headers=headers, data=data, stream=True, timeout=timeout
)
except httpx.HTTPStatusError as e:
raise AnthropicError(
status_code=e.response.status_code, message=await e.response.aread()
)
except Exception as e:
for exception in litellm.LITELLM_EXCEPTION_TYPES:
if isinstance(e, exception):
raise e
raise AnthropicError(status_code=500, message=str(e))
if response.status_code != 200:
@ -540,6 +622,51 @@ async def make_call(
return completion_stream
def make_sync_call(
client: Optional[HTTPHandler],
api_base: str,
headers: dict,
data: str,
model: str,
messages: list,
logging_obj,
timeout: Optional[Union[float, httpx.Timeout]],
):
if client is None:
client = HTTPHandler() # Create a new client if none provided
try:
response = client.post(
api_base, headers=headers, data=data, stream=True, timeout=timeout
)
except httpx.HTTPStatusError as e:
raise AnthropicError(
status_code=e.response.status_code, message=e.response.read()
)
except Exception as e:
for exception in litellm.LITELLM_EXCEPTION_TYPES:
if isinstance(e, exception):
raise e
raise AnthropicError(status_code=500, message=str(e))
if response.status_code != 200:
raise AnthropicError(status_code=response.status_code, message=response.read())
completion_stream = ModelResponseIterator(
streaming_response=response.iter_lines(), sync_stream=True
)
# LOGGING
logging_obj.post_call(
input=messages,
api_key="",
original_response="first stream response received",
additional_args={"complete_input_dict": data},
)
return completion_stream
class AnthropicChatCompletion(BaseLLM):
def __init__(self) -> None:
super().__init__()
@ -647,6 +774,7 @@ class AnthropicChatCompletion(BaseLLM):
custom_prompt_dict: dict,
model_response: ModelResponse,
print_verbose: Callable,
timeout: Union[float, httpx.Timeout],
encoding,
api_key,
logging_obj,
@ -659,20 +787,6 @@ class AnthropicChatCompletion(BaseLLM):
headers={},
):
data["stream"] = True
# async_handler = AsyncHTTPHandler(
# timeout=httpx.Timeout(timeout=600.0, connect=20.0)
# )
# response = await async_handler.post(
# api_base, headers=headers, json=data, stream=True
# )
# if response.status_code != 200:
# raise AnthropicError(
# status_code=response.status_code, message=response.text
# )
# completion_stream = response.aiter_lines()
streamwrapper = CustomStreamWrapper(
completion_stream=None,
@ -685,6 +799,7 @@ class AnthropicChatCompletion(BaseLLM):
model=model,
messages=messages,
logging_obj=logging_obj,
timeout=timeout,
),
model=model,
custom_llm_provider="anthropic",
@ -700,6 +815,7 @@ class AnthropicChatCompletion(BaseLLM):
custom_prompt_dict: dict,
model_response: ModelResponse,
print_verbose: Callable,
timeout: Union[float, httpx.Timeout],
encoding,
api_key,
logging_obj,
@ -716,7 +832,9 @@ class AnthropicChatCompletion(BaseLLM):
async_handler = _get_async_httpx_client()
try:
response = await async_handler.post(api_base, headers=headers, json=data)
response = await async_handler.post(
api_base, headers=headers, json=data, timeout=timeout
)
except Exception as e:
## LOGGING
logging_obj.post_call(
@ -876,6 +994,7 @@ class AnthropicChatCompletion(BaseLLM):
litellm_params=litellm_params,
logger_fn=logger_fn,
headers=headers,
timeout=timeout,
)
else:
return self.acompletion_function(
@ -897,43 +1016,40 @@ class AnthropicChatCompletion(BaseLLM):
headers=headers,
client=client,
json_mode=json_mode,
timeout=timeout,
)
else:
## COMPLETION CALL
if client is None or isinstance(client, AsyncHTTPHandler):
if client is None or not isinstance(client, HTTPHandler):
client = HTTPHandler(timeout=timeout) # type: ignore
else:
client = client
if (
stream is True
): # if function call - fake the streaming (need complete blocks for output parsing in openai format)
print_verbose("makes anthropic streaming POST request")
data["stream"] = stream
response = requests.post(
api_base,
headers=headers,
data=json.dumps(data),
stream=stream,
)
if response.status_code != 200:
raise AnthropicError(
status_code=response.status_code, message=response.text
)
completion_stream = ModelResponseIterator(
streaming_response=response.iter_lines(), sync_stream=True
)
streaming_response = CustomStreamWrapper(
completion_stream=completion_stream,
return CustomStreamWrapper(
completion_stream=None,
make_call=partial(
make_sync_call,
client=None,
api_base=api_base,
headers=headers, # type: ignore
data=json.dumps(data),
model=model,
messages=messages,
logging_obj=logging_obj,
timeout=timeout,
),
model=model,
custom_llm_provider="anthropic",
logging_obj=logging_obj,
)
return streaming_response
else:
response = client.post(api_base, headers=headers, data=json.dumps(data))
response = client.post(
api_base, headers=headers, data=json.dumps(data), timeout=timeout
)
if response.status_code != 200:
raise AnthropicError(
status_code=response.status_code, message=response.text

View file

@ -474,21 +474,13 @@ class AzureChatCompletion(BaseLLM):
- call chat.completions.create by default
"""
try:
if litellm.return_response_headers is True:
raw_response = (
await azure_client.chat.completions.with_raw_response.create(
**data, timeout=timeout
)
)
raw_response = await azure_client.chat.completions.with_raw_response.create(
**data, timeout=timeout
)
headers = dict(raw_response.headers)
response = raw_response.parse()
return headers, response
else:
response = await azure_client.chat.completions.create(
**data, timeout=timeout
)
return None, response
headers = dict(raw_response.headers)
response = raw_response.parse()
return headers, response
except Exception as e:
raise e

View file

@ -13,6 +13,7 @@ from enum import Enum
from typing import Any, Callable, List, Optional, Union
import httpx
from openai.types.image import Image
import litellm
from litellm.litellm_core_utils.core_helpers import map_finish_reason
@ -1413,10 +1414,10 @@ def embedding(
def image_generation(
model: str,
prompt: str,
model_response: ImageResponse,
optional_params: dict,
timeout=None,
logging_obj=None,
model_response=None,
optional_params=None,
aimg_generation=False,
):
"""
@ -1513,9 +1514,10 @@ def image_generation(
if model_response is None:
model_response = ImageResponse()
image_list: List = []
image_list: List[Image] = []
for artifact in response_body["artifacts"]:
image_dict = {"url": artifact["base64"]}
_image = Image(b64_json=artifact["base64"])
image_list.append(_image)
model_response.data = image_dict
model_response.data = image_list
return model_response

View file

@ -42,8 +42,11 @@ from litellm.types.llms.openai import (
ChatCompletionResponseMessage,
ChatCompletionToolCallChunk,
ChatCompletionToolCallFunctionChunk,
ChatCompletionUsageBlock,
)
from litellm.types.utils import Choices, Message
from litellm.types.utils import Choices
from litellm.types.utils import GenericStreamingChunk as GChunk
from litellm.types.utils import Message
from litellm.utils import (
CustomStreamWrapper,
ModelResponse,
@ -78,6 +81,7 @@ BEDROCK_CONVERSE_MODELS = [
"ai21.jamba-instruct-v1:0",
"meta.llama3-1-8b-instruct-v1:0",
"meta.llama3-1-70b-instruct-v1:0",
"meta.llama3-1-405b-instruct-v1:0",
"mistral.mistral-large-2407-v1:0",
]
@ -244,7 +248,7 @@ async def make_call(
return completion_stream
except httpx.HTTPStatusError as err:
error_code = err.response.status_code
raise BedrockError(status_code=error_code, message=str(err))
raise BedrockError(status_code=error_code, message=err.response.text)
except httpx.TimeoutException as e:
raise BedrockError(status_code=408, message="Timeout error occurred.")
except Exception as e:
@ -382,6 +386,7 @@ class BedrockLLM(BaseLLM):
aws_profile_name: Optional[str] = None,
aws_role_name: Optional[str] = None,
aws_web_identity_token: Optional[str] = None,
aws_sts_endpoint: Optional[str] = None,
):
"""
Return a boto3.Credentials object
@ -402,6 +407,7 @@ class BedrockLLM(BaseLLM):
aws_profile_name,
aws_role_name,
aws_web_identity_token,
aws_sts_endpoint,
]
# Iterate over parameters and update if needed
@ -420,6 +426,7 @@ class BedrockLLM(BaseLLM):
aws_profile_name,
aws_role_name,
aws_web_identity_token,
aws_sts_endpoint,
) = params_to_check
### CHECK STS ###
@ -431,12 +438,19 @@ class BedrockLLM(BaseLLM):
print_verbose(
f"IN Web Identity Token: {aws_web_identity_token} | Role Name: {aws_role_name} | Session Name: {aws_session_name}"
)
if aws_sts_endpoint is None:
sts_endpoint = f"https://sts.{aws_region_name}.amazonaws.com"
else:
sts_endpoint = aws_sts_endpoint
iam_creds_cache_key = json.dumps(
{
"aws_web_identity_token": aws_web_identity_token,
"aws_role_name": aws_role_name,
"aws_session_name": aws_session_name,
"aws_region_name": aws_region_name,
"aws_sts_endpoint": sts_endpoint,
}
)
@ -453,7 +467,7 @@ class BedrockLLM(BaseLLM):
sts_client = boto3.client(
"sts",
region_name=aws_region_name,
endpoint_url=f"https://sts.{aws_region_name}.amazonaws.com",
endpoint_url=sts_endpoint,
)
# https://docs.aws.amazon.com/STS/latest/APIReference/API_AssumeRoleWithWebIdentity.html
@ -848,6 +862,7 @@ class BedrockLLM(BaseLLM):
"aws_bedrock_runtime_endpoint", None
) # https://bedrock-runtime.{region_name}.amazonaws.com
aws_web_identity_token = optional_params.pop("aws_web_identity_token", None)
aws_sts_endpoint = optional_params.pop("aws_sts_endpoint", None)
### SET REGION NAME ###
if aws_region_name is None:
@ -877,6 +892,7 @@ class BedrockLLM(BaseLLM):
aws_profile_name=aws_profile_name,
aws_role_name=aws_role_name,
aws_web_identity_token=aws_web_identity_token,
aws_sts_endpoint=aws_sts_endpoint,
)
### SET RUNTIME ENDPOINT ###
@ -1535,6 +1551,7 @@ class BedrockConverseLLM(BaseLLM):
aws_profile_name: Optional[str] = None,
aws_role_name: Optional[str] = None,
aws_web_identity_token: Optional[str] = None,
aws_sts_endpoint: Optional[str] = None,
):
"""
Return a boto3.Credentials object
@ -1551,6 +1568,7 @@ class BedrockConverseLLM(BaseLLM):
aws_profile_name,
aws_role_name,
aws_web_identity_token,
aws_sts_endpoint,
]
# Iterate over parameters and update if needed
@ -1569,6 +1587,7 @@ class BedrockConverseLLM(BaseLLM):
aws_profile_name,
aws_role_name,
aws_web_identity_token,
aws_sts_endpoint,
) = params_to_check
### CHECK STS ###
@ -1577,12 +1596,22 @@ class BedrockConverseLLM(BaseLLM):
and aws_role_name is not None
and aws_session_name is not None
):
print_verbose(
f"IN Web Identity Token: {aws_web_identity_token} | Role Name: {aws_role_name} | Session Name: {aws_session_name}"
)
if aws_sts_endpoint is None:
sts_endpoint = f"https://sts.{aws_region_name}.amazonaws.com"
else:
sts_endpoint = aws_sts_endpoint
iam_creds_cache_key = json.dumps(
{
"aws_web_identity_token": aws_web_identity_token,
"aws_role_name": aws_role_name,
"aws_session_name": aws_session_name,
"aws_region_name": aws_region_name,
"aws_sts_endpoint": sts_endpoint,
}
)
@ -1599,7 +1628,7 @@ class BedrockConverseLLM(BaseLLM):
sts_client = boto3.client(
"sts",
region_name=aws_region_name,
endpoint_url=f"https://sts.{aws_region_name}.amazonaws.com",
endpoint_url=sts_endpoint,
)
# https://docs.aws.amazon.com/STS/latest/APIReference/API_AssumeRoleWithWebIdentity.html
@ -1814,6 +1843,7 @@ class BedrockConverseLLM(BaseLLM):
"aws_bedrock_runtime_endpoint", None
) # https://bedrock-runtime.{region_name}.amazonaws.com
aws_web_identity_token = optional_params.pop("aws_web_identity_token", None)
aws_sts_endpoint = optional_params.pop("aws_sts_endpoint", None)
### SET REGION NAME ###
if aws_region_name is None:
@ -1843,6 +1873,7 @@ class BedrockConverseLLM(BaseLLM):
aws_profile_name=aws_profile_name,
aws_role_name=aws_role_name,
aws_web_identity_token=aws_web_identity_token,
aws_sts_endpoint=aws_sts_endpoint,
)
### SET RUNTIME ENDPOINT ###
@ -1888,12 +1919,14 @@ class BedrockConverseLLM(BaseLLM):
additional_request_params = {}
supported_converse_params = AmazonConverseConfig.__annotations__.keys()
supported_tool_call_params = ["tools", "tool_choice"]
supported_guardrail_params = ["guardrailConfig"]
## TRANSFORMATION ##
# send all model-specific params in 'additional_request_params'
for k, v in inference_params.items():
if (
k not in supported_converse_params
and k not in supported_tool_call_params
and k not in supported_guardrail_params
):
additional_request_params[k] = v
additional_request_keys.append(k)
@ -1925,6 +1958,15 @@ class BedrockConverseLLM(BaseLLM):
"system": system_content_blocks,
"inferenceConfig": InferenceConfig(**inference_params),
}
# Guardrail Config
guardrail_config: Optional[GuardrailConfigBlock] = None
request_guardrails_config = inference_params.pop("guardrailConfig", None)
if request_guardrails_config is not None:
guardrail_config = GuardrailConfigBlock(**request_guardrails_config)
_data["guardrailConfig"] = guardrail_config
# Tool Config
if bedrock_tool_config is not None:
_data["toolConfig"] = bedrock_tool_config
data = json.dumps(_data)
@ -2068,13 +2110,13 @@ class AWSEventStreamDecoder:
self.model = model
self.parser = EventStreamJSONParser()
def converse_chunk_parser(self, chunk_data: dict) -> GenericStreamingChunk:
def converse_chunk_parser(self, chunk_data: dict) -> GChunk:
try:
text = ""
tool_use: Optional[ChatCompletionToolCallChunk] = None
is_finished = False
finish_reason = ""
usage: Optional[ConverseTokenUsageBlock] = None
usage: Optional[ChatCompletionUsageBlock] = None
index = int(chunk_data.get("contentBlockIndex", 0))
if "start" in chunk_data:
@ -2111,9 +2153,13 @@ class AWSEventStreamDecoder:
finish_reason = map_finish_reason(chunk_data.get("stopReason", "stop"))
is_finished = True
elif "usage" in chunk_data:
usage = ConverseTokenUsageBlock(**chunk_data["usage"]) # type: ignore
usage = ChatCompletionUsageBlock(
prompt_tokens=chunk_data.get("inputTokens", 0),
completion_tokens=chunk_data.get("outputTokens", 0),
total_tokens=chunk_data.get("totalTokens", 0),
)
response = GenericStreamingChunk(
response = GChunk(
text=text,
tool_use=tool_use,
is_finished=is_finished,
@ -2125,7 +2171,7 @@ class AWSEventStreamDecoder:
except Exception as e:
raise Exception("Received streaming error - {}".format(str(e)))
def _chunk_parser(self, chunk_data: dict) -> GenericStreamingChunk:
def _chunk_parser(self, chunk_data: dict) -> GChunk:
text = ""
is_finished = False
finish_reason = ""
@ -2168,7 +2214,7 @@ class AWSEventStreamDecoder:
elif chunk_data.get("completionReason", None):
is_finished = True
finish_reason = chunk_data["completionReason"]
return GenericStreamingChunk(
return GChunk(
text=text,
is_finished=is_finished,
finish_reason=finish_reason,
@ -2177,7 +2223,7 @@ class AWSEventStreamDecoder:
tool_use=None,
)
def iter_bytes(self, iterator: Iterator[bytes]) -> Iterator[GenericStreamingChunk]:
def iter_bytes(self, iterator: Iterator[bytes]) -> Iterator[GChunk]:
"""Given an iterator that yields lines, iterate over it & yield every event encountered"""
from botocore.eventstream import EventStreamBuffer
@ -2193,7 +2239,7 @@ class AWSEventStreamDecoder:
async def aiter_bytes(
self, iterator: AsyncIterator[bytes]
) -> AsyncIterator[GenericStreamingChunk]:
) -> AsyncIterator[GChunk]:
"""Given an async iterator that yields lines, iterate over it & yield every event encountered"""
from botocore.eventstream import EventStreamBuffer
@ -2233,20 +2279,16 @@ class MockResponseIterator: # for returning ai21 streaming responses
def __iter__(self):
return self
def _chunk_parser(self, chunk_data: ModelResponse) -> GenericStreamingChunk:
def _chunk_parser(self, chunk_data: ModelResponse) -> GChunk:
try:
chunk_usage: litellm.Usage = getattr(chunk_data, "usage")
processed_chunk = GenericStreamingChunk(
processed_chunk = GChunk(
text=chunk_data.choices[0].message.content or "", # type: ignore
tool_use=None,
is_finished=True,
finish_reason=chunk_data.choices[0].finish_reason, # type: ignore
usage=ConverseTokenUsageBlock(
inputTokens=chunk_usage.prompt_tokens,
outputTokens=chunk_usage.completion_tokens,
totalTokens=chunk_usage.total_tokens,
),
usage=chunk_usage, # type: ignore
index=0,
)
return processed_chunk

View file

@ -1,15 +1,20 @@
#################### OLD ########################
##### See `cohere_chat.py` for `/chat` calls ####
#################################################
import json
import os
import time
import traceback
import types
from enum import Enum
from typing import Callable, Optional
from typing import Any, Callable, Optional, Union
import httpx # type: ignore
import requests # type: ignore
import litellm
from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObj
from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
from litellm.utils import Choices, Message, ModelResponse, Usage
@ -246,14 +251,98 @@ def completion(
return model_response
def _process_embedding_response(
embeddings: list,
model_response: litellm.EmbeddingResponse,
model: str,
encoding: Any,
input: list,
) -> litellm.EmbeddingResponse:
output_data = []
for idx, embedding in enumerate(embeddings):
output_data.append(
{"object": "embedding", "index": idx, "embedding": embedding}
)
model_response.object = "list"
model_response.data = output_data
model_response.model = model
input_tokens = 0
for text in input:
input_tokens += len(encoding.encode(text))
setattr(
model_response,
"usage",
Usage(
prompt_tokens=input_tokens, completion_tokens=0, total_tokens=input_tokens
),
)
return model_response
async def async_embedding(
model: str,
data: dict,
input: list,
model_response: litellm.utils.EmbeddingResponse,
timeout: Union[float, httpx.Timeout],
logging_obj: LiteLLMLoggingObj,
optional_params: dict,
api_base: str,
api_key: Optional[str],
headers: dict,
encoding: Callable,
client: Optional[AsyncHTTPHandler] = None,
):
## LOGGING
logging_obj.pre_call(
input=input,
api_key=api_key,
additional_args={
"complete_input_dict": data,
"headers": headers,
"api_base": api_base,
},
)
## COMPLETION CALL
if client is None:
client = AsyncHTTPHandler(concurrent_limit=1)
response = await client.post(api_base, headers=headers, data=json.dumps(data))
## LOGGING
logging_obj.post_call(
input=input,
api_key=api_key,
additional_args={"complete_input_dict": data},
original_response=response,
)
embeddings = response.json()["embeddings"]
## PROCESS RESPONSE ##
return _process_embedding_response(
embeddings=embeddings,
model_response=model_response,
model=model,
encoding=encoding,
input=input,
)
def embedding(
model: str,
input: list,
model_response: litellm.EmbeddingResponse,
logging_obj: LiteLLMLoggingObj,
optional_params: dict,
encoding: Any,
api_key: Optional[str] = None,
logging_obj=None,
encoding=None,
optional_params=None,
aembedding: Optional[bool] = None,
timeout: Union[float, httpx.Timeout] = httpx.Timeout(None),
client: Optional[Union[HTTPHandler, AsyncHTTPHandler]] = None,
):
headers = validate_environment(api_key)
embed_url = "https://api.cohere.ai/v1/embed"
@ -270,8 +359,26 @@ def embedding(
api_key=api_key,
additional_args={"complete_input_dict": data},
)
## ROUTING
if aembedding is True:
return async_embedding(
model=model,
data=data,
input=input,
model_response=model_response,
timeout=timeout,
logging_obj=logging_obj,
optional_params=optional_params,
api_base=embed_url,
api_key=api_key,
headers=headers,
encoding=encoding,
)
## COMPLETION CALL
response = requests.post(embed_url, headers=headers, data=json.dumps(data))
if client is None or not isinstance(client, HTTPHandler):
client = HTTPHandler(concurrent_limit=1)
response = client.post(embed_url, headers=headers, data=json.dumps(data))
## LOGGING
logging_obj.post_call(
input=input,
@ -293,23 +400,11 @@ def embedding(
if response.status_code != 200:
raise CohereError(message=response.text, status_code=response.status_code)
embeddings = response.json()["embeddings"]
output_data = []
for idx, embedding in enumerate(embeddings):
output_data.append(
{"object": "embedding", "index": idx, "embedding": embedding}
)
model_response.object = "list"
model_response.data = output_data
model_response.model = model
input_tokens = 0
for text in input:
input_tokens += len(encoding.encode(text))
setattr(
model_response,
"usage",
Usage(
prompt_tokens=input_tokens, completion_tokens=0, total_tokens=input_tokens
),
return _process_embedding_response(
embeddings=embeddings,
model_response=model_response,
model=model,
encoding=encoding,
input=input,
)
return model_response

View file

@ -233,8 +233,14 @@ def completion(
optional_params["tool_results"] = [most_recent_message]
elif isinstance(most_recent_message, str):
optional_params["message"] = most_recent_message
## check if chat history message is 'user' and 'tool_results' is given -> force_single_step=True, else cohere api fails
if len(chat_history) > 0 and chat_history[-1]["role"] == "USER":
optional_params["force_single_step"] = True
data = {
"model": model,
"chat_history": chat_history,
**optional_params,
}

View file

@ -80,18 +80,77 @@ class AsyncHTTPHandler:
json: Optional[dict] = None,
params: Optional[dict] = None,
headers: Optional[dict] = None,
timeout: Optional[Union[float, httpx.Timeout]] = None,
stream: bool = False,
):
try:
if timeout is None:
timeout = self.timeout
req = self.client.build_request(
"POST", url, data=data, json=json, params=params, headers=headers # type: ignore
"POST", url, data=data, json=json, params=params, headers=headers, timeout=timeout # type: ignore
)
response = await self.client.send(req, stream=stream)
response.raise_for_status()
return response
except (httpx.RemoteProtocolError, httpx.ConnectError):
# Retry the request with a new session if there is a connection error
new_client = self.create_client(timeout=self.timeout, concurrent_limit=1)
new_client = self.create_client(timeout=timeout, concurrent_limit=1)
try:
return await self.single_connection_post_request(
url=url,
client=new_client,
data=data,
json=json,
params=params,
headers=headers,
stream=stream,
)
finally:
await new_client.aclose()
except httpx.TimeoutException as e:
headers = {}
if hasattr(e, "response") and e.response is not None:
for key, value in e.response.headers.items():
headers["response_headers-{}".format(key)] = value
raise litellm.Timeout(
message=f"Connection timed out after {timeout} seconds.",
model="default-model-name",
llm_provider="litellm-httpx-handler",
headers=headers,
)
except httpx.HTTPStatusError as e:
setattr(e, "status_code", e.response.status_code)
if stream is True:
setattr(e, "message", await e.response.aread())
else:
setattr(e, "message", e.response.text)
raise e
except Exception as e:
raise e
async def delete(
self,
url: str,
data: Optional[Union[dict, str]] = None, # type: ignore
json: Optional[dict] = None,
params: Optional[dict] = None,
headers: Optional[dict] = None,
timeout: Optional[Union[float, httpx.Timeout]] = None,
stream: bool = False,
):
try:
if timeout is None:
timeout = self.timeout
req = self.client.build_request(
"DELETE", url, data=data, json=json, params=params, headers=headers, timeout=timeout # type: ignore
)
response = await self.client.send(req, stream=stream)
response.raise_for_status()
return response
except (httpx.RemoteProtocolError, httpx.ConnectError):
# Retry the request with a new session if there is a connection error
new_client = self.create_client(timeout=timeout, concurrent_limit=1)
try:
return await self.single_connection_post_request(
url=url,
@ -192,13 +251,28 @@ class HTTPHandler:
params: Optional[dict] = None,
headers: Optional[dict] = None,
stream: bool = False,
timeout: Optional[Union[float, httpx.Timeout]] = None,
):
try:
req = self.client.build_request(
"POST", url, data=data, json=json, params=params, headers=headers # type: ignore
)
response = self.client.send(req, stream=stream)
return response
if timeout is not None:
req = self.client.build_request(
"POST", url, data=data, json=json, params=params, headers=headers, timeout=timeout # type: ignore
)
else:
req = self.client.build_request(
"POST", url, data=data, json=json, params=params, headers=headers # type: ignore
)
response = self.client.send(req, stream=stream)
return response
except httpx.TimeoutException:
raise litellm.Timeout(
message=f"Connection timed out after {timeout} seconds.",
model="default-model-name",
llm_provider="litellm-httpx-handler",
)
except Exception as e:
raise e
def __del__(self) -> None:
try:

View file

@ -15,8 +15,14 @@ import requests # type: ignore
import litellm
from litellm.litellm_core_utils.core_helpers import map_finish_reason
from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
from litellm.types.llms.databricks import GenericStreamingChunk
from litellm.types.utils import ProviderField
from litellm.types.llms.openai import (
ChatCompletionDeltaChunk,
ChatCompletionResponseMessage,
ChatCompletionToolCallChunk,
ChatCompletionToolCallFunctionChunk,
ChatCompletionUsageBlock,
)
from litellm.types.utils import GenericStreamingChunk, ProviderField
from litellm.utils import CustomStreamWrapper, EmbeddingResponse, ModelResponse, Usage
from .base import BaseLLM
@ -114,71 +120,6 @@ class DatabricksConfig:
optional_params["stop"] = value
return optional_params
def _chunk_parser(self, chunk_data: str) -> GenericStreamingChunk:
try:
text = ""
is_finished = False
finish_reason = None
logprobs = None
usage = None
original_chunk = None # this is used for function/tool calling
chunk_data = chunk_data.replace("data:", "")
chunk_data = chunk_data.strip()
if len(chunk_data) == 0 or chunk_data == "[DONE]":
return {
"text": "",
"is_finished": is_finished,
"finish_reason": finish_reason,
}
chunk_data_dict = json.loads(chunk_data)
str_line = litellm.ModelResponse(**chunk_data_dict, stream=True)
if len(str_line.choices) > 0:
if (
str_line.choices[0].delta is not None # type: ignore
and str_line.choices[0].delta.content is not None # type: ignore
):
text = str_line.choices[0].delta.content # type: ignore
else: # function/tool calling chunk - when content is None. in this case we just return the original chunk from openai
original_chunk = str_line
if str_line.choices[0].finish_reason:
is_finished = True
finish_reason = str_line.choices[0].finish_reason
if finish_reason == "content_filter":
if hasattr(str_line.choices[0], "content_filter_result"):
error_message = json.dumps(
str_line.choices[0].content_filter_result # type: ignore
)
else:
error_message = "Azure Response={}".format(
str(dict(str_line))
)
raise litellm.AzureOpenAIError(
status_code=400, message=error_message
)
# checking for logprobs
if (
hasattr(str_line.choices[0], "logprobs")
and str_line.choices[0].logprobs is not None
):
logprobs = str_line.choices[0].logprobs
else:
logprobs = None
usage = getattr(str_line, "usage", None)
return GenericStreamingChunk(
text=text,
is_finished=is_finished,
finish_reason=finish_reason,
logprobs=logprobs,
original_chunk=original_chunk,
usage=usage,
)
except Exception as e:
raise e
class DatabricksEmbeddingConfig:
"""
@ -236,7 +177,9 @@ async def make_call(
if response.status_code != 200:
raise DatabricksError(status_code=response.status_code, message=response.text)
completion_stream = response.aiter_lines()
completion_stream = ModelResponseIterator(
streaming_response=response.aiter_lines(), sync_stream=False
)
# LOGGING
logging_obj.post_call(
input=messages,
@ -248,6 +191,38 @@ async def make_call(
return completion_stream
def make_sync_call(
client: Optional[HTTPHandler],
api_base: str,
headers: dict,
data: str,
model: str,
messages: list,
logging_obj,
):
if client is None:
client = HTTPHandler() # Create a new client if none provided
response = client.post(api_base, headers=headers, data=data, stream=True)
if response.status_code != 200:
raise DatabricksError(status_code=response.status_code, message=response.read())
completion_stream = ModelResponseIterator(
streaming_response=response.iter_lines(), sync_stream=True
)
# LOGGING
logging_obj.post_call(
input=messages,
api_key="",
original_response="first stream response received",
additional_args={"complete_input_dict": data},
)
return completion_stream
class DatabricksChatCompletion(BaseLLM):
def __init__(self) -> None:
super().__init__()
@ -259,6 +234,7 @@ class DatabricksChatCompletion(BaseLLM):
api_key: Optional[str],
api_base: Optional[str],
endpoint_type: Literal["chat_completions", "embeddings"],
custom_endpoint: Optional[bool],
) -> Tuple[str, dict]:
if api_key is None:
raise DatabricksError(
@ -277,97 +253,17 @@ class DatabricksChatCompletion(BaseLLM):
"Content-Type": "application/json",
}
if endpoint_type == "chat_completions":
if endpoint_type == "chat_completions" and custom_endpoint is not True:
api_base = "{}/chat/completions".format(api_base)
elif endpoint_type == "embeddings":
elif endpoint_type == "embeddings" and custom_endpoint is not True:
api_base = "{}/embeddings".format(api_base)
return api_base, headers
def process_response(
self,
model: str,
response: Union[requests.Response, httpx.Response],
model_response: ModelResponse,
stream: bool,
logging_obj: litellm.litellm_core_utils.litellm_logging.Logging,
optional_params: dict,
api_key: str,
data: Union[dict, str],
messages: List,
print_verbose,
encoding,
) -> ModelResponse:
## LOGGING
logging_obj.post_call(
input=messages,
api_key=api_key,
original_response=response.text,
additional_args={"complete_input_dict": data},
)
print_verbose(f"raw model_response: {response.text}")
## RESPONSE OBJECT
try:
completion_response = response.json()
except:
raise DatabricksError(
message=response.text, status_code=response.status_code
)
if "error" in completion_response:
raise DatabricksError(
message=str(completion_response["error"]),
status_code=response.status_code,
)
else:
text_content = ""
tool_calls = []
for content in completion_response["content"]:
if content["type"] == "text":
text_content += content["text"]
## TOOL CALLING
elif content["type"] == "tool_use":
tool_calls.append(
{
"id": content["id"],
"type": "function",
"function": {
"name": content["name"],
"arguments": json.dumps(content["input"]),
},
}
)
_message = litellm.Message(
tool_calls=tool_calls,
content=text_content or None,
)
model_response.choices[0].message = _message # type: ignore
model_response._hidden_params["original_response"] = completion_response[
"content"
] # allow user to access raw anthropic tool calling response
model_response.choices[0].finish_reason = map_finish_reason(
completion_response["stop_reason"]
)
## CALCULATING USAGE
prompt_tokens = completion_response["usage"]["input_tokens"]
completion_tokens = completion_response["usage"]["output_tokens"]
total_tokens = prompt_tokens + completion_tokens
model_response.created = int(time.time())
model_response.model = model
usage = Usage(
prompt_tokens=prompt_tokens,
completion_tokens=completion_tokens,
total_tokens=total_tokens,
)
setattr(model_response, "usage", usage) # type: ignore
return model_response
async def acompletion_stream_function(
self,
model: str,
messages: list,
custom_llm_provider: str,
api_base: str,
custom_prompt_dict: dict,
model_response: ModelResponse,
@ -397,7 +293,7 @@ class DatabricksChatCompletion(BaseLLM):
logging_obj=logging_obj,
),
model=model,
custom_llm_provider="databricks",
custom_llm_provider=custom_llm_provider,
logging_obj=logging_obj,
)
return streamwrapper
@ -415,6 +311,7 @@ class DatabricksChatCompletion(BaseLLM):
logging_obj,
stream,
data: dict,
base_model: Optional[str],
optional_params: dict,
litellm_params=None,
logger_fn=None,
@ -436,20 +333,25 @@ class DatabricksChatCompletion(BaseLLM):
except httpx.HTTPStatusError as e:
raise DatabricksError(
status_code=e.response.status_code,
message=response.text if response else str(e),
message=e.response.text,
)
except httpx.TimeoutException as e:
raise DatabricksError(status_code=408, message="Timeout error occurred.")
except Exception as e:
raise DatabricksError(status_code=500, message=str(e))
return ModelResponse(**response_json)
response = ModelResponse(**response_json)
if base_model is not None:
response._hidden_params["model"] = base_model
return response
def completion(
self,
model: str,
messages: list,
api_base: str,
custom_llm_provider: str,
custom_prompt_dict: dict,
model_response: ModelResponse,
print_verbose: Callable,
@ -464,8 +366,13 @@ class DatabricksChatCompletion(BaseLLM):
timeout: Optional[Union[float, httpx.Timeout]] = None,
client: Optional[Union[HTTPHandler, AsyncHTTPHandler]] = None,
):
custom_endpoint: Optional[bool] = optional_params.pop("custom_endpoint", None)
base_model: Optional[str] = optional_params.pop("base_model", None)
api_base, headers = self._validate_environment(
api_base=api_base, api_key=api_key, endpoint_type="chat_completions"
api_base=api_base,
api_key=api_key,
endpoint_type="chat_completions",
custom_endpoint=custom_endpoint,
)
## Load Config
config = litellm.DatabricksConfig().get_config()
@ -475,7 +382,8 @@ class DatabricksChatCompletion(BaseLLM):
): # completion(top_k=3) > anthropic_config(top_k=3) <- allows for dynamic variables to be passed in
optional_params[k] = v
stream = optional_params.pop("stream", None)
stream: bool = optional_params.pop("stream", None) or False
optional_params["stream"] = stream
data = {
"model": model,
@ -493,11 +401,11 @@ class DatabricksChatCompletion(BaseLLM):
"headers": headers,
},
)
if acompletion == True:
if acompletion is True:
if client is not None and isinstance(client, HTTPHandler):
client = None
if (
stream is not None and stream == True
stream is not None and stream is True
): # if function call - fake the streaming (need complete blocks for output parsing in openai format)
print_verbose("makes async anthropic streaming POST request")
data["stream"] = stream
@ -518,6 +426,7 @@ class DatabricksChatCompletion(BaseLLM):
logger_fn=logger_fn,
headers=headers,
client=client,
custom_llm_provider=custom_llm_provider,
)
else:
return self.acompletion_function(
@ -537,46 +446,32 @@ class DatabricksChatCompletion(BaseLLM):
logger_fn=logger_fn,
headers=headers,
timeout=timeout,
base_model=base_model,
)
else:
if client is None or isinstance(client, AsyncHTTPHandler):
self.client = HTTPHandler(timeout=timeout) # type: ignore
else:
self.client = client
if client is None or not isinstance(client, HTTPHandler):
client = HTTPHandler(timeout=timeout) # type: ignore
## COMPLETION CALL
if (
stream is not None and stream == True
): # if function call - fake the streaming (need complete blocks for output parsing in openai format)
print_verbose("makes dbrx streaming POST request")
data["stream"] = stream
try:
response = self.client.post(
api_base, headers=headers, data=json.dumps(data), stream=stream
)
response.raise_for_status()
completion_stream = response.iter_lines()
except httpx.HTTPStatusError as e:
raise DatabricksError(
status_code=e.response.status_code, message=response.text
)
except httpx.TimeoutException as e:
raise DatabricksError(
status_code=408, message="Timeout error occurred."
)
except Exception as e:
raise DatabricksError(status_code=408, message=str(e))
streaming_response = CustomStreamWrapper(
completion_stream=completion_stream,
if stream is True:
return CustomStreamWrapper(
completion_stream=None,
make_call=partial(
make_sync_call,
client=None,
api_base=api_base,
headers=headers, # type: ignore
data=json.dumps(data),
model=model,
messages=messages,
logging_obj=logging_obj,
),
model=model,
custom_llm_provider="databricks",
custom_llm_provider=custom_llm_provider,
logging_obj=logging_obj,
)
return streaming_response
else:
try:
response = self.client.post(
response = client.post(
api_base, headers=headers, data=json.dumps(data)
)
response.raise_for_status()
@ -593,7 +488,12 @@ class DatabricksChatCompletion(BaseLLM):
except Exception as e:
raise DatabricksError(status_code=500, message=str(e))
return ModelResponse(**response_json)
response = ModelResponse(**response_json)
if base_model is not None:
response._hidden_params["model"] = base_model
return response
async def aembedding(
self,
@ -667,7 +567,10 @@ class DatabricksChatCompletion(BaseLLM):
aembedding=None,
) -> EmbeddingResponse:
api_base, headers = self._validate_environment(
api_base=api_base, api_key=api_key, endpoint_type="embeddings"
api_base=api_base,
api_key=api_key,
endpoint_type="embeddings",
custom_endpoint=False,
)
model = model
data = {"model": model, "input": input, **optional_params}
@ -716,3 +619,128 @@ class DatabricksChatCompletion(BaseLLM):
)
return litellm.EmbeddingResponse(**response_json)
class ModelResponseIterator:
def __init__(self, streaming_response, sync_stream: bool):
self.streaming_response = streaming_response
def chunk_parser(self, chunk: dict) -> GenericStreamingChunk:
try:
processed_chunk = litellm.ModelResponse(**chunk, stream=True) # type: ignore
text = ""
tool_use: Optional[ChatCompletionToolCallChunk] = None
is_finished = False
finish_reason = ""
usage: Optional[ChatCompletionUsageBlock] = None
if processed_chunk.choices[0].delta.content is not None: # type: ignore
text = processed_chunk.choices[0].delta.content # type: ignore
if (
processed_chunk.choices[0].delta.tool_calls is not None # type: ignore
and len(processed_chunk.choices[0].delta.tool_calls) > 0 # type: ignore
and processed_chunk.choices[0].delta.tool_calls[0].function is not None # type: ignore
and processed_chunk.choices[0].delta.tool_calls[0].function.arguments # type: ignore
is not None
):
tool_use = ChatCompletionToolCallChunk(
id=processed_chunk.choices[0].delta.tool_calls[0].id, # type: ignore
type="function",
function=ChatCompletionToolCallFunctionChunk(
name=processed_chunk.choices[0]
.delta.tool_calls[0] # type: ignore
.function.name,
arguments=processed_chunk.choices[0]
.delta.tool_calls[0] # type: ignore
.function.arguments,
),
index=processed_chunk.choices[0].index,
)
if processed_chunk.choices[0].finish_reason is not None:
is_finished = True
finish_reason = processed_chunk.choices[0].finish_reason
if hasattr(processed_chunk, "usage"):
usage = processed_chunk.usage # type: ignore
return GenericStreamingChunk(
text=text,
tool_use=tool_use,
is_finished=is_finished,
finish_reason=finish_reason,
usage=usage,
index=0,
)
except json.JSONDecodeError:
raise ValueError(f"Failed to decode JSON from chunk: {chunk}")
# Sync iterator
def __iter__(self):
self.response_iterator = self.streaming_response
return self
def __next__(self):
try:
chunk = self.response_iterator.__next__()
except StopIteration:
raise StopIteration
except ValueError as e:
raise RuntimeError(f"Error receiving chunk from stream: {e}")
try:
chunk = chunk.replace("data:", "")
chunk = chunk.strip()
if len(chunk) > 0:
json_chunk = json.loads(chunk)
return self.chunk_parser(chunk=json_chunk)
else:
return GenericStreamingChunk(
text="",
is_finished=False,
finish_reason="",
usage=None,
index=0,
tool_use=None,
)
except StopIteration:
raise StopIteration
except ValueError as e:
raise RuntimeError(f"Error parsing chunk: {e},\nReceived chunk: {chunk}")
# Async iterator
def __aiter__(self):
self.async_response_iterator = self.streaming_response.__aiter__()
return self
async def __anext__(self):
try:
chunk = await self.async_response_iterator.__anext__()
except StopAsyncIteration:
raise StopAsyncIteration
except ValueError as e:
raise RuntimeError(f"Error receiving chunk from stream: {e}")
try:
chunk = chunk.replace("data:", "")
chunk = chunk.strip()
if chunk == "[DONE]":
raise StopAsyncIteration
if len(chunk) > 0:
json_chunk = json.loads(chunk)
return self.chunk_parser(chunk=json_chunk)
else:
return GenericStreamingChunk(
text="",
is_finished=False,
finish_reason="",
usage=None,
index=0,
tool_use=None,
)
except StopAsyncIteration:
raise StopAsyncIteration
except ValueError as e:
raise RuntimeError(f"Error parsing chunk: {e},\nReceived chunk: {chunk}")

View file

@ -0,0 +1,315 @@
from typing import Any, Coroutine, Dict, List, Optional, Union
import httpx
from openai import AsyncAzureOpenAI, AzureOpenAI
from openai.types.file_deleted import FileDeleted
import litellm
from litellm._logging import verbose_logger
from litellm.llms.base import BaseLLM
from litellm.types.llms.openai import *
def get_azure_openai_client(
api_key: Optional[str],
api_base: Optional[str],
timeout: Union[float, httpx.Timeout],
max_retries: Optional[int],
api_version: Optional[str] = None,
organization: Optional[str] = None,
client: Optional[Union[AzureOpenAI, AsyncAzureOpenAI]] = None,
_is_async: bool = False,
) -> Optional[Union[AzureOpenAI, AsyncAzureOpenAI]]:
received_args = locals()
openai_client: Optional[Union[AzureOpenAI, AsyncAzureOpenAI]] = None
if client is None:
data = {}
for k, v in received_args.items():
if k == "self" or k == "client" or k == "_is_async":
pass
elif k == "api_base" and v is not None:
data["azure_endpoint"] = v
elif v is not None:
data[k] = v
if "api_version" not in data:
data["api_version"] = litellm.AZURE_DEFAULT_API_VERSION
if _is_async is True:
openai_client = AsyncAzureOpenAI(**data)
else:
openai_client = AzureOpenAI(**data) # type: ignore
else:
openai_client = client
return openai_client
class AzureOpenAIFilesAPI(BaseLLM):
"""
AzureOpenAI methods to support for batches
- create_file()
- retrieve_file()
- list_files()
- delete_file()
- file_content()
- update_file()
"""
def __init__(self) -> None:
super().__init__()
async def acreate_file(
self,
create_file_data: CreateFileRequest,
openai_client: AsyncAzureOpenAI,
) -> FileObject:
verbose_logger.debug("create_file_data=%s", create_file_data)
response = await openai_client.files.create(**create_file_data)
verbose_logger.debug("create_file_response=%s", response)
return response
def create_file(
self,
_is_async: bool,
create_file_data: CreateFileRequest,
api_base: str,
api_key: Optional[str],
api_version: Optional[str],
timeout: Union[float, httpx.Timeout],
max_retries: Optional[int],
client: Optional[Union[AzureOpenAI, AsyncAzureOpenAI]] = None,
) -> Union[FileObject, Coroutine[Any, Any, FileObject]]:
openai_client: Optional[Union[AzureOpenAI, AsyncAzureOpenAI]] = (
get_azure_openai_client(
api_key=api_key,
api_base=api_base,
api_version=api_version,
timeout=timeout,
max_retries=max_retries,
client=client,
_is_async=_is_async,
)
)
if openai_client is None:
raise ValueError(
"AzureOpenAI client is not initialized. Make sure api_key is passed or OPENAI_API_KEY is set in the environment."
)
if _is_async is True:
if not isinstance(openai_client, AsyncAzureOpenAI):
raise ValueError(
"AzureOpenAI client is not an instance of AsyncAzureOpenAI. Make sure you passed an AsyncAzureOpenAI client."
)
return self.acreate_file( # type: ignore
create_file_data=create_file_data, openai_client=openai_client
)
response = openai_client.files.create(**create_file_data)
return response
async def afile_content(
self,
file_content_request: FileContentRequest,
openai_client: AsyncAzureOpenAI,
) -> HttpxBinaryResponseContent:
response = await openai_client.files.content(**file_content_request)
return response
def file_content(
self,
_is_async: bool,
file_content_request: FileContentRequest,
api_base: str,
api_key: Optional[str],
timeout: Union[float, httpx.Timeout],
max_retries: Optional[int],
organization: Optional[str],
api_version: Optional[str] = None,
client: Optional[Union[AzureOpenAI, AsyncAzureOpenAI]] = None,
) -> Union[
HttpxBinaryResponseContent, Coroutine[Any, Any, HttpxBinaryResponseContent]
]:
openai_client: Optional[Union[AzureOpenAI, AsyncAzureOpenAI]] = (
get_azure_openai_client(
api_key=api_key,
api_base=api_base,
timeout=timeout,
api_version=api_version,
max_retries=max_retries,
organization=organization,
client=client,
_is_async=_is_async,
)
)
if openai_client is None:
raise ValueError(
"AzureOpenAI client is not initialized. Make sure api_key is passed or OPENAI_API_KEY is set in the environment."
)
if _is_async is True:
if not isinstance(openai_client, AsyncAzureOpenAI):
raise ValueError(
"AzureOpenAI client is not an instance of AsyncAzureOpenAI. Make sure you passed an AsyncAzureOpenAI client."
)
return self.afile_content( # type: ignore
file_content_request=file_content_request,
openai_client=openai_client,
)
response = openai_client.files.content(**file_content_request)
return response
async def aretrieve_file(
self,
file_id: str,
openai_client: AsyncAzureOpenAI,
) -> FileObject:
response = await openai_client.files.retrieve(file_id=file_id)
return response
def retrieve_file(
self,
_is_async: bool,
file_id: str,
api_base: str,
api_key: Optional[str],
timeout: Union[float, httpx.Timeout],
max_retries: Optional[int],
organization: Optional[str],
api_version: Optional[str] = None,
client: Optional[Union[AzureOpenAI, AsyncAzureOpenAI]] = None,
):
openai_client: Optional[Union[AzureOpenAI, AsyncAzureOpenAI]] = (
get_azure_openai_client(
api_key=api_key,
api_base=api_base,
timeout=timeout,
max_retries=max_retries,
organization=organization,
api_version=api_version,
client=client,
_is_async=_is_async,
)
)
if openai_client is None:
raise ValueError(
"AzureOpenAI client is not initialized. Make sure api_key is passed or OPENAI_API_KEY is set in the environment."
)
if _is_async is True:
if not isinstance(openai_client, AsyncAzureOpenAI):
raise ValueError(
"AzureOpenAI client is not an instance of AsyncAzureOpenAI. Make sure you passed an AsyncAzureOpenAI client."
)
return self.aretrieve_file( # type: ignore
file_id=file_id,
openai_client=openai_client,
)
response = openai_client.files.retrieve(file_id=file_id)
return response
async def adelete_file(
self,
file_id: str,
openai_client: AsyncAzureOpenAI,
) -> FileDeleted:
response = await openai_client.files.delete(file_id=file_id)
return response
def delete_file(
self,
_is_async: bool,
file_id: str,
api_base: str,
api_key: Optional[str],
timeout: Union[float, httpx.Timeout],
max_retries: Optional[int],
organization: Optional[str] = None,
api_version: Optional[str] = None,
client: Optional[Union[AzureOpenAI, AsyncAzureOpenAI]] = None,
):
openai_client: Optional[Union[AzureOpenAI, AsyncAzureOpenAI]] = (
get_azure_openai_client(
api_key=api_key,
api_base=api_base,
timeout=timeout,
max_retries=max_retries,
organization=organization,
api_version=api_version,
client=client,
_is_async=_is_async,
)
)
if openai_client is None:
raise ValueError(
"AzureOpenAI client is not initialized. Make sure api_key is passed or OPENAI_API_KEY is set in the environment."
)
if _is_async is True:
if not isinstance(openai_client, AsyncAzureOpenAI):
raise ValueError(
"AzureOpenAI client is not an instance of AsyncAzureOpenAI. Make sure you passed an AsyncAzureOpenAI client."
)
return self.adelete_file( # type: ignore
file_id=file_id,
openai_client=openai_client,
)
response = openai_client.files.delete(file_id=file_id)
return response
async def alist_files(
self,
openai_client: AsyncAzureOpenAI,
purpose: Optional[str] = None,
):
if isinstance(purpose, str):
response = await openai_client.files.list(purpose=purpose)
else:
response = await openai_client.files.list()
return response
def list_files(
self,
_is_async: bool,
api_base: str,
api_key: Optional[str],
timeout: Union[float, httpx.Timeout],
max_retries: Optional[int],
organization: Optional[str],
purpose: Optional[str] = None,
api_version: Optional[str] = None,
client: Optional[Union[AzureOpenAI, AsyncAzureOpenAI]] = None,
):
openai_client: Optional[Union[AzureOpenAI, AsyncAzureOpenAI]] = (
get_azure_openai_client(
api_key=api_key,
api_base=api_base,
timeout=timeout,
max_retries=max_retries,
organization=organization,
api_version=api_version,
client=client,
_is_async=_is_async,
)
)
if openai_client is None:
raise ValueError(
"AzureOpenAI client is not initialized. Make sure api_key is passed or OPENAI_API_KEY is set in the environment."
)
if _is_async is True:
if not isinstance(openai_client, AsyncAzureOpenAI):
raise ValueError(
"AzureOpenAI client is not an instance of AsyncAzureOpenAI. Make sure you passed an AsyncAzureOpenAI client."
)
return self.alist_files( # type: ignore
purpose=purpose,
openai_client=openai_client,
)
if isinstance(purpose, str):
response = openai_client.files.list(purpose=purpose)
else:
response = openai_client.files.list()
return response

View file

@ -0,0 +1,181 @@
from typing import Any, Coroutine, Optional, Union
import httpx
from openai import AsyncAzureOpenAI, AzureOpenAI
from openai.pagination import AsyncCursorPage
from openai.types.fine_tuning import FineTuningJob
from litellm._logging import verbose_logger
from litellm.llms.base import BaseLLM
from litellm.llms.files_apis.azure import get_azure_openai_client
from litellm.types.llms.openai import FineTuningJobCreate
class AzureOpenAIFineTuningAPI(BaseLLM):
"""
AzureOpenAI methods to support for batches
"""
def __init__(self) -> None:
super().__init__()
async def acreate_fine_tuning_job(
self,
create_fine_tuning_job_data: dict,
openai_client: AsyncAzureOpenAI,
) -> FineTuningJob:
response = await openai_client.fine_tuning.jobs.create(
**create_fine_tuning_job_data # type: ignore
)
return response
def create_fine_tuning_job(
self,
_is_async: bool,
create_fine_tuning_job_data: dict,
api_key: Optional[str],
api_base: Optional[str],
timeout: Union[float, httpx.Timeout],
max_retries: Optional[int],
organization: Optional[str] = None,
client: Optional[Union[AzureOpenAI, AsyncAzureOpenAI]] = None,
api_version: Optional[str] = None,
) -> Union[FineTuningJob, Union[Coroutine[Any, Any, FineTuningJob]]]:
openai_client: Optional[Union[AzureOpenAI, AsyncAzureOpenAI]] = (
get_azure_openai_client(
api_key=api_key,
api_base=api_base,
timeout=timeout,
max_retries=max_retries,
organization=organization,
api_version=api_version,
client=client,
_is_async=_is_async,
)
)
if openai_client is None:
raise ValueError(
"AzureOpenAI client is not initialized. Make sure api_key is passed or OPENAI_API_KEY is set in the environment."
)
if _is_async is True:
if not isinstance(openai_client, AsyncAzureOpenAI):
raise ValueError(
"AzureOpenAI client is not an instance of AsyncAzureOpenAI. Make sure you passed an AsyncAzureOpenAI client."
)
return self.acreate_fine_tuning_job( # type: ignore
create_fine_tuning_job_data=create_fine_tuning_job_data,
openai_client=openai_client,
)
verbose_logger.debug(
"creating fine tuning job, args= %s", create_fine_tuning_job_data
)
response = openai_client.fine_tuning.jobs.create(**create_fine_tuning_job_data) # type: ignore
return response
async def acancel_fine_tuning_job(
self,
fine_tuning_job_id: str,
openai_client: AsyncAzureOpenAI,
) -> FineTuningJob:
response = await openai_client.fine_tuning.jobs.cancel(
fine_tuning_job_id=fine_tuning_job_id
)
return response
def cancel_fine_tuning_job(
self,
_is_async: bool,
fine_tuning_job_id: str,
api_key: Optional[str],
api_base: Optional[str],
timeout: Union[float, httpx.Timeout],
max_retries: Optional[int],
organization: Optional[str] = None,
api_version: Optional[str] = None,
client: Optional[Union[AzureOpenAI, AsyncAzureOpenAI]] = None,
):
openai_client: Optional[Union[AzureOpenAI, AsyncAzureOpenAI]] = (
get_azure_openai_client(
api_key=api_key,
api_base=api_base,
api_version=api_version,
timeout=timeout,
max_retries=max_retries,
organization=organization,
client=client,
_is_async=_is_async,
)
)
if openai_client is None:
raise ValueError(
"AzureOpenAI client is not initialized. Make sure api_key is passed or OPENAI_API_KEY is set in the environment."
)
if _is_async is True:
if not isinstance(openai_client, AsyncAzureOpenAI):
raise ValueError(
"AzureOpenAI client is not an instance of AsyncAzureOpenAI. Make sure you passed an AsyncAzureOpenAI client."
)
return self.acancel_fine_tuning_job( # type: ignore
fine_tuning_job_id=fine_tuning_job_id,
openai_client=openai_client,
)
verbose_logger.debug("canceling fine tuning job, args= %s", fine_tuning_job_id)
response = openai_client.fine_tuning.jobs.cancel(
fine_tuning_job_id=fine_tuning_job_id
)
return response
async def alist_fine_tuning_jobs(
self,
openai_client: AsyncAzureOpenAI,
after: Optional[str] = None,
limit: Optional[int] = None,
):
response = await openai_client.fine_tuning.jobs.list(after=after, limit=limit) # type: ignore
return response
def list_fine_tuning_jobs(
self,
_is_async: bool,
api_key: Optional[str],
api_base: Optional[str],
timeout: Union[float, httpx.Timeout],
max_retries: Optional[int],
organization: Optional[str] = None,
client: Optional[Union[AzureOpenAI, AsyncAzureOpenAI]] = None,
api_version: Optional[str] = None,
after: Optional[str] = None,
limit: Optional[int] = None,
):
openai_client: Optional[Union[AzureOpenAI, AsyncAzureOpenAI]] = (
get_azure_openai_client(
api_key=api_key,
api_base=api_base,
api_version=api_version,
timeout=timeout,
max_retries=max_retries,
organization=organization,
client=client,
_is_async=_is_async,
)
)
if openai_client is None:
raise ValueError(
"AzureOpenAI client is not initialized. Make sure api_key is passed or OPENAI_API_KEY is set in the environment."
)
if _is_async is True:
if not isinstance(openai_client, AsyncAzureOpenAI):
raise ValueError(
"AzureOpenAI client is not an instance of AsyncAzureOpenAI. Make sure you passed an AsyncAzureOpenAI client."
)
return self.alist_fine_tuning_jobs( # type: ignore
after=after,
limit=limit,
openai_client=openai_client,
)
verbose_logger.debug("list fine tuning job, after= %s, limit= %s", after, limit)
response = openai_client.fine_tuning.jobs.list(after=after, limit=limit) # type: ignore
return response

View file

@ -0,0 +1,199 @@
from typing import Any, Coroutine, Optional, Union
import httpx
from openai import AsyncOpenAI, OpenAI
from openai.pagination import AsyncCursorPage
from openai.types.fine_tuning import FineTuningJob
from litellm._logging import verbose_logger
from litellm.llms.base import BaseLLM
from litellm.types.llms.openai import FineTuningJobCreate
class OpenAIFineTuningAPI(BaseLLM):
"""
OpenAI methods to support for batches
"""
def __init__(self) -> None:
super().__init__()
def get_openai_client(
self,
api_key: Optional[str],
api_base: Optional[str],
timeout: Union[float, httpx.Timeout],
max_retries: Optional[int],
organization: Optional[str],
client: Optional[Union[OpenAI, AsyncOpenAI]] = None,
_is_async: bool = False,
) -> Optional[Union[OpenAI, AsyncOpenAI]]:
received_args = locals()
openai_client: Optional[Union[OpenAI, AsyncOpenAI]] = None
if client is None:
data = {}
for k, v in received_args.items():
if k == "self" or k == "client" or k == "_is_async":
pass
elif k == "api_base" and v is not None:
data["base_url"] = v
elif v is not None:
data[k] = v
if _is_async is True:
openai_client = AsyncOpenAI(**data)
else:
openai_client = OpenAI(**data) # type: ignore
else:
openai_client = client
return openai_client
async def acreate_fine_tuning_job(
self,
create_fine_tuning_job_data: dict,
openai_client: AsyncOpenAI,
) -> FineTuningJob:
response = await openai_client.fine_tuning.jobs.create(
**create_fine_tuning_job_data
)
return response
def create_fine_tuning_job(
self,
_is_async: bool,
create_fine_tuning_job_data: dict,
api_key: Optional[str],
api_base: Optional[str],
timeout: Union[float, httpx.Timeout],
max_retries: Optional[int],
organization: Optional[str],
client: Optional[Union[OpenAI, AsyncOpenAI]] = None,
) -> Union[FineTuningJob, Union[Coroutine[Any, Any, FineTuningJob]]]:
openai_client: Optional[Union[OpenAI, AsyncOpenAI]] = self.get_openai_client(
api_key=api_key,
api_base=api_base,
timeout=timeout,
max_retries=max_retries,
organization=organization,
client=client,
_is_async=_is_async,
)
if openai_client is None:
raise ValueError(
"OpenAI client is not initialized. Make sure api_key is passed or OPENAI_API_KEY is set in the environment."
)
if _is_async is True:
if not isinstance(openai_client, AsyncOpenAI):
raise ValueError(
"OpenAI client is not an instance of AsyncOpenAI. Make sure you passed an AsyncOpenAI client."
)
return self.acreate_fine_tuning_job( # type: ignore
create_fine_tuning_job_data=create_fine_tuning_job_data,
openai_client=openai_client,
)
verbose_logger.debug(
"creating fine tuning job, args= %s", create_fine_tuning_job_data
)
response = openai_client.fine_tuning.jobs.create(**create_fine_tuning_job_data)
return response
async def acancel_fine_tuning_job(
self,
fine_tuning_job_id: str,
openai_client: AsyncOpenAI,
) -> FineTuningJob:
response = await openai_client.fine_tuning.jobs.cancel(
fine_tuning_job_id=fine_tuning_job_id
)
return response
def cancel_fine_tuning_job(
self,
_is_async: bool,
fine_tuning_job_id: str,
api_key: Optional[str],
api_base: Optional[str],
timeout: Union[float, httpx.Timeout],
max_retries: Optional[int],
organization: Optional[str],
client: Optional[Union[OpenAI, AsyncOpenAI]] = None,
):
openai_client: Optional[Union[OpenAI, AsyncOpenAI]] = self.get_openai_client(
api_key=api_key,
api_base=api_base,
timeout=timeout,
max_retries=max_retries,
organization=organization,
client=client,
_is_async=_is_async,
)
if openai_client is None:
raise ValueError(
"OpenAI client is not initialized. Make sure api_key is passed or OPENAI_API_KEY is set in the environment."
)
if _is_async is True:
if not isinstance(openai_client, AsyncOpenAI):
raise ValueError(
"OpenAI client is not an instance of AsyncOpenAI. Make sure you passed an AsyncOpenAI client."
)
return self.acancel_fine_tuning_job( # type: ignore
fine_tuning_job_id=fine_tuning_job_id,
openai_client=openai_client,
)
verbose_logger.debug("canceling fine tuning job, args= %s", fine_tuning_job_id)
response = openai_client.fine_tuning.jobs.cancel(
fine_tuning_job_id=fine_tuning_job_id
)
return response
async def alist_fine_tuning_jobs(
self,
openai_client: AsyncOpenAI,
after: Optional[str] = None,
limit: Optional[int] = None,
):
response = await openai_client.fine_tuning.jobs.list(after=after, limit=limit) # type: ignore
return response
def list_fine_tuning_jobs(
self,
_is_async: bool,
api_key: Optional[str],
api_base: Optional[str],
timeout: Union[float, httpx.Timeout],
max_retries: Optional[int],
organization: Optional[str],
client: Optional[Union[OpenAI, AsyncOpenAI]] = None,
after: Optional[str] = None,
limit: Optional[int] = None,
):
openai_client: Optional[Union[OpenAI, AsyncOpenAI]] = self.get_openai_client(
api_key=api_key,
api_base=api_base,
timeout=timeout,
max_retries=max_retries,
organization=organization,
client=client,
_is_async=_is_async,
)
if openai_client is None:
raise ValueError(
"OpenAI client is not initialized. Make sure api_key is passed or OPENAI_API_KEY is set in the environment."
)
if _is_async is True:
if not isinstance(openai_client, AsyncOpenAI):
raise ValueError(
"OpenAI client is not an instance of AsyncOpenAI. Make sure you passed an AsyncOpenAI client."
)
return self.alist_fine_tuning_jobs( # type: ignore
after=after,
limit=limit,
openai_client=openai_client,
)
verbose_logger.debug("list fine tuning job, after= %s, limit= %s", after, limit)
response = openai_client.fine_tuning.jobs.list(after=after, limit=limit) # type: ignore
return response
pass

View file

@ -0,0 +1,298 @@
import traceback
from datetime import datetime
from typing import Any, Coroutine, Literal, Optional, Union
import httpx
from openai.types.fine_tuning.fine_tuning_job import FineTuningJob, Hyperparameters
from litellm._logging import verbose_logger
from litellm.llms.base import BaseLLM
from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
from litellm.llms.vertex_httpx import VertexLLM
from litellm.types.llms.openai import FineTuningJobCreate
from litellm.types.llms.vertex_ai import (
FineTuneJobCreate,
FineTunesupervisedTuningSpec,
ResponseTuningJob,
)
class VertexFineTuningAPI(VertexLLM):
"""
Vertex methods to support for batches
"""
def __init__(self) -> None:
super().__init__()
self.async_handler = AsyncHTTPHandler(
timeout=httpx.Timeout(timeout=600.0, connect=5.0)
)
def convert_response_created_at(self, response: ResponseTuningJob):
try:
create_time_str = response.get("createTime", "") or ""
create_time_datetime = datetime.fromisoformat(
create_time_str.replace("Z", "+00:00")
)
# Convert to Unix timestamp (seconds since epoch)
created_at = int(create_time_datetime.timestamp())
return created_at
except Exception as e:
return 0
def convert_vertex_response_to_open_ai_response(
self, response: ResponseTuningJob
) -> FineTuningJob:
status: Literal[
"validating_files", "queued", "running", "succeeded", "failed", "cancelled"
] = "queued"
if response["state"] == "JOB_STATE_PENDING":
status = "queued"
if response["state"] == "JOB_STATE_SUCCEEDED":
status = "succeeded"
if response["state"] == "JOB_STATE_FAILED":
status = "failed"
if response["state"] == "JOB_STATE_CANCELLED":
status = "cancelled"
if response["state"] == "JOB_STATE_RUNNING":
status = "running"
created_at = self.convert_response_created_at(response)
training_uri = ""
if "supervisedTuningSpec" in response and response["supervisedTuningSpec"]:
training_uri = response["supervisedTuningSpec"]["trainingDatasetUri"] or ""
return FineTuningJob(
id=response["name"] or "",
created_at=created_at,
fine_tuned_model=response["tunedModelDisplayName"],
finished_at=None,
hyperparameters=Hyperparameters(
n_epochs=0,
),
model=response["baseModel"] or "",
object="fine_tuning.job",
organization_id="",
result_files=[],
seed=0,
status=status,
trained_tokens=None,
training_file=training_uri,
validation_file=None,
estimated_finish=None,
integrations=[],
)
def convert_openai_request_to_vertex(
self, create_fine_tuning_job_data: FineTuningJobCreate, **kwargs
) -> FineTuneJobCreate:
"""
convert request from OpenAI format to Vertex format
https://cloud.google.com/vertex-ai/generative-ai/docs/model-reference/tuning
supervised_tuning_spec = FineTunesupervisedTuningSpec(
"""
hyperparameters = create_fine_tuning_job_data.hyperparameters
supervised_tuning_spec = FineTunesupervisedTuningSpec(
training_dataset_uri=create_fine_tuning_job_data.training_file,
validation_dataset=create_fine_tuning_job_data.validation_file,
)
if hyperparameters:
if hyperparameters.n_epochs:
supervised_tuning_spec["epoch_count"] = int(hyperparameters.n_epochs)
if hyperparameters.learning_rate_multiplier:
supervised_tuning_spec["learning_rate_multiplier"] = float(
hyperparameters.learning_rate_multiplier
)
supervised_tuning_spec["adapter_size"] = kwargs.get("adapter_size")
fine_tune_job = FineTuneJobCreate(
baseModel=create_fine_tuning_job_data.model,
supervisedTuningSpec=supervised_tuning_spec,
tunedModelDisplayName=create_fine_tuning_job_data.suffix,
)
return fine_tune_job
async def acreate_fine_tuning_job(
self,
fine_tuning_url: str,
headers: dict,
request_data: FineTuneJobCreate,
):
from litellm.fine_tuning.main import FineTuningJob
try:
verbose_logger.debug(
"about to create fine tuning job: %s, request_data: %s",
fine_tuning_url,
request_data,
)
if self.async_handler is None:
raise ValueError(
"VertexAI Fine Tuning - async_handler is not initialized"
)
response = await self.async_handler.post(
headers=headers,
url=fine_tuning_url,
json=request_data, # type: ignore
)
if response.status_code != 200:
raise Exception(
f"Error creating fine tuning job. Status code: {response.status_code}. Response: {response.text}"
)
verbose_logger.debug(
"got response from creating fine tuning job: %s", response.json()
)
vertex_response = ResponseTuningJob( # type: ignore
**response.json(),
)
verbose_logger.debug("vertex_response %s", vertex_response)
open_ai_response = self.convert_vertex_response_to_open_ai_response(
vertex_response
)
return open_ai_response
except Exception as e:
verbose_logger.error("asyncerror creating fine tuning job %s", e)
trace_back_str = traceback.format_exc()
verbose_logger.error(trace_back_str)
raise e
def create_fine_tuning_job(
self,
_is_async: bool,
create_fine_tuning_job_data: FineTuningJobCreate,
vertex_project: Optional[str],
vertex_location: Optional[str],
vertex_credentials: Optional[str],
api_base: Optional[str],
timeout: Union[float, httpx.Timeout],
**kwargs,
):
verbose_logger.debug(
"creating fine tuning job, args= %s", create_fine_tuning_job_data
)
auth_header, _ = self._get_token_and_url(
model="",
gemini_api_key=None,
vertex_credentials=vertex_credentials,
vertex_project=vertex_project,
vertex_location=vertex_location,
stream=False,
custom_llm_provider="vertex_ai_beta",
api_base=api_base,
)
headers = {
"Authorization": f"Bearer {auth_header}",
"Content-Type": "application/json",
}
fine_tune_job = self.convert_openai_request_to_vertex(
create_fine_tuning_job_data=create_fine_tuning_job_data, **kwargs
)
fine_tuning_url = f"https://{vertex_location}-aiplatform.googleapis.com/v1/projects/{vertex_project}/locations/{vertex_location}/tuningJobs"
if _is_async is True:
return self.acreate_fine_tuning_job( # type: ignore
fine_tuning_url=fine_tuning_url,
headers=headers,
request_data=fine_tune_job,
)
sync_handler = HTTPHandler(timeout=httpx.Timeout(timeout=600.0, connect=5.0))
verbose_logger.debug(
"about to create fine tuning job: %s, request_data: %s",
fine_tuning_url,
fine_tune_job,
)
response = sync_handler.post(
headers=headers,
url=fine_tuning_url,
json=fine_tune_job, # type: ignore
)
if response.status_code != 200:
raise Exception(
f"Error creating fine tuning job. Status code: {response.status_code}. Response: {response.text}"
)
verbose_logger.debug(
"got response from creating fine tuning job: %s", response.json()
)
vertex_response = ResponseTuningJob( # type: ignore
**response.json(),
)
verbose_logger.debug("vertex_response %s", vertex_response)
open_ai_response = self.convert_vertex_response_to_open_ai_response(
vertex_response
)
return open_ai_response
async def pass_through_vertex_ai_POST_request(
self,
request_data: dict,
vertex_project: str,
vertex_location: str,
vertex_credentials: str,
request_route: str,
):
auth_header, _ = self._get_token_and_url(
model="",
gemini_api_key=None,
vertex_credentials=vertex_credentials,
vertex_project=vertex_project,
vertex_location=vertex_location,
stream=False,
custom_llm_provider="vertex_ai_beta",
api_base="",
)
headers = {
"Authorization": f"Bearer {auth_header}",
"Content-Type": "application/json",
}
url = None
if request_route == "/tuningJobs":
url = f"https://{vertex_location}-aiplatform.googleapis.com/v1/projects/{vertex_project}/locations/{vertex_location}/tuningJobs"
elif "/tuningJobs/" in request_route and "cancel" in request_route:
url = f"https://{vertex_location}-aiplatform.googleapis.com/v1/projects/{vertex_project}/locations/{vertex_location}/tuningJobs{request_route}"
elif "generateContent" in request_route:
url = f"https://{vertex_location}-aiplatform.googleapis.com/v1/projects/{vertex_project}/locations/{vertex_location}{request_route}"
elif "predict" in request_route:
url = f"https://{vertex_location}-aiplatform.googleapis.com/v1/projects/{vertex_project}/locations/{vertex_location}{request_route}"
elif "/batchPredictionJobs" in request_route:
url = f"https://{vertex_location}-aiplatform.googleapis.com/v1/projects/{vertex_project}/locations/{vertex_location}{request_route}"
elif "countTokens" in request_route:
url = f"https://{vertex_location}-aiplatform.googleapis.com/v1/projects/{vertex_project}/locations/{vertex_location}{request_route}"
else:
raise ValueError(f"Unsupported Vertex AI request route: {request_route}")
if self.async_handler is None:
raise ValueError("VertexAI Fine Tuning - async_handler is not initialized")
response = await self.async_handler.post(
headers=headers,
url=url,
json=request_data, # type: ignore
)
if response.status_code != 200:
raise Exception(
f"Error creating fine tuning job. Status code: {response.status_code}. Response: {response.text}"
)
response_json = response.json()
return response_json

View file

@ -6,12 +6,13 @@ import os
import time
import types
from enum import Enum
from typing import Any, Callable, Dict, List, Literal, Optional, Tuple
from typing import Any, Callable, Dict, List, Literal, Optional, Tuple, Union, get_args
import httpx
import requests
import litellm
from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObj
from litellm.types.completion import ChatCompletionMessageToolCallParam
from litellm.utils import Choices, CustomStreamWrapper, Message, ModelResponse, Usage
@ -60,6 +61,10 @@ hf_tasks = Literal[
"text-generation",
]
hf_tasks_embeddings = Literal[ # pipeline tags + hf tei endpoints - https://huggingface.github.io/text-embeddings-inference/#/
"sentence-similarity", "feature-extraction", "rerank", "embed", "similarity"
]
class HuggingfaceConfig:
"""
@ -249,6 +254,55 @@ def get_hf_task_for_model(model: str) -> Tuple[hf_tasks, str]:
return "text-generation-inference", model # default to tgi
from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
def get_hf_task_embedding_for_model(
model: str, task_type: Optional[str], api_base: str
) -> Optional[str]:
if task_type is not None:
if task_type in get_args(hf_tasks_embeddings):
return task_type
else:
raise Exception(
"Invalid task_type={}. Expected one of={}".format(
task_type, hf_tasks_embeddings
)
)
http_client = HTTPHandler(concurrent_limit=1)
model_info = http_client.get(url=api_base)
model_info_dict = model_info.json()
pipeline_tag: Optional[str] = model_info_dict.get("pipeline_tag", None)
return pipeline_tag
async def async_get_hf_task_embedding_for_model(
model: str, task_type: Optional[str], api_base: str
) -> Optional[str]:
if task_type is not None:
if task_type in get_args(hf_tasks_embeddings):
return task_type
else:
raise Exception(
"Invalid task_type={}. Expected one of={}".format(
task_type, hf_tasks_embeddings
)
)
http_client = AsyncHTTPHandler(concurrent_limit=1)
model_info = await http_client.get(url=api_base)
model_info_dict = model_info.json()
pipeline_tag: Optional[str] = model_info_dict.get("pipeline_tag", None)
return pipeline_tag
class Huggingface(BaseLLM):
_client_session: Optional[httpx.Client] = None
_aclient_session: Optional[httpx.AsyncClient] = None
@ -256,7 +310,7 @@ class Huggingface(BaseLLM):
def __init__(self) -> None:
super().__init__()
def validate_environment(self, api_key, headers):
def _validate_environment(self, api_key, headers) -> dict:
default_headers = {
"content-type": "application/json",
}
@ -406,7 +460,7 @@ class Huggingface(BaseLLM):
super().completion()
exception_mapping_worked = False
try:
headers = self.validate_environment(api_key, headers)
headers = self._validate_environment(api_key, headers)
task, model = get_hf_task_for_model(model)
## VALIDATE API FORMAT
if task is None or not isinstance(task, str) or task not in hf_task_list:
@ -762,76 +816,82 @@ class Huggingface(BaseLLM):
async for transformed_chunk in streamwrapper:
yield transformed_chunk
def embedding(
self,
model: str,
input: list,
model_response: litellm.EmbeddingResponse,
api_key: Optional[str] = None,
api_base: Optional[str] = None,
logging_obj=None,
encoding=None,
):
super().embedding()
headers = self.validate_environment(api_key, headers=None)
# print_verbose(f"{model}, {task}")
embed_url = ""
if "https" in model:
embed_url = model
elif api_base:
embed_url = api_base
elif "HF_API_BASE" in os.environ:
embed_url = os.getenv("HF_API_BASE", "")
elif "HUGGINGFACE_API_BASE" in os.environ:
embed_url = os.getenv("HUGGINGFACE_API_BASE", "")
else:
embed_url = f"https://api-inference.huggingface.co/models/{model}"
def _transform_input_on_pipeline_tag(
self, input: List, pipeline_tag: Optional[str]
) -> dict:
if pipeline_tag is None:
return {"inputs": input}
if pipeline_tag == "sentence-similarity" or pipeline_tag == "similarity":
if len(input) < 2:
raise HuggingfaceError(
status_code=400,
message="sentence-similarity requires 2+ sentences",
)
return {"inputs": {"source_sentence": input[0], "sentences": input[1:]}}
elif pipeline_tag == "rerank":
if len(input) < 2:
raise HuggingfaceError(
status_code=400,
message="reranker requires 2+ sentences",
)
return {"inputs": {"query": input[0], "texts": input[1:]}}
return {"inputs": input} # default to feature-extraction pipeline tag
async def _async_transform_input(
self, model: str, task_type: Optional[str], embed_url: str, input: List
) -> dict:
hf_task = await async_get_hf_task_embedding_for_model(
model=model, task_type=task_type, api_base=embed_url
)
data = self._transform_input_on_pipeline_tag(input=input, pipeline_tag=hf_task)
return data
def _transform_input(
self,
input: List,
model: str,
call_type: Literal["sync", "async"],
optional_params: dict,
embed_url: str,
) -> dict:
## TRANSFORMATION ##
if "sentence-transformers" in model:
if len(input) == 0:
raise HuggingfaceError(
status_code=400,
message="sentence transformers requires 2+ sentences",
)
data = {
"inputs": {
"source_sentence": input[0],
"sentences": [
"That is a happy dog",
"That is a very happy person",
"Today is a sunny day",
],
}
}
data = {"inputs": {"source_sentence": input[0], "sentences": input[1:]}}
else:
data = {"inputs": input} # type: ignore
## LOGGING
logging_obj.pre_call(
input=input,
api_key=api_key,
additional_args={
"complete_input_dict": data,
"headers": headers,
"api_base": embed_url,
},
)
## COMPLETION CALL
response = requests.post(embed_url, headers=headers, data=json.dumps(data))
task_type = optional_params.pop("input_type", None)
## LOGGING
logging_obj.post_call(
input=input,
api_key=api_key,
additional_args={"complete_input_dict": data},
original_response=response,
)
if call_type == "sync":
hf_task = get_hf_task_embedding_for_model(
model=model, task_type=task_type, api_base=embed_url
)
elif call_type == "async":
return self._async_transform_input(
model=model, task_type=task_type, embed_url=embed_url, input=input
) # type: ignore
embeddings = response.json()
data = self._transform_input_on_pipeline_tag(
input=input, pipeline_tag=hf_task
)
if "error" in embeddings:
raise HuggingfaceError(status_code=500, message=embeddings["error"])
return data
def _process_embedding_response(
self,
embeddings: dict,
model_response: litellm.EmbeddingResponse,
model: str,
input: List,
encoding: Any,
) -> litellm.EmbeddingResponse:
output_data = []
if "similarities" in embeddings:
for idx, embedding in embeddings["similarities"]:
@ -888,3 +948,156 @@ class Huggingface(BaseLLM):
),
)
return model_response
async def aembedding(
self,
model: str,
input: list,
model_response: litellm.utils.EmbeddingResponse,
timeout: Union[float, httpx.Timeout],
logging_obj: LiteLLMLoggingObj,
optional_params: dict,
api_base: str,
api_key: Optional[str],
headers: dict,
encoding: Callable,
client: Optional[AsyncHTTPHandler] = None,
):
## TRANSFORMATION ##
data = self._transform_input(
input=input,
model=model,
call_type="sync",
optional_params=optional_params,
embed_url=api_base,
)
## LOGGING
logging_obj.pre_call(
input=input,
api_key=api_key,
additional_args={
"complete_input_dict": data,
"headers": headers,
"api_base": api_base,
},
)
## COMPLETION CALL
if client is None:
client = AsyncHTTPHandler(concurrent_limit=1)
response = await client.post(api_base, headers=headers, data=json.dumps(data))
## LOGGING
logging_obj.post_call(
input=input,
api_key=api_key,
additional_args={"complete_input_dict": data},
original_response=response,
)
embeddings = response.json()
if "error" in embeddings:
raise HuggingfaceError(status_code=500, message=embeddings["error"])
## PROCESS RESPONSE ##
return self._process_embedding_response(
embeddings=embeddings,
model_response=model_response,
model=model,
input=input,
encoding=encoding,
)
def embedding(
self,
model: str,
input: list,
model_response: litellm.EmbeddingResponse,
optional_params: dict,
logging_obj: LiteLLMLoggingObj,
encoding: Callable,
api_key: Optional[str] = None,
api_base: Optional[str] = None,
timeout: Union[float, httpx.Timeout] = httpx.Timeout(None),
aembedding: Optional[bool] = None,
client: Optional[Union[HTTPHandler, AsyncHTTPHandler]] = None,
) -> litellm.EmbeddingResponse:
super().embedding()
headers = self._validate_environment(api_key, headers=None)
# print_verbose(f"{model}, {task}")
embed_url = ""
if "https" in model:
embed_url = model
elif api_base:
embed_url = api_base
elif "HF_API_BASE" in os.environ:
embed_url = os.getenv("HF_API_BASE", "")
elif "HUGGINGFACE_API_BASE" in os.environ:
embed_url = os.getenv("HUGGINGFACE_API_BASE", "")
else:
embed_url = f"https://api-inference.huggingface.co/models/{model}"
## ROUTING ##
if aembedding is True:
return self.aembedding(
input=input,
model_response=model_response,
timeout=timeout,
logging_obj=logging_obj,
headers=headers,
api_base=embed_url, # type: ignore
api_key=api_key,
client=client if isinstance(client, AsyncHTTPHandler) else None,
model=model,
optional_params=optional_params,
encoding=encoding,
)
## TRANSFORMATION ##
data = self._transform_input(
input=input,
model=model,
call_type="sync",
optional_params=optional_params,
embed_url=embed_url,
)
## LOGGING
logging_obj.pre_call(
input=input,
api_key=api_key,
additional_args={
"complete_input_dict": data,
"headers": headers,
"api_base": embed_url,
},
)
## COMPLETION CALL
if client is None or not isinstance(client, HTTPHandler):
client = HTTPHandler(concurrent_limit=1)
response = client.post(embed_url, headers=headers, data=json.dumps(data))
## LOGGING
logging_obj.post_call(
input=input,
api_key=api_key,
additional_args={"complete_input_dict": data},
original_response=response,
)
embeddings = response.json()
if "error" in embeddings:
raise HuggingfaceError(status_code=500, message=embeddings["error"])
## PROCESS RESPONSE ##
return self._process_embedding_response(
embeddings=embeddings,
model_response=model_response,
model=model,
input=input,
encoding=encoding,
)

View file

@ -258,7 +258,7 @@ def get_ollama_response(
logging_obj=logging_obj,
)
return response
elif stream == True:
elif stream is True:
return ollama_completion_stream(url=url, data=data, logging_obj=logging_obj)
response = requests.post(
@ -326,7 +326,7 @@ def ollama_completion_stream(url, data, logging_obj):
try:
if response.status_code != 200:
raise OllamaError(
status_code=response.status_code, message=response.text
status_code=response.status_code, message=response.read()
)
streamwrapper = litellm.CustomStreamWrapper(

View file

@ -149,7 +149,9 @@ class OllamaChatConfig:
"response_format",
]
def map_openai_params(self, non_default_params: dict, optional_params: dict):
def map_openai_params(
self, model: str, non_default_params: dict, optional_params: dict
):
for param, value in non_default_params.items():
if param == "max_tokens":
optional_params["num_predict"] = value
@ -170,16 +172,26 @@ class OllamaChatConfig:
### FUNCTION CALLING LOGIC ###
if param == "tools":
# ollama actually supports json output
optional_params["format"] = "json"
litellm.add_function_to_prompt = (
True # so that main.py adds the function call to the prompt
)
optional_params["functions_unsupported_model"] = value
## CHECK IF MODEL SUPPORTS TOOL CALLING ##
try:
model_info = litellm.get_model_info(
model=model, custom_llm_provider="ollama_chat"
)
if model_info.get("supports_function_calling") is True:
optional_params["tools"] = value
else:
raise Exception
except Exception:
optional_params["format"] = "json"
litellm.add_function_to_prompt = (
True # so that main.py adds the function call to the prompt
)
optional_params["functions_unsupported_model"] = value
if len(optional_params["functions_unsupported_model"]) == 1:
optional_params["function_name"] = optional_params[
"functions_unsupported_model"
][0]["function"]["name"]
if len(optional_params["functions_unsupported_model"]) == 1:
optional_params["function_name"] = optional_params[
"functions_unsupported_model"
][0]["function"]["name"]
if param == "functions":
# ollama actually supports json output
@ -198,11 +210,11 @@ class OllamaChatConfig:
# ollama implementation
def get_ollama_response(
model_response: litellm.ModelResponse,
messages: list,
optional_params: dict,
api_base="http://localhost:11434",
api_key: Optional[str] = None,
model="llama2",
messages=None,
optional_params=None,
logging_obj=None,
acompletion: bool = False,
encoding=None,
@ -223,6 +235,7 @@ def get_ollama_response(
stream = optional_params.pop("stream", False)
format = optional_params.pop("format", None)
function_name = optional_params.pop("function_name", None)
tools = optional_params.pop("tools", None)
for m in messages:
if "role" in m and m["role"] == "tool":
@ -236,6 +249,8 @@ def get_ollama_response(
}
if format is not None:
data["format"] = format
if tools is not None:
data["tools"] = tools
## LOGGING
logging_obj.pre_call(
input=None,
@ -278,7 +293,7 @@ def get_ollama_response(
"json": data,
}
if api_key is not None:
_request["headers"] = "Bearer {}".format(api_key)
_request["headers"] = {"Authorization": "Bearer {}".format(api_key)}
response = requests.post(**_request) # type: ignore
if response.status_code != 200:
raise OllamaError(status_code=response.status_code, message=response.text)
@ -343,7 +358,7 @@ def ollama_completion_stream(url, api_key, data, logging_obj):
"timeout": litellm.request_timeout,
}
if api_key is not None:
_request["headers"] = "Bearer {}".format(api_key)
_request["headers"] = {"Authorization": "Bearer {}".format(api_key)}
with httpx.stream(**_request) as response:
try:
if response.status_code != 200:
@ -405,7 +420,7 @@ async def ollama_async_streaming(
"timeout": litellm.request_timeout,
}
if api_key is not None:
_request["headers"] = "Bearer {}".format(api_key)
_request["headers"] = {"Authorization": "Bearer {}".format(api_key)}
async with client.stream(**_request) as response:
if response.status_code != 200:
raise OllamaError(
@ -477,7 +492,7 @@ async def ollama_acompletion(
"json": data,
}
if api_key is not None:
_request["headers"] = "Bearer {}".format(api_key)
_request["headers"] = {"Authorization": "Bearer {}".format(api_key)}
resp = await session.post(**_request)
if resp.status != 200:
@ -499,7 +514,8 @@ async def ollama_acompletion(
## RESPONSE OBJECT
model_response.choices[0].finish_reason = "stop"
if data.get("format", "") == "json":
if data.get("format", "") == "json" and function_name is not None:
function_call = json.loads(response_json["message"]["content"])
message = litellm.Message(
content=None,
@ -519,11 +535,8 @@ async def ollama_acompletion(
model_response.choices[0].message = message # type: ignore
model_response.choices[0].finish_reason = "tool_calls"
else:
model_response.choices[0].message.content = response_json[ # type: ignore
"message"
][
"content"
]
_message = litellm.Message(**response_json["message"])
model_response.choices[0].message = _message # type: ignore
model_response.created = int(time.time())
model_response.model = "ollama_chat/" + data["model"]

Some files were not shown because too many files have changed in this diff Show more