mirror of
https://github.com/BerriAI/litellm.git
synced 2025-04-25 18:54:30 +00:00
Merge branch 'BerriAI:main' into fix/groq-custom-pricing-cost
This commit is contained in:
commit
fcd2586909
352 changed files with 14316 additions and 6075 deletions
|
@ -424,7 +424,7 @@ jobs:
|
||||||
command: |
|
command: |
|
||||||
pwd
|
pwd
|
||||||
ls
|
ls
|
||||||
python -m pytest tests/local_testing tests/router_unit_tests --cov=litellm --cov-report=xml -vv -k "router" -x -s -v --junitxml=test-results/junit.xml --durations=5
|
python -m pytest tests/local_testing tests/router_unit_tests --cov=litellm --cov-report=xml -vv -k "router" -x -v --junitxml=test-results/junit.xml --durations=5
|
||||||
no_output_timeout: 120m
|
no_output_timeout: 120m
|
||||||
- run:
|
- run:
|
||||||
name: Rename the coverage files
|
name: Rename the coverage files
|
||||||
|
@ -701,7 +701,7 @@ jobs:
|
||||||
command: |
|
command: |
|
||||||
pwd
|
pwd
|
||||||
ls
|
ls
|
||||||
python -m pytest -vv tests/llm_translation --cov=litellm --cov-report=xml -x -s -v --junitxml=test-results/junit.xml --durations=5
|
python -m pytest -vv tests/llm_translation --cov=litellm --cov-report=xml -x -v --junitxml=test-results/junit.xml --durations=5
|
||||||
no_output_timeout: 120m
|
no_output_timeout: 120m
|
||||||
- run:
|
- run:
|
||||||
name: Rename the coverage files
|
name: Rename the coverage files
|
||||||
|
@ -1450,7 +1450,7 @@ jobs:
|
||||||
command: |
|
command: |
|
||||||
pwd
|
pwd
|
||||||
ls
|
ls
|
||||||
python -m pytest -s -vv tests/*.py -x --junitxml=test-results/junit.xml --durations=5 --ignore=tests/otel_tests --ignore=tests/pass_through_tests --ignore=tests/proxy_admin_ui_tests --ignore=tests/load_tests --ignore=tests/llm_translation --ignore=tests/llm_responses_api_testing --ignore=tests/mcp_tests --ignore=tests/image_gen_tests --ignore=tests/pass_through_unit_tests
|
python -m pytest -s -vv tests/*.py -x --junitxml=test-results/junit.xml --durations=5 --ignore=tests/otel_tests --ignore=tests/spend_tracking_tests --ignore=tests/pass_through_tests --ignore=tests/proxy_admin_ui_tests --ignore=tests/load_tests --ignore=tests/llm_translation --ignore=tests/llm_responses_api_testing --ignore=tests/mcp_tests --ignore=tests/image_gen_tests --ignore=tests/pass_through_unit_tests
|
||||||
no_output_timeout: 120m
|
no_output_timeout: 120m
|
||||||
|
|
||||||
# Store test results
|
# Store test results
|
||||||
|
@ -1743,6 +1743,96 @@ jobs:
|
||||||
# Store test results
|
# Store test results
|
||||||
- store_test_results:
|
- store_test_results:
|
||||||
path: test-results
|
path: test-results
|
||||||
|
proxy_spend_accuracy_tests:
|
||||||
|
machine:
|
||||||
|
image: ubuntu-2204:2023.10.1
|
||||||
|
resource_class: xlarge
|
||||||
|
working_directory: ~/project
|
||||||
|
steps:
|
||||||
|
- checkout
|
||||||
|
- setup_google_dns
|
||||||
|
- run:
|
||||||
|
name: Install Docker CLI (In case it's not already installed)
|
||||||
|
command: |
|
||||||
|
sudo apt-get update
|
||||||
|
sudo apt-get install -y docker-ce docker-ce-cli containerd.io
|
||||||
|
- run:
|
||||||
|
name: Install Python 3.9
|
||||||
|
command: |
|
||||||
|
curl https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh --output miniconda.sh
|
||||||
|
bash miniconda.sh -b -p $HOME/miniconda
|
||||||
|
export PATH="$HOME/miniconda/bin:$PATH"
|
||||||
|
conda init bash
|
||||||
|
source ~/.bashrc
|
||||||
|
conda create -n myenv python=3.9 -y
|
||||||
|
conda activate myenv
|
||||||
|
python --version
|
||||||
|
- run:
|
||||||
|
name: Install Dependencies
|
||||||
|
command: |
|
||||||
|
pip install "pytest==7.3.1"
|
||||||
|
pip install "pytest-asyncio==0.21.1"
|
||||||
|
pip install aiohttp
|
||||||
|
python -m pip install --upgrade pip
|
||||||
|
python -m pip install -r requirements.txt
|
||||||
|
- run:
|
||||||
|
name: Build Docker image
|
||||||
|
command: docker build -t my-app:latest -f ./docker/Dockerfile.database .
|
||||||
|
- run:
|
||||||
|
name: Run Docker container
|
||||||
|
# intentionally give bad redis credentials here
|
||||||
|
# the OTEL test - should get this as a trace
|
||||||
|
command: |
|
||||||
|
docker run -d \
|
||||||
|
-p 4000:4000 \
|
||||||
|
-e DATABASE_URL=$PROXY_DATABASE_URL \
|
||||||
|
-e REDIS_HOST=$REDIS_HOST \
|
||||||
|
-e REDIS_PASSWORD=$REDIS_PASSWORD \
|
||||||
|
-e REDIS_PORT=$REDIS_PORT \
|
||||||
|
-e LITELLM_MASTER_KEY="sk-1234" \
|
||||||
|
-e OPENAI_API_KEY=$OPENAI_API_KEY \
|
||||||
|
-e LITELLM_LICENSE=$LITELLM_LICENSE \
|
||||||
|
-e AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID \
|
||||||
|
-e AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY \
|
||||||
|
-e USE_DDTRACE=True \
|
||||||
|
-e DD_API_KEY=$DD_API_KEY \
|
||||||
|
-e DD_SITE=$DD_SITE \
|
||||||
|
-e AWS_REGION_NAME=$AWS_REGION_NAME \
|
||||||
|
--name my-app \
|
||||||
|
-v $(pwd)/litellm/proxy/example_config_yaml/spend_tracking_config.yaml:/app/config.yaml \
|
||||||
|
my-app:latest \
|
||||||
|
--config /app/config.yaml \
|
||||||
|
--port 4000 \
|
||||||
|
--detailed_debug \
|
||||||
|
- run:
|
||||||
|
name: Install curl and dockerize
|
||||||
|
command: |
|
||||||
|
sudo apt-get update
|
||||||
|
sudo apt-get install -y curl
|
||||||
|
sudo wget https://github.com/jwilder/dockerize/releases/download/v0.6.1/dockerize-linux-amd64-v0.6.1.tar.gz
|
||||||
|
sudo tar -C /usr/local/bin -xzvf dockerize-linux-amd64-v0.6.1.tar.gz
|
||||||
|
sudo rm dockerize-linux-amd64-v0.6.1.tar.gz
|
||||||
|
- run:
|
||||||
|
name: Start outputting logs
|
||||||
|
command: docker logs -f my-app
|
||||||
|
background: true
|
||||||
|
- run:
|
||||||
|
name: Wait for app to be ready
|
||||||
|
command: dockerize -wait http://localhost:4000 -timeout 5m
|
||||||
|
- run:
|
||||||
|
name: Run tests
|
||||||
|
command: |
|
||||||
|
pwd
|
||||||
|
ls
|
||||||
|
python -m pytest -vv tests/spend_tracking_tests -x --junitxml=test-results/junit.xml --durations=5
|
||||||
|
no_output_timeout:
|
||||||
|
120m
|
||||||
|
# Clean up first container
|
||||||
|
- run:
|
||||||
|
name: Stop and remove first container
|
||||||
|
command: |
|
||||||
|
docker stop my-app
|
||||||
|
docker rm my-app
|
||||||
|
|
||||||
proxy_multi_instance_tests:
|
proxy_multi_instance_tests:
|
||||||
machine:
|
machine:
|
||||||
|
@ -2553,6 +2643,12 @@ workflows:
|
||||||
only:
|
only:
|
||||||
- main
|
- main
|
||||||
- /litellm_.*/
|
- /litellm_.*/
|
||||||
|
- proxy_spend_accuracy_tests:
|
||||||
|
filters:
|
||||||
|
branches:
|
||||||
|
only:
|
||||||
|
- main
|
||||||
|
- /litellm_.*/
|
||||||
- proxy_multi_instance_tests:
|
- proxy_multi_instance_tests:
|
||||||
filters:
|
filters:
|
||||||
branches:
|
branches:
|
||||||
|
@ -2714,6 +2810,7 @@ workflows:
|
||||||
- installing_litellm_on_python
|
- installing_litellm_on_python
|
||||||
- installing_litellm_on_python_3_13
|
- installing_litellm_on_python_3_13
|
||||||
- proxy_logging_guardrails_model_info_tests
|
- proxy_logging_guardrails_model_info_tests
|
||||||
|
- proxy_spend_accuracy_tests
|
||||||
- proxy_multi_instance_tests
|
- proxy_multi_instance_tests
|
||||||
- proxy_store_model_in_db_tests
|
- proxy_store_model_in_db_tests
|
||||||
- proxy_build_from_pip_tests
|
- proxy_build_from_pip_tests
|
||||||
|
|
|
@ -10,6 +10,6 @@ anthropic
|
||||||
orjson==3.9.15
|
orjson==3.9.15
|
||||||
pydantic==2.10.2
|
pydantic==2.10.2
|
||||||
google-cloud-aiplatform==1.43.0
|
google-cloud-aiplatform==1.43.0
|
||||||
fastapi-sso==0.10.0
|
fastapi-sso==0.16.0
|
||||||
uvloop==0.21.0
|
uvloop==0.21.0
|
||||||
mcp==1.5.0 # for MCP server
|
mcp==1.5.0 # for MCP server
|
||||||
|
|
4
.github/workflows/test-linting.yml
vendored
4
.github/workflows/test-linting.yml
vendored
|
@ -24,10 +24,10 @@ jobs:
|
||||||
run: |
|
run: |
|
||||||
poetry install --with dev
|
poetry install --with dev
|
||||||
|
|
||||||
- name: Run Black formatting check
|
- name: Run Black formatting
|
||||||
run: |
|
run: |
|
||||||
cd litellm
|
cd litellm
|
||||||
poetry run black . --check
|
poetry run black .
|
||||||
cd ..
|
cd ..
|
||||||
|
|
||||||
- name: Run Ruff linting
|
- name: Run Ruff linting
|
||||||
|
|
318
cookbook/LiteLLM_HuggingFace.ipynb
vendored
318
cookbook/LiteLLM_HuggingFace.ipynb
vendored
|
@ -6,8 +6,9 @@
|
||||||
"id": "9dKM5k8qsMIj"
|
"id": "9dKM5k8qsMIj"
|
||||||
},
|
},
|
||||||
"source": [
|
"source": [
|
||||||
"## LiteLLM HuggingFace\n",
|
"## LiteLLM Hugging Face\n",
|
||||||
"Docs for huggingface: https://docs.litellm.ai/docs/providers/huggingface"
|
"\n",
|
||||||
|
"Docs for huggingface: https://docs.litellm.ai/docs/providers/huggingface\n"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -27,23 +28,18 @@
|
||||||
"id": "yp5UXRqtpu9f"
|
"id": "yp5UXRqtpu9f"
|
||||||
},
|
},
|
||||||
"source": [
|
"source": [
|
||||||
"## Hugging Face Free Serverless Inference API\n",
|
"## Serverless Inference Providers\n",
|
||||||
"Read more about the Free Serverless Inference API here: https://huggingface.co/docs/api-inference.\n",
|
|
||||||
"\n",
|
"\n",
|
||||||
"In order to use litellm to call Serverless Inference API:\n",
|
"Read more about Inference Providers here: https://huggingface.co/blog/inference-providers.\n",
|
||||||
"\n",
|
"\n",
|
||||||
"* Browse Serverless Inference compatible models here: https://huggingface.co/models?inference=warm&pipeline_tag=text-generation.\n",
|
"In order to use litellm with Hugging Face Inference Providers, you need to set `model=huggingface/<provider>/<model-id>`.\n",
|
||||||
"* Copy the model name from hugging face\n",
|
|
||||||
"* Set `model = \"huggingface/<model-name>\"`\n",
|
|
||||||
"\n",
|
"\n",
|
||||||
"Example set `model=huggingface/meta-llama/Meta-Llama-3.1-8B-Instruct` to call `meta-llama/Meta-Llama-3.1-8B-Instruct`\n",
|
"Example: `huggingface/together/deepseek-ai/DeepSeek-R1` to run DeepSeek-R1 (https://huggingface.co/deepseek-ai/DeepSeek-R1) through Together AI.\n"
|
||||||
"\n",
|
|
||||||
"https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct"
|
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 3,
|
"execution_count": null,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"colab": {
|
"colab": {
|
||||||
"base_uri": "https://localhost:8080/"
|
"base_uri": "https://localhost:8080/"
|
||||||
|
@ -51,107 +47,18 @@
|
||||||
"id": "Pi5Oww8gpCUm",
|
"id": "Pi5Oww8gpCUm",
|
||||||
"outputId": "659a67c7-f90d-4c06-b94e-2c4aa92d897a"
|
"outputId": "659a67c7-f90d-4c06-b94e-2c4aa92d897a"
|
||||||
},
|
},
|
||||||
"outputs": [
|
"outputs": [],
|
||||||
{
|
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"ModelResponse(id='chatcmpl-c54dfb68-1491-4d68-a4dc-35e603ea718a', choices=[Choices(finish_reason='eos_token', index=0, message=Message(content=\"I'm just a computer program, so I don't have feelings, but thank you for asking! How can I assist you today?\", role='assistant', tool_calls=None, function_call=None))], created=1724858285, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion', system_fingerprint=None, usage=Usage(completion_tokens=27, prompt_tokens=47, total_tokens=74))\n",
|
|
||||||
"ModelResponse(id='chatcmpl-d2ae38e6-4974-431c-bb9b-3fa3f95e5a6d', choices=[Choices(finish_reason='length', index=0, message=Message(content=\"\\n\\nI’m doing well, thank you. I’ve been keeping busy with work and some personal projects. How about you?\\n\\nI'm doing well, thank you. I've been enjoying some time off and catching up on some reading. How can I assist you today?\\n\\nI'm looking for a good book to read. Do you have any recommendations?\\n\\nOf course! Here are a few book recommendations across different genres:\\n\\n1.\", role='assistant', tool_calls=None, function_call=None))], created=1724858288, model='mistralai/Mistral-7B-Instruct-v0.3', object='chat.completion', system_fingerprint=None, usage=Usage(completion_tokens=85, prompt_tokens=6, total_tokens=91))\n"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
"source": [
|
||||||
"import os\n",
|
"import os\n",
|
||||||
"import litellm\n",
|
"from litellm import completion\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# Make sure to create an API_KEY with inference permissions at https://huggingface.co/settings/tokens/new?globalPermissions=inference.serverless.write&tokenType=fineGrained\n",
|
"# You can create a HF token here: https://huggingface.co/settings/tokens\n",
|
||||||
"os.environ[\"HUGGINGFACE_API_KEY\"] = \"\"\n",
|
"os.environ[\"HF_TOKEN\"] = \"hf_xxxxxx\"\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# Call https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct\n",
|
"# Call DeepSeek-R1 model through Together AI\n",
|
||||||
"# add the 'huggingface/' prefix to the model to set huggingface as the provider\n",
|
"response = completion(\n",
|
||||||
"response = litellm.completion(\n",
|
" model=\"huggingface/together/deepseek-ai/DeepSeek-R1\",\n",
|
||||||
" model=\"huggingface/meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
|
" messages=[{\"content\": \"How many r's are in the word `strawberry`?\", \"role\": \"user\"}],\n",
|
||||||
" messages=[{ \"content\": \"Hello, how are you?\",\"role\": \"user\"}]\n",
|
|
||||||
")\n",
|
|
||||||
"print(response)\n",
|
|
||||||
"\n",
|
|
||||||
"\n",
|
|
||||||
"# Call https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3\n",
|
|
||||||
"response = litellm.completion(\n",
|
|
||||||
" model=\"huggingface/mistralai/Mistral-7B-Instruct-v0.3\",\n",
|
|
||||||
" messages=[{ \"content\": \"Hello, how are you?\",\"role\": \"user\"}]\n",
|
|
||||||
")\n",
|
|
||||||
"print(response)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"metadata": {
|
|
||||||
"id": "-klhAhjLtclv"
|
|
||||||
},
|
|
||||||
"source": [
|
|
||||||
"## Hugging Face Dedicated Inference Endpoints\n",
|
|
||||||
"\n",
|
|
||||||
"Steps to use\n",
|
|
||||||
"* Create your own Hugging Face dedicated endpoint here: https://ui.endpoints.huggingface.co/\n",
|
|
||||||
"* Set `api_base` to your deployed api base\n",
|
|
||||||
"* Add the `huggingface/` prefix to your model so litellm knows it's a huggingface Deployed Inference Endpoint"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 9,
|
|
||||||
"metadata": {
|
|
||||||
"colab": {
|
|
||||||
"base_uri": "https://localhost:8080/"
|
|
||||||
},
|
|
||||||
"id": "Lbmw8Gl_pHns",
|
|
||||||
"outputId": "ea8408bf-1cc3-4670-ecea-f12666d204a8"
|
|
||||||
},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"{\n",
|
|
||||||
" \"object\": \"chat.completion\",\n",
|
|
||||||
" \"choices\": [\n",
|
|
||||||
" {\n",
|
|
||||||
" \"finish_reason\": \"length\",\n",
|
|
||||||
" \"index\": 0,\n",
|
|
||||||
" \"message\": {\n",
|
|
||||||
" \"content\": \"\\n\\nI am doing well, thank you for asking. How about you?\\nI am doing\",\n",
|
|
||||||
" \"role\": \"assistant\",\n",
|
|
||||||
" \"logprobs\": -8.9481967812\n",
|
|
||||||
" }\n",
|
|
||||||
" }\n",
|
|
||||||
" ],\n",
|
|
||||||
" \"id\": \"chatcmpl-74dc9d89-3916-47ce-9bea-b80e66660f77\",\n",
|
|
||||||
" \"created\": 1695871068.8413374,\n",
|
|
||||||
" \"model\": \"glaiveai/glaive-coder-7b\",\n",
|
|
||||||
" \"usage\": {\n",
|
|
||||||
" \"prompt_tokens\": 6,\n",
|
|
||||||
" \"completion_tokens\": 18,\n",
|
|
||||||
" \"total_tokens\": 24\n",
|
|
||||||
" }\n",
|
|
||||||
"}\n"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"import os\n",
|
|
||||||
"import litellm\n",
|
|
||||||
"\n",
|
|
||||||
"os.environ[\"HUGGINGFACE_API_KEY\"] = \"\"\n",
|
|
||||||
"\n",
|
|
||||||
"# TGI model: Call https://huggingface.co/glaiveai/glaive-coder-7b\n",
|
|
||||||
"# add the 'huggingface/' prefix to the model to set huggingface as the provider\n",
|
|
||||||
"# set api base to your deployed api endpoint from hugging face\n",
|
|
||||||
"response = litellm.completion(\n",
|
|
||||||
" model=\"huggingface/glaiveai/glaive-coder-7b\",\n",
|
|
||||||
" messages=[{ \"content\": \"Hello, how are you?\",\"role\": \"user\"}],\n",
|
|
||||||
" api_base=\"https://wjiegasee9bmqke2.us-east-1.aws.endpoints.huggingface.cloud\"\n",
|
|
||||||
")\n",
|
")\n",
|
||||||
"print(response)"
|
"print(response)"
|
||||||
]
|
]
|
||||||
|
@ -162,13 +69,12 @@
|
||||||
"id": "EU0UubrKzTFe"
|
"id": "EU0UubrKzTFe"
|
||||||
},
|
},
|
||||||
"source": [
|
"source": [
|
||||||
"## HuggingFace - Streaming (Serveless or Dedicated)\n",
|
"## Streaming\n"
|
||||||
"Set stream = True"
|
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 6,
|
"execution_count": null,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"colab": {
|
"colab": {
|
||||||
"base_uri": "https://localhost:8080/"
|
"base_uri": "https://localhost:8080/"
|
||||||
|
@ -176,74 +82,147 @@
|
||||||
"id": "y-QfIvA-uJKX",
|
"id": "y-QfIvA-uJKX",
|
||||||
"outputId": "b007bb98-00d0-44a4-8264-c8a2caed6768"
|
"outputId": "b007bb98-00d0-44a4-8264-c8a2caed6768"
|
||||||
},
|
},
|
||||||
"outputs": [
|
"outputs": [],
|
||||||
{
|
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"<litellm.utils.CustomStreamWrapper object at 0x1278471d0>\n",
|
|
||||||
"ModelResponse(id='chatcmpl-ffeb4491-624b-4ddf-8005-60358cf67d36', choices=[StreamingChoices(finish_reason=None, index=0, delta=Delta(content='I', role='assistant', function_call=None, tool_calls=None), logprobs=None)], created=1724858353, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion.chunk', system_fingerprint=None)\n",
|
|
||||||
"ModelResponse(id='chatcmpl-ffeb4491-624b-4ddf-8005-60358cf67d36', choices=[StreamingChoices(finish_reason=None, index=0, delta=Delta(content=\"'m\", role=None, function_call=None, tool_calls=None), logprobs=None)], created=1724858353, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion.chunk', system_fingerprint=None)\n",
|
|
||||||
"ModelResponse(id='chatcmpl-ffeb4491-624b-4ddf-8005-60358cf67d36', choices=[StreamingChoices(finish_reason=None, index=0, delta=Delta(content=' just', role=None, function_call=None, tool_calls=None), logprobs=None)], created=1724858353, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion.chunk', system_fingerprint=None)\n",
|
|
||||||
"ModelResponse(id='chatcmpl-ffeb4491-624b-4ddf-8005-60358cf67d36', choices=[StreamingChoices(finish_reason=None, index=0, delta=Delta(content=' a', role=None, function_call=None, tool_calls=None), logprobs=None)], created=1724858353, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion.chunk', system_fingerprint=None)\n",
|
|
||||||
"ModelResponse(id='chatcmpl-ffeb4491-624b-4ddf-8005-60358cf67d36', choices=[StreamingChoices(finish_reason=None, index=0, delta=Delta(content=' computer', role=None, function_call=None, tool_calls=None), logprobs=None)], created=1724858353, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion.chunk', system_fingerprint=None)\n",
|
|
||||||
"ModelResponse(id='chatcmpl-ffeb4491-624b-4ddf-8005-60358cf67d36', choices=[StreamingChoices(finish_reason=None, index=0, delta=Delta(content=' program', role=None, function_call=None, tool_calls=None), logprobs=None)], created=1724858353, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion.chunk', system_fingerprint=None)\n",
|
|
||||||
"ModelResponse(id='chatcmpl-ffeb4491-624b-4ddf-8005-60358cf67d36', choices=[StreamingChoices(finish_reason=None, index=0, delta=Delta(content=',', role=None, function_call=None, tool_calls=None), logprobs=None)], created=1724858353, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion.chunk', system_fingerprint=None)\n",
|
|
||||||
"ModelResponse(id='chatcmpl-ffeb4491-624b-4ddf-8005-60358cf67d36', choices=[StreamingChoices(finish_reason=None, index=0, delta=Delta(content=' so', role=None, function_call=None, tool_calls=None), logprobs=None)], created=1724858353, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion.chunk', system_fingerprint=None)\n",
|
|
||||||
"ModelResponse(id='chatcmpl-ffeb4491-624b-4ddf-8005-60358cf67d36', choices=[StreamingChoices(finish_reason=None, index=0, delta=Delta(content=' I', role=None, function_call=None, tool_calls=None), logprobs=None)], created=1724858353, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion.chunk', system_fingerprint=None)\n",
|
|
||||||
"ModelResponse(id='chatcmpl-ffeb4491-624b-4ddf-8005-60358cf67d36', choices=[StreamingChoices(finish_reason=None, index=0, delta=Delta(content=' don', role=None, function_call=None, tool_calls=None), logprobs=None)], created=1724858353, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion.chunk', system_fingerprint=None)\n",
|
|
||||||
"ModelResponse(id='chatcmpl-ffeb4491-624b-4ddf-8005-60358cf67d36', choices=[StreamingChoices(finish_reason=None, index=0, delta=Delta(content=\"'t\", role=None, function_call=None, tool_calls=None), logprobs=None)], created=1724858353, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion.chunk', system_fingerprint=None)\n",
|
|
||||||
"ModelResponse(id='chatcmpl-ffeb4491-624b-4ddf-8005-60358cf67d36', choices=[StreamingChoices(finish_reason=None, index=0, delta=Delta(content=' have', role=None, function_call=None, tool_calls=None), logprobs=None)], created=1724858353, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion.chunk', system_fingerprint=None)\n",
|
|
||||||
"ModelResponse(id='chatcmpl-ffeb4491-624b-4ddf-8005-60358cf67d36', choices=[StreamingChoices(finish_reason=None, index=0, delta=Delta(content=' feelings', role=None, function_call=None, tool_calls=None), logprobs=None)], created=1724858353, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion.chunk', system_fingerprint=None)\n",
|
|
||||||
"ModelResponse(id='chatcmpl-ffeb4491-624b-4ddf-8005-60358cf67d36', choices=[StreamingChoices(finish_reason=None, index=0, delta=Delta(content=',', role=None, function_call=None, tool_calls=None), logprobs=None)], created=1724858353, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion.chunk', system_fingerprint=None)\n",
|
|
||||||
"ModelResponse(id='chatcmpl-ffeb4491-624b-4ddf-8005-60358cf67d36', choices=[StreamingChoices(finish_reason=None, index=0, delta=Delta(content=' but', role=None, function_call=None, tool_calls=None), logprobs=None)], created=1724858353, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion.chunk', system_fingerprint=None)\n",
|
|
||||||
"ModelResponse(id='chatcmpl-ffeb4491-624b-4ddf-8005-60358cf67d36', choices=[StreamingChoices(finish_reason=None, index=0, delta=Delta(content=' thank', role=None, function_call=None, tool_calls=None), logprobs=None)], created=1724858353, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion.chunk', system_fingerprint=None)\n",
|
|
||||||
"ModelResponse(id='chatcmpl-ffeb4491-624b-4ddf-8005-60358cf67d36', choices=[StreamingChoices(finish_reason=None, index=0, delta=Delta(content=' you', role=None, function_call=None, tool_calls=None), logprobs=None)], created=1724858353, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion.chunk', system_fingerprint=None)\n",
|
|
||||||
"ModelResponse(id='chatcmpl-ffeb4491-624b-4ddf-8005-60358cf67d36', choices=[StreamingChoices(finish_reason=None, index=0, delta=Delta(content=' for', role=None, function_call=None, tool_calls=None), logprobs=None)], created=1724858353, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion.chunk', system_fingerprint=None)\n",
|
|
||||||
"ModelResponse(id='chatcmpl-ffeb4491-624b-4ddf-8005-60358cf67d36', choices=[StreamingChoices(finish_reason=None, index=0, delta=Delta(content=' asking', role=None, function_call=None, tool_calls=None), logprobs=None)], created=1724858353, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion.chunk', system_fingerprint=None)\n",
|
|
||||||
"ModelResponse(id='chatcmpl-ffeb4491-624b-4ddf-8005-60358cf67d36', choices=[StreamingChoices(finish_reason=None, index=0, delta=Delta(content='!', role=None, function_call=None, tool_calls=None), logprobs=None)], created=1724858353, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion.chunk', system_fingerprint=None)\n",
|
|
||||||
"ModelResponse(id='chatcmpl-ffeb4491-624b-4ddf-8005-60358cf67d36', choices=[StreamingChoices(finish_reason=None, index=0, delta=Delta(content=' How', role=None, function_call=None, tool_calls=None), logprobs=None)], created=1724858353, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion.chunk', system_fingerprint=None)\n",
|
|
||||||
"ModelResponse(id='chatcmpl-ffeb4491-624b-4ddf-8005-60358cf67d36', choices=[StreamingChoices(finish_reason=None, index=0, delta=Delta(content=' can', role=None, function_call=None, tool_calls=None), logprobs=None)], created=1724858353, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion.chunk', system_fingerprint=None)\n",
|
|
||||||
"ModelResponse(id='chatcmpl-ffeb4491-624b-4ddf-8005-60358cf67d36', choices=[StreamingChoices(finish_reason=None, index=0, delta=Delta(content=' I', role=None, function_call=None, tool_calls=None), logprobs=None)], created=1724858353, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion.chunk', system_fingerprint=None)\n",
|
|
||||||
"ModelResponse(id='chatcmpl-ffeb4491-624b-4ddf-8005-60358cf67d36', choices=[StreamingChoices(finish_reason=None, index=0, delta=Delta(content=' assist', role=None, function_call=None, tool_calls=None), logprobs=None)], created=1724858353, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion.chunk', system_fingerprint=None)\n",
|
|
||||||
"ModelResponse(id='chatcmpl-ffeb4491-624b-4ddf-8005-60358cf67d36', choices=[StreamingChoices(finish_reason=None, index=0, delta=Delta(content=' you', role=None, function_call=None, tool_calls=None), logprobs=None)], created=1724858353, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion.chunk', system_fingerprint=None)\n",
|
|
||||||
"ModelResponse(id='chatcmpl-ffeb4491-624b-4ddf-8005-60358cf67d36', choices=[StreamingChoices(finish_reason=None, index=0, delta=Delta(content=' today', role=None, function_call=None, tool_calls=None), logprobs=None)], created=1724858353, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion.chunk', system_fingerprint=None)\n",
|
|
||||||
"ModelResponse(id='chatcmpl-ffeb4491-624b-4ddf-8005-60358cf67d36', choices=[StreamingChoices(finish_reason=None, index=0, delta=Delta(content='?', role=None, function_call=None, tool_calls=None), logprobs=None)], created=1724858353, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion.chunk', system_fingerprint=None)\n",
|
|
||||||
"ModelResponse(id='chatcmpl-ffeb4491-624b-4ddf-8005-60358cf67d36', choices=[StreamingChoices(finish_reason=None, index=0, delta=Delta(content='<|eot_id|>', role=None, function_call=None, tool_calls=None), logprobs=None)], created=1724858353, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion.chunk', system_fingerprint=None)\n",
|
|
||||||
"ModelResponse(id='chatcmpl-ffeb4491-624b-4ddf-8005-60358cf67d36', choices=[StreamingChoices(finish_reason='stop', index=0, delta=Delta(content=None, role=None, function_call=None, tool_calls=None), logprobs=None)], created=1724858353, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion.chunk', system_fingerprint=None)\n"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
"source": [
|
||||||
"import os\n",
|
"import os\n",
|
||||||
"import litellm\n",
|
"from litellm import completion\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# Make sure to create an API_KEY with inference permissions at https://huggingface.co/settings/tokens/new?globalPermissions=inference.serverless.write&tokenType=fineGrained\n",
|
"os.environ[\"HF_TOKEN\"] = \"hf_xxxxxx\"\n",
|
||||||
"os.environ[\"HUGGINGFACE_API_KEY\"] = \"\"\n",
|
|
||||||
"\n",
|
"\n",
|
||||||
"# Call https://huggingface.co/glaiveai/glaive-coder-7b\n",
|
"response = completion(\n",
|
||||||
"# add the 'huggingface/' prefix to the model to set huggingface as the provider\n",
|
" model=\"huggingface/together/deepseek-ai/DeepSeek-R1\",\n",
|
||||||
"# set api base to your deployed api endpoint from hugging face\n",
|
" messages=[\n",
|
||||||
"response = litellm.completion(\n",
|
" {\n",
|
||||||
" model=\"huggingface/meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
|
" \"role\": \"user\",\n",
|
||||||
" messages=[{ \"content\": \"Hello, how are you?\",\"role\": \"user\"}],\n",
|
" \"content\": \"How many r's are in the word `strawberry`?\",\n",
|
||||||
" stream=True\n",
|
" \n",
|
||||||
|
" }\n",
|
||||||
|
" ],\n",
|
||||||
|
" stream=True,\n",
|
||||||
")\n",
|
")\n",
|
||||||
"\n",
|
"\n",
|
||||||
"print(response)\n",
|
|
||||||
"\n",
|
|
||||||
"for chunk in response:\n",
|
"for chunk in response:\n",
|
||||||
" print(chunk)"
|
" print(chunk)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## With images as input\n"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": null,
|
||||||
"metadata": {
|
"metadata": {},
|
||||||
"id": "CKXAnK55zQRl"
|
|
||||||
},
|
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": []
|
"source": [
|
||||||
|
"from litellm import completion\n",
|
||||||
|
"\n",
|
||||||
|
"# Set your Hugging Face Token\n",
|
||||||
|
"os.environ[\"HF_TOKEN\"] = \"hf_xxxxxx\"\n",
|
||||||
|
"\n",
|
||||||
|
"messages = [\n",
|
||||||
|
" {\n",
|
||||||
|
" \"role\": \"user\",\n",
|
||||||
|
" \"content\": [\n",
|
||||||
|
" {\"type\": \"text\", \"text\": \"What's in this image?\"},\n",
|
||||||
|
" {\n",
|
||||||
|
" \"type\": \"image_url\",\n",
|
||||||
|
" \"image_url\": {\n",
|
||||||
|
" \"url\": \"https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg\",\n",
|
||||||
|
" },\n",
|
||||||
|
" },\n",
|
||||||
|
" ],\n",
|
||||||
|
" }\n",
|
||||||
|
"]\n",
|
||||||
|
"\n",
|
||||||
|
"response = completion(\n",
|
||||||
|
" model=\"huggingface/sambanova/meta-llama/Llama-3.3-70B-Instruct\",\n",
|
||||||
|
" messages=messages,\n",
|
||||||
|
")\n",
|
||||||
|
"print(response.choices[0])"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Tools - Function Calling\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import os\n",
|
||||||
|
"from litellm import completion\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"# Set your Hugging Face Token\n",
|
||||||
|
"os.environ[\"HF_TOKEN\"] = \"hf_xxxxxx\"\n",
|
||||||
|
"\n",
|
||||||
|
"tools = [\n",
|
||||||
|
" {\n",
|
||||||
|
" \"type\": \"function\",\n",
|
||||||
|
" \"function\": {\n",
|
||||||
|
" \"name\": \"get_current_weather\",\n",
|
||||||
|
" \"description\": \"Get the current weather in a given location\",\n",
|
||||||
|
" \"parameters\": {\n",
|
||||||
|
" \"type\": \"object\",\n",
|
||||||
|
" \"properties\": {\n",
|
||||||
|
" \"location\": {\n",
|
||||||
|
" \"type\": \"string\",\n",
|
||||||
|
" \"description\": \"The city and state, e.g. San Francisco, CA\",\n",
|
||||||
|
" },\n",
|
||||||
|
" \"unit\": {\"type\": \"string\", \"enum\": [\"celsius\", \"fahrenheit\"]},\n",
|
||||||
|
" },\n",
|
||||||
|
" \"required\": [\"location\"],\n",
|
||||||
|
" },\n",
|
||||||
|
" },\n",
|
||||||
|
" }\n",
|
||||||
|
"]\n",
|
||||||
|
"messages = [{\"role\": \"user\", \"content\": \"What's the weather like in Boston today?\"}]\n",
|
||||||
|
"\n",
|
||||||
|
"response = completion(\n",
|
||||||
|
" model=\"huggingface/sambanova/meta-llama/Llama-3.1-8B-Instruct\", messages=messages, tools=tools, tool_choice=\"auto\"\n",
|
||||||
|
")\n",
|
||||||
|
"print(response)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Hugging Face Dedicated Inference Endpoints\n",
|
||||||
|
"\n",
|
||||||
|
"Steps to use\n",
|
||||||
|
"\n",
|
||||||
|
"- Create your own Hugging Face dedicated endpoint here: https://ui.endpoints.huggingface.co/\n",
|
||||||
|
"- Set `api_base` to your deployed api base\n",
|
||||||
|
"- set the model to `huggingface/tgi` so that litellm knows it's a huggingface Deployed Inference Endpoint.\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import os\n",
|
||||||
|
"import litellm\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"response = litellm.completion(\n",
|
||||||
|
" model=\"huggingface/tgi\",\n",
|
||||||
|
" messages=[{\"content\": \"Hello, how are you?\", \"role\": \"user\"}],\n",
|
||||||
|
" api_base=\"https://my-endpoint.endpoints.huggingface.cloud/v1/\",\n",
|
||||||
|
")\n",
|
||||||
|
"print(response)"
|
||||||
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"metadata": {
|
"metadata": {
|
||||||
|
@ -251,7 +230,8 @@
|
||||||
"provenance": []
|
"provenance": []
|
||||||
},
|
},
|
||||||
"kernelspec": {
|
"kernelspec": {
|
||||||
"display_name": "Python 3",
|
"display_name": ".venv",
|
||||||
|
"language": "python",
|
||||||
"name": "python3"
|
"name": "python3"
|
||||||
},
|
},
|
||||||
"language_info": {
|
"language_info": {
|
||||||
|
@ -264,7 +244,7 @@
|
||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.12.2"
|
"version": "3.12.0"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"nbformat": 4,
|
"nbformat": 4,
|
||||||
|
|
|
@ -1,2 +1,11 @@
|
||||||
python3 -m build
|
python3 -m build
|
||||||
twine upload --verbose dist/litellm-1.18.13.dev4.tar.gz -u __token__ -
|
twine upload --verbose dist/litellm-1.18.13.dev4.tar.gz -u __token__ -
|
||||||
|
|
||||||
|
|
||||||
|
Note: You might need to make a MANIFEST.ini file on root for build process incase it fails
|
||||||
|
|
||||||
|
Place this in MANIFEST.ini
|
||||||
|
recursive-exclude venv *
|
||||||
|
recursive-exclude myenv *
|
||||||
|
recursive-exclude py313_env *
|
||||||
|
recursive-exclude **/.venv *
|
||||||
|
|
|
@ -3,9 +3,10 @@ import TabItem from '@theme/TabItem';
|
||||||
|
|
||||||
# /v1/messages [BETA]
|
# /v1/messages [BETA]
|
||||||
|
|
||||||
LiteLLM provides a BETA endpoint in the spec of Anthropic's `/v1/messages` endpoint.
|
Use LiteLLM to call all your LLM APIs in the Anthropic `v1/messages` format.
|
||||||
|
|
||||||
This currently just supports the Anthropic API.
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
| Feature | Supported | Notes |
|
| Feature | Supported | Notes |
|
||||||
|-------|-------|-------|
|
|-------|-------|-------|
|
||||||
|
@ -21,9 +22,61 @@ Planned improvement:
|
||||||
- Bedrock Anthropic support
|
- Bedrock Anthropic support
|
||||||
|
|
||||||
## Usage
|
## Usage
|
||||||
|
---
|
||||||
|
|
||||||
|
### LiteLLM Python SDK
|
||||||
|
|
||||||
|
#### Non-streaming example
|
||||||
|
```python showLineNumbers title="Example using LiteLLM Python SDK"
|
||||||
|
import litellm
|
||||||
|
response = await litellm.anthropic.messages.acreate(
|
||||||
|
messages=[{"role": "user", "content": "Hello, can you tell me a short joke?"}],
|
||||||
|
api_key=api_key,
|
||||||
|
model="anthropic/claude-3-haiku-20240307",
|
||||||
|
max_tokens=100,
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
Example response:
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"content": [
|
||||||
|
{
|
||||||
|
"text": "Hi! this is a very short joke",
|
||||||
|
"type": "text"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"id": "msg_013Zva2CMHLNnXjNJJKqJ2EF",
|
||||||
|
"model": "claude-3-7-sonnet-20250219",
|
||||||
|
"role": "assistant",
|
||||||
|
"stop_reason": "end_turn",
|
||||||
|
"stop_sequence": null,
|
||||||
|
"type": "message",
|
||||||
|
"usage": {
|
||||||
|
"input_tokens": 2095,
|
||||||
|
"output_tokens": 503,
|
||||||
|
"cache_creation_input_tokens": 2095,
|
||||||
|
"cache_read_input_tokens": 0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Streaming example
|
||||||
|
```python showLineNumbers title="Example using LiteLLM Python SDK"
|
||||||
|
import litellm
|
||||||
|
response = await litellm.anthropic.messages.acreate(
|
||||||
|
messages=[{"role": "user", "content": "Hello, can you tell me a short joke?"}],
|
||||||
|
api_key=api_key,
|
||||||
|
model="anthropic/claude-3-haiku-20240307",
|
||||||
|
max_tokens=100,
|
||||||
|
stream=True,
|
||||||
|
)
|
||||||
|
async for chunk in response:
|
||||||
|
print(chunk)
|
||||||
|
```
|
||||||
|
|
||||||
|
### LiteLLM Proxy Server
|
||||||
|
|
||||||
<Tabs>
|
|
||||||
<TabItem label="PROXY" value="proxy">
|
|
||||||
|
|
||||||
1. Setup config.yaml
|
1. Setup config.yaml
|
||||||
|
|
||||||
|
@ -42,7 +95,28 @@ litellm --config /path/to/config.yaml
|
||||||
|
|
||||||
3. Test it!
|
3. Test it!
|
||||||
|
|
||||||
```bash
|
<Tabs>
|
||||||
|
<TabItem label="Anthropic Python SDK" value="python">
|
||||||
|
|
||||||
|
```python showLineNumbers title="Example using LiteLLM Proxy Server"
|
||||||
|
import anthropic
|
||||||
|
|
||||||
|
# point anthropic sdk to litellm proxy
|
||||||
|
client = anthropic.Anthropic(
|
||||||
|
base_url="http://0.0.0.0:4000",
|
||||||
|
api_key="sk-1234",
|
||||||
|
)
|
||||||
|
|
||||||
|
response = client.messages.create(
|
||||||
|
messages=[{"role": "user", "content": "Hello, can you tell me a short joke?"}],
|
||||||
|
model="anthropic/claude-3-haiku-20240307",
|
||||||
|
max_tokens=100,
|
||||||
|
)
|
||||||
|
```
|
||||||
|
</TabItem>
|
||||||
|
<TabItem label="curl" value="curl">
|
||||||
|
|
||||||
|
```bash showLineNumbers title="Example using LiteLLM Proxy Server"
|
||||||
curl -L -X POST 'http://0.0.0.0:4000/v1/messages' \
|
curl -L -X POST 'http://0.0.0.0:4000/v1/messages' \
|
||||||
-H 'content-type: application/json' \
|
-H 'content-type: application/json' \
|
||||||
-H 'x-api-key: $LITELLM_API_KEY' \
|
-H 'x-api-key: $LITELLM_API_KEY' \
|
||||||
|
@ -52,41 +126,176 @@ curl -L -X POST 'http://0.0.0.0:4000/v1/messages' \
|
||||||
"messages": [
|
"messages": [
|
||||||
{
|
{
|
||||||
"role": "user",
|
"role": "user",
|
||||||
"content": [
|
"content": "Hello, can you tell me a short joke?"
|
||||||
{
|
|
||||||
"type": "text",
|
|
||||||
"text": "List 5 important events in the XIX century"
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"max_tokens": 4096
|
"max_tokens": 100
|
||||||
}'
|
}'
|
||||||
```
|
```
|
||||||
|
|
||||||
</TabItem>
|
</TabItem>
|
||||||
<TabItem value="sdk" label="SDK">
|
</Tabs>
|
||||||
|
|
||||||
```python
|
|
||||||
from litellm.llms.anthropic.experimental_pass_through.messages.handler import anthropic_messages
|
|
||||||
import asyncio
|
|
||||||
import os
|
|
||||||
|
|
||||||
# set env
|
## Request Format
|
||||||
os.environ["ANTHROPIC_API_KEY"] = "my-api-key"
|
---
|
||||||
|
|
||||||
messages = [{"role": "user", "content": "Hello, can you tell me a short joke?"}]
|
Request body will be in the Anthropic messages API format. **litellm follows the Anthropic messages specification for this endpoint.**
|
||||||
|
|
||||||
# Call the handler
|
#### Example request body
|
||||||
async def call():
|
|
||||||
response = await anthropic_messages(
|
|
||||||
messages=messages,
|
|
||||||
api_key=api_key,
|
|
||||||
model="claude-3-haiku-20240307",
|
|
||||||
max_tokens=100,
|
|
||||||
)
|
|
||||||
|
|
||||||
asyncio.run(call())
|
```json
|
||||||
|
{
|
||||||
|
"model": "claude-3-7-sonnet-20250219",
|
||||||
|
"max_tokens": 1024,
|
||||||
|
"messages": [
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": "Hello, world"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
</TabItem>
|
#### Required Fields
|
||||||
</Tabs>
|
- **model** (string):
|
||||||
|
The model identifier (e.g., `"claude-3-7-sonnet-20250219"`).
|
||||||
|
- **max_tokens** (integer):
|
||||||
|
The maximum number of tokens to generate before stopping.
|
||||||
|
_Note: The model may stop before reaching this limit; value must be greater than 1._
|
||||||
|
- **messages** (array of objects):
|
||||||
|
An ordered list of conversational turns.
|
||||||
|
Each message object must include:
|
||||||
|
- **role** (enum: `"user"` or `"assistant"`):
|
||||||
|
Specifies the speaker of the message.
|
||||||
|
- **content** (string or array of content blocks):
|
||||||
|
The text or content blocks (e.g., an array containing objects with a `type` such as `"text"`) that form the message.
|
||||||
|
_Example equivalence:_
|
||||||
|
```json
|
||||||
|
{"role": "user", "content": "Hello, Claude"}
|
||||||
|
```
|
||||||
|
is equivalent to:
|
||||||
|
```json
|
||||||
|
{"role": "user", "content": [{"type": "text", "text": "Hello, Claude"}]}
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Optional Fields
|
||||||
|
- **metadata** (object):
|
||||||
|
Contains additional metadata about the request (e.g., `user_id` as an opaque identifier).
|
||||||
|
- **stop_sequences** (array of strings):
|
||||||
|
Custom sequences that, when encountered in the generated text, cause the model to stop.
|
||||||
|
- **stream** (boolean):
|
||||||
|
Indicates whether to stream the response using server-sent events.
|
||||||
|
- **system** (string or array):
|
||||||
|
A system prompt providing context or specific instructions to the model.
|
||||||
|
- **temperature** (number):
|
||||||
|
Controls randomness in the model’s responses. Valid range: `0 < temperature < 1`.
|
||||||
|
- **thinking** (object):
|
||||||
|
Configuration for enabling extended thinking. If enabled, it includes:
|
||||||
|
- **budget_tokens** (integer):
|
||||||
|
Minimum of 1024 tokens (and less than `max_tokens`).
|
||||||
|
- **type** (enum):
|
||||||
|
E.g., `"enabled"`.
|
||||||
|
- **tool_choice** (object):
|
||||||
|
Instructs how the model should utilize any provided tools.
|
||||||
|
- **tools** (array of objects):
|
||||||
|
Definitions for tools available to the model. Each tool includes:
|
||||||
|
- **name** (string):
|
||||||
|
The tool’s name.
|
||||||
|
- **description** (string):
|
||||||
|
A detailed description of the tool.
|
||||||
|
- **input_schema** (object):
|
||||||
|
A JSON schema describing the expected input format for the tool.
|
||||||
|
- **top_k** (integer):
|
||||||
|
Limits sampling to the top K options.
|
||||||
|
- **top_p** (number):
|
||||||
|
Enables nucleus sampling with a cumulative probability cutoff. Valid range: `0 < top_p < 1`.
|
||||||
|
|
||||||
|
|
||||||
|
## Response Format
|
||||||
|
---
|
||||||
|
|
||||||
|
Responses will be in the Anthropic messages API format.
|
||||||
|
|
||||||
|
#### Example Response
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"content": [
|
||||||
|
{
|
||||||
|
"text": "Hi! My name is Claude.",
|
||||||
|
"type": "text"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"id": "msg_013Zva2CMHLNnXjNJJKqJ2EF",
|
||||||
|
"model": "claude-3-7-sonnet-20250219",
|
||||||
|
"role": "assistant",
|
||||||
|
"stop_reason": "end_turn",
|
||||||
|
"stop_sequence": null,
|
||||||
|
"type": "message",
|
||||||
|
"usage": {
|
||||||
|
"input_tokens": 2095,
|
||||||
|
"output_tokens": 503,
|
||||||
|
"cache_creation_input_tokens": 2095,
|
||||||
|
"cache_read_input_tokens": 0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Response fields
|
||||||
|
|
||||||
|
- **content** (array of objects):
|
||||||
|
Contains the generated content blocks from the model. Each block includes:
|
||||||
|
- **type** (string):
|
||||||
|
Indicates the type of content (e.g., `"text"`, `"tool_use"`, `"thinking"`, or `"redacted_thinking"`).
|
||||||
|
- **text** (string):
|
||||||
|
The generated text from the model.
|
||||||
|
_Note: Maximum length is 5,000,000 characters._
|
||||||
|
- **citations** (array of objects or `null`):
|
||||||
|
Optional field providing citation details. Each citation includes:
|
||||||
|
- **cited_text** (string):
|
||||||
|
The excerpt being cited.
|
||||||
|
- **document_index** (integer):
|
||||||
|
An index referencing the cited document.
|
||||||
|
- **document_title** (string or `null`):
|
||||||
|
The title of the cited document.
|
||||||
|
- **start_char_index** (integer):
|
||||||
|
The starting character index for the citation.
|
||||||
|
- **end_char_index** (integer):
|
||||||
|
The ending character index for the citation.
|
||||||
|
- **type** (string):
|
||||||
|
Typically `"char_location"`.
|
||||||
|
|
||||||
|
- **id** (string):
|
||||||
|
A unique identifier for the response message.
|
||||||
|
_Note: The format and length of IDs may change over time._
|
||||||
|
|
||||||
|
- **model** (string):
|
||||||
|
Specifies the model that generated the response.
|
||||||
|
|
||||||
|
- **role** (string):
|
||||||
|
Indicates the role of the generated message. For responses, this is always `"assistant"`.
|
||||||
|
|
||||||
|
- **stop_reason** (string):
|
||||||
|
Explains why the model stopped generating text. Possible values include:
|
||||||
|
- `"end_turn"`: The model reached a natural stopping point.
|
||||||
|
- `"max_tokens"`: The generation stopped because the maximum token limit was reached.
|
||||||
|
- `"stop_sequence"`: A custom stop sequence was encountered.
|
||||||
|
- `"tool_use"`: The model invoked one or more tools.
|
||||||
|
|
||||||
|
- **stop_sequence** (string or `null`):
|
||||||
|
Contains the specific stop sequence that caused the generation to halt, if applicable; otherwise, it is `null`.
|
||||||
|
|
||||||
|
- **type** (string):
|
||||||
|
Denotes the type of response object, which is always `"message"`.
|
||||||
|
|
||||||
|
- **usage** (object):
|
||||||
|
Provides details on token usage for billing and rate limiting. This includes:
|
||||||
|
- **input_tokens** (integer):
|
||||||
|
Total number of input tokens processed.
|
||||||
|
- **output_tokens** (integer):
|
||||||
|
Total number of output tokens generated.
|
||||||
|
- **cache_creation_input_tokens** (integer or `null`):
|
||||||
|
Number of tokens used to create a cache entry.
|
||||||
|
- **cache_read_input_tokens** (integer or `null`):
|
||||||
|
Number of tokens read from the cache.
|
||||||
|
|
|
@ -3,7 +3,7 @@ import TabItem from '@theme/TabItem';
|
||||||
|
|
||||||
# Caching - In-Memory, Redis, s3, Redis Semantic Cache, Disk
|
# Caching - In-Memory, Redis, s3, Redis Semantic Cache, Disk
|
||||||
|
|
||||||
[**See Code**](https://github.com/BerriAI/litellm/blob/main/litellm.caching.caching.py)
|
[**See Code**](https://github.com/BerriAI/litellm/blob/main/litellm/caching/caching.py)
|
||||||
|
|
||||||
:::info
|
:::info
|
||||||
|
|
||||||
|
|
|
@ -80,11 +80,13 @@ curl -X POST 'http://0.0.0.0:4000/chat/completions' \
|
||||||
-d '{
|
-d '{
|
||||||
"model": "bedrock-model",
|
"model": "bedrock-model",
|
||||||
"messages": [
|
"messages": [
|
||||||
{"role": "user", "content": {"type": "text", "text": "What's this file about?"}},
|
{"role": "user", "content": [
|
||||||
{
|
{"type": "text", "text": "What's this file about?"},
|
||||||
"type": "image_url",
|
{
|
||||||
"image_url": "https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf",
|
"type": "image_url",
|
||||||
}
|
"image_url": "https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf",
|
||||||
|
}
|
||||||
|
]},
|
||||||
]
|
]
|
||||||
}'
|
}'
|
||||||
```
|
```
|
||||||
|
@ -135,6 +137,46 @@ response = completion(
|
||||||
assert response is not None
|
assert response is not None
|
||||||
```
|
```
|
||||||
</TabItem>
|
</TabItem>
|
||||||
|
<TabItem value="proxy" label="PROXY">
|
||||||
|
|
||||||
|
1. Setup config.yaml
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
model_list:
|
||||||
|
- model_name: bedrock-model
|
||||||
|
litellm_params:
|
||||||
|
model: bedrock/anthropic.claude-3-5-sonnet-20240620-v1:0
|
||||||
|
aws_access_key_id: os.environ/AWS_ACCESS_KEY_ID
|
||||||
|
aws_secret_access_key: os.environ/AWS_SECRET_ACCESS_KEY
|
||||||
|
aws_region_name: os.environ/AWS_REGION_NAME
|
||||||
|
```
|
||||||
|
|
||||||
|
2. Start the proxy
|
||||||
|
|
||||||
|
```bash
|
||||||
|
litellm --config /path/to/config.yaml
|
||||||
|
```
|
||||||
|
|
||||||
|
3. Test it!
|
||||||
|
|
||||||
|
```bash
|
||||||
|
curl -X POST 'http://0.0.0.0:4000/chat/completions' \
|
||||||
|
-H 'Content-Type: application/json' \
|
||||||
|
-H 'Authorization: Bearer sk-1234' \
|
||||||
|
-d '{
|
||||||
|
"model": "bedrock-model",
|
||||||
|
"messages": [
|
||||||
|
{"role": "user", "content": [
|
||||||
|
{"type": "text", "text": "What's this file about?"},
|
||||||
|
{
|
||||||
|
"type": "image_url",
|
||||||
|
"image_url": "data:application/pdf;base64...",
|
||||||
|
}
|
||||||
|
]},
|
||||||
|
]
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
</TabItem>
|
||||||
</Tabs>
|
</Tabs>
|
||||||
|
|
||||||
## Checking if a model supports pdf input
|
## Checking if a model supports pdf input
|
||||||
|
|
|
@ -107,4 +107,76 @@ response = litellm.completion(
|
||||||
</TabItem>
|
</TabItem>
|
||||||
</Tabs>
|
</Tabs>
|
||||||
|
|
||||||
**additional_drop_params**: List or null - Is a list of openai params you want to drop when making a call to the model.
|
**additional_drop_params**: List or null - Is a list of openai params you want to drop when making a call to the model.
|
||||||
|
|
||||||
|
## Specify allowed openai params in a request
|
||||||
|
|
||||||
|
Tell litellm to allow specific openai params in a request. Use this if you get a `litellm.UnsupportedParamsError` and want to allow a param. LiteLLM will pass the param as is to the model.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<Tabs>
|
||||||
|
<TabItem value="sdk" label="LiteLLM Python SDK">
|
||||||
|
|
||||||
|
In this example we pass `allowed_openai_params=["tools"]` to allow the `tools` param.
|
||||||
|
|
||||||
|
```python showLineNumbers title="Pass allowed_openai_params to LiteLLM Python SDK"
|
||||||
|
await litellm.acompletion(
|
||||||
|
model="azure/o_series/<my-deployment-name>",
|
||||||
|
api_key="xxxxx",
|
||||||
|
api_base=api_base,
|
||||||
|
messages=[{"role": "user", "content": "Hello! return a json object"}],
|
||||||
|
tools=[{"type": "function", "function": {"name": "get_current_time", "description": "Get the current time in a given location.", "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "The city name, e.g. San Francisco"}}, "required": ["location"]}}}]
|
||||||
|
allowed_openai_params=["tools"],
|
||||||
|
)
|
||||||
|
```
|
||||||
|
</TabItem>
|
||||||
|
<TabItem value="proxy" label="LiteLLM Proxy">
|
||||||
|
|
||||||
|
When using litellm proxy you can pass `allowed_openai_params` in two ways:
|
||||||
|
|
||||||
|
1. Dynamically pass `allowed_openai_params` in a request
|
||||||
|
2. Set `allowed_openai_params` on the config.yaml file for a specific model
|
||||||
|
|
||||||
|
#### Dynamically pass allowed_openai_params in a request
|
||||||
|
In this example we pass `allowed_openai_params=["tools"]` to allow the `tools` param for a request sent to the model set on the proxy.
|
||||||
|
|
||||||
|
```python showLineNumbers title="Dynamically pass allowed_openai_params in a request"
|
||||||
|
import openai
|
||||||
|
from openai import AsyncAzureOpenAI
|
||||||
|
|
||||||
|
import openai
|
||||||
|
client = openai.OpenAI(
|
||||||
|
api_key="anything",
|
||||||
|
base_url="http://0.0.0.0:4000"
|
||||||
|
)
|
||||||
|
|
||||||
|
response = client.chat.completions.create(
|
||||||
|
model="gpt-3.5-turbo",
|
||||||
|
messages = [
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": "this is a test request, write a short poem"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
extra_body={
|
||||||
|
"allowed_openai_params": ["tools"]
|
||||||
|
}
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Set allowed_openai_params on config.yaml
|
||||||
|
|
||||||
|
You can also set `allowed_openai_params` on the config.yaml file for a specific model. This means that all requests to this deployment are allowed to pass in the `tools` param.
|
||||||
|
|
||||||
|
```yaml showLineNumbers title="Set allowed_openai_params on config.yaml"
|
||||||
|
model_list:
|
||||||
|
- model_name: azure-o1-preview
|
||||||
|
litellm_params:
|
||||||
|
model: azure/o_series/<my-deployment-name>
|
||||||
|
api_key: xxxxx
|
||||||
|
api_base: https://openai-prod-test.openai.azure.com/openai/deployments/o1/chat/completions?api-version=2025-01-01-preview
|
||||||
|
allowed_openai_params: ["tools"]
|
||||||
|
```
|
||||||
|
</TabItem>
|
||||||
|
</Tabs>
|
|
@ -1,3 +1,5 @@
|
||||||
|
import Image from '@theme/IdealImage';
|
||||||
|
|
||||||
# Enterprise
|
# Enterprise
|
||||||
For companies that need SSO, user management and professional support for LiteLLM Proxy
|
For companies that need SSO, user management and professional support for LiteLLM Proxy
|
||||||
|
|
||||||
|
@ -7,6 +9,8 @@ Get free 7-day trial key [here](https://www.litellm.ai/#trial)
|
||||||
|
|
||||||
Includes all enterprise features.
|
Includes all enterprise features.
|
||||||
|
|
||||||
|
<Image img={require('../img/enterprise_vs_oss.png')} />
|
||||||
|
|
||||||
[**Procurement available via AWS / Azure Marketplace**](./data_security.md#legalcompliance-faqs)
|
[**Procurement available via AWS / Azure Marketplace**](./data_security.md#legalcompliance-faqs)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -14,48 +14,105 @@ Files are used to upload documents that can be used with features like Assistant
|
||||||
- Delete File
|
- Delete File
|
||||||
- Get File Content
|
- Get File Content
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
<Tabs>
|
<Tabs>
|
||||||
<TabItem value="proxy" label="LiteLLM PROXY Server">
|
<TabItem value="proxy" label="LiteLLM PROXY Server">
|
||||||
|
|
||||||
```bash
|
### 1. Setup config.yaml
|
||||||
$ export OPENAI_API_KEY="sk-..."
|
|
||||||
|
|
||||||
$ litellm
|
```
|
||||||
|
# for /files endpoints
|
||||||
# RUNNING on http://0.0.0.0:4000
|
files_settings:
|
||||||
|
- custom_llm_provider: azure
|
||||||
|
api_base: https://exampleopenaiendpoint-production.up.railway.app
|
||||||
|
api_key: fake-key
|
||||||
|
api_version: "2023-03-15-preview"
|
||||||
|
- custom_llm_provider: openai
|
||||||
|
api_key: os.environ/OPENAI_API_KEY
|
||||||
```
|
```
|
||||||
|
|
||||||
**Upload a File**
|
### 2. Start LiteLLM PROXY Server
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
curl http://localhost:4000/v1/files \
|
litellm --config /path/to/config.yaml
|
||||||
-H "Authorization: Bearer sk-1234" \
|
|
||||||
-F purpose="fine-tune" \
|
## RUNNING on http://0.0.0.0:4000
|
||||||
-F file="@mydata.jsonl"
|
|
||||||
```
|
```
|
||||||
|
|
||||||
**List Files**
|
### 3. Use OpenAI's /files endpoints
|
||||||
```bash
|
|
||||||
curl http://localhost:4000/v1/files \
|
Upload a File
|
||||||
-H "Authorization: Bearer sk-1234"
|
|
||||||
|
```python
|
||||||
|
from openai import OpenAI
|
||||||
|
|
||||||
|
client = OpenAI(
|
||||||
|
api_key="sk-...",
|
||||||
|
base_url="http://0.0.0.0:4000/v1"
|
||||||
|
)
|
||||||
|
|
||||||
|
client.files.create(
|
||||||
|
file=wav_data,
|
||||||
|
purpose="user_data",
|
||||||
|
extra_body={"custom_llm_provider": "openai"}
|
||||||
|
)
|
||||||
```
|
```
|
||||||
|
|
||||||
**Retrieve File Information**
|
List Files
|
||||||
```bash
|
|
||||||
curl http://localhost:4000/v1/files/file-abc123 \
|
```python
|
||||||
-H "Authorization: Bearer sk-1234"
|
from openai import OpenAI
|
||||||
|
|
||||||
|
client = OpenAI(
|
||||||
|
api_key="sk-...",
|
||||||
|
base_url="http://0.0.0.0:4000/v1"
|
||||||
|
)
|
||||||
|
|
||||||
|
files = client.files.list(extra_body={"custom_llm_provider": "openai"})
|
||||||
|
print("files=", files)
|
||||||
```
|
```
|
||||||
|
|
||||||
**Delete File**
|
Retrieve File Information
|
||||||
```bash
|
|
||||||
curl http://localhost:4000/v1/files/file-abc123 \
|
```python
|
||||||
-X DELETE \
|
from openai import OpenAI
|
||||||
-H "Authorization: Bearer sk-1234"
|
|
||||||
|
client = OpenAI(
|
||||||
|
api_key="sk-...",
|
||||||
|
base_url="http://0.0.0.0:4000/v1"
|
||||||
|
)
|
||||||
|
|
||||||
|
file = client.files.retrieve(file_id="file-abc123", extra_body={"custom_llm_provider": "openai"})
|
||||||
|
print("file=", file)
|
||||||
```
|
```
|
||||||
|
|
||||||
**Get File Content**
|
Delete File
|
||||||
```bash
|
|
||||||
curl http://localhost:4000/v1/files/file-abc123/content \
|
```python
|
||||||
-H "Authorization: Bearer sk-1234"
|
from openai import OpenAI
|
||||||
|
|
||||||
|
client = OpenAI(
|
||||||
|
api_key="sk-...",
|
||||||
|
base_url="http://0.0.0.0:4000/v1"
|
||||||
|
)
|
||||||
|
|
||||||
|
response = client.files.delete(file_id="file-abc123", extra_body={"custom_llm_provider": "openai"})
|
||||||
|
print("delete response=", response)
|
||||||
|
```
|
||||||
|
|
||||||
|
Get File Content
|
||||||
|
|
||||||
|
```python
|
||||||
|
from openai import OpenAI
|
||||||
|
|
||||||
|
client = OpenAI(
|
||||||
|
api_key="sk-...",
|
||||||
|
base_url="http://0.0.0.0:4000/v1"
|
||||||
|
)
|
||||||
|
|
||||||
|
content = client.files.content(file_id="file-abc123", extra_body={"custom_llm_provider": "openai"})
|
||||||
|
print("content=", content)
|
||||||
```
|
```
|
||||||
|
|
||||||
</TabItem>
|
</TabItem>
|
||||||
|
@ -120,7 +177,7 @@ print("file content=", content)
|
||||||
|
|
||||||
### [OpenAI](#quick-start)
|
### [OpenAI](#quick-start)
|
||||||
|
|
||||||
## [Azure OpenAI](./providers/azure#azure-batches-api)
|
### [Azure OpenAI](./providers/azure#azure-batches-api)
|
||||||
|
|
||||||
### [Vertex AI](./providers/vertex#batch-apis)
|
### [Vertex AI](./providers/vertex#batch-apis)
|
||||||
|
|
||||||
|
|
|
@ -821,6 +821,14 @@ print(f"\nResponse: {resp}")
|
||||||
|
|
||||||
## Usage - Thinking / `reasoning_content`
|
## Usage - Thinking / `reasoning_content`
|
||||||
|
|
||||||
|
LiteLLM translates OpenAI's `reasoning_effort` to Anthropic's `thinking` parameter. [Code](https://github.com/BerriAI/litellm/blob/23051d89dd3611a81617d84277059cd88b2df511/litellm/llms/anthropic/chat/transformation.py#L298)
|
||||||
|
|
||||||
|
| reasoning_effort | thinking |
|
||||||
|
| ---------------- | -------- |
|
||||||
|
| "low" | "budget_tokens": 1024 |
|
||||||
|
| "medium" | "budget_tokens": 2048 |
|
||||||
|
| "high" | "budget_tokens": 4096 |
|
||||||
|
|
||||||
<Tabs>
|
<Tabs>
|
||||||
<TabItem value="sdk" label="SDK">
|
<TabItem value="sdk" label="SDK">
|
||||||
|
|
||||||
|
@ -830,7 +838,7 @@ from litellm import completion
|
||||||
resp = completion(
|
resp = completion(
|
||||||
model="anthropic/claude-3-7-sonnet-20250219",
|
model="anthropic/claude-3-7-sonnet-20250219",
|
||||||
messages=[{"role": "user", "content": "What is the capital of France?"}],
|
messages=[{"role": "user", "content": "What is the capital of France?"}],
|
||||||
thinking={"type": "enabled", "budget_tokens": 1024},
|
reasoning_effort="low",
|
||||||
)
|
)
|
||||||
|
|
||||||
```
|
```
|
||||||
|
@ -863,7 +871,7 @@ curl http://0.0.0.0:4000/v1/chat/completions \
|
||||||
-d '{
|
-d '{
|
||||||
"model": "claude-3-7-sonnet-20250219",
|
"model": "claude-3-7-sonnet-20250219",
|
||||||
"messages": [{"role": "user", "content": "What is the capital of France?"}],
|
"messages": [{"role": "user", "content": "What is the capital of France?"}],
|
||||||
"thinking": {"type": "enabled", "budget_tokens": 1024}
|
"reasoning_effort": "low"
|
||||||
}'
|
}'
|
||||||
```
|
```
|
||||||
|
|
||||||
|
@ -927,6 +935,44 @@ ModelResponse(
|
||||||
)
|
)
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### Pass `thinking` to Anthropic models
|
||||||
|
|
||||||
|
You can also pass the `thinking` parameter to Anthropic models.
|
||||||
|
|
||||||
|
|
||||||
|
You can also pass the `thinking` parameter to Anthropic models.
|
||||||
|
|
||||||
|
<Tabs>
|
||||||
|
<TabItem value="sdk" label="SDK">
|
||||||
|
|
||||||
|
```python
|
||||||
|
response = litellm.completion(
|
||||||
|
model="anthropic/claude-3-7-sonnet-20250219",
|
||||||
|
messages=[{"role": "user", "content": "What is the capital of France?"}],
|
||||||
|
thinking={"type": "enabled", "budget_tokens": 1024},
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
</TabItem>
|
||||||
|
<TabItem value="proxy" label="PROXY">
|
||||||
|
|
||||||
|
```bash
|
||||||
|
curl http://0.0.0.0:4000/v1/chat/completions \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-H "Authorization: Bearer $LITELLM_KEY" \
|
||||||
|
-d '{
|
||||||
|
"model": "anthropic/claude-3-7-sonnet-20250219",
|
||||||
|
"messages": [{"role": "user", "content": "What is the capital of France?"}],
|
||||||
|
"thinking": {"type": "enabled", "budget_tokens": 1024}
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
|
||||||
|
</TabItem>
|
||||||
|
</Tabs>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
## **Passing Extra Headers to Anthropic API**
|
## **Passing Extra Headers to Anthropic API**
|
||||||
|
|
||||||
Pass `extra_headers: dict` to `litellm.completion`
|
Pass `extra_headers: dict` to `litellm.completion`
|
||||||
|
@ -1035,8 +1081,10 @@ response = completion(
|
||||||
"content": [
|
"content": [
|
||||||
{"type": "text", "text": "You are a very professional document summarization specialist. Please summarize the given document."},
|
{"type": "text", "text": "You are a very professional document summarization specialist. Please summarize the given document."},
|
||||||
{
|
{
|
||||||
"type": "image_url",
|
"type": "file",
|
||||||
"image_url": f"data:application/pdf;base64,{encoded_file}", # 👈 PDF
|
"file": {
|
||||||
|
"file_data": f"data:application/pdf;base64,{encoded_file}", # 👈 PDF
|
||||||
|
}
|
||||||
},
|
},
|
||||||
],
|
],
|
||||||
}
|
}
|
||||||
|
@ -1081,8 +1129,10 @@ curl http://0.0.0.0:4000/v1/chat/completions \
|
||||||
"text": "You are a very professional document summarization specialist. Please summarize the given document"
|
"text": "You are a very professional document summarization specialist. Please summarize the given document"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"type": "image_url",
|
"type": "file",
|
||||||
"image_url": "data:application/pdf;base64,{encoded_file}" # 👈 PDF
|
"file": {
|
||||||
|
"file_data": f"data:application/pdf;base64,{encoded_file}", # 👈 PDF
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
|
|
|
@ -476,7 +476,7 @@ os.environ["AWS_REGION_NAME"] = ""
|
||||||
resp = completion(
|
resp = completion(
|
||||||
model="bedrock/us.anthropic.claude-3-7-sonnet-20250219-v1:0",
|
model="bedrock/us.anthropic.claude-3-7-sonnet-20250219-v1:0",
|
||||||
messages=[{"role": "user", "content": "What is the capital of France?"}],
|
messages=[{"role": "user", "content": "What is the capital of France?"}],
|
||||||
thinking={"type": "enabled", "budget_tokens": 1024},
|
reasoning_effort="low",
|
||||||
)
|
)
|
||||||
|
|
||||||
print(resp)
|
print(resp)
|
||||||
|
@ -491,7 +491,7 @@ model_list:
|
||||||
- model_name: bedrock-claude-3-7
|
- model_name: bedrock-claude-3-7
|
||||||
litellm_params:
|
litellm_params:
|
||||||
model: bedrock/us.anthropic.claude-3-7-sonnet-20250219-v1:0
|
model: bedrock/us.anthropic.claude-3-7-sonnet-20250219-v1:0
|
||||||
thinking: {"type": "enabled", "budget_tokens": 1024} # 👈 EITHER HERE OR ON REQUEST
|
reasoning_effort: "low" # 👈 EITHER HERE OR ON REQUEST
|
||||||
```
|
```
|
||||||
|
|
||||||
2. Start proxy
|
2. Start proxy
|
||||||
|
@ -509,7 +509,7 @@ curl http://0.0.0.0:4000/v1/chat/completions \
|
||||||
-d '{
|
-d '{
|
||||||
"model": "bedrock-claude-3-7",
|
"model": "bedrock-claude-3-7",
|
||||||
"messages": [{"role": "user", "content": "What is the capital of France?"}],
|
"messages": [{"role": "user", "content": "What is the capital of France?"}],
|
||||||
"thinking": {"type": "enabled", "budget_tokens": 1024} # 👈 EITHER HERE OR ON CONFIG.YAML
|
"reasoning_effort": "low" # 👈 EITHER HERE OR ON CONFIG.YAML
|
||||||
}'
|
}'
|
||||||
```
|
```
|
||||||
|
|
||||||
|
@ -558,6 +558,10 @@ Same as [Anthropic API response](../providers/anthropic#usage---thinking--reason
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### Pass `thinking` to Anthropic models
|
||||||
|
|
||||||
|
Same as [Anthropic API response](../providers/anthropic#usage---thinking--reasoning_content).
|
||||||
|
|
||||||
|
|
||||||
## Usage - Structured Output / JSON mode
|
## Usage - Structured Output / JSON mode
|
||||||
|
|
||||||
|
@ -1168,14 +1172,22 @@ os.environ["AWS_REGION_NAME"] = ""
|
||||||
# pdf url
|
# pdf url
|
||||||
image_url = "https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf"
|
image_url = "https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf"
|
||||||
|
|
||||||
|
# Download the file
|
||||||
|
response = requests.get(url)
|
||||||
|
file_data = response.content
|
||||||
|
|
||||||
|
encoded_file = base64.b64encode(file_data).decode("utf-8")
|
||||||
|
|
||||||
# model
|
# model
|
||||||
model = "bedrock/anthropic.claude-3-5-sonnet-20240620-v1:0"
|
model = "bedrock/anthropic.claude-3-5-sonnet-20240620-v1:0"
|
||||||
|
|
||||||
image_content = [
|
image_content = [
|
||||||
{"type": "text", "text": "What's this file about?"},
|
{"type": "text", "text": "What's this file about?"},
|
||||||
{
|
{
|
||||||
"type": "image_url",
|
"type": "file",
|
||||||
"image_url": image_url, # OR {"url": image_url}
|
"file": {
|
||||||
|
"file_data": f"data:application/pdf;base64,{encoded_file}", # 👈 PDF
|
||||||
|
}
|
||||||
},
|
},
|
||||||
]
|
]
|
||||||
|
|
||||||
|
@ -1221,8 +1233,10 @@ curl -X POST 'http://0.0.0.0:4000/chat/completions' \
|
||||||
"messages": [
|
"messages": [
|
||||||
{"role": "user", "content": {"type": "text", "text": "What's this file about?"}},
|
{"role": "user", "content": {"type": "text", "text": "What's this file about?"}},
|
||||||
{
|
{
|
||||||
"type": "image_url",
|
"type": "file",
|
||||||
"image_url": "https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf",
|
"file": {
|
||||||
|
"file_data": f"data:application/pdf;base64,{encoded_file}", # 👈 PDF
|
||||||
|
}
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
}'
|
}'
|
||||||
|
|
|
@ -365,7 +365,7 @@ curl -X POST 'http://0.0.0.0:4000/chat/completions' \
|
||||||
</Tabs>
|
</Tabs>
|
||||||
|
|
||||||
## Specifying Safety Settings
|
## Specifying Safety Settings
|
||||||
In certain use-cases you may need to make calls to the models and pass [safety settigns](https://ai.google.dev/docs/safety_setting_gemini) different from the defaults. To do so, simple pass the `safety_settings` argument to `completion` or `acompletion`. For example:
|
In certain use-cases you may need to make calls to the models and pass [safety settings](https://ai.google.dev/docs/safety_setting_gemini) different from the defaults. To do so, simple pass the `safety_settings` argument to `completion` or `acompletion`. For example:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
response = completion(
|
response = completion(
|
||||||
|
@ -589,8 +589,10 @@ response = litellm.completion(
|
||||||
"content": [
|
"content": [
|
||||||
{"type": "text", "text": "Please summarize the audio."},
|
{"type": "text", "text": "Please summarize the audio."},
|
||||||
{
|
{
|
||||||
"type": "image_url",
|
"type": "file",
|
||||||
"image_url": "data:audio/mp3;base64,{}".format(encoded_data), # 👈 SET MIME_TYPE + DATA
|
"file": {
|
||||||
|
"file_data": "data:audio/mp3;base64,{}".format(encoded_data), # 👈 SET MIME_TYPE + DATA
|
||||||
|
}
|
||||||
},
|
},
|
||||||
],
|
],
|
||||||
}
|
}
|
||||||
|
@ -640,8 +642,11 @@ response = litellm.completion(
|
||||||
"content": [
|
"content": [
|
||||||
{"type": "text", "text": "Please summarize the file."},
|
{"type": "text", "text": "Please summarize the file."},
|
||||||
{
|
{
|
||||||
"type": "image_url",
|
"type": "file",
|
||||||
"image_url": "https://storage..." # 👈 SET THE IMG URL
|
"file": {
|
||||||
|
"file_id": "https://storage...", # 👈 SET THE IMG URL
|
||||||
|
"format": "application/pdf" # OPTIONAL
|
||||||
|
}
|
||||||
},
|
},
|
||||||
],
|
],
|
||||||
}
|
}
|
||||||
|
@ -668,8 +673,11 @@ response = litellm.completion(
|
||||||
"content": [
|
"content": [
|
||||||
{"type": "text", "text": "Please summarize the file."},
|
{"type": "text", "text": "Please summarize the file."},
|
||||||
{
|
{
|
||||||
"type": "image_url",
|
"type": "file",
|
||||||
"image_url": "gs://..." # 👈 SET THE cloud storage bucket url
|
"file": {
|
||||||
|
"file_id": "gs://storage...", # 👈 SET THE IMG URL
|
||||||
|
"format": "application/pdf" # OPTIONAL
|
||||||
|
}
|
||||||
},
|
},
|
||||||
],
|
],
|
||||||
}
|
}
|
||||||
|
|
|
@ -2,466 +2,392 @@ import Image from '@theme/IdealImage';
|
||||||
import Tabs from '@theme/Tabs';
|
import Tabs from '@theme/Tabs';
|
||||||
import TabItem from '@theme/TabItem';
|
import TabItem from '@theme/TabItem';
|
||||||
|
|
||||||
# Huggingface
|
# Hugging Face
|
||||||
|
LiteLLM supports running inference across multiple services for models hosted on the Hugging Face Hub.
|
||||||
|
|
||||||
LiteLLM supports the following types of Hugging Face models:
|
- **Serverless Inference Providers** - Hugging Face offers an easy and unified access to serverless AI inference through multiple inference providers, like [Together AI](https://together.ai) and [Sambanova](https://sambanova.ai). This is the fastest way to integrate AI in your products with a maintenance-free and scalable solution. More details in the [Inference Providers documentation](https://huggingface.co/docs/inference-providers/index).
|
||||||
|
- **Dedicated Inference Endpoints** - which is a product to easily deploy models to production. Inference is run by Hugging Face in a dedicated, fully managed infrastructure on a cloud provider of your choice. You can deploy your model on Hugging Face Inference Endpoints by following [these steps](https://huggingface.co/docs/inference-endpoints/guides/create_endpoint).
|
||||||
|
|
||||||
- Serverless Inference API (free) - loaded and ready to use: https://huggingface.co/models?inference=warm&pipeline_tag=text-generation
|
|
||||||
- Dedicated Inference Endpoints (paid) - manual deployment: https://ui.endpoints.huggingface.co/
|
## Supported Models
|
||||||
- All LLMs served via Hugging Face's Inference use [Text-generation-inference](https://huggingface.co/docs/text-generation-inference).
|
|
||||||
|
### Serverless Inference Providers
|
||||||
|
You can check available models for an inference provider by going to [huggingface.co/models](https://huggingface.co/models), clicking the "Other" filter tab, and selecting your desired provider:
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|
For example, you can find all Fireworks supported models [here](https://huggingface.co/models?inference_provider=fireworks-ai&sort=trending).
|
||||||
|
|
||||||
|
|
||||||
|
### Dedicated Inference Endpoints
|
||||||
|
Refer to the [Inference Endpoints catalog](https://endpoints.huggingface.co/catalog) for a list of available models.
|
||||||
|
|
||||||
## Usage
|
## Usage
|
||||||
|
|
||||||
|
<Tabs>
|
||||||
|
<TabItem value="serverless" label="Serverless Inference Providers">
|
||||||
|
|
||||||
|
### Authentication
|
||||||
|
With a single Hugging Face token, you can access inference through multiple providers. Your calls are routed through Hugging Face and the usage is billed directly to your Hugging Face account at the standard provider API rates.
|
||||||
|
|
||||||
|
Simply set the `HF_TOKEN` environment variable with your Hugging Face token, you can create one here: https://huggingface.co/settings/tokens.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
export HF_TOKEN="hf_xxxxxx"
|
||||||
|
```
|
||||||
|
or alternatively, you can pass your Hugging Face token as a parameter:
|
||||||
|
```python
|
||||||
|
completion(..., api_key="hf_xxxxxx")
|
||||||
|
```
|
||||||
|
|
||||||
|
### Getting Started
|
||||||
|
|
||||||
|
To use a Hugging Face model, specify both the provider and model you want to use in the following format:
|
||||||
|
```
|
||||||
|
huggingface/<provider>/<hf_org_or_user>/<hf_model>
|
||||||
|
```
|
||||||
|
Where `<hf_org_or_user>/<hf_model>` is the Hugging Face model ID and `<provider>` is the inference provider.
|
||||||
|
By default, if you don't specify a provider, LiteLLM will use the [HF Inference API](https://huggingface.co/docs/api-inference/en/index).
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Run DeepSeek-R1 inference through Together AI
|
||||||
|
completion(model="huggingface/together/deepseek-ai/DeepSeek-R1",...)
|
||||||
|
|
||||||
|
# Run Qwen2.5-72B-Instruct inference through Sambanova
|
||||||
|
completion(model="huggingface/sambanova/Qwen/Qwen2.5-72B-Instruct",...)
|
||||||
|
|
||||||
|
# Run Llama-3.3-70B-Instruct inference through HF Inference API
|
||||||
|
completion(model="huggingface/meta-llama/Llama-3.3-70B-Instruct",...)
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
<a target="_blank" href="https://colab.research.google.com/github/BerriAI/litellm/blob/main/cookbook/LiteLLM_HuggingFace.ipynb">
|
<a target="_blank" href="https://colab.research.google.com/github/BerriAI/litellm/blob/main/cookbook/LiteLLM_HuggingFace.ipynb">
|
||||||
<img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
|
<img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
|
||||||
</a>
|
</a>
|
||||||
|
|
||||||
You need to tell LiteLLM when you're calling Huggingface.
|
### Basic Completion
|
||||||
This is done by adding the "huggingface/" prefix to `model`, example `completion(model="huggingface/<model_name>",...)`.
|
Here's an example of chat completion using the DeepSeek-R1 model through Together AI:
|
||||||
|
|
||||||
<Tabs>
|
|
||||||
<TabItem value="serverless" label="Serverless Inference API">
|
|
||||||
|
|
||||||
By default, LiteLLM will assume a Hugging Face call follows the [Messages API](https://huggingface.co/docs/text-generation-inference/messages_api), which is fully compatible with the OpenAI Chat Completion API.
|
|
||||||
|
|
||||||
<Tabs>
|
|
||||||
<TabItem value="sdk" label="SDK">
|
|
||||||
|
|
||||||
```python
|
```python
|
||||||
import os
|
import os
|
||||||
from litellm import completion
|
from litellm import completion
|
||||||
|
|
||||||
# [OPTIONAL] set env var
|
os.environ["HF_TOKEN"] = "hf_xxxxxx"
|
||||||
os.environ["HUGGINGFACE_API_KEY"] = "huggingface_api_key"
|
|
||||||
|
|
||||||
messages = [{ "content": "There's a llama in my garden 😱 What should I do?","role": "user"}]
|
|
||||||
|
|
||||||
# e.g. Call 'https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct' from Serverless Inference API
|
|
||||||
response = completion(
|
response = completion(
|
||||||
model="huggingface/meta-llama/Meta-Llama-3.1-8B-Instruct",
|
model="huggingface/together/deepseek-ai/DeepSeek-R1",
|
||||||
messages=[{ "content": "Hello, how are you?","role": "user"}],
|
messages=[
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": "How many r's are in the word 'strawberry'?",
|
||||||
|
}
|
||||||
|
],
|
||||||
|
)
|
||||||
|
print(response)
|
||||||
|
```
|
||||||
|
|
||||||
|
### Streaming
|
||||||
|
Now, let's see what a streaming request looks like.
|
||||||
|
|
||||||
|
```python
|
||||||
|
import os
|
||||||
|
from litellm import completion
|
||||||
|
|
||||||
|
os.environ["HF_TOKEN"] = "hf_xxxxxx"
|
||||||
|
|
||||||
|
response = completion(
|
||||||
|
model="huggingface/together/deepseek-ai/DeepSeek-R1",
|
||||||
|
messages=[
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": "How many r's are in the word `strawberry`?",
|
||||||
|
|
||||||
|
}
|
||||||
|
],
|
||||||
|
stream=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
for chunk in response:
|
||||||
|
print(chunk)
|
||||||
|
```
|
||||||
|
|
||||||
|
### Image Input
|
||||||
|
You can also pass images when the model supports it. Here is an example using [Llama-3.2-11B-Vision-Instruct](https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct) model through Sambanova.
|
||||||
|
|
||||||
|
```python
|
||||||
|
from litellm import completion
|
||||||
|
|
||||||
|
# Set your Hugging Face Token
|
||||||
|
os.environ["HF_TOKEN"] = "hf_xxxxxx"
|
||||||
|
|
||||||
|
messages=[
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": [
|
||||||
|
{"type": "text", "text": "What's in this image?"},
|
||||||
|
{
|
||||||
|
"type": "image_url",
|
||||||
|
"image_url": {
|
||||||
|
"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
|
||||||
|
}
|
||||||
|
},
|
||||||
|
],
|
||||||
|
}
|
||||||
|
]
|
||||||
|
|
||||||
|
response = completion(
|
||||||
|
model="huggingface/sambanova/meta-llama/Llama-3.2-11B-Vision-Instruct",
|
||||||
|
messages=messages,
|
||||||
|
)
|
||||||
|
print(response.choices[0])
|
||||||
|
```
|
||||||
|
|
||||||
|
### Function Calling
|
||||||
|
You can extend the model's capabilities by giving them access to tools. Here is an example with function calling using [Qwen2.5-72B-Instruct](https://huggingface.co/Qwen/Qwen2.5-72B-Instruct) model through Sambanova.
|
||||||
|
|
||||||
|
```python
|
||||||
|
import os
|
||||||
|
from litellm import completion
|
||||||
|
|
||||||
|
# Set your Hugging Face Token
|
||||||
|
os.environ["HF_TOKEN"] = "hf_xxxxxx"
|
||||||
|
|
||||||
|
tools = [
|
||||||
|
{
|
||||||
|
"type": "function",
|
||||||
|
"function": {
|
||||||
|
"name": "get_current_weather",
|
||||||
|
"description": "Get the current weather in a given location",
|
||||||
|
"parameters": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"location": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "The city and state, e.g. San Francisco, CA",
|
||||||
|
},
|
||||||
|
"unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
|
||||||
|
},
|
||||||
|
"required": ["location"],
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
messages = [
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": "What's the weather like in Boston today?",
|
||||||
|
}
|
||||||
|
]
|
||||||
|
|
||||||
|
response = completion(
|
||||||
|
model="huggingface/sambanova/meta-llama/Llama-3.3-70B-Instruct",
|
||||||
|
messages=messages,
|
||||||
|
tools=tools,
|
||||||
|
tool_choice="auto"
|
||||||
|
)
|
||||||
|
print(response)
|
||||||
|
```
|
||||||
|
|
||||||
|
</TabItem>
|
||||||
|
|
||||||
|
<TabItem value="endpoints" label="Inference Endpoints">
|
||||||
|
|
||||||
|
<a target="_blank" href="https://colab.research.google.com/github/BerriAI/litellm/blob/main/cookbook/LiteLLM_HuggingFace.ipynb">
|
||||||
|
<img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
|
||||||
|
</a>
|
||||||
|
|
||||||
|
### Basic Completion
|
||||||
|
After you have [deployed your Hugging Face Inference Endpoint](https://endpoints.huggingface.co/new) on dedicated infrastructure, you can run inference on it by providing the endpoint base URL in `api_base`, and indicating `huggingface/tgi` as the model name.
|
||||||
|
|
||||||
|
```python
|
||||||
|
import os
|
||||||
|
from litellm import completion
|
||||||
|
|
||||||
|
os.environ["HF_TOKEN"] = "hf_xxxxxx"
|
||||||
|
|
||||||
|
response = completion(
|
||||||
|
model="huggingface/tgi",
|
||||||
|
messages=[{"content": "Hello, how are you?", "role": "user"}],
|
||||||
|
api_base="https://my-endpoint.endpoints.huggingface.cloud/v1/"
|
||||||
|
)
|
||||||
|
print(response)
|
||||||
|
```
|
||||||
|
|
||||||
|
### Streaming
|
||||||
|
|
||||||
|
```python
|
||||||
|
import os
|
||||||
|
from litellm import completion
|
||||||
|
|
||||||
|
os.environ["HF_TOKEN"] = "hf_xxxxxx"
|
||||||
|
|
||||||
|
response = completion(
|
||||||
|
model="huggingface/tgi",
|
||||||
|
messages=[{"content": "Hello, how are you?", "role": "user"}],
|
||||||
|
api_base="https://my-endpoint.endpoints.huggingface.cloud/v1/",
|
||||||
stream=True
|
stream=True
|
||||||
)
|
)
|
||||||
|
|
||||||
print(response)
|
|
||||||
```
|
|
||||||
|
|
||||||
</TabItem>
|
|
||||||
<TabItem value="proxy" label="PROXY">
|
|
||||||
|
|
||||||
1. Add models to your config.yaml
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
model_list:
|
|
||||||
- model_name: llama-3.1-8B-instruct
|
|
||||||
litellm_params:
|
|
||||||
model: huggingface/meta-llama/Meta-Llama-3.1-8B-Instruct
|
|
||||||
api_key: os.environ/HUGGINGFACE_API_KEY
|
|
||||||
```
|
|
||||||
|
|
||||||
2. Start the proxy
|
|
||||||
|
|
||||||
```bash
|
|
||||||
$ litellm --config /path/to/config.yaml --debug
|
|
||||||
```
|
|
||||||
|
|
||||||
3. Test it!
|
|
||||||
|
|
||||||
```shell
|
|
||||||
curl --location 'http://0.0.0.0:4000/chat/completions' \
|
|
||||||
--header 'Authorization: Bearer sk-1234' \
|
|
||||||
--header 'Content-Type: application/json' \
|
|
||||||
--data '{
|
|
||||||
"model": "llama-3.1-8B-instruct",
|
|
||||||
"messages": [
|
|
||||||
{
|
|
||||||
"role": "user",
|
|
||||||
"content": "I like you!"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
}'
|
|
||||||
```
|
|
||||||
|
|
||||||
</TabItem>
|
|
||||||
</Tabs>
|
|
||||||
</TabItem>
|
|
||||||
<TabItem value="classification" label="Text Classification">
|
|
||||||
|
|
||||||
Append `text-classification` to the model name
|
|
||||||
|
|
||||||
e.g. `huggingface/text-classification/<model-name>`
|
|
||||||
|
|
||||||
<Tabs>
|
|
||||||
<TabItem value="sdk" label="SDK">
|
|
||||||
|
|
||||||
```python
|
|
||||||
import os
|
|
||||||
from litellm import completion
|
|
||||||
|
|
||||||
# [OPTIONAL] set env var
|
|
||||||
os.environ["HUGGINGFACE_API_KEY"] = "huggingface_api_key"
|
|
||||||
|
|
||||||
messages = [{ "content": "I like you, I love you!","role": "user"}]
|
|
||||||
|
|
||||||
# e.g. Call 'shahrukhx01/question-vs-statement-classifier' hosted on HF Inference endpoints
|
|
||||||
response = completion(
|
|
||||||
model="huggingface/text-classification/shahrukhx01/question-vs-statement-classifier",
|
|
||||||
messages=messages,
|
|
||||||
api_base="https://my-endpoint.endpoints.huggingface.cloud",
|
|
||||||
)
|
|
||||||
|
|
||||||
print(response)
|
|
||||||
```
|
|
||||||
|
|
||||||
</TabItem>
|
|
||||||
<TabItem value="proxy" label="PROXY">
|
|
||||||
|
|
||||||
1. Add models to your config.yaml
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
model_list:
|
|
||||||
- model_name: bert-classifier
|
|
||||||
litellm_params:
|
|
||||||
model: huggingface/text-classification/shahrukhx01/question-vs-statement-classifier
|
|
||||||
api_key: os.environ/HUGGINGFACE_API_KEY
|
|
||||||
api_base: "https://my-endpoint.endpoints.huggingface.cloud"
|
|
||||||
```
|
|
||||||
|
|
||||||
2. Start the proxy
|
|
||||||
|
|
||||||
```bash
|
|
||||||
$ litellm --config /path/to/config.yaml --debug
|
|
||||||
```
|
|
||||||
|
|
||||||
3. Test it!
|
|
||||||
|
|
||||||
```shell
|
|
||||||
curl --location 'http://0.0.0.0:4000/chat/completions' \
|
|
||||||
--header 'Authorization: Bearer sk-1234' \
|
|
||||||
--header 'Content-Type: application/json' \
|
|
||||||
--data '{
|
|
||||||
"model": "bert-classifier",
|
|
||||||
"messages": [
|
|
||||||
{
|
|
||||||
"role": "user",
|
|
||||||
"content": "I like you!"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
}'
|
|
||||||
```
|
|
||||||
|
|
||||||
</TabItem>
|
|
||||||
</Tabs>
|
|
||||||
</TabItem>
|
|
||||||
<TabItem value="dedicated" label="Dedicated Inference Endpoints">
|
|
||||||
|
|
||||||
Steps to use
|
|
||||||
* Create your own Hugging Face dedicated endpoint here: https://ui.endpoints.huggingface.co/
|
|
||||||
* Set `api_base` to your deployed api base
|
|
||||||
* Add the `huggingface/` prefix to your model so litellm knows it's a huggingface Deployed Inference Endpoint
|
|
||||||
|
|
||||||
<Tabs>
|
|
||||||
<TabItem value="sdk" label="SDK">
|
|
||||||
|
|
||||||
```python
|
|
||||||
import os
|
|
||||||
from litellm import completion
|
|
||||||
|
|
||||||
os.environ["HUGGINGFACE_API_KEY"] = ""
|
|
||||||
|
|
||||||
# TGI model: Call https://huggingface.co/glaiveai/glaive-coder-7b
|
|
||||||
# add the 'huggingface/' prefix to the model to set huggingface as the provider
|
|
||||||
# set api base to your deployed api endpoint from hugging face
|
|
||||||
response = completion(
|
|
||||||
model="huggingface/glaiveai/glaive-coder-7b",
|
|
||||||
messages=[{ "content": "Hello, how are you?","role": "user"}],
|
|
||||||
api_base="https://wjiegasee9bmqke2.us-east-1.aws.endpoints.huggingface.cloud"
|
|
||||||
)
|
|
||||||
print(response)
|
|
||||||
```
|
|
||||||
|
|
||||||
</TabItem>
|
|
||||||
<TabItem value="proxy" label="PROXY">
|
|
||||||
|
|
||||||
1. Add models to your config.yaml
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
model_list:
|
|
||||||
- model_name: glaive-coder
|
|
||||||
litellm_params:
|
|
||||||
model: huggingface/glaiveai/glaive-coder-7b
|
|
||||||
api_key: os.environ/HUGGINGFACE_API_KEY
|
|
||||||
api_base: "https://wjiegasee9bmqke2.us-east-1.aws.endpoints.huggingface.cloud"
|
|
||||||
```
|
|
||||||
|
|
||||||
2. Start the proxy
|
|
||||||
|
|
||||||
```bash
|
|
||||||
$ litellm --config /path/to/config.yaml --debug
|
|
||||||
```
|
|
||||||
|
|
||||||
3. Test it!
|
|
||||||
|
|
||||||
```shell
|
|
||||||
curl --location 'http://0.0.0.0:4000/chat/completions' \
|
|
||||||
--header 'Authorization: Bearer sk-1234' \
|
|
||||||
--header 'Content-Type: application/json' \
|
|
||||||
--data '{
|
|
||||||
"model": "glaive-coder",
|
|
||||||
"messages": [
|
|
||||||
{
|
|
||||||
"role": "user",
|
|
||||||
"content": "I like you!"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
}'
|
|
||||||
```
|
|
||||||
|
|
||||||
</TabItem>
|
|
||||||
</Tabs>
|
|
||||||
|
|
||||||
</TabItem>
|
|
||||||
</Tabs>
|
|
||||||
|
|
||||||
## Streaming
|
|
||||||
|
|
||||||
<a target="_blank" href="https://colab.research.google.com/github/BerriAI/litellm/blob/main/cookbook/LiteLLM_HuggingFace.ipynb">
|
|
||||||
<img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
|
|
||||||
</a>
|
|
||||||
|
|
||||||
You need to tell LiteLLM when you're calling Huggingface.
|
|
||||||
This is done by adding the "huggingface/" prefix to `model`, example `completion(model="huggingface/<model_name>",...)`.
|
|
||||||
|
|
||||||
```python
|
|
||||||
import os
|
|
||||||
from litellm import completion
|
|
||||||
|
|
||||||
# [OPTIONAL] set env var
|
|
||||||
os.environ["HUGGINGFACE_API_KEY"] = "huggingface_api_key"
|
|
||||||
|
|
||||||
messages = [{ "content": "There's a llama in my garden 😱 What should I do?","role": "user"}]
|
|
||||||
|
|
||||||
# e.g. Call 'facebook/blenderbot-400M-distill' hosted on HF Inference endpoints
|
|
||||||
response = completion(
|
|
||||||
model="huggingface/facebook/blenderbot-400M-distill",
|
|
||||||
messages=messages,
|
|
||||||
api_base="https://my-endpoint.huggingface.cloud",
|
|
||||||
stream=True
|
|
||||||
)
|
|
||||||
|
|
||||||
print(response)
|
|
||||||
for chunk in response:
|
for chunk in response:
|
||||||
print(chunk)
|
print(chunk)
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### Image Input
|
||||||
|
|
||||||
|
```python
|
||||||
|
import os
|
||||||
|
from litellm import completion
|
||||||
|
|
||||||
|
os.environ["HF_TOKEN"] = "hf_xxxxxx"
|
||||||
|
|
||||||
|
messages=[
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": [
|
||||||
|
{"type": "text", "text": "What's in this image?"},
|
||||||
|
{
|
||||||
|
"type": "image_url",
|
||||||
|
"image_url": {
|
||||||
|
"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
|
||||||
|
}
|
||||||
|
},
|
||||||
|
],
|
||||||
|
}
|
||||||
|
]
|
||||||
|
response = completion(
|
||||||
|
model="huggingface/tgi",
|
||||||
|
messages=messages,
|
||||||
|
api_base="https://my-endpoint.endpoints.huggingface.cloud/v1/""
|
||||||
|
)
|
||||||
|
print(response.choices[0])
|
||||||
|
```
|
||||||
|
|
||||||
|
### Function Calling
|
||||||
|
|
||||||
|
```python
|
||||||
|
import os
|
||||||
|
from litellm import completion
|
||||||
|
|
||||||
|
os.environ["HF_TOKEN"] = "hf_xxxxxx"
|
||||||
|
|
||||||
|
functions = [{
|
||||||
|
"name": "get_weather",
|
||||||
|
"description": "Get the weather in a given location",
|
||||||
|
"parameters": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"location": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "The location to get weather for"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": ["location"]
|
||||||
|
}
|
||||||
|
}]
|
||||||
|
|
||||||
|
response = completion(
|
||||||
|
model="huggingface/tgi",
|
||||||
|
messages=[{"content": "What's the weather like in San Francisco?", "role": "user"}],
|
||||||
|
api_base="https://my-endpoint.endpoints.huggingface.cloud/v1/",
|
||||||
|
functions=functions
|
||||||
|
)
|
||||||
|
print(response)
|
||||||
|
```
|
||||||
|
|
||||||
|
</TabItem>
|
||||||
|
</Tabs>
|
||||||
|
|
||||||
|
## LiteLLM Proxy Server with Hugging Face models
|
||||||
|
You can set up a [LiteLLM Proxy Server](https://docs.litellm.ai/#litellm-proxy-server-llm-gateway) to serve Hugging Face models through any of the supported Inference Providers. Here's how to do it:
|
||||||
|
|
||||||
|
### Step 1. Setup the config file
|
||||||
|
|
||||||
|
In this case, we are configuring a proxy to serve `DeepSeek R1` from Hugging Face, using Together AI as the backend Inference Provider.
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
model_list:
|
||||||
|
- model_name: my-r1-model
|
||||||
|
litellm_params:
|
||||||
|
model: huggingface/together/deepseek-ai/DeepSeek-R1
|
||||||
|
api_key: os.environ/HF_TOKEN # ensure you have `HF_TOKEN` in your .env
|
||||||
|
```
|
||||||
|
|
||||||
|
### Step 2. Start the server
|
||||||
|
```bash
|
||||||
|
litellm --config /path/to/config.yaml
|
||||||
|
```
|
||||||
|
|
||||||
|
### Step 3. Make a request to the server
|
||||||
|
<Tabs>
|
||||||
|
<TabItem value="curl" label="curl">
|
||||||
|
|
||||||
|
```shell
|
||||||
|
curl --location 'http://0.0.0.0:4000/chat/completions' \
|
||||||
|
--header 'Content-Type: application/json' \
|
||||||
|
--data '{
|
||||||
|
"model": "my-r1-model",
|
||||||
|
"messages": [
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": "Hello, how are you?"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
|
||||||
|
</TabItem>
|
||||||
|
<TabItem value="python" label="python">
|
||||||
|
|
||||||
|
```python
|
||||||
|
# pip install openai
|
||||||
|
from openai import OpenAI
|
||||||
|
|
||||||
|
client = OpenAI(
|
||||||
|
base_url="http://0.0.0.0:4000",
|
||||||
|
api_key="anything",
|
||||||
|
)
|
||||||
|
|
||||||
|
response = client.chat.completions.create(
|
||||||
|
model="my-r1-model",
|
||||||
|
messages=[
|
||||||
|
{"role": "user", "content": "Hello, how are you?"}
|
||||||
|
]
|
||||||
|
)
|
||||||
|
print(response)
|
||||||
|
```
|
||||||
|
|
||||||
|
</TabItem>
|
||||||
|
</Tabs>
|
||||||
|
|
||||||
|
|
||||||
## Embedding
|
## Embedding
|
||||||
|
|
||||||
LiteLLM supports Hugging Face's [text-embedding-inference](https://github.com/huggingface/text-embeddings-inference) format.
|
LiteLLM supports Hugging Face's [text-embedding-inference](https://github.com/huggingface/text-embeddings-inference) models as well.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from litellm import embedding
|
from litellm import embedding
|
||||||
import os
|
import os
|
||||||
os.environ['HUGGINGFACE_API_KEY'] = ""
|
os.environ['HF_TOKEN'] = "hf_xxxxxx"
|
||||||
response = embedding(
|
response = embedding(
|
||||||
model='huggingface/microsoft/codebert-base',
|
model='huggingface/microsoft/codebert-base',
|
||||||
input=["good morning from litellm"]
|
input=["good morning from litellm"]
|
||||||
)
|
)
|
||||||
```
|
```
|
||||||
|
|
||||||
## Advanced
|
|
||||||
|
|
||||||
### Setting API KEYS + API BASE
|
|
||||||
|
|
||||||
If required, you can set the api key + api base, set it in your os environment. [Code for how it's sent](https://github.com/BerriAI/litellm/blob/0100ab2382a0e720c7978fbf662cc6e6920e7e03/litellm/llms/huggingface_restapi.py#L25)
|
|
||||||
|
|
||||||
```python
|
|
||||||
import os
|
|
||||||
os.environ["HUGGINGFACE_API_KEY"] = ""
|
|
||||||
os.environ["HUGGINGFACE_API_BASE"] = ""
|
|
||||||
```
|
|
||||||
|
|
||||||
### Viewing Log probs
|
|
||||||
|
|
||||||
#### Using `decoder_input_details` - OpenAI `echo`
|
|
||||||
|
|
||||||
The `echo` param is supported by OpenAI Completions - Use `litellm.text_completion()` for this
|
|
||||||
|
|
||||||
```python
|
|
||||||
from litellm import text_completion
|
|
||||||
response = text_completion(
|
|
||||||
model="huggingface/bigcode/starcoder",
|
|
||||||
prompt="good morning",
|
|
||||||
max_tokens=10, logprobs=10,
|
|
||||||
echo=True
|
|
||||||
)
|
|
||||||
```
|
|
||||||
|
|
||||||
#### Output
|
|
||||||
|
|
||||||
```json
|
|
||||||
{
|
|
||||||
"id": "chatcmpl-3fc71792-c442-4ba1-a611-19dd0ac371ad",
|
|
||||||
"object": "text_completion",
|
|
||||||
"created": 1698801125.936519,
|
|
||||||
"model": "bigcode/starcoder",
|
|
||||||
"choices": [
|
|
||||||
{
|
|
||||||
"text": ", I'm going to make you a sand",
|
|
||||||
"index": 0,
|
|
||||||
"logprobs": {
|
|
||||||
"tokens": [
|
|
||||||
"good",
|
|
||||||
" morning",
|
|
||||||
",",
|
|
||||||
" I",
|
|
||||||
"'m",
|
|
||||||
" going",
|
|
||||||
" to",
|
|
||||||
" make",
|
|
||||||
" you",
|
|
||||||
" a",
|
|
||||||
" s",
|
|
||||||
"and"
|
|
||||||
],
|
|
||||||
"token_logprobs": [
|
|
||||||
"None",
|
|
||||||
-14.96875,
|
|
||||||
-2.2285156,
|
|
||||||
-2.734375,
|
|
||||||
-2.0957031,
|
|
||||||
-2.0917969,
|
|
||||||
-0.09429932,
|
|
||||||
-3.1132812,
|
|
||||||
-1.3203125,
|
|
||||||
-1.2304688,
|
|
||||||
-1.6201172,
|
|
||||||
-0.010292053
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"finish_reason": "length"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"usage": {
|
|
||||||
"completion_tokens": 9,
|
|
||||||
"prompt_tokens": 2,
|
|
||||||
"total_tokens": 11
|
|
||||||
}
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
### Models with Prompt Formatting
|
|
||||||
|
|
||||||
For models with special prompt templates (e.g. Llama2), we format the prompt to fit their template.
|
|
||||||
|
|
||||||
#### Models with natively Supported Prompt Templates
|
|
||||||
|
|
||||||
| Model Name | Works for Models | Function Call | Required OS Variables |
|
|
||||||
| ------------------------------------ | ---------------------------------- | ----------------------------------------------------------------------------------------------------------------------- | ----------------------------------- |
|
|
||||||
| mistralai/Mistral-7B-Instruct-v0.1 | mistralai/Mistral-7B-Instruct-v0.1 | `completion(model='huggingface/mistralai/Mistral-7B-Instruct-v0.1', messages=messages, api_base="your_api_endpoint")` | `os.environ['HUGGINGFACE_API_KEY']` |
|
|
||||||
| meta-llama/Llama-2-7b-chat | All meta-llama llama2 chat models | `completion(model='huggingface/meta-llama/Llama-2-7b', messages=messages, api_base="your_api_endpoint")` | `os.environ['HUGGINGFACE_API_KEY']` |
|
|
||||||
| tiiuae/falcon-7b-instruct | All falcon instruct models | `completion(model='huggingface/tiiuae/falcon-7b-instruct', messages=messages, api_base="your_api_endpoint")` | `os.environ['HUGGINGFACE_API_KEY']` |
|
|
||||||
| mosaicml/mpt-7b-chat | All mpt chat models | `completion(model='huggingface/mosaicml/mpt-7b-chat', messages=messages, api_base="your_api_endpoint")` | `os.environ['HUGGINGFACE_API_KEY']` |
|
|
||||||
| codellama/CodeLlama-34b-Instruct-hf | All codellama instruct models | `completion(model='huggingface/codellama/CodeLlama-34b-Instruct-hf', messages=messages, api_base="your_api_endpoint")` | `os.environ['HUGGINGFACE_API_KEY']` |
|
|
||||||
| WizardLM/WizardCoder-Python-34B-V1.0 | All wizardcoder models | `completion(model='huggingface/WizardLM/WizardCoder-Python-34B-V1.0', messages=messages, api_base="your_api_endpoint")` | `os.environ['HUGGINGFACE_API_KEY']` |
|
|
||||||
| Phind/Phind-CodeLlama-34B-v2 | All phind-codellama models | `completion(model='huggingface/Phind/Phind-CodeLlama-34B-v2', messages=messages, api_base="your_api_endpoint")` | `os.environ['HUGGINGFACE_API_KEY']` |
|
|
||||||
|
|
||||||
**What if we don't support a model you need?**
|
|
||||||
You can also specify you're own custom prompt formatting, in case we don't have your model covered yet.
|
|
||||||
|
|
||||||
**Does this mean you have to specify a prompt for all models?**
|
|
||||||
No. By default we'll concatenate your message content to make a prompt.
|
|
||||||
|
|
||||||
**Default Prompt Template**
|
|
||||||
|
|
||||||
```python
|
|
||||||
def default_pt(messages):
|
|
||||||
return " ".join(message["content"] for message in messages)
|
|
||||||
```
|
|
||||||
|
|
||||||
[Code for how prompt formats work in LiteLLM](https://github.com/BerriAI/litellm/blob/main/litellm/llms/prompt_templates/factory.py)
|
|
||||||
|
|
||||||
#### Custom prompt templates
|
|
||||||
|
|
||||||
```python
|
|
||||||
import litellm
|
|
||||||
|
|
||||||
# Create your own custom prompt template works
|
|
||||||
litellm.register_prompt_template(
|
|
||||||
model="togethercomputer/LLaMA-2-7B-32K",
|
|
||||||
roles={
|
|
||||||
"system": {
|
|
||||||
"pre_message": "[INST] <<SYS>>\n",
|
|
||||||
"post_message": "\n<</SYS>>\n [/INST]\n"
|
|
||||||
},
|
|
||||||
"user": {
|
|
||||||
"pre_message": "[INST] ",
|
|
||||||
"post_message": " [/INST]\n"
|
|
||||||
},
|
|
||||||
"assistant": {
|
|
||||||
"post_message": "\n"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
def test_huggingface_custom_model():
|
|
||||||
model = "huggingface/togethercomputer/LLaMA-2-7B-32K"
|
|
||||||
response = completion(model=model, messages=messages, api_base="https://ecd4sb5n09bo4ei2.us-east-1.aws.endpoints.huggingface.cloud")
|
|
||||||
print(response['choices'][0]['message']['content'])
|
|
||||||
return response
|
|
||||||
|
|
||||||
test_huggingface_custom_model()
|
|
||||||
```
|
|
||||||
|
|
||||||
[Implementation Code](https://github.com/BerriAI/litellm/blob/c0b3da2c14c791a0b755f0b1e5a9ef065951ecbf/litellm/llms/huggingface_restapi.py#L52)
|
|
||||||
|
|
||||||
### Deploying a model on huggingface
|
|
||||||
|
|
||||||
You can use any chat/text model from Hugging Face with the following steps:
|
|
||||||
|
|
||||||
- Copy your model id/url from Huggingface Inference Endpoints
|
|
||||||
- [ ] Go to https://ui.endpoints.huggingface.co/
|
|
||||||
- [ ] Copy the url of the specific model you'd like to use
|
|
||||||
<Image img={require('../../img/hf_inference_endpoint.png')} alt="HF_Dashboard" style={{ maxWidth: '50%', height: 'auto' }}/>
|
|
||||||
- Set it as your model name
|
|
||||||
- Set your HUGGINGFACE_API_KEY as an environment variable
|
|
||||||
|
|
||||||
Need help deploying a model on huggingface? [Check out this guide.](https://huggingface.co/docs/inference-endpoints/guides/create_endpoint)
|
|
||||||
|
|
||||||
# output
|
|
||||||
|
|
||||||
Same as the OpenAI format, but also includes logprobs. [See the code](https://github.com/BerriAI/litellm/blob/b4b2dbf005142e0a483d46a07a88a19814899403/litellm/llms/huggingface_restapi.py#L115)
|
|
||||||
|
|
||||||
```json
|
|
||||||
{
|
|
||||||
"choices": [
|
|
||||||
{
|
|
||||||
"finish_reason": "stop",
|
|
||||||
"index": 0,
|
|
||||||
"message": {
|
|
||||||
"content": "\ud83d\ude31\n\nComment: @SarahSzabo I'm",
|
|
||||||
"role": "assistant",
|
|
||||||
"logprobs": -22.697942825499993
|
|
||||||
}
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"created": 1693436637.38206,
|
|
||||||
"model": "https://ji16r2iys9a8rjk2.us-east-1.aws.endpoints.huggingface.cloud",
|
|
||||||
"usage": {
|
|
||||||
"prompt_tokens": 14,
|
|
||||||
"completion_tokens": 11,
|
|
||||||
"total_tokens": 25
|
|
||||||
}
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
# FAQ
|
# FAQ
|
||||||
|
|
||||||
**Does this support stop sequences?**
|
**How does billing work with Hugging Face Inference Providers?**
|
||||||
|
|
||||||
Yes, we support stop sequences - and you can pass as many as allowed by Hugging Face (or any provider!)
|
> Billing is centralized on your Hugging Face account, no matter which providers you are using. You are billed the standard provider API rates with no additional markup - Hugging Face simply passes through the provider costs. Note that [Hugging Face PRO](https://huggingface.co/subscribe/pro) users get $2 worth of Inference credits every month that can be used across providers.
|
||||||
|
|
||||||
**How do you deal with repetition penalty?**
|
**Do I need to create an account for each Inference Provider?**
|
||||||
|
|
||||||
We map the presence penalty parameter in openai to the repetition penalty parameter on Hugging Face. [See code](https://github.com/BerriAI/litellm/blob/b4b2dbf005142e0a483d46a07a88a19814899403/litellm/utils.py#L757).
|
> No, you don't need to create separate accounts. All requests are routed through Hugging Face, so you only need your HF token. This allows you to easily benchmark different providers and choose the one that best fits your needs.
|
||||||
|
|
||||||
We welcome any suggestions for improving our Hugging Face integration - Create an [issue](https://github.com/BerriAI/litellm/issues/new/choose)/[Join the Discord](https://discord.com/invite/wuPM9dRgDw)!
|
**Will more inference providers be supported by Hugging Face in the future?**
|
||||||
|
|
||||||
|
> Yes! New inference providers (and models) are being added gradually.
|
||||||
|
|
||||||
|
We welcome any suggestions for improving our Hugging Face integration - Create an [issue](https://github.com/BerriAI/litellm/issues/new/choose)/[Join the Discord](https://discord.com/invite/wuPM9dRgDw)!
|
|
@ -1720,23 +1720,25 @@ assert isinstance(
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
||||||
## Usage - PDF / Videos / etc. Files
|
## Usage - PDF / Videos / Audio etc. Files
|
||||||
|
|
||||||
Pass any file supported by Vertex AI, through LiteLLM.
|
Pass any file supported by Vertex AI, through LiteLLM.
|
||||||
|
|
||||||
LiteLLM Supports the following image types passed in url
|
LiteLLM Supports the following file types passed in url.
|
||||||
|
|
||||||
|
Using `file` message type for VertexAI is live from v1.65.1+
|
||||||
|
|
||||||
```
|
```
|
||||||
Images with Cloud Storage URIs - gs://cloud-samples-data/generative-ai/image/boats.jpeg
|
Files with Cloud Storage URIs - gs://cloud-samples-data/generative-ai/image/boats.jpeg
|
||||||
Images with direct links - https://storage.googleapis.com/github-repo/img/gemini/intro/landmark3.jpg
|
Files with direct links - https://storage.googleapis.com/github-repo/img/gemini/intro/landmark3.jpg
|
||||||
Videos with Cloud Storage URIs - https://storage.googleapis.com/github-repo/img/gemini/multimodality_usecases_overview/pixel8.mp4
|
Videos with Cloud Storage URIs - https://storage.googleapis.com/github-repo/img/gemini/multimodality_usecases_overview/pixel8.mp4
|
||||||
Base64 Encoded Local Images
|
Base64 Encoded Local Files
|
||||||
```
|
```
|
||||||
|
|
||||||
<Tabs>
|
<Tabs>
|
||||||
<TabItem value="sdk" label="SDK">
|
<TabItem value="sdk" label="SDK">
|
||||||
|
|
||||||
### **Using `gs://`**
|
### **Using `gs://` or any URL**
|
||||||
```python
|
```python
|
||||||
from litellm import completion
|
from litellm import completion
|
||||||
|
|
||||||
|
@ -1748,8 +1750,11 @@ response = completion(
|
||||||
"content": [
|
"content": [
|
||||||
{"type": "text", "text": "You are a very professional document summarization specialist. Please summarize the given document."},
|
{"type": "text", "text": "You are a very professional document summarization specialist. Please summarize the given document."},
|
||||||
{
|
{
|
||||||
"type": "image_url",
|
"type": "file",
|
||||||
"image_url": "gs://cloud-samples-data/generative-ai/pdf/2403.05530.pdf", # 👈 PDF
|
"file": {
|
||||||
|
"file_id": "gs://cloud-samples-data/generative-ai/pdf/2403.05530.pdf",
|
||||||
|
"format": "application/pdf" # OPTIONAL - specify mime-type
|
||||||
|
}
|
||||||
},
|
},
|
||||||
],
|
],
|
||||||
}
|
}
|
||||||
|
@ -1783,8 +1788,16 @@ response = completion(
|
||||||
"content": [
|
"content": [
|
||||||
{"type": "text", "text": "You are a very professional document summarization specialist. Please summarize the given document."},
|
{"type": "text", "text": "You are a very professional document summarization specialist. Please summarize the given document."},
|
||||||
{
|
{
|
||||||
"type": "image_url",
|
"type": "file",
|
||||||
"image_url": f"data:application/pdf;base64,{encoded_file}", # 👈 PDF
|
"file": {
|
||||||
|
"file_data": f"data:application/pdf;base64,{encoded_file}", # 👈 PDF
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "audio_input",
|
||||||
|
"audio_input {
|
||||||
|
"audio_input": f"data:audio/mp3;base64,{encoded_file}", # 👈 AUDIO File ('file' message works as too)
|
||||||
|
}
|
||||||
},
|
},
|
||||||
],
|
],
|
||||||
}
|
}
|
||||||
|
@ -1830,8 +1843,11 @@ curl http://0.0.0.0:4000/v1/chat/completions \
|
||||||
"text": "You are a very professional document summarization specialist. Please summarize the given document"
|
"text": "You are a very professional document summarization specialist. Please summarize the given document"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"type": "image_url",
|
"type": "file",
|
||||||
"image_url": "gs://cloud-samples-data/generative-ai/pdf/2403.05530.pdf" # 👈 PDF
|
"file": {
|
||||||
|
"file_id": "gs://cloud-samples-data/generative-ai/pdf/2403.05530.pdf",
|
||||||
|
"format": "application/pdf" # OPTIONAL
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
|
@ -1858,11 +1874,18 @@ curl http://0.0.0.0:4000/v1/chat/completions \
|
||||||
"text": "You are a very professional document summarization specialist. Please summarize the given document"
|
"text": "You are a very professional document summarization specialist. Please summarize the given document"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"type": "image_url",
|
"type": "file",
|
||||||
"image_url": "data:application/pdf;base64,{encoded_file}" # 👈 PDF
|
"file": {
|
||||||
}
|
"file_data": f"data:application/pdf;base64,{encoded_file}", # 👈 PDF
|
||||||
}
|
},
|
||||||
]
|
},
|
||||||
|
{
|
||||||
|
"type": "audio_input",
|
||||||
|
"audio_input {
|
||||||
|
"audio_input": f"data:audio/mp3;base64,{encoded_file}", # 👈 AUDIO File ('file' message works as too)
|
||||||
|
}
|
||||||
|
},
|
||||||
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"max_tokens": 300
|
"max_tokens": 300
|
||||||
|
@ -1872,6 +1895,7 @@ curl http://0.0.0.0:4000/v1/chat/completions \
|
||||||
</TabItem>
|
</TabItem>
|
||||||
</Tabs>
|
</Tabs>
|
||||||
|
|
||||||
|
|
||||||
## Chat Models
|
## Chat Models
|
||||||
| Model Name | Function Call |
|
| Model Name | Function Call |
|
||||||
|------------------|--------------------------------------|
|
|------------------|--------------------------------------|
|
||||||
|
|
|
@ -82,7 +82,7 @@ from litellm import completion
|
||||||
os.environ["XAI_API_KEY"] = "your-api-key"
|
os.environ["XAI_API_KEY"] = "your-api-key"
|
||||||
|
|
||||||
response = completion(
|
response = completion(
|
||||||
model="xai/grok-2-latest",
|
model="xai/grok-2-vision-latest",
|
||||||
messages=[
|
messages=[
|
||||||
{
|
{
|
||||||
"role": "user",
|
"role": "user",
|
||||||
|
|
|
@ -156,7 +156,7 @@ PROXY_LOGOUT_URL="https://www.google.com"
|
||||||
|
|
||||||
Set this in your .env (so the proxy can set the correct redirect url)
|
Set this in your .env (so the proxy can set the correct redirect url)
|
||||||
```shell
|
```shell
|
||||||
PROXY_BASE_URL=https://litellm-api.up.railway.app/
|
PROXY_BASE_URL=https://litellm-api.up.railway.app
|
||||||
```
|
```
|
||||||
|
|
||||||
#### Step 4. Test flow
|
#### Step 4. Test flow
|
||||||
|
|
|
@ -406,6 +406,7 @@ router_settings:
|
||||||
| HELICONE_API_KEY | API key for Helicone service
|
| HELICONE_API_KEY | API key for Helicone service
|
||||||
| HOSTNAME | Hostname for the server, this will be [emitted to `datadog` logs](https://docs.litellm.ai/docs/proxy/logging#datadog)
|
| HOSTNAME | Hostname for the server, this will be [emitted to `datadog` logs](https://docs.litellm.ai/docs/proxy/logging#datadog)
|
||||||
| HUGGINGFACE_API_BASE | Base URL for Hugging Face API
|
| HUGGINGFACE_API_BASE | Base URL for Hugging Face API
|
||||||
|
| HUGGINGFACE_API_KEY | API key for Hugging Face API
|
||||||
| IAM_TOKEN_DB_AUTH | IAM token for database authentication
|
| IAM_TOKEN_DB_AUTH | IAM token for database authentication
|
||||||
| JSON_LOGS | Enable JSON formatted logging
|
| JSON_LOGS | Enable JSON formatted logging
|
||||||
| JWT_AUDIENCE | Expected audience for JWT tokens
|
| JWT_AUDIENCE | Expected audience for JWT tokens
|
||||||
|
|
86
docs/my-website/docs/proxy/db_deadlocks.md
Normal file
86
docs/my-website/docs/proxy/db_deadlocks.md
Normal file
|
@ -0,0 +1,86 @@
|
||||||
|
import Image from '@theme/IdealImage';
|
||||||
|
import Tabs from '@theme/Tabs';
|
||||||
|
import TabItem from '@theme/TabItem';
|
||||||
|
|
||||||
|
# High Availability Setup (Resolve DB Deadlocks)
|
||||||
|
|
||||||
|
Resolve any Database Deadlocks you see in high traffic by using this setup
|
||||||
|
|
||||||
|
## What causes the problem?
|
||||||
|
|
||||||
|
LiteLLM writes `UPDATE` and `UPSERT` queries to the DB. When using 10+ instances of LiteLLM, these queries can cause deadlocks since each instance could simultaneously attempt to update the same `user_id`, `team_id`, `key` etc.
|
||||||
|
|
||||||
|
## How the high availability setup fixes the problem
|
||||||
|
- All instances will write to a Redis queue instead of the DB.
|
||||||
|
- A single instance will acquire a lock on the DB and flush the redis queue to the DB.
|
||||||
|
|
||||||
|
|
||||||
|
## How it works
|
||||||
|
|
||||||
|
### Stage 1. Each instance writes updates to redis
|
||||||
|
|
||||||
|
Each instance will accumlate the spend updates for a key, user, team, etc and write the updates to a redis queue.
|
||||||
|
|
||||||
|
<Image img={require('../../img/deadlock_fix_1.png')} style={{ width: '900px', height: 'auto' }} />
|
||||||
|
<p style={{textAlign: 'left', color: '#666'}}>
|
||||||
|
Each instance writes updates to redis
|
||||||
|
</p>
|
||||||
|
|
||||||
|
|
||||||
|
### Stage 2. A single instance flushes the redis queue to the DB
|
||||||
|
|
||||||
|
A single instance will acquire a lock on the DB and flush all elements in the redis queue to the DB.
|
||||||
|
|
||||||
|
- 1 instance will attempt to acquire the lock for the DB update job
|
||||||
|
- The status of the lock is stored in redis
|
||||||
|
- If the instance acquires the lock to write to DB
|
||||||
|
- It will read all updates from redis
|
||||||
|
- Aggregate all updates into 1 transaction
|
||||||
|
- Write updates to DB
|
||||||
|
- Release the lock
|
||||||
|
- Note: Only 1 instance can acquire the lock at a time, this limits the number of instances that can write to the DB at once
|
||||||
|
|
||||||
|
|
||||||
|
<Image img={require('../../img/deadlock_fix_2.png')} style={{ width: '900px', height: 'auto' }} />
|
||||||
|
<p style={{textAlign: 'left', color: '#666'}}>
|
||||||
|
A single instance flushes the redis queue to the DB
|
||||||
|
</p>
|
||||||
|
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
### Required components
|
||||||
|
|
||||||
|
- Redis
|
||||||
|
- Postgres
|
||||||
|
|
||||||
|
### Setup on LiteLLM config
|
||||||
|
|
||||||
|
You can enable using the redis buffer by setting `use_redis_transaction_buffer: true` in the `general_settings` section of your `proxy_config.yaml` file.
|
||||||
|
|
||||||
|
Note: This setup requires litellm to be connected to a redis instance.
|
||||||
|
|
||||||
|
```yaml showLineNumbers title="litellm proxy_config.yaml"
|
||||||
|
general_settings:
|
||||||
|
use_redis_transaction_buffer: true
|
||||||
|
|
||||||
|
litellm_settings:
|
||||||
|
cache: True
|
||||||
|
cache_params:
|
||||||
|
type: redis
|
||||||
|
supported_call_types: [] # Optional: Set cache for proxy, but not on the actual llm api call
|
||||||
|
```
|
||||||
|
|
||||||
|
## Monitoring
|
||||||
|
|
||||||
|
LiteLLM emits the following prometheus metrics to monitor the health/status of the in memory buffer and redis buffer.
|
||||||
|
|
||||||
|
|
||||||
|
| Metric Name | Description | Storage Type |
|
||||||
|
|-----------------------------------------------------|-----------------------------------------------------------------------------|--------------|
|
||||||
|
| `litellm_pod_lock_manager_size` | Indicates which pod has the lock to write updates to the database. | Redis |
|
||||||
|
| `litellm_in_memory_daily_spend_update_queue_size` | Number of items in the in-memory daily spend update queue. These are the aggregate spend logs for each user. | In-Memory |
|
||||||
|
| `litellm_redis_daily_spend_update_queue_size` | Number of items in the Redis daily spend update queue. These are the aggregate spend logs for each user. | Redis |
|
||||||
|
| `litellm_in_memory_spend_update_queue_size` | In-memory aggregate spend values for keys, users, teams, team members, etc.| In-Memory |
|
||||||
|
| `litellm_redis_spend_update_queue_size` | Redis aggregate spend values for keys, users, teams, etc. | Redis |
|
||||||
|
|
|
@ -23,6 +23,12 @@ In the newly created guard's page, you can find a reference to the prompt policy
|
||||||
|
|
||||||
You can decide which detections will be enabled, and set the threshold for each detection.
|
You can decide which detections will be enabled, and set the threshold for each detection.
|
||||||
|
|
||||||
|
:::info
|
||||||
|
When using LiteLLM with virtual keys, key-specific policies can be set directly in Aim's guards page by specifying the virtual key alias when creating the guard.
|
||||||
|
|
||||||
|
Only the aliases of your virtual keys (and not the actual key secrets) will be sent to Aim.
|
||||||
|
:::
|
||||||
|
|
||||||
### 3. Add Aim Guardrail on your LiteLLM config.yaml
|
### 3. Add Aim Guardrail on your LiteLLM config.yaml
|
||||||
|
|
||||||
Define your guardrails under the `guardrails` section
|
Define your guardrails under the `guardrails` section
|
||||||
|
|
|
@ -17,6 +17,14 @@ model_list:
|
||||||
api_key: os.environ/OPENAI_API_KEY
|
api_key: os.environ/OPENAI_API_KEY
|
||||||
|
|
||||||
guardrails:
|
guardrails:
|
||||||
|
- guardrail_name: general-guard
|
||||||
|
litellm_params:
|
||||||
|
guardrail: aim
|
||||||
|
mode: [pre_call, post_call]
|
||||||
|
api_key: os.environ/AIM_API_KEY
|
||||||
|
api_base: os.environ/AIM_API_BASE
|
||||||
|
default_on: true # Optional
|
||||||
|
|
||||||
- guardrail_name: "aporia-pre-guard"
|
- guardrail_name: "aporia-pre-guard"
|
||||||
litellm_params:
|
litellm_params:
|
||||||
guardrail: aporia # supported values: "aporia", "lakera"
|
guardrail: aporia # supported values: "aporia", "lakera"
|
||||||
|
@ -45,6 +53,7 @@ guardrails:
|
||||||
- `pre_call` Run **before** LLM call, on **input**
|
- `pre_call` Run **before** LLM call, on **input**
|
||||||
- `post_call` Run **after** LLM call, on **input & output**
|
- `post_call` Run **after** LLM call, on **input & output**
|
||||||
- `during_call` Run **during** LLM call, on **input** Same as `pre_call` but runs in parallel as LLM call. Response not returned until guardrail check completes
|
- `during_call` Run **during** LLM call, on **input** Same as `pre_call` but runs in parallel as LLM call. Response not returned until guardrail check completes
|
||||||
|
- A list of the above values to run multiple modes, e.g. `mode: [pre_call, post_call]`
|
||||||
|
|
||||||
|
|
||||||
## 2. Start LiteLLM Gateway
|
## 2. Start LiteLLM Gateway
|
||||||
|
@ -569,4 +578,4 @@ guardrails: Union[
|
||||||
|
|
||||||
class DynamicGuardrailParams:
|
class DynamicGuardrailParams:
|
||||||
extra_body: Dict[str, Any] # Additional parameters for the guardrail
|
extra_body: Dict[str, Any] # Additional parameters for the guardrail
|
||||||
```
|
```
|
||||||
|
|
|
@ -242,6 +242,19 @@ litellm_settings:
|
||||||
| `litellm_redis_fails` | Number of failed redis calls |
|
| `litellm_redis_fails` | Number of failed redis calls |
|
||||||
| `litellm_self_latency` | Histogram latency for successful litellm api call |
|
| `litellm_self_latency` | Histogram latency for successful litellm api call |
|
||||||
|
|
||||||
|
#### DB Transaction Queue Health Metrics
|
||||||
|
|
||||||
|
Use these metrics to monitor the health of the DB Transaction Queue. Eg. Monitoring the size of the in-memory and redis buffers.
|
||||||
|
|
||||||
|
| Metric Name | Description | Storage Type |
|
||||||
|
|-----------------------------------------------------|-----------------------------------------------------------------------------|--------------|
|
||||||
|
| `litellm_pod_lock_manager_size` | Indicates which pod has the lock to write updates to the database. | Redis |
|
||||||
|
| `litellm_in_memory_daily_spend_update_queue_size` | Number of items in the in-memory daily spend update queue. These are the aggregate spend logs for each user. | In-Memory |
|
||||||
|
| `litellm_redis_daily_spend_update_queue_size` | Number of items in the Redis daily spend update queue. These are the aggregate spend logs for each user. | Redis |
|
||||||
|
| `litellm_in_memory_spend_update_queue_size` | In-memory aggregate spend values for keys, users, teams, team members, etc.| In-Memory |
|
||||||
|
| `litellm_redis_spend_update_queue_size` | Redis aggregate spend values for keys, users, teams, etc. | Redis |
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
## **🔥 LiteLLM Maintained Grafana Dashboards **
|
## **🔥 LiteLLM Maintained Grafana Dashboards **
|
||||||
|
|
||||||
|
@ -268,6 +281,17 @@ Here is a screenshot of the metrics you can monitor with the LiteLLM Grafana Das
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
## Add authentication on /metrics endpoint
|
||||||
|
|
||||||
|
**By default /metrics endpoint is unauthenticated.**
|
||||||
|
|
||||||
|
You can opt into running litellm authentication on the /metrics endpoint by setting the following on the config
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
litellm_settings:
|
||||||
|
require_auth_for_metrics_endpoint: true
|
||||||
|
```
|
||||||
|
|
||||||
## FAQ
|
## FAQ
|
||||||
|
|
||||||
### What are `_created` vs. `_total` metrics?
|
### What are `_created` vs. `_total` metrics?
|
||||||
|
|
|
@ -48,7 +48,7 @@ response = completion(
|
||||||
messages=[
|
messages=[
|
||||||
{"role": "user", "content": "What is the capital of France?"},
|
{"role": "user", "content": "What is the capital of France?"},
|
||||||
],
|
],
|
||||||
thinking={"type": "enabled", "budget_tokens": 1024} # 👈 REQUIRED FOR ANTHROPIC models (on `anthropic/`, `bedrock/`, `vertexai/`)
|
reasoning_effort="low",
|
||||||
)
|
)
|
||||||
print(response.choices[0].message.content)
|
print(response.choices[0].message.content)
|
||||||
```
|
```
|
||||||
|
@ -68,7 +68,7 @@ curl http://0.0.0.0:4000/v1/chat/completions \
|
||||||
"content": "What is the capital of France?"
|
"content": "What is the capital of France?"
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"thinking": {"type": "enabled", "budget_tokens": 1024}
|
"reasoning_effort": "low"
|
||||||
}'
|
}'
|
||||||
```
|
```
|
||||||
</TabItem>
|
</TabItem>
|
||||||
|
@ -150,7 +150,7 @@ response = litellm.completion(
|
||||||
messages=messages,
|
messages=messages,
|
||||||
tools=tools,
|
tools=tools,
|
||||||
tool_choice="auto", # auto is default, but we'll be explicit
|
tool_choice="auto", # auto is default, but we'll be explicit
|
||||||
thinking={"type": "enabled", "budget_tokens": 1024},
|
reasoning_effort="low",
|
||||||
)
|
)
|
||||||
print("Response\n", response)
|
print("Response\n", response)
|
||||||
response_message = response.choices[0].message
|
response_message = response.choices[0].message
|
||||||
|
@ -198,9 +198,9 @@ if tool_calls:
|
||||||
model=model,
|
model=model,
|
||||||
messages=messages,
|
messages=messages,
|
||||||
seed=22,
|
seed=22,
|
||||||
|
reasoning_effort="low",
|
||||||
# tools=tools,
|
# tools=tools,
|
||||||
drop_params=True,
|
drop_params=True,
|
||||||
thinking={"type": "enabled", "budget_tokens": 1024},
|
|
||||||
) # get a new response from the model where it can see the function response
|
) # get a new response from the model where it can see the function response
|
||||||
print("second response\n", second_response)
|
print("second response\n", second_response)
|
||||||
```
|
```
|
||||||
|
@ -340,7 +340,7 @@ litellm.drop_params = True # 👈 EITHER GLOBALLY or per request
|
||||||
response = litellm.completion(
|
response = litellm.completion(
|
||||||
model="anthropic/claude-3-7-sonnet-20250219",
|
model="anthropic/claude-3-7-sonnet-20250219",
|
||||||
messages=[{"role": "user", "content": "What is the capital of France?"}],
|
messages=[{"role": "user", "content": "What is the capital of France?"}],
|
||||||
thinking={"type": "enabled", "budget_tokens": 1024},
|
reasoning_effort="low",
|
||||||
drop_params=True,
|
drop_params=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -348,7 +348,7 @@ response = litellm.completion(
|
||||||
response = litellm.completion(
|
response = litellm.completion(
|
||||||
model="deepseek/deepseek-chat",
|
model="deepseek/deepseek-chat",
|
||||||
messages=[{"role": "user", "content": "What is the capital of France?"}],
|
messages=[{"role": "user", "content": "What is the capital of France?"}],
|
||||||
thinking={"type": "enabled", "budget_tokens": 1024},
|
reasoning_effort="low",
|
||||||
drop_params=True,
|
drop_params=True,
|
||||||
)
|
)
|
||||||
```
|
```
|
||||||
|
@ -364,3 +364,36 @@ These fields can be accessed via `response.choices[0].message.reasoning_content`
|
||||||
- `thinking` - str: The thinking from the model.
|
- `thinking` - str: The thinking from the model.
|
||||||
- `signature` - str: The signature delta from the model.
|
- `signature` - str: The signature delta from the model.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
## Pass `thinking` to Anthropic models
|
||||||
|
|
||||||
|
You can also pass the `thinking` parameter to Anthropic models.
|
||||||
|
|
||||||
|
<Tabs>
|
||||||
|
<TabItem value="sdk" label="SDK">
|
||||||
|
|
||||||
|
```python
|
||||||
|
response = litellm.completion(
|
||||||
|
model="anthropic/claude-3-7-sonnet-20250219",
|
||||||
|
messages=[{"role": "user", "content": "What is the capital of France?"}],
|
||||||
|
thinking={"type": "enabled", "budget_tokens": 1024},
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
</TabItem>
|
||||||
|
<TabItem value="proxy" label="PROXY">
|
||||||
|
|
||||||
|
```bash
|
||||||
|
curl http://0.0.0.0:4000/v1/chat/completions \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-H "Authorization: Bearer $LITELLM_KEY" \
|
||||||
|
-d '{
|
||||||
|
"model": "anthropic/claude-3-7-sonnet-20250219",
|
||||||
|
"messages": [{"role": "user", "content": "What is the capital of France?"}],
|
||||||
|
"thinking": {"type": "enabled", "budget_tokens": 1024}
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
|
||||||
|
</TabItem>
|
||||||
|
</Tabs>
|
||||||
|
|
BIN
docs/my-website/img/deadlock_fix_1.png
Normal file
BIN
docs/my-website/img/deadlock_fix_1.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 60 KiB |
BIN
docs/my-website/img/deadlock_fix_2.png
Normal file
BIN
docs/my-website/img/deadlock_fix_2.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 70 KiB |
BIN
docs/my-website/img/enterprise_vs_oss.png
Normal file
BIN
docs/my-website/img/enterprise_vs_oss.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 61 KiB |
BIN
docs/my-website/img/hf_filter_inference_providers.png
Normal file
BIN
docs/my-website/img/hf_filter_inference_providers.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 120 KiB |
7
docs/my-website/package-lock.json
generated
7
docs/my-website/package-lock.json
generated
|
@ -12559,9 +12559,10 @@
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"node_modules/image-size": {
|
"node_modules/image-size": {
|
||||||
"version": "1.1.1",
|
"version": "1.2.1",
|
||||||
"resolved": "https://registry.npmjs.org/image-size/-/image-size-1.1.1.tgz",
|
"resolved": "https://registry.npmjs.org/image-size/-/image-size-1.2.1.tgz",
|
||||||
"integrity": "sha512-541xKlUw6jr/6gGuk92F+mYM5zaFAc5ahphvkqvNe2bQ6gVBkd6bfrmVJ2t4KDAfikAYZyIqTnktX3i6/aQDrQ==",
|
"integrity": "sha512-rH+46sQJ2dlwfjfhCyNx5thzrv+dtmBIhPHk0zgRUukHzZ/kRueTJXoYYsclBaKcSMBWuGbOFXtioLpzTb5euw==",
|
||||||
|
"license": "MIT",
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"queue": "6.0.2"
|
"queue": "6.0.2"
|
||||||
},
|
},
|
||||||
|
|
42
docs/my-website/release_notes/v1.65.4-stable/index.md
Normal file
42
docs/my-website/release_notes/v1.65.4-stable/index.md
Normal file
|
@ -0,0 +1,42 @@
|
||||||
|
---
|
||||||
|
title: v1.65.4-stable
|
||||||
|
slug: v1.65.4-stable
|
||||||
|
date: 2025-04-05T10:00:00
|
||||||
|
authors:
|
||||||
|
- name: Krrish Dholakia
|
||||||
|
title: CEO, LiteLLM
|
||||||
|
url: https://www.linkedin.com/in/krish-d/
|
||||||
|
image_url: https://media.licdn.com/dms/image/v2/D4D03AQGrlsJ3aqpHmQ/profile-displayphoto-shrink_400_400/B4DZSAzgP7HYAg-/0/1737327772964?e=1743638400&v=beta&t=39KOXMUFedvukiWWVPHf3qI45fuQD7lNglICwN31DrI
|
||||||
|
- name: Ishaan Jaffer
|
||||||
|
title: CTO, LiteLLM
|
||||||
|
url: https://www.linkedin.com/in/reffajnaahsi/
|
||||||
|
image_url: https://pbs.twimg.com/profile_images/1613813310264340481/lz54oEiB_400x400.jpg
|
||||||
|
|
||||||
|
tags: []
|
||||||
|
hide_table_of_contents: false
|
||||||
|
---
|
||||||
|
|
||||||
|
import Image from '@theme/IdealImage';
|
||||||
|
import Tabs from '@theme/Tabs';
|
||||||
|
import TabItem from '@theme/TabItem';
|
||||||
|
|
||||||
|
## Deploy this version
|
||||||
|
|
||||||
|
<Tabs>
|
||||||
|
<TabItem value="docker" label="Docker">
|
||||||
|
|
||||||
|
``` showLineNumbers title="docker run litellm"
|
||||||
|
docker run
|
||||||
|
-e STORE_MODEL_IN_DB=True
|
||||||
|
-p 4000:4000
|
||||||
|
ghcr.io/berriai/litellm:main-v1.65.4-stable
|
||||||
|
```
|
||||||
|
</TabItem>
|
||||||
|
|
||||||
|
<TabItem value="pip" label="Pip">
|
||||||
|
|
||||||
|
``` showLineNumbers title="pip install litellm"
|
||||||
|
pip install litellm==1.65.4.post1
|
||||||
|
```
|
||||||
|
</TabItem>
|
||||||
|
</Tabs>
|
|
@ -53,7 +53,7 @@ const sidebars = {
|
||||||
{
|
{
|
||||||
type: "category",
|
type: "category",
|
||||||
label: "Architecture",
|
label: "Architecture",
|
||||||
items: ["proxy/architecture", "proxy/db_info", "router_architecture", "proxy/user_management_heirarchy", "proxy/jwt_auth_arch", "proxy/image_handling"],
|
items: ["proxy/architecture", "proxy/db_info", "proxy/db_deadlocks", "router_architecture", "proxy/user_management_heirarchy", "proxy/jwt_auth_arch", "proxy/image_handling"],
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
type: "link",
|
type: "link",
|
||||||
|
@ -137,15 +137,17 @@ const sidebars = {
|
||||||
label: "[Beta] Guardrails",
|
label: "[Beta] Guardrails",
|
||||||
items: [
|
items: [
|
||||||
"proxy/guardrails/quick_start",
|
"proxy/guardrails/quick_start",
|
||||||
"proxy/guardrails/aim_security",
|
...[
|
||||||
"proxy/guardrails/aporia_api",
|
"proxy/guardrails/aim_security",
|
||||||
"proxy/guardrails/bedrock",
|
"proxy/guardrails/aporia_api",
|
||||||
"proxy/guardrails/guardrails_ai",
|
"proxy/guardrails/bedrock",
|
||||||
"proxy/guardrails/lakera_ai",
|
"proxy/guardrails/guardrails_ai",
|
||||||
"proxy/guardrails/pii_masking_v2",
|
"proxy/guardrails/lakera_ai",
|
||||||
"proxy/guardrails/secret_detection",
|
"proxy/guardrails/pii_masking_v2",
|
||||||
"proxy/guardrails/custom_guardrail",
|
"proxy/guardrails/secret_detection",
|
||||||
"prompt_injection"
|
"proxy/guardrails/custom_guardrail",
|
||||||
|
"proxy/guardrails/prompt_injection",
|
||||||
|
].sort(),
|
||||||
],
|
],
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
|
BIN
litellm-proxy-extras/dist/litellm_proxy_extras-0.1.2-py3-none-any.whl
vendored
Normal file
BIN
litellm-proxy-extras/dist/litellm_proxy_extras-0.1.2-py3-none-any.whl
vendored
Normal file
Binary file not shown.
BIN
litellm-proxy-extras/dist/litellm_proxy_extras-0.1.2.tar.gz
vendored
Normal file
BIN
litellm-proxy-extras/dist/litellm_proxy_extras-0.1.2.tar.gz
vendored
Normal file
Binary file not shown.
BIN
litellm-proxy-extras/dist/litellm_proxy_extras-0.1.3-py3-none-any.whl
vendored
Normal file
BIN
litellm-proxy-extras/dist/litellm_proxy_extras-0.1.3-py3-none-any.whl
vendored
Normal file
Binary file not shown.
BIN
litellm-proxy-extras/dist/litellm_proxy_extras-0.1.3.tar.gz
vendored
Normal file
BIN
litellm-proxy-extras/dist/litellm_proxy_extras-0.1.3.tar.gz
vendored
Normal file
Binary file not shown.
|
@ -0,0 +1,4 @@
|
||||||
|
-- AlterTable
|
||||||
|
ALTER TABLE "LiteLLM_DailyUserSpend" ADD COLUMN "failed_requests" INTEGER NOT NULL DEFAULT 0,
|
||||||
|
ADD COLUMN "successful_requests" INTEGER NOT NULL DEFAULT 0;
|
||||||
|
|
356
litellm-proxy-extras/litellm_proxy_extras/schema.prisma
Normal file
356
litellm-proxy-extras/litellm_proxy_extras/schema.prisma
Normal file
|
@ -0,0 +1,356 @@
|
||||||
|
datasource client {
|
||||||
|
provider = "postgresql"
|
||||||
|
url = env("DATABASE_URL")
|
||||||
|
}
|
||||||
|
|
||||||
|
generator client {
|
||||||
|
provider = "prisma-client-py"
|
||||||
|
}
|
||||||
|
|
||||||
|
// Budget / Rate Limits for an org
|
||||||
|
model LiteLLM_BudgetTable {
|
||||||
|
budget_id String @id @default(uuid())
|
||||||
|
max_budget Float?
|
||||||
|
soft_budget Float?
|
||||||
|
max_parallel_requests Int?
|
||||||
|
tpm_limit BigInt?
|
||||||
|
rpm_limit BigInt?
|
||||||
|
model_max_budget Json?
|
||||||
|
budget_duration String?
|
||||||
|
budget_reset_at DateTime?
|
||||||
|
created_at DateTime @default(now()) @map("created_at")
|
||||||
|
created_by String
|
||||||
|
updated_at DateTime @default(now()) @updatedAt @map("updated_at")
|
||||||
|
updated_by String
|
||||||
|
organization LiteLLM_OrganizationTable[] // multiple orgs can have the same budget
|
||||||
|
keys LiteLLM_VerificationToken[] // multiple keys can have the same budget
|
||||||
|
end_users LiteLLM_EndUserTable[] // multiple end-users can have the same budget
|
||||||
|
team_membership LiteLLM_TeamMembership[] // budgets of Users within a Team
|
||||||
|
organization_membership LiteLLM_OrganizationMembership[] // budgets of Users within a Organization
|
||||||
|
}
|
||||||
|
|
||||||
|
// Models on proxy
|
||||||
|
model LiteLLM_CredentialsTable {
|
||||||
|
credential_id String @id @default(uuid())
|
||||||
|
credential_name String @unique
|
||||||
|
credential_values Json
|
||||||
|
credential_info Json?
|
||||||
|
created_at DateTime @default(now()) @map("created_at")
|
||||||
|
created_by String
|
||||||
|
updated_at DateTime @default(now()) @updatedAt @map("updated_at")
|
||||||
|
updated_by String
|
||||||
|
}
|
||||||
|
|
||||||
|
// Models on proxy
|
||||||
|
model LiteLLM_ProxyModelTable {
|
||||||
|
model_id String @id @default(uuid())
|
||||||
|
model_name String
|
||||||
|
litellm_params Json
|
||||||
|
model_info Json?
|
||||||
|
created_at DateTime @default(now()) @map("created_at")
|
||||||
|
created_by String
|
||||||
|
updated_at DateTime @default(now()) @updatedAt @map("updated_at")
|
||||||
|
updated_by String
|
||||||
|
}
|
||||||
|
|
||||||
|
model LiteLLM_OrganizationTable {
|
||||||
|
organization_id String @id @default(uuid())
|
||||||
|
organization_alias String
|
||||||
|
budget_id String
|
||||||
|
metadata Json @default("{}")
|
||||||
|
models String[]
|
||||||
|
spend Float @default(0.0)
|
||||||
|
model_spend Json @default("{}")
|
||||||
|
created_at DateTime @default(now()) @map("created_at")
|
||||||
|
created_by String
|
||||||
|
updated_at DateTime @default(now()) @updatedAt @map("updated_at")
|
||||||
|
updated_by String
|
||||||
|
litellm_budget_table LiteLLM_BudgetTable? @relation(fields: [budget_id], references: [budget_id])
|
||||||
|
teams LiteLLM_TeamTable[]
|
||||||
|
users LiteLLM_UserTable[]
|
||||||
|
keys LiteLLM_VerificationToken[]
|
||||||
|
members LiteLLM_OrganizationMembership[] @relation("OrganizationToMembership")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Model info for teams, just has model aliases for now.
|
||||||
|
model LiteLLM_ModelTable {
|
||||||
|
id Int @id @default(autoincrement())
|
||||||
|
model_aliases Json? @map("aliases")
|
||||||
|
created_at DateTime @default(now()) @map("created_at")
|
||||||
|
created_by String
|
||||||
|
updated_at DateTime @default(now()) @updatedAt @map("updated_at")
|
||||||
|
updated_by String
|
||||||
|
team LiteLLM_TeamTable?
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// Assign prod keys to groups, not individuals
|
||||||
|
model LiteLLM_TeamTable {
|
||||||
|
team_id String @id @default(uuid())
|
||||||
|
team_alias String?
|
||||||
|
organization_id String?
|
||||||
|
admins String[]
|
||||||
|
members String[]
|
||||||
|
members_with_roles Json @default("{}")
|
||||||
|
metadata Json @default("{}")
|
||||||
|
max_budget Float?
|
||||||
|
spend Float @default(0.0)
|
||||||
|
models String[]
|
||||||
|
max_parallel_requests Int?
|
||||||
|
tpm_limit BigInt?
|
||||||
|
rpm_limit BigInt?
|
||||||
|
budget_duration String?
|
||||||
|
budget_reset_at DateTime?
|
||||||
|
blocked Boolean @default(false)
|
||||||
|
created_at DateTime @default(now()) @map("created_at")
|
||||||
|
updated_at DateTime @default(now()) @updatedAt @map("updated_at")
|
||||||
|
model_spend Json @default("{}")
|
||||||
|
model_max_budget Json @default("{}")
|
||||||
|
model_id Int? @unique // id for LiteLLM_ModelTable -> stores team-level model aliases
|
||||||
|
litellm_organization_table LiteLLM_OrganizationTable? @relation(fields: [organization_id], references: [organization_id])
|
||||||
|
litellm_model_table LiteLLM_ModelTable? @relation(fields: [model_id], references: [id])
|
||||||
|
}
|
||||||
|
|
||||||
|
// Track spend, rate limit, budget Users
|
||||||
|
model LiteLLM_UserTable {
|
||||||
|
user_id String @id
|
||||||
|
user_alias String?
|
||||||
|
team_id String?
|
||||||
|
sso_user_id String? @unique
|
||||||
|
organization_id String?
|
||||||
|
password String?
|
||||||
|
teams String[] @default([])
|
||||||
|
user_role String?
|
||||||
|
max_budget Float?
|
||||||
|
spend Float @default(0.0)
|
||||||
|
user_email String?
|
||||||
|
models String[]
|
||||||
|
metadata Json @default("{}")
|
||||||
|
max_parallel_requests Int?
|
||||||
|
tpm_limit BigInt?
|
||||||
|
rpm_limit BigInt?
|
||||||
|
budget_duration String?
|
||||||
|
budget_reset_at DateTime?
|
||||||
|
allowed_cache_controls String[] @default([])
|
||||||
|
model_spend Json @default("{}")
|
||||||
|
model_max_budget Json @default("{}")
|
||||||
|
created_at DateTime? @default(now()) @map("created_at")
|
||||||
|
updated_at DateTime? @default(now()) @updatedAt @map("updated_at")
|
||||||
|
|
||||||
|
// relations
|
||||||
|
litellm_organization_table LiteLLM_OrganizationTable? @relation(fields: [organization_id], references: [organization_id])
|
||||||
|
organization_memberships LiteLLM_OrganizationMembership[]
|
||||||
|
invitations_created LiteLLM_InvitationLink[] @relation("CreatedBy")
|
||||||
|
invitations_updated LiteLLM_InvitationLink[] @relation("UpdatedBy")
|
||||||
|
invitations_user LiteLLM_InvitationLink[] @relation("UserId")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Generate Tokens for Proxy
|
||||||
|
model LiteLLM_VerificationToken {
|
||||||
|
token String @id
|
||||||
|
key_name String?
|
||||||
|
key_alias String?
|
||||||
|
soft_budget_cooldown Boolean @default(false) // key-level state on if budget alerts need to be cooled down
|
||||||
|
spend Float @default(0.0)
|
||||||
|
expires DateTime?
|
||||||
|
models String[]
|
||||||
|
aliases Json @default("{}")
|
||||||
|
config Json @default("{}")
|
||||||
|
user_id String?
|
||||||
|
team_id String?
|
||||||
|
permissions Json @default("{}")
|
||||||
|
max_parallel_requests Int?
|
||||||
|
metadata Json @default("{}")
|
||||||
|
blocked Boolean?
|
||||||
|
tpm_limit BigInt?
|
||||||
|
rpm_limit BigInt?
|
||||||
|
max_budget Float?
|
||||||
|
budget_duration String?
|
||||||
|
budget_reset_at DateTime?
|
||||||
|
allowed_cache_controls String[] @default([])
|
||||||
|
model_spend Json @default("{}")
|
||||||
|
model_max_budget Json @default("{}")
|
||||||
|
budget_id String?
|
||||||
|
organization_id String?
|
||||||
|
created_at DateTime? @default(now()) @map("created_at")
|
||||||
|
created_by String?
|
||||||
|
updated_at DateTime? @default(now()) @updatedAt @map("updated_at")
|
||||||
|
updated_by String?
|
||||||
|
litellm_budget_table LiteLLM_BudgetTable? @relation(fields: [budget_id], references: [budget_id])
|
||||||
|
litellm_organization_table LiteLLM_OrganizationTable? @relation(fields: [organization_id], references: [organization_id])
|
||||||
|
}
|
||||||
|
|
||||||
|
model LiteLLM_EndUserTable {
|
||||||
|
user_id String @id
|
||||||
|
alias String? // admin-facing alias
|
||||||
|
spend Float @default(0.0)
|
||||||
|
allowed_model_region String? // require all user requests to use models in this specific region
|
||||||
|
default_model String? // use along with 'allowed_model_region'. if no available model in region, default to this model.
|
||||||
|
budget_id String?
|
||||||
|
litellm_budget_table LiteLLM_BudgetTable? @relation(fields: [budget_id], references: [budget_id])
|
||||||
|
blocked Boolean @default(false)
|
||||||
|
}
|
||||||
|
|
||||||
|
// store proxy config.yaml
|
||||||
|
model LiteLLM_Config {
|
||||||
|
param_name String @id
|
||||||
|
param_value Json?
|
||||||
|
}
|
||||||
|
|
||||||
|
// View spend, model, api_key per request
|
||||||
|
model LiteLLM_SpendLogs {
|
||||||
|
request_id String @id
|
||||||
|
call_type String
|
||||||
|
api_key String @default ("") // Hashed API Token. Not the actual Virtual Key. Equivalent to 'token' column in LiteLLM_VerificationToken
|
||||||
|
spend Float @default(0.0)
|
||||||
|
total_tokens Int @default(0)
|
||||||
|
prompt_tokens Int @default(0)
|
||||||
|
completion_tokens Int @default(0)
|
||||||
|
startTime DateTime // Assuming start_time is a DateTime field
|
||||||
|
endTime DateTime // Assuming end_time is a DateTime field
|
||||||
|
completionStartTime DateTime? // Assuming completionStartTime is a DateTime field
|
||||||
|
model String @default("")
|
||||||
|
model_id String? @default("") // the model id stored in proxy model db
|
||||||
|
model_group String? @default("") // public model_name / model_group
|
||||||
|
custom_llm_provider String? @default("") // litellm used custom_llm_provider
|
||||||
|
api_base String? @default("")
|
||||||
|
user String? @default("")
|
||||||
|
metadata Json? @default("{}")
|
||||||
|
cache_hit String? @default("")
|
||||||
|
cache_key String? @default("")
|
||||||
|
request_tags Json? @default("[]")
|
||||||
|
team_id String?
|
||||||
|
end_user String?
|
||||||
|
requester_ip_address String?
|
||||||
|
messages Json? @default("{}")
|
||||||
|
response Json? @default("{}")
|
||||||
|
@@index([startTime])
|
||||||
|
@@index([end_user])
|
||||||
|
}
|
||||||
|
|
||||||
|
// View spend, model, api_key per request
|
||||||
|
model LiteLLM_ErrorLogs {
|
||||||
|
request_id String @id @default(uuid())
|
||||||
|
startTime DateTime // Assuming start_time is a DateTime field
|
||||||
|
endTime DateTime // Assuming end_time is a DateTime field
|
||||||
|
api_base String @default("")
|
||||||
|
model_group String @default("") // public model_name / model_group
|
||||||
|
litellm_model_name String @default("") // model passed to litellm
|
||||||
|
model_id String @default("") // ID of model in ProxyModelTable
|
||||||
|
request_kwargs Json @default("{}")
|
||||||
|
exception_type String @default("")
|
||||||
|
exception_string String @default("")
|
||||||
|
status_code String @default("")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Beta - allow team members to request access to a model
|
||||||
|
model LiteLLM_UserNotifications {
|
||||||
|
request_id String @id
|
||||||
|
user_id String
|
||||||
|
models String[]
|
||||||
|
justification String
|
||||||
|
status String // approved, disapproved, pending
|
||||||
|
}
|
||||||
|
|
||||||
|
model LiteLLM_TeamMembership {
|
||||||
|
// Use this table to track the Internal User's Spend within a Team + Set Budgets, rpm limits for the user within the team
|
||||||
|
user_id String
|
||||||
|
team_id String
|
||||||
|
spend Float @default(0.0)
|
||||||
|
budget_id String?
|
||||||
|
litellm_budget_table LiteLLM_BudgetTable? @relation(fields: [budget_id], references: [budget_id])
|
||||||
|
@@id([user_id, team_id])
|
||||||
|
}
|
||||||
|
|
||||||
|
model LiteLLM_OrganizationMembership {
|
||||||
|
// Use this table to track Internal User and Organization membership. Helps tracking a users role within an Organization
|
||||||
|
user_id String
|
||||||
|
organization_id String
|
||||||
|
user_role String?
|
||||||
|
spend Float? @default(0.0)
|
||||||
|
budget_id String?
|
||||||
|
created_at DateTime? @default(now()) @map("created_at")
|
||||||
|
updated_at DateTime? @default(now()) @updatedAt @map("updated_at")
|
||||||
|
|
||||||
|
// relations
|
||||||
|
user LiteLLM_UserTable @relation(fields: [user_id], references: [user_id])
|
||||||
|
organization LiteLLM_OrganizationTable @relation("OrganizationToMembership", fields: [organization_id], references: [organization_id])
|
||||||
|
litellm_budget_table LiteLLM_BudgetTable? @relation(fields: [budget_id], references: [budget_id])
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@@id([user_id, organization_id])
|
||||||
|
@@unique([user_id, organization_id])
|
||||||
|
}
|
||||||
|
|
||||||
|
model LiteLLM_InvitationLink {
|
||||||
|
// use this table to track invite links sent by admin for people to join the proxy
|
||||||
|
id String @id @default(uuid())
|
||||||
|
user_id String
|
||||||
|
is_accepted Boolean @default(false)
|
||||||
|
accepted_at DateTime? // when link is claimed (user successfully onboards via link)
|
||||||
|
expires_at DateTime // till when is link valid
|
||||||
|
created_at DateTime // when did admin create the link
|
||||||
|
created_by String // who created the link
|
||||||
|
updated_at DateTime // when was invite status updated
|
||||||
|
updated_by String // who updated the status (admin/user who accepted invite)
|
||||||
|
|
||||||
|
// Relations
|
||||||
|
liteLLM_user_table_user LiteLLM_UserTable @relation("UserId", fields: [user_id], references: [user_id])
|
||||||
|
liteLLM_user_table_created LiteLLM_UserTable @relation("CreatedBy", fields: [created_by], references: [user_id])
|
||||||
|
liteLLM_user_table_updated LiteLLM_UserTable @relation("UpdatedBy", fields: [updated_by], references: [user_id])
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
model LiteLLM_AuditLog {
|
||||||
|
id String @id @default(uuid())
|
||||||
|
updated_at DateTime @default(now())
|
||||||
|
changed_by String @default("") // user or system that performed the action
|
||||||
|
changed_by_api_key String @default("") // api key hash that performed the action
|
||||||
|
action String // create, update, delete
|
||||||
|
table_name String // on of LitellmTableNames.TEAM_TABLE_NAME, LitellmTableNames.USER_TABLE_NAME, LitellmTableNames.PROXY_MODEL_TABLE_NAME,
|
||||||
|
object_id String // id of the object being audited. This can be the key id, team id, user id, model id
|
||||||
|
before_value Json? // value of the row
|
||||||
|
updated_values Json? // value of the row after change
|
||||||
|
}
|
||||||
|
|
||||||
|
// Track daily user spend metrics per model and key
|
||||||
|
model LiteLLM_DailyUserSpend {
|
||||||
|
id String @id @default(uuid())
|
||||||
|
user_id String
|
||||||
|
date String
|
||||||
|
api_key String
|
||||||
|
model String
|
||||||
|
model_group String?
|
||||||
|
custom_llm_provider String?
|
||||||
|
prompt_tokens Int @default(0)
|
||||||
|
completion_tokens Int @default(0)
|
||||||
|
spend Float @default(0.0)
|
||||||
|
api_requests Int @default(0)
|
||||||
|
successful_requests Int @default(0)
|
||||||
|
failed_requests Int @default(0)
|
||||||
|
created_at DateTime @default(now())
|
||||||
|
updated_at DateTime @updatedAt
|
||||||
|
|
||||||
|
@@unique([user_id, date, api_key, model, custom_llm_provider])
|
||||||
|
@@index([date])
|
||||||
|
@@index([user_id])
|
||||||
|
@@index([api_key])
|
||||||
|
@@index([model])
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// Track the status of cron jobs running. Only allow one pod to run the job at a time
|
||||||
|
model LiteLLM_CronJob {
|
||||||
|
cronjob_id String @id @default(cuid()) // Unique ID for the record
|
||||||
|
pod_id String // Unique identifier for the pod acting as the leader
|
||||||
|
status JobStatus @default(INACTIVE) // Status of the cron job (active or inactive)
|
||||||
|
last_updated DateTime @default(now()) // Timestamp for the last update of the cron job record
|
||||||
|
ttl DateTime // Time when the leader's lease expires
|
||||||
|
}
|
||||||
|
|
||||||
|
enum JobStatus {
|
||||||
|
ACTIVE
|
||||||
|
INACTIVE
|
||||||
|
}
|
||||||
|
|
|
@ -30,21 +30,23 @@ class ProxyExtrasDBManager:
|
||||||
use_migrate = str_to_bool(os.getenv("USE_PRISMA_MIGRATE")) or use_migrate
|
use_migrate = str_to_bool(os.getenv("USE_PRISMA_MIGRATE")) or use_migrate
|
||||||
for attempt in range(4):
|
for attempt in range(4):
|
||||||
original_dir = os.getcwd()
|
original_dir = os.getcwd()
|
||||||
schema_dir = os.path.dirname(schema_path)
|
migrations_dir = os.path.dirname(__file__)
|
||||||
os.chdir(schema_dir)
|
os.chdir(migrations_dir)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
if use_migrate:
|
if use_migrate:
|
||||||
logger.info("Running prisma migrate deploy")
|
logger.info("Running prisma migrate deploy")
|
||||||
try:
|
try:
|
||||||
# Set migrations directory for Prisma
|
# Set migrations directory for Prisma
|
||||||
subprocess.run(
|
result = subprocess.run(
|
||||||
["prisma", "migrate", "deploy"],
|
["prisma", "migrate", "deploy"],
|
||||||
timeout=60,
|
timeout=60,
|
||||||
check=True,
|
check=True,
|
||||||
capture_output=True,
|
capture_output=True,
|
||||||
text=True,
|
text=True,
|
||||||
)
|
)
|
||||||
|
logger.info(f"prisma migrate deploy stdout: {result.stdout}")
|
||||||
|
|
||||||
logger.info("prisma migrate deploy completed")
|
logger.info("prisma migrate deploy completed")
|
||||||
return True
|
return True
|
||||||
except subprocess.CalledProcessError as e:
|
except subprocess.CalledProcessError as e:
|
||||||
|
@ -77,4 +79,5 @@ class ProxyExtrasDBManager:
|
||||||
time.sleep(random.randrange(5, 15))
|
time.sleep(random.randrange(5, 15))
|
||||||
finally:
|
finally:
|
||||||
os.chdir(original_dir)
|
os.chdir(original_dir)
|
||||||
|
pass
|
||||||
return False
|
return False
|
||||||
|
|
7
litellm-proxy-extras/poetry.lock
generated
Normal file
7
litellm-proxy-extras/poetry.lock
generated
Normal file
|
@ -0,0 +1,7 @@
|
||||||
|
# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand.
|
||||||
|
package = []
|
||||||
|
|
||||||
|
[metadata]
|
||||||
|
lock-version = "2.0"
|
||||||
|
python-versions = ">=3.8.1,<4.0, !=3.9.7"
|
||||||
|
content-hash = "2cf39473e67ff0615f0a61c9d2ac9f02b38cc08cbb1bdb893d89bee002646623"
|
|
@ -1,6 +1,6 @@
|
||||||
[tool.poetry]
|
[tool.poetry]
|
||||||
name = "litellm-proxy-extras"
|
name = "litellm-proxy-extras"
|
||||||
version = "0.1.1"
|
version = "0.1.3"
|
||||||
description = "Additional files for the LiteLLM Proxy. Reduces the size of the main litellm package."
|
description = "Additional files for the LiteLLM Proxy. Reduces the size of the main litellm package."
|
||||||
authors = ["BerriAI"]
|
authors = ["BerriAI"]
|
||||||
readme = "README.md"
|
readme = "README.md"
|
||||||
|
@ -22,7 +22,7 @@ requires = ["poetry-core"]
|
||||||
build-backend = "poetry.core.masonry.api"
|
build-backend = "poetry.core.masonry.api"
|
||||||
|
|
||||||
[tool.commitizen]
|
[tool.commitizen]
|
||||||
version = "0.1.1"
|
version = "0.1.3"
|
||||||
version_files = [
|
version_files = [
|
||||||
"pyproject.toml:version",
|
"pyproject.toml:version",
|
||||||
"../requirements.txt:litellm-proxy-extras==",
|
"../requirements.txt:litellm-proxy-extras==",
|
||||||
|
|
|
@ -56,6 +56,9 @@ from litellm.constants import (
|
||||||
bedrock_embedding_models,
|
bedrock_embedding_models,
|
||||||
known_tokenizer_config,
|
known_tokenizer_config,
|
||||||
BEDROCK_INVOKE_PROVIDERS_LITERAL,
|
BEDROCK_INVOKE_PROVIDERS_LITERAL,
|
||||||
|
DEFAULT_MAX_TOKENS,
|
||||||
|
DEFAULT_SOFT_BUDGET,
|
||||||
|
DEFAULT_ALLOWED_FAILS,
|
||||||
)
|
)
|
||||||
from litellm.types.guardrails import GuardrailItem
|
from litellm.types.guardrails import GuardrailItem
|
||||||
from litellm.proxy._types import (
|
from litellm.proxy._types import (
|
||||||
|
@ -120,6 +123,7 @@ callbacks: List[
|
||||||
langfuse_default_tags: Optional[List[str]] = None
|
langfuse_default_tags: Optional[List[str]] = None
|
||||||
langsmith_batch_size: Optional[int] = None
|
langsmith_batch_size: Optional[int] = None
|
||||||
prometheus_initialize_budget_metrics: Optional[bool] = False
|
prometheus_initialize_budget_metrics: Optional[bool] = False
|
||||||
|
require_auth_for_metrics_endpoint: Optional[bool] = False
|
||||||
argilla_batch_size: Optional[int] = None
|
argilla_batch_size: Optional[int] = None
|
||||||
datadog_use_v1: Optional[bool] = False # if you want to use v1 datadog logged payload
|
datadog_use_v1: Optional[bool] = False # if you want to use v1 datadog logged payload
|
||||||
gcs_pub_sub_use_v1: Optional[
|
gcs_pub_sub_use_v1: Optional[
|
||||||
|
@ -155,7 +159,7 @@ token: Optional[
|
||||||
str
|
str
|
||||||
] = None # Not used anymore, will be removed in next MAJOR release - https://github.com/BerriAI/litellm/discussions/648
|
] = None # Not used anymore, will be removed in next MAJOR release - https://github.com/BerriAI/litellm/discussions/648
|
||||||
telemetry = True
|
telemetry = True
|
||||||
max_tokens = 256 # OpenAI Defaults
|
max_tokens: int = DEFAULT_MAX_TOKENS # OpenAI Defaults
|
||||||
drop_params = bool(os.getenv("LITELLM_DROP_PARAMS", False))
|
drop_params = bool(os.getenv("LITELLM_DROP_PARAMS", False))
|
||||||
modify_params = False
|
modify_params = False
|
||||||
retry = True
|
retry = True
|
||||||
|
@ -244,7 +248,7 @@ budget_duration: Optional[
|
||||||
str
|
str
|
||||||
] = None # proxy only - resets budget after fixed duration. You can set duration as seconds ("30s"), minutes ("30m"), hours ("30h"), days ("30d").
|
] = None # proxy only - resets budget after fixed duration. You can set duration as seconds ("30s"), minutes ("30m"), hours ("30h"), days ("30d").
|
||||||
default_soft_budget: float = (
|
default_soft_budget: float = (
|
||||||
50.0 # by default all litellm proxy keys have a soft budget of 50.0
|
DEFAULT_SOFT_BUDGET # by default all litellm proxy keys have a soft budget of 50.0
|
||||||
)
|
)
|
||||||
forward_traceparent_to_llm_provider: bool = False
|
forward_traceparent_to_llm_provider: bool = False
|
||||||
|
|
||||||
|
@ -796,9 +800,8 @@ from .llms.aiohttp_openai.chat.transformation import AiohttpOpenAIChatConfig
|
||||||
from .llms.galadriel.chat.transformation import GaladrielChatConfig
|
from .llms.galadriel.chat.transformation import GaladrielChatConfig
|
||||||
from .llms.github.chat.transformation import GithubChatConfig
|
from .llms.github.chat.transformation import GithubChatConfig
|
||||||
from .llms.empower.chat.transformation import EmpowerChatConfig
|
from .llms.empower.chat.transformation import EmpowerChatConfig
|
||||||
from .llms.huggingface.chat.transformation import (
|
from .llms.huggingface.chat.transformation import HuggingFaceChatConfig
|
||||||
HuggingfaceChatConfig as HuggingfaceConfig,
|
from .llms.huggingface.embedding.transformation import HuggingFaceEmbeddingConfig
|
||||||
)
|
|
||||||
from .llms.oobabooga.chat.transformation import OobaboogaConfig
|
from .llms.oobabooga.chat.transformation import OobaboogaConfig
|
||||||
from .llms.maritalk import MaritalkConfig
|
from .llms.maritalk import MaritalkConfig
|
||||||
from .llms.openrouter.chat.transformation import OpenrouterConfig
|
from .llms.openrouter.chat.transformation import OpenrouterConfig
|
||||||
|
@ -1038,6 +1041,7 @@ from .cost_calculator import response_cost_calculator, cost_per_token
|
||||||
|
|
||||||
### ADAPTERS ###
|
### ADAPTERS ###
|
||||||
from .types.adapter import AdapterItem
|
from .types.adapter import AdapterItem
|
||||||
|
import litellm.anthropic_interface as anthropic
|
||||||
|
|
||||||
adapters: List[AdapterItem] = []
|
adapters: List[AdapterItem] = []
|
||||||
|
|
||||||
|
|
|
@ -18,6 +18,7 @@ import redis # type: ignore
|
||||||
import redis.asyncio as async_redis # type: ignore
|
import redis.asyncio as async_redis # type: ignore
|
||||||
|
|
||||||
from litellm import get_secret, get_secret_str
|
from litellm import get_secret, get_secret_str
|
||||||
|
from litellm.constants import REDIS_CONNECTION_POOL_TIMEOUT, REDIS_SOCKET_TIMEOUT
|
||||||
|
|
||||||
from ._logging import verbose_logger
|
from ._logging import verbose_logger
|
||||||
|
|
||||||
|
@ -214,8 +215,8 @@ def _init_redis_sentinel(redis_kwargs) -> redis.Redis:
|
||||||
|
|
||||||
# Set up the Sentinel client
|
# Set up the Sentinel client
|
||||||
sentinel = redis.Sentinel(
|
sentinel = redis.Sentinel(
|
||||||
sentinel_nodes,
|
sentinel_nodes,
|
||||||
socket_timeout=0.1,
|
socket_timeout=REDIS_SOCKET_TIMEOUT,
|
||||||
password=sentinel_password,
|
password=sentinel_password,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -239,7 +240,7 @@ def _init_async_redis_sentinel(redis_kwargs) -> async_redis.Redis:
|
||||||
# Set up the Sentinel client
|
# Set up the Sentinel client
|
||||||
sentinel = async_redis.Sentinel(
|
sentinel = async_redis.Sentinel(
|
||||||
sentinel_nodes,
|
sentinel_nodes,
|
||||||
socket_timeout=0.1,
|
socket_timeout=REDIS_SOCKET_TIMEOUT,
|
||||||
password=sentinel_password,
|
password=sentinel_password,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -319,7 +320,7 @@ def get_redis_connection_pool(**env_overrides):
|
||||||
verbose_logger.debug("get_redis_connection_pool: redis_kwargs", redis_kwargs)
|
verbose_logger.debug("get_redis_connection_pool: redis_kwargs", redis_kwargs)
|
||||||
if "url" in redis_kwargs and redis_kwargs["url"] is not None:
|
if "url" in redis_kwargs and redis_kwargs["url"] is not None:
|
||||||
return async_redis.BlockingConnectionPool.from_url(
|
return async_redis.BlockingConnectionPool.from_url(
|
||||||
timeout=5, url=redis_kwargs["url"]
|
timeout=REDIS_CONNECTION_POOL_TIMEOUT, url=redis_kwargs["url"]
|
||||||
)
|
)
|
||||||
connection_class = async_redis.Connection
|
connection_class = async_redis.Connection
|
||||||
if "ssl" in redis_kwargs:
|
if "ssl" in redis_kwargs:
|
||||||
|
@ -327,4 +328,6 @@ def get_redis_connection_pool(**env_overrides):
|
||||||
redis_kwargs.pop("ssl", None)
|
redis_kwargs.pop("ssl", None)
|
||||||
redis_kwargs["connection_class"] = connection_class
|
redis_kwargs["connection_class"] = connection_class
|
||||||
redis_kwargs.pop("startup_nodes", None)
|
redis_kwargs.pop("startup_nodes", None)
|
||||||
return async_redis.BlockingConnectionPool(timeout=5, **redis_kwargs)
|
return async_redis.BlockingConnectionPool(
|
||||||
|
timeout=REDIS_CONNECTION_POOL_TIMEOUT, **redis_kwargs
|
||||||
|
)
|
||||||
|
|
|
@ -124,6 +124,7 @@ class ServiceLogging(CustomLogger):
|
||||||
service=service,
|
service=service,
|
||||||
duration=duration,
|
duration=duration,
|
||||||
call_type=call_type,
|
call_type=call_type,
|
||||||
|
event_metadata=event_metadata,
|
||||||
)
|
)
|
||||||
|
|
||||||
for callback in litellm.service_callback:
|
for callback in litellm.service_callback:
|
||||||
|
@ -229,6 +230,7 @@ class ServiceLogging(CustomLogger):
|
||||||
service=service,
|
service=service,
|
||||||
duration=duration,
|
duration=duration,
|
||||||
call_type=call_type,
|
call_type=call_type,
|
||||||
|
event_metadata=event_metadata,
|
||||||
)
|
)
|
||||||
|
|
||||||
for callback in litellm.service_callback:
|
for callback in litellm.service_callback:
|
||||||
|
|
|
@ -3,4 +3,4 @@ import importlib_metadata
|
||||||
try:
|
try:
|
||||||
version = importlib_metadata.version("litellm")
|
version = importlib_metadata.version("litellm")
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
version = "unknown"
|
||||||
|
|
6
litellm/anthropic_interface/__init__.py
Normal file
6
litellm/anthropic_interface/__init__.py
Normal file
|
@ -0,0 +1,6 @@
|
||||||
|
"""
|
||||||
|
Anthropic module for LiteLLM
|
||||||
|
"""
|
||||||
|
from .messages import acreate, create
|
||||||
|
|
||||||
|
__all__ = ["acreate", "create"]
|
117
litellm/anthropic_interface/messages/__init__.py
Normal file
117
litellm/anthropic_interface/messages/__init__.py
Normal file
|
@ -0,0 +1,117 @@
|
||||||
|
"""
|
||||||
|
Interface for Anthropic's messages API
|
||||||
|
|
||||||
|
Use this to call LLMs in Anthropic /messages Request/Response format
|
||||||
|
|
||||||
|
This is an __init__.py file to allow the following interface
|
||||||
|
|
||||||
|
- litellm.messages.acreate
|
||||||
|
- litellm.messages.create
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
from typing import AsyncIterator, Dict, Iterator, List, Optional, Union
|
||||||
|
|
||||||
|
from litellm.llms.anthropic.experimental_pass_through.messages.handler import (
|
||||||
|
anthropic_messages as _async_anthropic_messages,
|
||||||
|
)
|
||||||
|
from litellm.types.llms.anthropic_messages.anthropic_response import (
|
||||||
|
AnthropicMessagesResponse,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
async def acreate(
|
||||||
|
max_tokens: int,
|
||||||
|
messages: List[Dict],
|
||||||
|
model: str,
|
||||||
|
metadata: Optional[Dict] = None,
|
||||||
|
stop_sequences: Optional[List[str]] = None,
|
||||||
|
stream: Optional[bool] = False,
|
||||||
|
system: Optional[str] = None,
|
||||||
|
temperature: Optional[float] = 1.0,
|
||||||
|
thinking: Optional[Dict] = None,
|
||||||
|
tool_choice: Optional[Dict] = None,
|
||||||
|
tools: Optional[List[Dict]] = None,
|
||||||
|
top_k: Optional[int] = None,
|
||||||
|
top_p: Optional[float] = None,
|
||||||
|
**kwargs
|
||||||
|
) -> Union[AnthropicMessagesResponse, AsyncIterator]:
|
||||||
|
"""
|
||||||
|
Async wrapper for Anthropic's messages API
|
||||||
|
|
||||||
|
Args:
|
||||||
|
max_tokens (int): Maximum tokens to generate (required)
|
||||||
|
messages (List[Dict]): List of message objects with role and content (required)
|
||||||
|
model (str): Model name to use (required)
|
||||||
|
metadata (Dict, optional): Request metadata
|
||||||
|
stop_sequences (List[str], optional): Custom stop sequences
|
||||||
|
stream (bool, optional): Whether to stream the response
|
||||||
|
system (str, optional): System prompt
|
||||||
|
temperature (float, optional): Sampling temperature (0.0 to 1.0)
|
||||||
|
thinking (Dict, optional): Extended thinking configuration
|
||||||
|
tool_choice (Dict, optional): Tool choice configuration
|
||||||
|
tools (List[Dict], optional): List of tool definitions
|
||||||
|
top_k (int, optional): Top K sampling parameter
|
||||||
|
top_p (float, optional): Nucleus sampling parameter
|
||||||
|
**kwargs: Additional arguments
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict: Response from the API
|
||||||
|
"""
|
||||||
|
return await _async_anthropic_messages(
|
||||||
|
max_tokens=max_tokens,
|
||||||
|
messages=messages,
|
||||||
|
model=model,
|
||||||
|
metadata=metadata,
|
||||||
|
stop_sequences=stop_sequences,
|
||||||
|
stream=stream,
|
||||||
|
system=system,
|
||||||
|
temperature=temperature,
|
||||||
|
thinking=thinking,
|
||||||
|
tool_choice=tool_choice,
|
||||||
|
tools=tools,
|
||||||
|
top_k=top_k,
|
||||||
|
top_p=top_p,
|
||||||
|
**kwargs,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
async def create(
|
||||||
|
max_tokens: int,
|
||||||
|
messages: List[Dict],
|
||||||
|
model: str,
|
||||||
|
metadata: Optional[Dict] = None,
|
||||||
|
stop_sequences: Optional[List[str]] = None,
|
||||||
|
stream: Optional[bool] = False,
|
||||||
|
system: Optional[str] = None,
|
||||||
|
temperature: Optional[float] = 1.0,
|
||||||
|
thinking: Optional[Dict] = None,
|
||||||
|
tool_choice: Optional[Dict] = None,
|
||||||
|
tools: Optional[List[Dict]] = None,
|
||||||
|
top_k: Optional[int] = None,
|
||||||
|
top_p: Optional[float] = None,
|
||||||
|
**kwargs
|
||||||
|
) -> Union[AnthropicMessagesResponse, Iterator]:
|
||||||
|
"""
|
||||||
|
Async wrapper for Anthropic's messages API
|
||||||
|
|
||||||
|
Args:
|
||||||
|
max_tokens (int): Maximum tokens to generate (required)
|
||||||
|
messages (List[Dict]): List of message objects with role and content (required)
|
||||||
|
model (str): Model name to use (required)
|
||||||
|
metadata (Dict, optional): Request metadata
|
||||||
|
stop_sequences (List[str], optional): Custom stop sequences
|
||||||
|
stream (bool, optional): Whether to stream the response
|
||||||
|
system (str, optional): System prompt
|
||||||
|
temperature (float, optional): Sampling temperature (0.0 to 1.0)
|
||||||
|
thinking (Dict, optional): Extended thinking configuration
|
||||||
|
tool_choice (Dict, optional): Tool choice configuration
|
||||||
|
tools (List[Dict], optional): List of tool definitions
|
||||||
|
top_k (int, optional): Top K sampling parameter
|
||||||
|
top_p (float, optional): Nucleus sampling parameter
|
||||||
|
**kwargs: Additional arguments
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict: Response from the API
|
||||||
|
"""
|
||||||
|
raise NotImplementedError("This function is not implemented")
|
116
litellm/anthropic_interface/readme.md
Normal file
116
litellm/anthropic_interface/readme.md
Normal file
|
@ -0,0 +1,116 @@
|
||||||
|
## Use LLM API endpoints in Anthropic Interface
|
||||||
|
|
||||||
|
Note: This is called `anthropic_interface` because `anthropic` is a known python package and was failing mypy type checking.
|
||||||
|
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
---
|
||||||
|
|
||||||
|
### LiteLLM Python SDK
|
||||||
|
|
||||||
|
#### Non-streaming example
|
||||||
|
```python showLineNumbers title="Example using LiteLLM Python SDK"
|
||||||
|
import litellm
|
||||||
|
response = await litellm.anthropic.messages.acreate(
|
||||||
|
messages=[{"role": "user", "content": "Hello, can you tell me a short joke?"}],
|
||||||
|
api_key=api_key,
|
||||||
|
model="anthropic/claude-3-haiku-20240307",
|
||||||
|
max_tokens=100,
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
Example response:
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"content": [
|
||||||
|
{
|
||||||
|
"text": "Hi! this is a very short joke",
|
||||||
|
"type": "text"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"id": "msg_013Zva2CMHLNnXjNJJKqJ2EF",
|
||||||
|
"model": "claude-3-7-sonnet-20250219",
|
||||||
|
"role": "assistant",
|
||||||
|
"stop_reason": "end_turn",
|
||||||
|
"stop_sequence": null,
|
||||||
|
"type": "message",
|
||||||
|
"usage": {
|
||||||
|
"input_tokens": 2095,
|
||||||
|
"output_tokens": 503,
|
||||||
|
"cache_creation_input_tokens": 2095,
|
||||||
|
"cache_read_input_tokens": 0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Streaming example
|
||||||
|
```python showLineNumbers title="Example using LiteLLM Python SDK"
|
||||||
|
import litellm
|
||||||
|
response = await litellm.anthropic.messages.acreate(
|
||||||
|
messages=[{"role": "user", "content": "Hello, can you tell me a short joke?"}],
|
||||||
|
api_key=api_key,
|
||||||
|
model="anthropic/claude-3-haiku-20240307",
|
||||||
|
max_tokens=100,
|
||||||
|
stream=True,
|
||||||
|
)
|
||||||
|
async for chunk in response:
|
||||||
|
print(chunk)
|
||||||
|
```
|
||||||
|
|
||||||
|
### LiteLLM Proxy Server
|
||||||
|
|
||||||
|
|
||||||
|
1. Setup config.yaml
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
model_list:
|
||||||
|
- model_name: anthropic-claude
|
||||||
|
litellm_params:
|
||||||
|
model: claude-3-7-sonnet-latest
|
||||||
|
```
|
||||||
|
|
||||||
|
2. Start proxy
|
||||||
|
|
||||||
|
```bash
|
||||||
|
litellm --config /path/to/config.yaml
|
||||||
|
```
|
||||||
|
|
||||||
|
3. Test it!
|
||||||
|
|
||||||
|
<Tabs>
|
||||||
|
<TabItem label="Anthropic Python SDK" value="python">
|
||||||
|
|
||||||
|
```python showLineNumbers title="Example using LiteLLM Proxy Server"
|
||||||
|
import anthropic
|
||||||
|
|
||||||
|
# point anthropic sdk to litellm proxy
|
||||||
|
client = anthropic.Anthropic(
|
||||||
|
base_url="http://0.0.0.0:4000",
|
||||||
|
api_key="sk-1234",
|
||||||
|
)
|
||||||
|
|
||||||
|
response = client.messages.create(
|
||||||
|
messages=[{"role": "user", "content": "Hello, can you tell me a short joke?"}],
|
||||||
|
model="anthropic/claude-3-haiku-20240307",
|
||||||
|
max_tokens=100,
|
||||||
|
)
|
||||||
|
```
|
||||||
|
</TabItem>
|
||||||
|
<TabItem label="curl" value="curl">
|
||||||
|
|
||||||
|
```bash showLineNumbers title="Example using LiteLLM Proxy Server"
|
||||||
|
curl -L -X POST 'http://0.0.0.0:4000/v1/messages' \
|
||||||
|
-H 'content-type: application/json' \
|
||||||
|
-H 'x-api-key: $LITELLM_API_KEY' \
|
||||||
|
-H 'anthropic-version: 2023-06-01' \
|
||||||
|
-d '{
|
||||||
|
"model": "anthropic-claude",
|
||||||
|
"messages": [
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": "Hello, can you tell me a short joke?"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"max_tokens": 100
|
||||||
|
}'
|
||||||
|
```
|
|
@ -14,6 +14,12 @@ import time
|
||||||
from typing import Literal, Optional
|
from typing import Literal, Optional
|
||||||
|
|
||||||
import litellm
|
import litellm
|
||||||
|
from litellm.constants import (
|
||||||
|
DAYS_IN_A_MONTH,
|
||||||
|
DAYS_IN_A_WEEK,
|
||||||
|
DAYS_IN_A_YEAR,
|
||||||
|
HOURS_IN_A_DAY,
|
||||||
|
)
|
||||||
from litellm.utils import ModelResponse
|
from litellm.utils import ModelResponse
|
||||||
|
|
||||||
|
|
||||||
|
@ -81,11 +87,11 @@ class BudgetManager:
|
||||||
if duration == "daily":
|
if duration == "daily":
|
||||||
duration_in_days = 1
|
duration_in_days = 1
|
||||||
elif duration == "weekly":
|
elif duration == "weekly":
|
||||||
duration_in_days = 7
|
duration_in_days = DAYS_IN_A_WEEK
|
||||||
elif duration == "monthly":
|
elif duration == "monthly":
|
||||||
duration_in_days = 28
|
duration_in_days = DAYS_IN_A_MONTH
|
||||||
elif duration == "yearly":
|
elif duration == "yearly":
|
||||||
duration_in_days = 365
|
duration_in_days = DAYS_IN_A_YEAR
|
||||||
else:
|
else:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"""duration needs to be one of ["daily", "weekly", "monthly", "yearly"]"""
|
"""duration needs to be one of ["daily", "weekly", "monthly", "yearly"]"""
|
||||||
|
@ -182,7 +188,9 @@ class BudgetManager:
|
||||||
current_time = time.time()
|
current_time = time.time()
|
||||||
|
|
||||||
# Convert duration from days to seconds
|
# Convert duration from days to seconds
|
||||||
duration_in_seconds = self.user_dict[user]["duration"] * 24 * 60 * 60
|
duration_in_seconds = (
|
||||||
|
self.user_dict[user]["duration"] * HOURS_IN_A_DAY * 60 * 60
|
||||||
|
)
|
||||||
|
|
||||||
# Check if duration has elapsed
|
# Check if duration has elapsed
|
||||||
if current_time - last_updated_at >= duration_in_seconds:
|
if current_time - last_updated_at >= duration_in_seconds:
|
||||||
|
|
|
@ -19,6 +19,7 @@ from pydantic import BaseModel
|
||||||
|
|
||||||
import litellm
|
import litellm
|
||||||
from litellm._logging import verbose_logger
|
from litellm._logging import verbose_logger
|
||||||
|
from litellm.constants import CACHED_STREAMING_CHUNK_DELAY
|
||||||
from litellm.litellm_core_utils.model_param_helper import ModelParamHelper
|
from litellm.litellm_core_utils.model_param_helper import ModelParamHelper
|
||||||
from litellm.types.caching import *
|
from litellm.types.caching import *
|
||||||
from litellm.types.utils import all_litellm_params
|
from litellm.types.utils import all_litellm_params
|
||||||
|
@ -406,7 +407,7 @@ class Cache:
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
time.sleep(0.02)
|
time.sleep(CACHED_STREAMING_CHUNK_DELAY)
|
||||||
|
|
||||||
def _get_cache_logic(
|
def _get_cache_logic(
|
||||||
self,
|
self,
|
||||||
|
|
|
@ -15,7 +15,8 @@ from typing import Any, List, Optional
|
||||||
|
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
|
|
||||||
from ..constants import MAX_SIZE_PER_ITEM_IN_MEMORY_CACHE_IN_KB
|
from litellm.constants import MAX_SIZE_PER_ITEM_IN_MEMORY_CACHE_IN_KB
|
||||||
|
|
||||||
from .base_cache import BaseCache
|
from .base_cache import BaseCache
|
||||||
|
|
||||||
|
|
||||||
|
@ -52,7 +53,8 @@ class InMemoryCache(BaseCache):
|
||||||
# Fast path for common primitive types that are typically small
|
# Fast path for common primitive types that are typically small
|
||||||
if (
|
if (
|
||||||
isinstance(value, (bool, int, float, str))
|
isinstance(value, (bool, int, float, str))
|
||||||
and len(str(value)) < self.max_size_per_item * 512
|
and len(str(value))
|
||||||
|
< self.max_size_per_item * MAX_SIZE_PER_ITEM_IN_MEMORY_CACHE_IN_KB
|
||||||
): # Conservative estimate
|
): # Conservative estimate
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
|
|
@ -11,10 +11,12 @@ Has 4 methods:
|
||||||
import ast
|
import ast
|
||||||
import asyncio
|
import asyncio
|
||||||
import json
|
import json
|
||||||
from typing import Any
|
from typing import Any, cast
|
||||||
|
|
||||||
import litellm
|
import litellm
|
||||||
from litellm._logging import print_verbose
|
from litellm._logging import print_verbose
|
||||||
|
from litellm.constants import QDRANT_SCALAR_QUANTILE, QDRANT_VECTOR_SIZE
|
||||||
|
from litellm.types.utils import EmbeddingResponse
|
||||||
|
|
||||||
from .base_cache import BaseCache
|
from .base_cache import BaseCache
|
||||||
|
|
||||||
|
@ -118,7 +120,11 @@ class QdrantSemanticCache(BaseCache):
|
||||||
}
|
}
|
||||||
elif quantization_config == "scalar":
|
elif quantization_config == "scalar":
|
||||||
quantization_params = {
|
quantization_params = {
|
||||||
"scalar": {"type": "int8", "quantile": 0.99, "always_ram": False}
|
"scalar": {
|
||||||
|
"type": "int8",
|
||||||
|
"quantile": QDRANT_SCALAR_QUANTILE,
|
||||||
|
"always_ram": False,
|
||||||
|
}
|
||||||
}
|
}
|
||||||
elif quantization_config == "product":
|
elif quantization_config == "product":
|
||||||
quantization_params = {
|
quantization_params = {
|
||||||
|
@ -132,7 +138,7 @@ class QdrantSemanticCache(BaseCache):
|
||||||
new_collection_status = self.sync_client.put(
|
new_collection_status = self.sync_client.put(
|
||||||
url=f"{self.qdrant_api_base}/collections/{self.collection_name}",
|
url=f"{self.qdrant_api_base}/collections/{self.collection_name}",
|
||||||
json={
|
json={
|
||||||
"vectors": {"size": 1536, "distance": "Cosine"},
|
"vectors": {"size": QDRANT_VECTOR_SIZE, "distance": "Cosine"},
|
||||||
"quantization_config": quantization_params,
|
"quantization_config": quantization_params,
|
||||||
},
|
},
|
||||||
headers=self.headers,
|
headers=self.headers,
|
||||||
|
@ -171,10 +177,13 @@ class QdrantSemanticCache(BaseCache):
|
||||||
prompt += message["content"]
|
prompt += message["content"]
|
||||||
|
|
||||||
# create an embedding for prompt
|
# create an embedding for prompt
|
||||||
embedding_response = litellm.embedding(
|
embedding_response = cast(
|
||||||
model=self.embedding_model,
|
EmbeddingResponse,
|
||||||
input=prompt,
|
litellm.embedding(
|
||||||
cache={"no-store": True, "no-cache": True},
|
model=self.embedding_model,
|
||||||
|
input=prompt,
|
||||||
|
cache={"no-store": True, "no-cache": True},
|
||||||
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
# get the embedding
|
# get the embedding
|
||||||
|
@ -212,10 +221,13 @@ class QdrantSemanticCache(BaseCache):
|
||||||
prompt += message["content"]
|
prompt += message["content"]
|
||||||
|
|
||||||
# convert to embedding
|
# convert to embedding
|
||||||
embedding_response = litellm.embedding(
|
embedding_response = cast(
|
||||||
model=self.embedding_model,
|
EmbeddingResponse,
|
||||||
input=prompt,
|
litellm.embedding(
|
||||||
cache={"no-store": True, "no-cache": True},
|
model=self.embedding_model,
|
||||||
|
input=prompt,
|
||||||
|
cache={"no-store": True, "no-cache": True},
|
||||||
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
# get the embedding
|
# get the embedding
|
||||||
|
|
|
@ -304,12 +304,18 @@ class RedisCache(BaseCache):
|
||||||
|
|
||||||
key = self.check_and_fix_namespace(key=key)
|
key = self.check_and_fix_namespace(key=key)
|
||||||
ttl = self.get_ttl(**kwargs)
|
ttl = self.get_ttl(**kwargs)
|
||||||
|
nx = kwargs.get("nx", False)
|
||||||
print_verbose(f"Set ASYNC Redis Cache: key: {key}\nValue {value}\nttl={ttl}")
|
print_verbose(f"Set ASYNC Redis Cache: key: {key}\nValue {value}\nttl={ttl}")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
if not hasattr(_redis_client, "set"):
|
if not hasattr(_redis_client, "set"):
|
||||||
raise Exception("Redis client cannot set cache. Attribute not found.")
|
raise Exception("Redis client cannot set cache. Attribute not found.")
|
||||||
await _redis_client.set(name=key, value=json.dumps(value), ex=ttl)
|
result = await _redis_client.set(
|
||||||
|
name=key,
|
||||||
|
value=json.dumps(value),
|
||||||
|
nx=nx,
|
||||||
|
ex=ttl,
|
||||||
|
)
|
||||||
print_verbose(
|
print_verbose(
|
||||||
f"Successfully Set ASYNC Redis Cache: key: {key}\nValue {value}\nttl={ttl}"
|
f"Successfully Set ASYNC Redis Cache: key: {key}\nValue {value}\nttl={ttl}"
|
||||||
)
|
)
|
||||||
|
@ -326,6 +332,7 @@ class RedisCache(BaseCache):
|
||||||
event_metadata={"key": key},
|
event_metadata={"key": key},
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
return result
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
end_time = time.time()
|
end_time = time.time()
|
||||||
_duration = end_time - start_time
|
_duration = end_time - start_time
|
||||||
|
@ -931,7 +938,7 @@ class RedisCache(BaseCache):
|
||||||
# typed as Any, redis python lib has incomplete type stubs for RedisCluster and does not include `delete`
|
# typed as Any, redis python lib has incomplete type stubs for RedisCluster and does not include `delete`
|
||||||
_redis_client: Any = self.init_async_client()
|
_redis_client: Any = self.init_async_client()
|
||||||
# keys is str
|
# keys is str
|
||||||
await _redis_client.delete(key)
|
return await _redis_client.delete(key)
|
||||||
|
|
||||||
def delete_cache(self, key):
|
def delete_cache(self, key):
|
||||||
self.redis_client.delete(key)
|
self.redis_client.delete(key)
|
||||||
|
|
|
@ -9,6 +9,7 @@ DEFAULT_FAILURE_THRESHOLD_PERCENT = (
|
||||||
0.5 # default cooldown a deployment if 50% of requests fail in a given minute
|
0.5 # default cooldown a deployment if 50% of requests fail in a given minute
|
||||||
)
|
)
|
||||||
DEFAULT_MAX_TOKENS = 4096
|
DEFAULT_MAX_TOKENS = 4096
|
||||||
|
DEFAULT_ALLOWED_FAILS = 3
|
||||||
DEFAULT_REDIS_SYNC_INTERVAL = 1
|
DEFAULT_REDIS_SYNC_INTERVAL = 1
|
||||||
DEFAULT_COOLDOWN_TIME_SECONDS = 5
|
DEFAULT_COOLDOWN_TIME_SECONDS = 5
|
||||||
DEFAULT_REPLICATE_POLLING_RETRIES = 5
|
DEFAULT_REPLICATE_POLLING_RETRIES = 5
|
||||||
|
@ -16,15 +17,76 @@ DEFAULT_REPLICATE_POLLING_DELAY_SECONDS = 1
|
||||||
DEFAULT_IMAGE_TOKEN_COUNT = 250
|
DEFAULT_IMAGE_TOKEN_COUNT = 250
|
||||||
DEFAULT_IMAGE_WIDTH = 300
|
DEFAULT_IMAGE_WIDTH = 300
|
||||||
DEFAULT_IMAGE_HEIGHT = 300
|
DEFAULT_IMAGE_HEIGHT = 300
|
||||||
|
DEFAULT_MAX_TOKENS = 256 # used when providers need a default
|
||||||
MAX_SIZE_PER_ITEM_IN_MEMORY_CACHE_IN_KB = 1024 # 1MB = 1024KB
|
MAX_SIZE_PER_ITEM_IN_MEMORY_CACHE_IN_KB = 1024 # 1MB = 1024KB
|
||||||
SINGLE_DEPLOYMENT_TRAFFIC_FAILURE_THRESHOLD = 1000 # Minimum number of requests to consider "reasonable traffic". Used for single-deployment cooldown logic.
|
SINGLE_DEPLOYMENT_TRAFFIC_FAILURE_THRESHOLD = 1000 # Minimum number of requests to consider "reasonable traffic". Used for single-deployment cooldown logic.
|
||||||
|
|
||||||
|
########### v2 Architecture constants for managing writing updates to the database ###########
|
||||||
REDIS_UPDATE_BUFFER_KEY = "litellm_spend_update_buffer"
|
REDIS_UPDATE_BUFFER_KEY = "litellm_spend_update_buffer"
|
||||||
|
REDIS_DAILY_SPEND_UPDATE_BUFFER_KEY = "litellm_daily_spend_update_buffer"
|
||||||
MAX_REDIS_BUFFER_DEQUEUE_COUNT = 100
|
MAX_REDIS_BUFFER_DEQUEUE_COUNT = 100
|
||||||
|
MAX_SIZE_IN_MEMORY_QUEUE = 10000
|
||||||
|
MAX_IN_MEMORY_QUEUE_FLUSH_COUNT = 1000
|
||||||
|
###############################################################################################
|
||||||
|
MINIMUM_PROMPT_CACHE_TOKEN_COUNT = (
|
||||||
|
1024 # minimum number of tokens to cache a prompt by Anthropic
|
||||||
|
)
|
||||||
|
DEFAULT_TRIM_RATIO = 0.75 # default ratio of tokens to trim from the end of a prompt
|
||||||
|
HOURS_IN_A_DAY = 24
|
||||||
|
DAYS_IN_A_WEEK = 7
|
||||||
|
DAYS_IN_A_MONTH = 28
|
||||||
|
DAYS_IN_A_YEAR = 365
|
||||||
|
REPLICATE_MODEL_NAME_WITH_ID_LENGTH = 64
|
||||||
|
#### TOKEN COUNTING ####
|
||||||
|
FUNCTION_DEFINITION_TOKEN_COUNT = 9
|
||||||
|
SYSTEM_MESSAGE_TOKEN_COUNT = 4
|
||||||
|
TOOL_CHOICE_OBJECT_TOKEN_COUNT = 4
|
||||||
|
DEFAULT_MOCK_RESPONSE_PROMPT_TOKEN_COUNT = 10
|
||||||
|
DEFAULT_MOCK_RESPONSE_COMPLETION_TOKEN_COUNT = 20
|
||||||
|
MAX_SHORT_SIDE_FOR_IMAGE_HIGH_RES = 768
|
||||||
|
MAX_LONG_SIDE_FOR_IMAGE_HIGH_RES = 2000
|
||||||
|
MAX_TILE_WIDTH = 512
|
||||||
|
MAX_TILE_HEIGHT = 512
|
||||||
|
OPENAI_FILE_SEARCH_COST_PER_1K_CALLS = 2.5 / 1000
|
||||||
|
MIN_NON_ZERO_TEMPERATURE = 0.0001
|
||||||
#### RELIABILITY ####
|
#### RELIABILITY ####
|
||||||
REPEATED_STREAMING_CHUNK_LIMIT = 100 # catch if model starts looping the same chunk while streaming. Uses high default to prevent false positives.
|
REPEATED_STREAMING_CHUNK_LIMIT = 100 # catch if model starts looping the same chunk while streaming. Uses high default to prevent false positives.
|
||||||
|
DEFAULT_MAX_LRU_CACHE_SIZE = 16
|
||||||
|
INITIAL_RETRY_DELAY = 0.5
|
||||||
|
MAX_RETRY_DELAY = 8.0
|
||||||
|
JITTER = 0.75
|
||||||
|
DEFAULT_IN_MEMORY_TTL = 5 # default time to live for the in-memory cache
|
||||||
|
DEFAULT_POLLING_INTERVAL = 0.03 # default polling interval for the scheduler
|
||||||
|
AZURE_OPERATION_POLLING_TIMEOUT = 120
|
||||||
|
REDIS_SOCKET_TIMEOUT = 0.1
|
||||||
|
REDIS_CONNECTION_POOL_TIMEOUT = 5
|
||||||
|
NON_LLM_CONNECTION_TIMEOUT = 15 # timeout for adjacent services (e.g. jwt auth)
|
||||||
|
MAX_EXCEPTION_MESSAGE_LENGTH = 2000
|
||||||
|
BEDROCK_MAX_POLICY_SIZE = 75
|
||||||
|
REPLICATE_POLLING_DELAY_SECONDS = 0.5
|
||||||
|
DEFAULT_ANTHROPIC_CHAT_MAX_TOKENS = 4096
|
||||||
|
TOGETHER_AI_4_B = 4
|
||||||
|
TOGETHER_AI_8_B = 8
|
||||||
|
TOGETHER_AI_21_B = 21
|
||||||
|
TOGETHER_AI_41_B = 41
|
||||||
|
TOGETHER_AI_80_B = 80
|
||||||
|
TOGETHER_AI_110_B = 110
|
||||||
|
TOGETHER_AI_EMBEDDING_150_M = 150
|
||||||
|
TOGETHER_AI_EMBEDDING_350_M = 350
|
||||||
|
QDRANT_SCALAR_QUANTILE = 0.99
|
||||||
|
QDRANT_VECTOR_SIZE = 1536
|
||||||
|
CACHED_STREAMING_CHUNK_DELAY = 0.02
|
||||||
|
MAX_SIZE_PER_ITEM_IN_MEMORY_CACHE_IN_KB = 512
|
||||||
|
DEFAULT_MAX_TOKENS_FOR_TRITON = 2000
|
||||||
#### Networking settings ####
|
#### Networking settings ####
|
||||||
request_timeout: float = 6000 # time in seconds
|
request_timeout: float = 6000 # time in seconds
|
||||||
STREAM_SSE_DONE_STRING: str = "[DONE]"
|
STREAM_SSE_DONE_STRING: str = "[DONE]"
|
||||||
|
### SPEND TRACKING ###
|
||||||
|
DEFAULT_REPLICATE_GPU_PRICE_PER_SECOND = 0.001400 # price per second for a100 80GB
|
||||||
|
FIREWORKS_AI_56_B_MOE = 56
|
||||||
|
FIREWORKS_AI_176_B_MOE = 176
|
||||||
|
FIREWORKS_AI_16_B = 16
|
||||||
|
FIREWORKS_AI_80_B = 80
|
||||||
|
|
||||||
LITELLM_CHAT_PROVIDERS = [
|
LITELLM_CHAT_PROVIDERS = [
|
||||||
"openai",
|
"openai",
|
||||||
|
@ -425,6 +487,9 @@ MCP_TOOL_NAME_PREFIX = "mcp_tool"
|
||||||
MAX_SPENDLOG_ROWS_TO_QUERY = (
|
MAX_SPENDLOG_ROWS_TO_QUERY = (
|
||||||
1_000_000 # if spendLogs has more than 1M rows, do not query the DB
|
1_000_000 # if spendLogs has more than 1M rows, do not query the DB
|
||||||
)
|
)
|
||||||
|
DEFAULT_SOFT_BUDGET = (
|
||||||
|
50.0 # by default all litellm proxy keys have a soft budget of 50.0
|
||||||
|
)
|
||||||
# makes it clear this is a rate limit error for a litellm virtual key
|
# makes it clear this is a rate limit error for a litellm virtual key
|
||||||
RATE_LIMIT_ERROR_MESSAGE_FOR_VIRTUAL_KEY = "LiteLLM Virtual Key user_api_key_hash"
|
RATE_LIMIT_ERROR_MESSAGE_FOR_VIRTUAL_KEY = "LiteLLM Virtual Key user_api_key_hash"
|
||||||
|
|
||||||
|
@ -450,3 +515,14 @@ LITELLM_PROXY_ADMIN_NAME = "default_user_id"
|
||||||
########################### DB CRON JOB NAMES ###########################
|
########################### DB CRON JOB NAMES ###########################
|
||||||
DB_SPEND_UPDATE_JOB_NAME = "db_spend_update_job"
|
DB_SPEND_UPDATE_JOB_NAME = "db_spend_update_job"
|
||||||
DEFAULT_CRON_JOB_LOCK_TTL_SECONDS = 60 # 1 minute
|
DEFAULT_CRON_JOB_LOCK_TTL_SECONDS = 60 # 1 minute
|
||||||
|
PROXY_BUDGET_RESCHEDULER_MIN_TIME = 597
|
||||||
|
PROXY_BUDGET_RESCHEDULER_MAX_TIME = 605
|
||||||
|
PROXY_BATCH_WRITE_AT = 10 # in seconds
|
||||||
|
DEFAULT_HEALTH_CHECK_INTERVAL = 300 # 5 minutes
|
||||||
|
PROMETHEUS_FALLBACK_STATS_SEND_TIME_HOURS = 9
|
||||||
|
DEFAULT_MODEL_CREATED_AT_TIME = 1677610602 # returns on `/models` endpoint
|
||||||
|
DEFAULT_SLACK_ALERTING_THRESHOLD = 300
|
||||||
|
MAX_TEAM_LIST_LIMIT = 20
|
||||||
|
DEFAULT_PROMPT_INJECTION_SIMILARITY_THRESHOLD = 0.7
|
||||||
|
LENGTH_OF_LITELLM_GENERATED_KEY = 16
|
||||||
|
SECRET_MANAGER_REFRESH_INTERVAL = 86400
|
||||||
|
|
|
@ -9,6 +9,10 @@ from pydantic import BaseModel
|
||||||
import litellm
|
import litellm
|
||||||
import litellm._logging
|
import litellm._logging
|
||||||
from litellm import verbose_logger
|
from litellm import verbose_logger
|
||||||
|
from litellm.constants import (
|
||||||
|
DEFAULT_MAX_LRU_CACHE_SIZE,
|
||||||
|
DEFAULT_REPLICATE_GPU_PRICE_PER_SECOND,
|
||||||
|
)
|
||||||
from litellm.litellm_core_utils.llm_cost_calc.tool_call_cost_tracking import (
|
from litellm.litellm_core_utils.llm_cost_calc.tool_call_cost_tracking import (
|
||||||
StandardBuiltInToolCostTracking,
|
StandardBuiltInToolCostTracking,
|
||||||
)
|
)
|
||||||
|
@ -355,9 +359,7 @@ def cost_per_token( # noqa: PLR0915
|
||||||
def get_replicate_completion_pricing(completion_response: dict, total_time=0.0):
|
def get_replicate_completion_pricing(completion_response: dict, total_time=0.0):
|
||||||
# see https://replicate.com/pricing
|
# see https://replicate.com/pricing
|
||||||
# for all litellm currently supported LLMs, almost all requests go to a100_80gb
|
# for all litellm currently supported LLMs, almost all requests go to a100_80gb
|
||||||
a100_80gb_price_per_second_public = (
|
a100_80gb_price_per_second_public = DEFAULT_REPLICATE_GPU_PRICE_PER_SECOND # assume all calls sent to A100 80GB for now
|
||||||
0.001400 # assume all calls sent to A100 80GB for now
|
|
||||||
)
|
|
||||||
if total_time == 0.0: # total time is in ms
|
if total_time == 0.0: # total time is in ms
|
||||||
start_time = completion_response.get("created", time.time())
|
start_time = completion_response.get("created", time.time())
|
||||||
end_time = getattr(completion_response, "ended", time.time())
|
end_time = getattr(completion_response, "ended", time.time())
|
||||||
|
@ -450,7 +452,7 @@ def _select_model_name_for_cost_calc(
|
||||||
return return_model
|
return return_model
|
||||||
|
|
||||||
|
|
||||||
@lru_cache(maxsize=16)
|
@lru_cache(maxsize=DEFAULT_MAX_LRU_CACHE_SIZE)
|
||||||
def _model_contains_known_llm_provider(model: str) -> bool:
|
def _model_contains_known_llm_provider(model: str) -> bool:
|
||||||
"""
|
"""
|
||||||
Check if the model contains a known llm provider
|
Check if the model contains a known llm provider
|
||||||
|
@ -550,6 +552,7 @@ def completion_cost( # noqa: PLR0915
|
||||||
custom_pricing: Optional[bool] = None,
|
custom_pricing: Optional[bool] = None,
|
||||||
base_model: Optional[str] = None,
|
base_model: Optional[str] = None,
|
||||||
standard_built_in_tools_params: Optional[StandardBuiltInToolsParams] = None,
|
standard_built_in_tools_params: Optional[StandardBuiltInToolsParams] = None,
|
||||||
|
litellm_model_name: Optional[str] = None,
|
||||||
) -> float:
|
) -> float:
|
||||||
"""
|
"""
|
||||||
Calculate the cost of a given completion call fot GPT-3.5-turbo, llama2, any litellm supported llm.
|
Calculate the cost of a given completion call fot GPT-3.5-turbo, llama2, any litellm supported llm.
|
||||||
|
@ -602,7 +605,7 @@ def completion_cost( # noqa: PLR0915
|
||||||
completion_response=completion_response
|
completion_response=completion_response
|
||||||
)
|
)
|
||||||
rerank_billed_units: Optional[RerankBilledUnits] = None
|
rerank_billed_units: Optional[RerankBilledUnits] = None
|
||||||
model = _select_model_name_for_cost_calc(
|
selected_model = _select_model_name_for_cost_calc(
|
||||||
model=model,
|
model=model,
|
||||||
completion_response=completion_response,
|
completion_response=completion_response,
|
||||||
custom_llm_provider=custom_llm_provider,
|
custom_llm_provider=custom_llm_provider,
|
||||||
|
@ -610,232 +613,268 @@ def completion_cost( # noqa: PLR0915
|
||||||
base_model=base_model,
|
base_model=base_model,
|
||||||
)
|
)
|
||||||
|
|
||||||
verbose_logger.info(f"selected model name for cost calculation: {model}")
|
potential_model_names = [selected_model]
|
||||||
|
if model is not None:
|
||||||
|
potential_model_names.append(model)
|
||||||
|
|
||||||
if completion_response is not None and (
|
for idx, model in enumerate(potential_model_names):
|
||||||
isinstance(completion_response, BaseModel)
|
|
||||||
or isinstance(completion_response, dict)
|
|
||||||
): # tts returns a custom class
|
|
||||||
if isinstance(completion_response, dict):
|
|
||||||
usage_obj: Optional[Union[dict, Usage]] = completion_response.get(
|
|
||||||
"usage", {}
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
usage_obj = getattr(completion_response, "usage", {})
|
|
||||||
if isinstance(usage_obj, BaseModel) and not _is_known_usage_objects(
|
|
||||||
usage_obj=usage_obj
|
|
||||||
):
|
|
||||||
setattr(
|
|
||||||
completion_response,
|
|
||||||
"usage",
|
|
||||||
litellm.Usage(**usage_obj.model_dump()),
|
|
||||||
)
|
|
||||||
if usage_obj is None:
|
|
||||||
_usage = {}
|
|
||||||
elif isinstance(usage_obj, BaseModel):
|
|
||||||
_usage = usage_obj.model_dump()
|
|
||||||
else:
|
|
||||||
_usage = usage_obj
|
|
||||||
|
|
||||||
if ResponseAPILoggingUtils._is_response_api_usage(_usage):
|
|
||||||
_usage = (
|
|
||||||
ResponseAPILoggingUtils._transform_response_api_usage_to_chat_usage(
|
|
||||||
_usage
|
|
||||||
).model_dump()
|
|
||||||
)
|
|
||||||
|
|
||||||
# get input/output tokens from completion_response
|
|
||||||
prompt_tokens = _usage.get("prompt_tokens", 0)
|
|
||||||
completion_tokens = _usage.get("completion_tokens", 0)
|
|
||||||
cache_creation_input_tokens = _usage.get("cache_creation_input_tokens", 0)
|
|
||||||
cache_read_input_tokens = _usage.get("cache_read_input_tokens", 0)
|
|
||||||
if (
|
|
||||||
"prompt_tokens_details" in _usage
|
|
||||||
and _usage["prompt_tokens_details"] != {}
|
|
||||||
and _usage["prompt_tokens_details"]
|
|
||||||
):
|
|
||||||
prompt_tokens_details = _usage.get("prompt_tokens_details", {})
|
|
||||||
cache_read_input_tokens = prompt_tokens_details.get("cached_tokens", 0)
|
|
||||||
|
|
||||||
total_time = getattr(completion_response, "_response_ms", 0)
|
|
||||||
|
|
||||||
hidden_params = getattr(completion_response, "_hidden_params", None)
|
|
||||||
if hidden_params is not None:
|
|
||||||
custom_llm_provider = hidden_params.get(
|
|
||||||
"custom_llm_provider", custom_llm_provider or None
|
|
||||||
)
|
|
||||||
region_name = hidden_params.get("region_name", region_name)
|
|
||||||
size = hidden_params.get("optional_params", {}).get(
|
|
||||||
"size", "1024-x-1024"
|
|
||||||
) # openai default
|
|
||||||
quality = hidden_params.get("optional_params", {}).get(
|
|
||||||
"quality", "standard"
|
|
||||||
) # openai default
|
|
||||||
n = hidden_params.get("optional_params", {}).get(
|
|
||||||
"n", 1
|
|
||||||
) # openai default
|
|
||||||
else:
|
|
||||||
if model is None:
|
|
||||||
raise ValueError(
|
|
||||||
f"Model is None and does not exist in passed completion_response. Passed completion_response={completion_response}, model={model}"
|
|
||||||
)
|
|
||||||
if len(messages) > 0:
|
|
||||||
prompt_tokens = token_counter(model=model, messages=messages)
|
|
||||||
elif len(prompt) > 0:
|
|
||||||
prompt_tokens = token_counter(model=model, text=prompt)
|
|
||||||
completion_tokens = token_counter(model=model, text=completion)
|
|
||||||
|
|
||||||
if model is None:
|
|
||||||
raise ValueError(
|
|
||||||
f"Model is None and does not exist in passed completion_response. Passed completion_response={completion_response}, model={model}"
|
|
||||||
)
|
|
||||||
if custom_llm_provider is None:
|
|
||||||
try:
|
try:
|
||||||
model, custom_llm_provider, _, _ = litellm.get_llm_provider(
|
verbose_logger.info(
|
||||||
model=model
|
f"selected model name for cost calculation: {model}"
|
||||||
) # strip the llm provider from the model name -> for image gen cost calculation
|
)
|
||||||
|
|
||||||
|
if completion_response is not None and (
|
||||||
|
isinstance(completion_response, BaseModel)
|
||||||
|
or isinstance(completion_response, dict)
|
||||||
|
): # tts returns a custom class
|
||||||
|
if isinstance(completion_response, dict):
|
||||||
|
usage_obj: Optional[
|
||||||
|
Union[dict, Usage]
|
||||||
|
] = completion_response.get("usage", {})
|
||||||
|
else:
|
||||||
|
usage_obj = getattr(completion_response, "usage", {})
|
||||||
|
if isinstance(usage_obj, BaseModel) and not _is_known_usage_objects(
|
||||||
|
usage_obj=usage_obj
|
||||||
|
):
|
||||||
|
setattr(
|
||||||
|
completion_response,
|
||||||
|
"usage",
|
||||||
|
litellm.Usage(**usage_obj.model_dump()),
|
||||||
|
)
|
||||||
|
if usage_obj is None:
|
||||||
|
_usage = {}
|
||||||
|
elif isinstance(usage_obj, BaseModel):
|
||||||
|
_usage = usage_obj.model_dump()
|
||||||
|
else:
|
||||||
|
_usage = usage_obj
|
||||||
|
|
||||||
|
if ResponseAPILoggingUtils._is_response_api_usage(_usage):
|
||||||
|
_usage = ResponseAPILoggingUtils._transform_response_api_usage_to_chat_usage(
|
||||||
|
_usage
|
||||||
|
).model_dump()
|
||||||
|
|
||||||
|
# get input/output tokens from completion_response
|
||||||
|
prompt_tokens = _usage.get("prompt_tokens", 0)
|
||||||
|
completion_tokens = _usage.get("completion_tokens", 0)
|
||||||
|
cache_creation_input_tokens = _usage.get(
|
||||||
|
"cache_creation_input_tokens", 0
|
||||||
|
)
|
||||||
|
cache_read_input_tokens = _usage.get("cache_read_input_tokens", 0)
|
||||||
|
if (
|
||||||
|
"prompt_tokens_details" in _usage
|
||||||
|
and _usage["prompt_tokens_details"] != {}
|
||||||
|
and _usage["prompt_tokens_details"]
|
||||||
|
):
|
||||||
|
prompt_tokens_details = _usage.get("prompt_tokens_details", {})
|
||||||
|
cache_read_input_tokens = prompt_tokens_details.get(
|
||||||
|
"cached_tokens", 0
|
||||||
|
)
|
||||||
|
|
||||||
|
total_time = getattr(completion_response, "_response_ms", 0)
|
||||||
|
|
||||||
|
hidden_params = getattr(completion_response, "_hidden_params", None)
|
||||||
|
if hidden_params is not None:
|
||||||
|
custom_llm_provider = hidden_params.get(
|
||||||
|
"custom_llm_provider", custom_llm_provider or None
|
||||||
|
)
|
||||||
|
region_name = hidden_params.get("region_name", region_name)
|
||||||
|
size = hidden_params.get("optional_params", {}).get(
|
||||||
|
"size", "1024-x-1024"
|
||||||
|
) # openai default
|
||||||
|
quality = hidden_params.get("optional_params", {}).get(
|
||||||
|
"quality", "standard"
|
||||||
|
) # openai default
|
||||||
|
n = hidden_params.get("optional_params", {}).get(
|
||||||
|
"n", 1
|
||||||
|
) # openai default
|
||||||
|
else:
|
||||||
|
if model is None:
|
||||||
|
raise ValueError(
|
||||||
|
f"Model is None and does not exist in passed completion_response. Passed completion_response={completion_response}, model={model}"
|
||||||
|
)
|
||||||
|
if len(messages) > 0:
|
||||||
|
prompt_tokens = token_counter(model=model, messages=messages)
|
||||||
|
elif len(prompt) > 0:
|
||||||
|
prompt_tokens = token_counter(model=model, text=prompt)
|
||||||
|
completion_tokens = token_counter(model=model, text=completion)
|
||||||
|
|
||||||
|
if model is None:
|
||||||
|
raise ValueError(
|
||||||
|
f"Model is None and does not exist in passed completion_response. Passed completion_response={completion_response}, model={model}"
|
||||||
|
)
|
||||||
|
if custom_llm_provider is None:
|
||||||
|
try:
|
||||||
|
model, custom_llm_provider, _, _ = litellm.get_llm_provider(
|
||||||
|
model=model
|
||||||
|
) # strip the llm provider from the model name -> for image gen cost calculation
|
||||||
|
except Exception as e:
|
||||||
|
verbose_logger.debug(
|
||||||
|
"litellm.cost_calculator.py::completion_cost() - Error inferring custom_llm_provider - {}".format(
|
||||||
|
str(e)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
if (
|
||||||
|
call_type == CallTypes.image_generation.value
|
||||||
|
or call_type == CallTypes.aimage_generation.value
|
||||||
|
or call_type
|
||||||
|
== PassthroughCallTypes.passthrough_image_generation.value
|
||||||
|
):
|
||||||
|
### IMAGE GENERATION COST CALCULATION ###
|
||||||
|
if custom_llm_provider == "vertex_ai":
|
||||||
|
if isinstance(completion_response, ImageResponse):
|
||||||
|
return vertex_ai_image_cost_calculator(
|
||||||
|
model=model,
|
||||||
|
image_response=completion_response,
|
||||||
|
)
|
||||||
|
elif custom_llm_provider == "bedrock":
|
||||||
|
if isinstance(completion_response, ImageResponse):
|
||||||
|
return bedrock_image_cost_calculator(
|
||||||
|
model=model,
|
||||||
|
size=size,
|
||||||
|
image_response=completion_response,
|
||||||
|
optional_params=optional_params,
|
||||||
|
)
|
||||||
|
raise TypeError(
|
||||||
|
"completion_response must be of type ImageResponse for bedrock image cost calculation"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
return default_image_cost_calculator(
|
||||||
|
model=model,
|
||||||
|
quality=quality,
|
||||||
|
custom_llm_provider=custom_llm_provider,
|
||||||
|
n=n,
|
||||||
|
size=size,
|
||||||
|
optional_params=optional_params,
|
||||||
|
)
|
||||||
|
elif (
|
||||||
|
call_type == CallTypes.speech.value
|
||||||
|
or call_type == CallTypes.aspeech.value
|
||||||
|
):
|
||||||
|
prompt_characters = litellm.utils._count_characters(text=prompt)
|
||||||
|
elif (
|
||||||
|
call_type == CallTypes.atranscription.value
|
||||||
|
or call_type == CallTypes.transcription.value
|
||||||
|
):
|
||||||
|
audio_transcription_file_duration = getattr(
|
||||||
|
completion_response, "duration", 0.0
|
||||||
|
)
|
||||||
|
elif (
|
||||||
|
call_type == CallTypes.rerank.value
|
||||||
|
or call_type == CallTypes.arerank.value
|
||||||
|
):
|
||||||
|
if completion_response is not None and isinstance(
|
||||||
|
completion_response, RerankResponse
|
||||||
|
):
|
||||||
|
meta_obj = completion_response.meta
|
||||||
|
if meta_obj is not None:
|
||||||
|
billed_units = meta_obj.get("billed_units", {}) or {}
|
||||||
|
else:
|
||||||
|
billed_units = {}
|
||||||
|
|
||||||
|
rerank_billed_units = RerankBilledUnits(
|
||||||
|
search_units=billed_units.get("search_units"),
|
||||||
|
total_tokens=billed_units.get("total_tokens"),
|
||||||
|
)
|
||||||
|
|
||||||
|
search_units = (
|
||||||
|
billed_units.get("search_units") or 1
|
||||||
|
) # cohere charges per request by default.
|
||||||
|
completion_tokens = search_units
|
||||||
|
# Calculate cost based on prompt_tokens, completion_tokens
|
||||||
|
if (
|
||||||
|
"togethercomputer" in model
|
||||||
|
or "together_ai" in model
|
||||||
|
or custom_llm_provider == "together_ai"
|
||||||
|
):
|
||||||
|
# together ai prices based on size of llm
|
||||||
|
# get_model_params_and_category takes a model name and returns the category of LLM size it is in model_prices_and_context_window.json
|
||||||
|
|
||||||
|
model = get_model_params_and_category(
|
||||||
|
model, call_type=CallTypes(call_type)
|
||||||
|
)
|
||||||
|
|
||||||
|
# replicate llms are calculate based on time for request running
|
||||||
|
# see https://replicate.com/pricing
|
||||||
|
elif (
|
||||||
|
model in litellm.replicate_models or "replicate" in model
|
||||||
|
) and model not in litellm.model_cost:
|
||||||
|
# for unmapped replicate model, default to replicate's time tracking logic
|
||||||
|
return get_replicate_completion_pricing(completion_response, total_time) # type: ignore
|
||||||
|
|
||||||
|
if model is None:
|
||||||
|
raise ValueError(
|
||||||
|
f"Model is None and does not exist in passed completion_response. Passed completion_response={completion_response}, model={model}"
|
||||||
|
)
|
||||||
|
|
||||||
|
if (
|
||||||
|
custom_llm_provider is not None
|
||||||
|
and custom_llm_provider == "vertex_ai"
|
||||||
|
):
|
||||||
|
# Calculate the prompt characters + response characters
|
||||||
|
if len(messages) > 0:
|
||||||
|
prompt_string = litellm.utils.get_formatted_prompt(
|
||||||
|
data={"messages": messages}, call_type="completion"
|
||||||
|
)
|
||||||
|
|
||||||
|
prompt_characters = litellm.utils._count_characters(
|
||||||
|
text=prompt_string
|
||||||
|
)
|
||||||
|
if completion_response is not None and isinstance(
|
||||||
|
completion_response, ModelResponse
|
||||||
|
):
|
||||||
|
completion_string = litellm.utils.get_response_string(
|
||||||
|
response_obj=completion_response
|
||||||
|
)
|
||||||
|
completion_characters = litellm.utils._count_characters(
|
||||||
|
text=completion_string
|
||||||
|
)
|
||||||
|
|
||||||
|
(
|
||||||
|
prompt_tokens_cost_usd_dollar,
|
||||||
|
completion_tokens_cost_usd_dollar,
|
||||||
|
) = cost_per_token(
|
||||||
|
model=model,
|
||||||
|
prompt_tokens=prompt_tokens,
|
||||||
|
completion_tokens=completion_tokens,
|
||||||
|
custom_llm_provider=custom_llm_provider,
|
||||||
|
response_time_ms=total_time,
|
||||||
|
region_name=region_name,
|
||||||
|
custom_cost_per_second=custom_cost_per_second,
|
||||||
|
custom_cost_per_token=custom_cost_per_token,
|
||||||
|
prompt_characters=prompt_characters,
|
||||||
|
completion_characters=completion_characters,
|
||||||
|
cache_creation_input_tokens=cache_creation_input_tokens,
|
||||||
|
cache_read_input_tokens=cache_read_input_tokens,
|
||||||
|
usage_object=cost_per_token_usage_object,
|
||||||
|
call_type=cast(CallTypesLiteral, call_type),
|
||||||
|
audio_transcription_file_duration=audio_transcription_file_duration,
|
||||||
|
rerank_billed_units=rerank_billed_units,
|
||||||
|
)
|
||||||
|
_final_cost = (
|
||||||
|
prompt_tokens_cost_usd_dollar + completion_tokens_cost_usd_dollar
|
||||||
|
)
|
||||||
|
_final_cost += (
|
||||||
|
StandardBuiltInToolCostTracking.get_cost_for_built_in_tools(
|
||||||
|
model=model,
|
||||||
|
response_object=completion_response,
|
||||||
|
standard_built_in_tools_params=standard_built_in_tools_params,
|
||||||
|
custom_llm_provider=custom_llm_provider,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
return _final_cost
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
verbose_logger.debug(
|
verbose_logger.debug(
|
||||||
"litellm.cost_calculator.py::completion_cost() - Error inferring custom_llm_provider - {}".format(
|
"litellm.cost_calculator.py::completion_cost() - Error calculating cost for model={} - {}".format(
|
||||||
str(e)
|
model, str(e)
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
if (
|
if idx == len(potential_model_names) - 1:
|
||||||
call_type == CallTypes.image_generation.value
|
raise e
|
||||||
or call_type == CallTypes.aimage_generation.value
|
raise Exception(
|
||||||
or call_type == PassthroughCallTypes.passthrough_image_generation.value
|
"Unable to calculat cost for received potential model names - {}".format(
|
||||||
):
|
potential_model_names
|
||||||
### IMAGE GENERATION COST CALCULATION ###
|
|
||||||
if custom_llm_provider == "vertex_ai":
|
|
||||||
if isinstance(completion_response, ImageResponse):
|
|
||||||
return vertex_ai_image_cost_calculator(
|
|
||||||
model=model,
|
|
||||||
image_response=completion_response,
|
|
||||||
)
|
|
||||||
elif custom_llm_provider == "bedrock":
|
|
||||||
if isinstance(completion_response, ImageResponse):
|
|
||||||
return bedrock_image_cost_calculator(
|
|
||||||
model=model,
|
|
||||||
size=size,
|
|
||||||
image_response=completion_response,
|
|
||||||
optional_params=optional_params,
|
|
||||||
)
|
|
||||||
raise TypeError(
|
|
||||||
"completion_response must be of type ImageResponse for bedrock image cost calculation"
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
return default_image_cost_calculator(
|
|
||||||
model=model,
|
|
||||||
quality=quality,
|
|
||||||
custom_llm_provider=custom_llm_provider,
|
|
||||||
n=n,
|
|
||||||
size=size,
|
|
||||||
optional_params=optional_params,
|
|
||||||
)
|
|
||||||
elif (
|
|
||||||
call_type == CallTypes.speech.value or call_type == CallTypes.aspeech.value
|
|
||||||
):
|
|
||||||
prompt_characters = litellm.utils._count_characters(text=prompt)
|
|
||||||
elif (
|
|
||||||
call_type == CallTypes.atranscription.value
|
|
||||||
or call_type == CallTypes.transcription.value
|
|
||||||
):
|
|
||||||
audio_transcription_file_duration = getattr(
|
|
||||||
completion_response, "duration", 0.0
|
|
||||||
)
|
)
|
||||||
elif (
|
|
||||||
call_type == CallTypes.rerank.value or call_type == CallTypes.arerank.value
|
|
||||||
):
|
|
||||||
if completion_response is not None and isinstance(
|
|
||||||
completion_response, RerankResponse
|
|
||||||
):
|
|
||||||
meta_obj = completion_response.meta
|
|
||||||
if meta_obj is not None:
|
|
||||||
billed_units = meta_obj.get("billed_units", {}) or {}
|
|
||||||
else:
|
|
||||||
billed_units = {}
|
|
||||||
|
|
||||||
rerank_billed_units = RerankBilledUnits(
|
|
||||||
search_units=billed_units.get("search_units"),
|
|
||||||
total_tokens=billed_units.get("total_tokens"),
|
|
||||||
)
|
|
||||||
|
|
||||||
search_units = (
|
|
||||||
billed_units.get("search_units") or 1
|
|
||||||
) # cohere charges per request by default.
|
|
||||||
completion_tokens = search_units
|
|
||||||
# Calculate cost based on prompt_tokens, completion_tokens
|
|
||||||
if (
|
|
||||||
"togethercomputer" in model
|
|
||||||
or "together_ai" in model
|
|
||||||
or custom_llm_provider == "together_ai"
|
|
||||||
):
|
|
||||||
# together ai prices based on size of llm
|
|
||||||
# get_model_params_and_category takes a model name and returns the category of LLM size it is in model_prices_and_context_window.json
|
|
||||||
|
|
||||||
model = get_model_params_and_category(model, call_type=CallTypes(call_type))
|
|
||||||
|
|
||||||
# replicate llms are calculate based on time for request running
|
|
||||||
# see https://replicate.com/pricing
|
|
||||||
elif (
|
|
||||||
model in litellm.replicate_models or "replicate" in model
|
|
||||||
) and model not in litellm.model_cost:
|
|
||||||
# for unmapped replicate model, default to replicate's time tracking logic
|
|
||||||
return get_replicate_completion_pricing(completion_response, total_time) # type: ignore
|
|
||||||
|
|
||||||
if model is None:
|
|
||||||
raise ValueError(
|
|
||||||
f"Model is None and does not exist in passed completion_response. Passed completion_response={completion_response}, model={model}"
|
|
||||||
)
|
|
||||||
|
|
||||||
if custom_llm_provider is not None and custom_llm_provider == "vertex_ai":
|
|
||||||
# Calculate the prompt characters + response characters
|
|
||||||
if len(messages) > 0:
|
|
||||||
prompt_string = litellm.utils.get_formatted_prompt(
|
|
||||||
data={"messages": messages}, call_type="completion"
|
|
||||||
)
|
|
||||||
|
|
||||||
prompt_characters = litellm.utils._count_characters(text=prompt_string)
|
|
||||||
if completion_response is not None and isinstance(
|
|
||||||
completion_response, ModelResponse
|
|
||||||
):
|
|
||||||
completion_string = litellm.utils.get_response_string(
|
|
||||||
response_obj=completion_response
|
|
||||||
)
|
|
||||||
completion_characters = litellm.utils._count_characters(
|
|
||||||
text=completion_string
|
|
||||||
)
|
|
||||||
|
|
||||||
(
|
|
||||||
prompt_tokens_cost_usd_dollar,
|
|
||||||
completion_tokens_cost_usd_dollar,
|
|
||||||
) = cost_per_token(
|
|
||||||
model=model,
|
|
||||||
prompt_tokens=prompt_tokens,
|
|
||||||
completion_tokens=completion_tokens,
|
|
||||||
custom_llm_provider=custom_llm_provider,
|
|
||||||
response_time_ms=total_time,
|
|
||||||
region_name=region_name,
|
|
||||||
custom_cost_per_second=custom_cost_per_second,
|
|
||||||
custom_cost_per_token=custom_cost_per_token,
|
|
||||||
prompt_characters=prompt_characters,
|
|
||||||
completion_characters=completion_characters,
|
|
||||||
cache_creation_input_tokens=cache_creation_input_tokens,
|
|
||||||
cache_read_input_tokens=cache_read_input_tokens,
|
|
||||||
usage_object=cost_per_token_usage_object,
|
|
||||||
call_type=call_type,
|
|
||||||
audio_transcription_file_duration=audio_transcription_file_duration,
|
|
||||||
rerank_billed_units=rerank_billed_units,
|
|
||||||
)
|
)
|
||||||
_final_cost = prompt_tokens_cost_usd_dollar + completion_tokens_cost_usd_dollar
|
|
||||||
_final_cost += StandardBuiltInToolCostTracking.get_cost_for_built_in_tools(
|
|
||||||
model=model,
|
|
||||||
response_object=completion_response,
|
|
||||||
standard_built_in_tools_params=standard_built_in_tools_params,
|
|
||||||
custom_llm_provider=custom_llm_provider,
|
|
||||||
)
|
|
||||||
|
|
||||||
return _final_cost
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise e
|
raise e
|
||||||
|
|
||||||
|
@ -897,6 +936,7 @@ def response_cost_calculator(
|
||||||
custom_pricing: Optional[bool] = None,
|
custom_pricing: Optional[bool] = None,
|
||||||
prompt: str = "",
|
prompt: str = "",
|
||||||
standard_built_in_tools_params: Optional[StandardBuiltInToolsParams] = None,
|
standard_built_in_tools_params: Optional[StandardBuiltInToolsParams] = None,
|
||||||
|
litellm_model_name: Optional[str] = None,
|
||||||
) -> float:
|
) -> float:
|
||||||
"""
|
"""
|
||||||
Returns
|
Returns
|
||||||
|
|
|
@ -15,7 +15,9 @@ import httpx
|
||||||
|
|
||||||
import litellm
|
import litellm
|
||||||
from litellm import get_secret_str
|
from litellm import get_secret_str
|
||||||
|
from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObj
|
||||||
from litellm.llms.azure.files.handler import AzureOpenAIFilesAPI
|
from litellm.llms.azure.files.handler import AzureOpenAIFilesAPI
|
||||||
|
from litellm.llms.custom_httpx.llm_http_handler import BaseLLMHTTPHandler
|
||||||
from litellm.llms.openai.openai import FileDeleted, FileObject, OpenAIFilesAPI
|
from litellm.llms.openai.openai import FileDeleted, FileObject, OpenAIFilesAPI
|
||||||
from litellm.llms.vertex_ai.files.handler import VertexAIFilesHandler
|
from litellm.llms.vertex_ai.files.handler import VertexAIFilesHandler
|
||||||
from litellm.types.llms.openai import (
|
from litellm.types.llms.openai import (
|
||||||
|
@ -23,9 +25,18 @@ from litellm.types.llms.openai import (
|
||||||
FileContentRequest,
|
FileContentRequest,
|
||||||
FileTypes,
|
FileTypes,
|
||||||
HttpxBinaryResponseContent,
|
HttpxBinaryResponseContent,
|
||||||
|
OpenAIFileObject,
|
||||||
)
|
)
|
||||||
from litellm.types.router import *
|
from litellm.types.router import *
|
||||||
from litellm.utils import get_litellm_params, supports_httpx_timeout
|
from litellm.types.utils import LlmProviders
|
||||||
|
from litellm.utils import (
|
||||||
|
ProviderConfigManager,
|
||||||
|
client,
|
||||||
|
get_litellm_params,
|
||||||
|
supports_httpx_timeout,
|
||||||
|
)
|
||||||
|
|
||||||
|
base_llm_http_handler = BaseLLMHTTPHandler()
|
||||||
|
|
||||||
####### ENVIRONMENT VARIABLES ###################
|
####### ENVIRONMENT VARIABLES ###################
|
||||||
openai_files_instance = OpenAIFilesAPI()
|
openai_files_instance = OpenAIFilesAPI()
|
||||||
|
@ -34,6 +45,227 @@ vertex_ai_files_instance = VertexAIFilesHandler()
|
||||||
#################################################
|
#################################################
|
||||||
|
|
||||||
|
|
||||||
|
@client
|
||||||
|
async def acreate_file(
|
||||||
|
file: FileTypes,
|
||||||
|
purpose: Literal["assistants", "batch", "fine-tune"],
|
||||||
|
custom_llm_provider: Literal["openai", "azure", "vertex_ai"] = "openai",
|
||||||
|
extra_headers: Optional[Dict[str, str]] = None,
|
||||||
|
extra_body: Optional[Dict[str, str]] = None,
|
||||||
|
**kwargs,
|
||||||
|
) -> OpenAIFileObject:
|
||||||
|
"""
|
||||||
|
Async: Files are used to upload documents that can be used with features like Assistants, Fine-tuning, and Batch API.
|
||||||
|
|
||||||
|
LiteLLM Equivalent of POST: POST https://api.openai.com/v1/files
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
loop = asyncio.get_event_loop()
|
||||||
|
kwargs["acreate_file"] = True
|
||||||
|
|
||||||
|
call_args = {
|
||||||
|
"file": file,
|
||||||
|
"purpose": purpose,
|
||||||
|
"custom_llm_provider": custom_llm_provider,
|
||||||
|
"extra_headers": extra_headers,
|
||||||
|
"extra_body": extra_body,
|
||||||
|
**kwargs,
|
||||||
|
}
|
||||||
|
|
||||||
|
# Use a partial function to pass your keyword arguments
|
||||||
|
func = partial(create_file, **call_args)
|
||||||
|
|
||||||
|
# Add the context to the function
|
||||||
|
ctx = contextvars.copy_context()
|
||||||
|
func_with_context = partial(ctx.run, func)
|
||||||
|
init_response = await loop.run_in_executor(None, func_with_context)
|
||||||
|
if asyncio.iscoroutine(init_response):
|
||||||
|
response = await init_response
|
||||||
|
else:
|
||||||
|
response = init_response # type: ignore
|
||||||
|
|
||||||
|
return response
|
||||||
|
except Exception as e:
|
||||||
|
raise e
|
||||||
|
|
||||||
|
|
||||||
|
@client
|
||||||
|
def create_file(
|
||||||
|
file: FileTypes,
|
||||||
|
purpose: Literal["assistants", "batch", "fine-tune"],
|
||||||
|
custom_llm_provider: Optional[Literal["openai", "azure", "vertex_ai"]] = None,
|
||||||
|
extra_headers: Optional[Dict[str, str]] = None,
|
||||||
|
extra_body: Optional[Dict[str, str]] = None,
|
||||||
|
**kwargs,
|
||||||
|
) -> Union[OpenAIFileObject, Coroutine[Any, Any, OpenAIFileObject]]:
|
||||||
|
"""
|
||||||
|
Files are used to upload documents that can be used with features like Assistants, Fine-tuning, and Batch API.
|
||||||
|
|
||||||
|
LiteLLM Equivalent of POST: POST https://api.openai.com/v1/files
|
||||||
|
|
||||||
|
Specify either provider_list or custom_llm_provider.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
_is_async = kwargs.pop("acreate_file", False) is True
|
||||||
|
optional_params = GenericLiteLLMParams(**kwargs)
|
||||||
|
litellm_params_dict = get_litellm_params(**kwargs)
|
||||||
|
logging_obj = cast(
|
||||||
|
Optional[LiteLLMLoggingObj], kwargs.get("litellm_logging_obj")
|
||||||
|
)
|
||||||
|
if logging_obj is None:
|
||||||
|
raise ValueError("logging_obj is required")
|
||||||
|
client = kwargs.get("client")
|
||||||
|
|
||||||
|
### TIMEOUT LOGIC ###
|
||||||
|
timeout = optional_params.timeout or kwargs.get("request_timeout", 600) or 600
|
||||||
|
# set timeout for 10 minutes by default
|
||||||
|
|
||||||
|
if (
|
||||||
|
timeout is not None
|
||||||
|
and isinstance(timeout, httpx.Timeout)
|
||||||
|
and supports_httpx_timeout(cast(str, custom_llm_provider)) is False
|
||||||
|
):
|
||||||
|
read_timeout = timeout.read or 600
|
||||||
|
timeout = read_timeout # default 10 min timeout
|
||||||
|
elif timeout is not None and not isinstance(timeout, httpx.Timeout):
|
||||||
|
timeout = float(timeout) # type: ignore
|
||||||
|
elif timeout is None:
|
||||||
|
timeout = 600.0
|
||||||
|
|
||||||
|
_create_file_request = CreateFileRequest(
|
||||||
|
file=file,
|
||||||
|
purpose=purpose,
|
||||||
|
extra_headers=extra_headers,
|
||||||
|
extra_body=extra_body,
|
||||||
|
)
|
||||||
|
|
||||||
|
provider_config = ProviderConfigManager.get_provider_files_config(
|
||||||
|
model="",
|
||||||
|
provider=LlmProviders(custom_llm_provider),
|
||||||
|
)
|
||||||
|
if provider_config is not None:
|
||||||
|
response = base_llm_http_handler.create_file(
|
||||||
|
provider_config=provider_config,
|
||||||
|
litellm_params=litellm_params_dict,
|
||||||
|
create_file_data=_create_file_request,
|
||||||
|
headers=extra_headers or {},
|
||||||
|
api_base=optional_params.api_base,
|
||||||
|
api_key=optional_params.api_key,
|
||||||
|
logging_obj=logging_obj,
|
||||||
|
_is_async=_is_async,
|
||||||
|
client=client
|
||||||
|
if client is not None
|
||||||
|
and isinstance(client, (HTTPHandler, AsyncHTTPHandler))
|
||||||
|
else None,
|
||||||
|
timeout=timeout,
|
||||||
|
)
|
||||||
|
elif custom_llm_provider == "openai":
|
||||||
|
# for deepinfra/perplexity/anyscale/groq we check in get_llm_provider and pass in the api base from there
|
||||||
|
api_base = (
|
||||||
|
optional_params.api_base
|
||||||
|
or litellm.api_base
|
||||||
|
or os.getenv("OPENAI_API_BASE")
|
||||||
|
or "https://api.openai.com/v1"
|
||||||
|
)
|
||||||
|
organization = (
|
||||||
|
optional_params.organization
|
||||||
|
or litellm.organization
|
||||||
|
or os.getenv("OPENAI_ORGANIZATION", None)
|
||||||
|
or None # default - https://github.com/openai/openai-python/blob/284c1799070c723c6a553337134148a7ab088dd8/openai/util.py#L105
|
||||||
|
)
|
||||||
|
# set API KEY
|
||||||
|
api_key = (
|
||||||
|
optional_params.api_key
|
||||||
|
or litellm.api_key # for deepinfra/perplexity/anyscale we check in get_llm_provider and pass in the api key from there
|
||||||
|
or litellm.openai_key
|
||||||
|
or os.getenv("OPENAI_API_KEY")
|
||||||
|
)
|
||||||
|
|
||||||
|
response = openai_files_instance.create_file(
|
||||||
|
_is_async=_is_async,
|
||||||
|
api_base=api_base,
|
||||||
|
api_key=api_key,
|
||||||
|
timeout=timeout,
|
||||||
|
max_retries=optional_params.max_retries,
|
||||||
|
organization=organization,
|
||||||
|
create_file_data=_create_file_request,
|
||||||
|
)
|
||||||
|
elif custom_llm_provider == "azure":
|
||||||
|
api_base = optional_params.api_base or litellm.api_base or get_secret_str("AZURE_API_BASE") # type: ignore
|
||||||
|
api_version = (
|
||||||
|
optional_params.api_version
|
||||||
|
or litellm.api_version
|
||||||
|
or get_secret_str("AZURE_API_VERSION")
|
||||||
|
) # type: ignore
|
||||||
|
|
||||||
|
api_key = (
|
||||||
|
optional_params.api_key
|
||||||
|
or litellm.api_key
|
||||||
|
or litellm.azure_key
|
||||||
|
or get_secret_str("AZURE_OPENAI_API_KEY")
|
||||||
|
or get_secret_str("AZURE_API_KEY")
|
||||||
|
) # type: ignore
|
||||||
|
|
||||||
|
extra_body = optional_params.get("extra_body", {})
|
||||||
|
if extra_body is not None:
|
||||||
|
extra_body.pop("azure_ad_token", None)
|
||||||
|
else:
|
||||||
|
get_secret_str("AZURE_AD_TOKEN") # type: ignore
|
||||||
|
|
||||||
|
response = azure_files_instance.create_file(
|
||||||
|
_is_async=_is_async,
|
||||||
|
api_base=api_base,
|
||||||
|
api_key=api_key,
|
||||||
|
api_version=api_version,
|
||||||
|
timeout=timeout,
|
||||||
|
max_retries=optional_params.max_retries,
|
||||||
|
create_file_data=_create_file_request,
|
||||||
|
litellm_params=litellm_params_dict,
|
||||||
|
)
|
||||||
|
elif custom_llm_provider == "vertex_ai":
|
||||||
|
api_base = optional_params.api_base or ""
|
||||||
|
vertex_ai_project = (
|
||||||
|
optional_params.vertex_project
|
||||||
|
or litellm.vertex_project
|
||||||
|
or get_secret_str("VERTEXAI_PROJECT")
|
||||||
|
)
|
||||||
|
vertex_ai_location = (
|
||||||
|
optional_params.vertex_location
|
||||||
|
or litellm.vertex_location
|
||||||
|
or get_secret_str("VERTEXAI_LOCATION")
|
||||||
|
)
|
||||||
|
vertex_credentials = optional_params.vertex_credentials or get_secret_str(
|
||||||
|
"VERTEXAI_CREDENTIALS"
|
||||||
|
)
|
||||||
|
|
||||||
|
response = vertex_ai_files_instance.create_file(
|
||||||
|
_is_async=_is_async,
|
||||||
|
api_base=api_base,
|
||||||
|
vertex_project=vertex_ai_project,
|
||||||
|
vertex_location=vertex_ai_location,
|
||||||
|
vertex_credentials=vertex_credentials,
|
||||||
|
timeout=timeout,
|
||||||
|
max_retries=optional_params.max_retries,
|
||||||
|
create_file_data=_create_file_request,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
raise litellm.exceptions.BadRequestError(
|
||||||
|
message="LiteLLM doesn't support {} for 'create_file'. Only ['openai', 'azure', 'vertex_ai'] are supported.".format(
|
||||||
|
custom_llm_provider
|
||||||
|
),
|
||||||
|
model="n/a",
|
||||||
|
llm_provider=custom_llm_provider,
|
||||||
|
response=httpx.Response(
|
||||||
|
status_code=400,
|
||||||
|
content="Unsupported provider",
|
||||||
|
request=httpx.Request(method="create_file", url="https://github.com/BerriAI/litellm"), # type: ignore
|
||||||
|
),
|
||||||
|
)
|
||||||
|
return response
|
||||||
|
except Exception as e:
|
||||||
|
raise e
|
||||||
|
|
||||||
|
|
||||||
async def afile_retrieve(
|
async def afile_retrieve(
|
||||||
file_id: str,
|
file_id: str,
|
||||||
custom_llm_provider: Literal["openai", "azure"] = "openai",
|
custom_llm_provider: Literal["openai", "azure"] = "openai",
|
||||||
|
@ -488,195 +720,6 @@ def file_list(
|
||||||
raise e
|
raise e
|
||||||
|
|
||||||
|
|
||||||
async def acreate_file(
|
|
||||||
file: FileTypes,
|
|
||||||
purpose: Literal["assistants", "batch", "fine-tune"],
|
|
||||||
custom_llm_provider: Literal["openai", "azure", "vertex_ai"] = "openai",
|
|
||||||
extra_headers: Optional[Dict[str, str]] = None,
|
|
||||||
extra_body: Optional[Dict[str, str]] = None,
|
|
||||||
**kwargs,
|
|
||||||
) -> FileObject:
|
|
||||||
"""
|
|
||||||
Async: Files are used to upload documents that can be used with features like Assistants, Fine-tuning, and Batch API.
|
|
||||||
|
|
||||||
LiteLLM Equivalent of POST: POST https://api.openai.com/v1/files
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
loop = asyncio.get_event_loop()
|
|
||||||
kwargs["acreate_file"] = True
|
|
||||||
|
|
||||||
# Use a partial function to pass your keyword arguments
|
|
||||||
func = partial(
|
|
||||||
create_file,
|
|
||||||
file,
|
|
||||||
purpose,
|
|
||||||
custom_llm_provider,
|
|
||||||
extra_headers,
|
|
||||||
extra_body,
|
|
||||||
**kwargs,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Add the context to the function
|
|
||||||
ctx = contextvars.copy_context()
|
|
||||||
func_with_context = partial(ctx.run, func)
|
|
||||||
init_response = await loop.run_in_executor(None, func_with_context)
|
|
||||||
if asyncio.iscoroutine(init_response):
|
|
||||||
response = await init_response
|
|
||||||
else:
|
|
||||||
response = init_response # type: ignore
|
|
||||||
|
|
||||||
return response
|
|
||||||
except Exception as e:
|
|
||||||
raise e
|
|
||||||
|
|
||||||
|
|
||||||
def create_file(
|
|
||||||
file: FileTypes,
|
|
||||||
purpose: Literal["assistants", "batch", "fine-tune"],
|
|
||||||
custom_llm_provider: Literal["openai", "azure", "vertex_ai"] = "openai",
|
|
||||||
extra_headers: Optional[Dict[str, str]] = None,
|
|
||||||
extra_body: Optional[Dict[str, str]] = None,
|
|
||||||
**kwargs,
|
|
||||||
) -> Union[FileObject, Coroutine[Any, Any, FileObject]]:
|
|
||||||
"""
|
|
||||||
Files are used to upload documents that can be used with features like Assistants, Fine-tuning, and Batch API.
|
|
||||||
|
|
||||||
LiteLLM Equivalent of POST: POST https://api.openai.com/v1/files
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
_is_async = kwargs.pop("acreate_file", False) is True
|
|
||||||
optional_params = GenericLiteLLMParams(**kwargs)
|
|
||||||
litellm_params_dict = get_litellm_params(**kwargs)
|
|
||||||
|
|
||||||
### TIMEOUT LOGIC ###
|
|
||||||
timeout = optional_params.timeout or kwargs.get("request_timeout", 600) or 600
|
|
||||||
# set timeout for 10 minutes by default
|
|
||||||
|
|
||||||
if (
|
|
||||||
timeout is not None
|
|
||||||
and isinstance(timeout, httpx.Timeout)
|
|
||||||
and supports_httpx_timeout(custom_llm_provider) is False
|
|
||||||
):
|
|
||||||
read_timeout = timeout.read or 600
|
|
||||||
timeout = read_timeout # default 10 min timeout
|
|
||||||
elif timeout is not None and not isinstance(timeout, httpx.Timeout):
|
|
||||||
timeout = float(timeout) # type: ignore
|
|
||||||
elif timeout is None:
|
|
||||||
timeout = 600.0
|
|
||||||
|
|
||||||
_create_file_request = CreateFileRequest(
|
|
||||||
file=file,
|
|
||||||
purpose=purpose,
|
|
||||||
extra_headers=extra_headers,
|
|
||||||
extra_body=extra_body,
|
|
||||||
)
|
|
||||||
if custom_llm_provider == "openai":
|
|
||||||
# for deepinfra/perplexity/anyscale/groq we check in get_llm_provider and pass in the api base from there
|
|
||||||
api_base = (
|
|
||||||
optional_params.api_base
|
|
||||||
or litellm.api_base
|
|
||||||
or os.getenv("OPENAI_API_BASE")
|
|
||||||
or "https://api.openai.com/v1"
|
|
||||||
)
|
|
||||||
organization = (
|
|
||||||
optional_params.organization
|
|
||||||
or litellm.organization
|
|
||||||
or os.getenv("OPENAI_ORGANIZATION", None)
|
|
||||||
or None # default - https://github.com/openai/openai-python/blob/284c1799070c723c6a553337134148a7ab088dd8/openai/util.py#L105
|
|
||||||
)
|
|
||||||
# set API KEY
|
|
||||||
api_key = (
|
|
||||||
optional_params.api_key
|
|
||||||
or litellm.api_key # for deepinfra/perplexity/anyscale we check in get_llm_provider and pass in the api key from there
|
|
||||||
or litellm.openai_key
|
|
||||||
or os.getenv("OPENAI_API_KEY")
|
|
||||||
)
|
|
||||||
|
|
||||||
response = openai_files_instance.create_file(
|
|
||||||
_is_async=_is_async,
|
|
||||||
api_base=api_base,
|
|
||||||
api_key=api_key,
|
|
||||||
timeout=timeout,
|
|
||||||
max_retries=optional_params.max_retries,
|
|
||||||
organization=organization,
|
|
||||||
create_file_data=_create_file_request,
|
|
||||||
)
|
|
||||||
elif custom_llm_provider == "azure":
|
|
||||||
api_base = optional_params.api_base or litellm.api_base or get_secret_str("AZURE_API_BASE") # type: ignore
|
|
||||||
api_version = (
|
|
||||||
optional_params.api_version
|
|
||||||
or litellm.api_version
|
|
||||||
or get_secret_str("AZURE_API_VERSION")
|
|
||||||
) # type: ignore
|
|
||||||
|
|
||||||
api_key = (
|
|
||||||
optional_params.api_key
|
|
||||||
or litellm.api_key
|
|
||||||
or litellm.azure_key
|
|
||||||
or get_secret_str("AZURE_OPENAI_API_KEY")
|
|
||||||
or get_secret_str("AZURE_API_KEY")
|
|
||||||
) # type: ignore
|
|
||||||
|
|
||||||
extra_body = optional_params.get("extra_body", {})
|
|
||||||
if extra_body is not None:
|
|
||||||
extra_body.pop("azure_ad_token", None)
|
|
||||||
else:
|
|
||||||
get_secret_str("AZURE_AD_TOKEN") # type: ignore
|
|
||||||
|
|
||||||
response = azure_files_instance.create_file(
|
|
||||||
_is_async=_is_async,
|
|
||||||
api_base=api_base,
|
|
||||||
api_key=api_key,
|
|
||||||
api_version=api_version,
|
|
||||||
timeout=timeout,
|
|
||||||
max_retries=optional_params.max_retries,
|
|
||||||
create_file_data=_create_file_request,
|
|
||||||
litellm_params=litellm_params_dict,
|
|
||||||
)
|
|
||||||
elif custom_llm_provider == "vertex_ai":
|
|
||||||
api_base = optional_params.api_base or ""
|
|
||||||
vertex_ai_project = (
|
|
||||||
optional_params.vertex_project
|
|
||||||
or litellm.vertex_project
|
|
||||||
or get_secret_str("VERTEXAI_PROJECT")
|
|
||||||
)
|
|
||||||
vertex_ai_location = (
|
|
||||||
optional_params.vertex_location
|
|
||||||
or litellm.vertex_location
|
|
||||||
or get_secret_str("VERTEXAI_LOCATION")
|
|
||||||
)
|
|
||||||
vertex_credentials = optional_params.vertex_credentials or get_secret_str(
|
|
||||||
"VERTEXAI_CREDENTIALS"
|
|
||||||
)
|
|
||||||
|
|
||||||
response = vertex_ai_files_instance.create_file(
|
|
||||||
_is_async=_is_async,
|
|
||||||
api_base=api_base,
|
|
||||||
vertex_project=vertex_ai_project,
|
|
||||||
vertex_location=vertex_ai_location,
|
|
||||||
vertex_credentials=vertex_credentials,
|
|
||||||
timeout=timeout,
|
|
||||||
max_retries=optional_params.max_retries,
|
|
||||||
create_file_data=_create_file_request,
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
raise litellm.exceptions.BadRequestError(
|
|
||||||
message="LiteLLM doesn't support {} for 'create_batch'. Only 'openai' is supported.".format(
|
|
||||||
custom_llm_provider
|
|
||||||
),
|
|
||||||
model="n/a",
|
|
||||||
llm_provider=custom_llm_provider,
|
|
||||||
response=httpx.Response(
|
|
||||||
status_code=400,
|
|
||||||
content="Unsupported provider",
|
|
||||||
request=httpx.Request(method="create_thread", url="https://github.com/BerriAI/litellm"), # type: ignore
|
|
||||||
),
|
|
||||||
)
|
|
||||||
return response
|
|
||||||
except Exception as e:
|
|
||||||
raise e
|
|
||||||
|
|
||||||
|
|
||||||
async def afile_content(
|
async def afile_content(
|
||||||
file_id: str,
|
file_id: str,
|
||||||
custom_llm_provider: Literal["openai", "azure"] = "openai",
|
custom_llm_provider: Literal["openai", "azure"] = "openai",
|
||||||
|
|
|
@ -16,6 +16,7 @@ import litellm.litellm_core_utils.litellm_logging
|
||||||
import litellm.types
|
import litellm.types
|
||||||
from litellm._logging import verbose_logger, verbose_proxy_logger
|
from litellm._logging import verbose_logger, verbose_proxy_logger
|
||||||
from litellm.caching.caching import DualCache
|
from litellm.caching.caching import DualCache
|
||||||
|
from litellm.constants import HOURS_IN_A_DAY
|
||||||
from litellm.integrations.custom_batch_logger import CustomBatchLogger
|
from litellm.integrations.custom_batch_logger import CustomBatchLogger
|
||||||
from litellm.litellm_core_utils.duration_parser import duration_in_seconds
|
from litellm.litellm_core_utils.duration_parser import duration_in_seconds
|
||||||
from litellm.litellm_core_utils.exception_mapping_utils import (
|
from litellm.litellm_core_utils.exception_mapping_utils import (
|
||||||
|
@ -649,10 +650,10 @@ class SlackAlerting(CustomBatchLogger):
|
||||||
event_message += (
|
event_message += (
|
||||||
f"Budget Crossed\n Total Budget:`{user_info.max_budget}`"
|
f"Budget Crossed\n Total Budget:`{user_info.max_budget}`"
|
||||||
)
|
)
|
||||||
elif percent_left <= 0.05:
|
elif percent_left <= SLACK_ALERTING_THRESHOLD_5_PERCENT:
|
||||||
event = "threshold_crossed"
|
event = "threshold_crossed"
|
||||||
event_message += "5% Threshold Crossed "
|
event_message += "5% Threshold Crossed "
|
||||||
elif percent_left <= 0.15:
|
elif percent_left <= SLACK_ALERTING_THRESHOLD_15_PERCENT:
|
||||||
event = "threshold_crossed"
|
event = "threshold_crossed"
|
||||||
event_message += "15% Threshold Crossed"
|
event_message += "15% Threshold Crossed"
|
||||||
elif user_info.soft_budget is not None:
|
elif user_info.soft_budget is not None:
|
||||||
|
@ -1718,7 +1719,7 @@ Model Info:
|
||||||
await self.internal_usage_cache.async_set_cache(
|
await self.internal_usage_cache.async_set_cache(
|
||||||
key=_event_cache_key,
|
key=_event_cache_key,
|
||||||
value="SENT",
|
value="SENT",
|
||||||
ttl=(30 * 24 * 60 * 60), # 1 month
|
ttl=(30 * HOURS_IN_A_DAY * 60 * 60), # 1 month
|
||||||
)
|
)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|
|
@ -41,7 +41,7 @@ from litellm.types.utils import StandardLoggingPayload
|
||||||
from ..additional_logging_utils import AdditionalLoggingUtils
|
from ..additional_logging_utils import AdditionalLoggingUtils
|
||||||
|
|
||||||
# max number of logs DD API can accept
|
# max number of logs DD API can accept
|
||||||
DD_MAX_BATCH_SIZE = 1000
|
|
||||||
|
|
||||||
# specify what ServiceTypes are logged as success events to DD. (We don't want to spam DD traces with large number of service types)
|
# specify what ServiceTypes are logged as success events to DD. (We don't want to spam DD traces with large number of service types)
|
||||||
DD_LOGGED_SUCCESS_SERVICE_TYPES = [
|
DD_LOGGED_SUCCESS_SERVICE_TYPES = [
|
||||||
|
|
|
@ -20,10 +20,6 @@ else:
|
||||||
VertexBase = Any
|
VertexBase = Any
|
||||||
|
|
||||||
|
|
||||||
GCS_DEFAULT_BATCH_SIZE = 2048
|
|
||||||
GCS_DEFAULT_FLUSH_INTERVAL_SECONDS = 20
|
|
||||||
|
|
||||||
|
|
||||||
class GCSBucketLogger(GCSBucketBase, AdditionalLoggingUtils):
|
class GCSBucketLogger(GCSBucketBase, AdditionalLoggingUtils):
|
||||||
def __init__(self, bucket_name: Optional[str] = None) -> None:
|
def __init__(self, bucket_name: Optional[str] = None) -> None:
|
||||||
from litellm.proxy.proxy_server import premium_user
|
from litellm.proxy.proxy_server import premium_user
|
||||||
|
@ -125,6 +121,7 @@ class GCSBucketLogger(GCSBucketBase, AdditionalLoggingUtils):
|
||||||
gcs_logging_config: GCSLoggingConfig = await self.get_gcs_logging_config(
|
gcs_logging_config: GCSLoggingConfig = await self.get_gcs_logging_config(
|
||||||
kwargs
|
kwargs
|
||||||
)
|
)
|
||||||
|
|
||||||
headers = await self.construct_request_headers(
|
headers = await self.construct_request_headers(
|
||||||
vertex_instance=gcs_logging_config["vertex_instance"],
|
vertex_instance=gcs_logging_config["vertex_instance"],
|
||||||
service_account_json=gcs_logging_config["path_service_account"],
|
service_account_json=gcs_logging_config["path_service_account"],
|
||||||
|
|
|
@ -818,7 +818,7 @@ class PrometheusLogger(CustomLogger):
|
||||||
requested_model=request_data.get("model", ""),
|
requested_model=request_data.get("model", ""),
|
||||||
status_code=str(getattr(original_exception, "status_code", None)),
|
status_code=str(getattr(original_exception, "status_code", None)),
|
||||||
exception_status=str(getattr(original_exception, "status_code", None)),
|
exception_status=str(getattr(original_exception, "status_code", None)),
|
||||||
exception_class=str(original_exception.__class__.__name__),
|
exception_class=self._get_exception_class_name(original_exception),
|
||||||
tags=_tags,
|
tags=_tags,
|
||||||
)
|
)
|
||||||
_labels = prometheus_label_factory(
|
_labels = prometheus_label_factory(
|
||||||
|
@ -917,7 +917,7 @@ class PrometheusLogger(CustomLogger):
|
||||||
api_base=api_base,
|
api_base=api_base,
|
||||||
api_provider=llm_provider,
|
api_provider=llm_provider,
|
||||||
exception_status=str(getattr(exception, "status_code", None)),
|
exception_status=str(getattr(exception, "status_code", None)),
|
||||||
exception_class=exception.__class__.__name__,
|
exception_class=self._get_exception_class_name(exception),
|
||||||
requested_model=model_group,
|
requested_model=model_group,
|
||||||
hashed_api_key=standard_logging_payload["metadata"][
|
hashed_api_key=standard_logging_payload["metadata"][
|
||||||
"user_api_key_hash"
|
"user_api_key_hash"
|
||||||
|
@ -1146,6 +1146,22 @@ class PrometheusLogger(CustomLogger):
|
||||||
)
|
)
|
||||||
return
|
return
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _get_exception_class_name(exception: Exception) -> str:
|
||||||
|
exception_class_name = ""
|
||||||
|
if hasattr(exception, "llm_provider"):
|
||||||
|
exception_class_name = getattr(exception, "llm_provider") or ""
|
||||||
|
|
||||||
|
# pretty print the provider name on prometheus
|
||||||
|
# eg. `openai` -> `Openai.`
|
||||||
|
if len(exception_class_name) >= 1:
|
||||||
|
exception_class_name = (
|
||||||
|
exception_class_name[0].upper() + exception_class_name[1:] + "."
|
||||||
|
)
|
||||||
|
|
||||||
|
exception_class_name += exception.__class__.__name__
|
||||||
|
return exception_class_name
|
||||||
|
|
||||||
async def log_success_fallback_event(
|
async def log_success_fallback_event(
|
||||||
self, original_model_group: str, kwargs: dict, original_exception: Exception
|
self, original_model_group: str, kwargs: dict, original_exception: Exception
|
||||||
):
|
):
|
||||||
|
@ -1181,7 +1197,7 @@ class PrometheusLogger(CustomLogger):
|
||||||
team=standard_metadata["user_api_key_team_id"],
|
team=standard_metadata["user_api_key_team_id"],
|
||||||
team_alias=standard_metadata["user_api_key_team_alias"],
|
team_alias=standard_metadata["user_api_key_team_alias"],
|
||||||
exception_status=str(getattr(original_exception, "status_code", None)),
|
exception_status=str(getattr(original_exception, "status_code", None)),
|
||||||
exception_class=str(original_exception.__class__.__name__),
|
exception_class=self._get_exception_class_name(original_exception),
|
||||||
tags=_tags,
|
tags=_tags,
|
||||||
)
|
)
|
||||||
_labels = prometheus_label_factory(
|
_labels = prometheus_label_factory(
|
||||||
|
@ -1225,7 +1241,7 @@ class PrometheusLogger(CustomLogger):
|
||||||
team=standard_metadata["user_api_key_team_id"],
|
team=standard_metadata["user_api_key_team_id"],
|
||||||
team_alias=standard_metadata["user_api_key_team_alias"],
|
team_alias=standard_metadata["user_api_key_team_alias"],
|
||||||
exception_status=str(getattr(original_exception, "status_code", None)),
|
exception_status=str(getattr(original_exception, "status_code", None)),
|
||||||
exception_class=str(original_exception.__class__.__name__),
|
exception_class=self._get_exception_class_name(original_exception),
|
||||||
tags=_tags,
|
tags=_tags,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -1721,6 +1737,36 @@ class PrometheusLogger(CustomLogger):
|
||||||
return (end_time - start_time).total_seconds()
|
return (end_time - start_time).total_seconds()
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _mount_metrics_endpoint(premium_user: bool):
|
||||||
|
"""
|
||||||
|
Mount the Prometheus metrics endpoint with optional authentication.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
premium_user (bool): Whether the user is a premium user
|
||||||
|
require_auth (bool, optional): Whether to require authentication for the metrics endpoint.
|
||||||
|
Defaults to False.
|
||||||
|
"""
|
||||||
|
from prometheus_client import make_asgi_app
|
||||||
|
|
||||||
|
from litellm._logging import verbose_proxy_logger
|
||||||
|
from litellm.proxy._types import CommonProxyErrors
|
||||||
|
from litellm.proxy.proxy_server import app
|
||||||
|
|
||||||
|
if premium_user is not True:
|
||||||
|
verbose_proxy_logger.warning(
|
||||||
|
f"Prometheus metrics are only available for premium users. {CommonProxyErrors.not_premium_user.value}"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Create metrics ASGI app
|
||||||
|
metrics_app = make_asgi_app()
|
||||||
|
|
||||||
|
# Mount the metrics app to the app
|
||||||
|
app.mount("/metrics", metrics_app)
|
||||||
|
verbose_proxy_logger.debug(
|
||||||
|
"Starting Prometheus Metrics on /metrics (no authentication)"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def prometheus_label_factory(
|
def prometheus_label_factory(
|
||||||
supported_enum_labels: List[str],
|
supported_enum_labels: List[str],
|
||||||
|
|
|
@ -3,11 +3,16 @@
|
||||||
# On success + failure, log events to Prometheus for litellm / adjacent services (litellm, redis, postgres, llm api providers)
|
# On success + failure, log events to Prometheus for litellm / adjacent services (litellm, redis, postgres, llm api providers)
|
||||||
|
|
||||||
|
|
||||||
from typing import List, Optional, Union
|
from typing import Dict, List, Optional, Union
|
||||||
|
|
||||||
from litellm._logging import print_verbose, verbose_logger
|
from litellm._logging import print_verbose, verbose_logger
|
||||||
from litellm.types.integrations.prometheus import LATENCY_BUCKETS
|
from litellm.types.integrations.prometheus import LATENCY_BUCKETS
|
||||||
from litellm.types.services import ServiceLoggerPayload, ServiceTypes
|
from litellm.types.services import (
|
||||||
|
DEFAULT_SERVICE_CONFIGS,
|
||||||
|
ServiceLoggerPayload,
|
||||||
|
ServiceMetrics,
|
||||||
|
ServiceTypes,
|
||||||
|
)
|
||||||
|
|
||||||
FAILED_REQUESTS_LABELS = ["error_class", "function_name"]
|
FAILED_REQUESTS_LABELS = ["error_class", "function_name"]
|
||||||
|
|
||||||
|
@ -23,7 +28,8 @@ class PrometheusServicesLogger:
|
||||||
):
|
):
|
||||||
try:
|
try:
|
||||||
try:
|
try:
|
||||||
from prometheus_client import REGISTRY, Counter, Histogram
|
from prometheus_client import REGISTRY, Counter, Gauge, Histogram
|
||||||
|
from prometheus_client.gc_collector import Collector
|
||||||
except ImportError:
|
except ImportError:
|
||||||
raise Exception(
|
raise Exception(
|
||||||
"Missing prometheus_client. Run `pip install prometheus-client`"
|
"Missing prometheus_client. Run `pip install prometheus-client`"
|
||||||
|
@ -31,36 +37,51 @@ class PrometheusServicesLogger:
|
||||||
|
|
||||||
self.Histogram = Histogram
|
self.Histogram = Histogram
|
||||||
self.Counter = Counter
|
self.Counter = Counter
|
||||||
|
self.Gauge = Gauge
|
||||||
self.REGISTRY = REGISTRY
|
self.REGISTRY = REGISTRY
|
||||||
|
|
||||||
verbose_logger.debug("in init prometheus services metrics")
|
verbose_logger.debug("in init prometheus services metrics")
|
||||||
|
|
||||||
self.services = [item.value for item in ServiceTypes]
|
self.payload_to_prometheus_map: Dict[
|
||||||
|
str, List[Union[Histogram, Counter, Gauge, Collector]]
|
||||||
|
] = {}
|
||||||
|
|
||||||
self.payload_to_prometheus_map = (
|
for service in ServiceTypes:
|
||||||
{}
|
service_metrics: List[Union[Histogram, Counter, Gauge, Collector]] = []
|
||||||
) # store the prometheus histogram/counter we need to call for each field in payload
|
|
||||||
|
|
||||||
for service in self.services:
|
metrics_to_initialize = self._get_service_metrics_initialize(service)
|
||||||
histogram = self.create_histogram(service, type_of_request="latency")
|
|
||||||
counter_failed_request = self.create_counter(
|
|
||||||
service,
|
|
||||||
type_of_request="failed_requests",
|
|
||||||
additional_labels=FAILED_REQUESTS_LABELS,
|
|
||||||
)
|
|
||||||
counter_total_requests = self.create_counter(
|
|
||||||
service, type_of_request="total_requests"
|
|
||||||
)
|
|
||||||
self.payload_to_prometheus_map[service] = [
|
|
||||||
histogram,
|
|
||||||
counter_failed_request,
|
|
||||||
counter_total_requests,
|
|
||||||
]
|
|
||||||
|
|
||||||
self.prometheus_to_amount_map: dict = (
|
# Initialize only the configured metrics for each service
|
||||||
{}
|
if ServiceMetrics.HISTOGRAM in metrics_to_initialize:
|
||||||
) # the field / value in ServiceLoggerPayload the object needs to be incremented by
|
histogram = self.create_histogram(
|
||||||
|
service.value, type_of_request="latency"
|
||||||
|
)
|
||||||
|
if histogram:
|
||||||
|
service_metrics.append(histogram)
|
||||||
|
|
||||||
|
if ServiceMetrics.COUNTER in metrics_to_initialize:
|
||||||
|
counter_failed_request = self.create_counter(
|
||||||
|
service.value,
|
||||||
|
type_of_request="failed_requests",
|
||||||
|
additional_labels=FAILED_REQUESTS_LABELS,
|
||||||
|
)
|
||||||
|
if counter_failed_request:
|
||||||
|
service_metrics.append(counter_failed_request)
|
||||||
|
counter_total_requests = self.create_counter(
|
||||||
|
service.value, type_of_request="total_requests"
|
||||||
|
)
|
||||||
|
if counter_total_requests:
|
||||||
|
service_metrics.append(counter_total_requests)
|
||||||
|
|
||||||
|
if ServiceMetrics.GAUGE in metrics_to_initialize:
|
||||||
|
gauge = self.create_gauge(service.value, type_of_request="size")
|
||||||
|
if gauge:
|
||||||
|
service_metrics.append(gauge)
|
||||||
|
|
||||||
|
if service_metrics:
|
||||||
|
self.payload_to_prometheus_map[service.value] = service_metrics
|
||||||
|
|
||||||
|
self.prometheus_to_amount_map: dict = {}
|
||||||
### MOCK TESTING ###
|
### MOCK TESTING ###
|
||||||
self.mock_testing = mock_testing
|
self.mock_testing = mock_testing
|
||||||
self.mock_testing_success_calls = 0
|
self.mock_testing_success_calls = 0
|
||||||
|
@ -70,6 +91,19 @@ class PrometheusServicesLogger:
|
||||||
print_verbose(f"Got exception on init prometheus client {str(e)}")
|
print_verbose(f"Got exception on init prometheus client {str(e)}")
|
||||||
raise e
|
raise e
|
||||||
|
|
||||||
|
def _get_service_metrics_initialize(
|
||||||
|
self, service: ServiceTypes
|
||||||
|
) -> List[ServiceMetrics]:
|
||||||
|
DEFAULT_METRICS = [ServiceMetrics.COUNTER, ServiceMetrics.HISTOGRAM]
|
||||||
|
if service not in DEFAULT_SERVICE_CONFIGS:
|
||||||
|
return DEFAULT_METRICS
|
||||||
|
|
||||||
|
metrics = DEFAULT_SERVICE_CONFIGS.get(service, {}).get("metrics", [])
|
||||||
|
if not metrics:
|
||||||
|
verbose_logger.debug(f"No metrics found for service {service}")
|
||||||
|
return DEFAULT_METRICS
|
||||||
|
return metrics
|
||||||
|
|
||||||
def is_metric_registered(self, metric_name) -> bool:
|
def is_metric_registered(self, metric_name) -> bool:
|
||||||
for metric in self.REGISTRY.collect():
|
for metric in self.REGISTRY.collect():
|
||||||
if metric_name == metric.name:
|
if metric_name == metric.name:
|
||||||
|
@ -94,6 +128,15 @@ class PrometheusServicesLogger:
|
||||||
buckets=LATENCY_BUCKETS,
|
buckets=LATENCY_BUCKETS,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def create_gauge(self, service: str, type_of_request: str):
|
||||||
|
metric_name = "litellm_{}_{}".format(service, type_of_request)
|
||||||
|
is_registered = self.is_metric_registered(metric_name)
|
||||||
|
if is_registered:
|
||||||
|
return self._get_metric(metric_name)
|
||||||
|
return self.Gauge(
|
||||||
|
metric_name, "Gauge for {} service".format(service), labelnames=[service]
|
||||||
|
)
|
||||||
|
|
||||||
def create_counter(
|
def create_counter(
|
||||||
self,
|
self,
|
||||||
service: str,
|
service: str,
|
||||||
|
@ -120,6 +163,15 @@ class PrometheusServicesLogger:
|
||||||
|
|
||||||
histogram.labels(labels).observe(amount)
|
histogram.labels(labels).observe(amount)
|
||||||
|
|
||||||
|
def update_gauge(
|
||||||
|
self,
|
||||||
|
gauge,
|
||||||
|
labels: str,
|
||||||
|
amount: float,
|
||||||
|
):
|
||||||
|
assert isinstance(gauge, self.Gauge)
|
||||||
|
gauge.labels(labels).set(amount)
|
||||||
|
|
||||||
def increment_counter(
|
def increment_counter(
|
||||||
self,
|
self,
|
||||||
counter,
|
counter,
|
||||||
|
@ -190,6 +242,13 @@ class PrometheusServicesLogger:
|
||||||
labels=payload.service.value,
|
labels=payload.service.value,
|
||||||
amount=1, # LOG TOTAL REQUESTS TO PROMETHEUS
|
amount=1, # LOG TOTAL REQUESTS TO PROMETHEUS
|
||||||
)
|
)
|
||||||
|
elif isinstance(obj, self.Gauge):
|
||||||
|
if payload.event_metadata:
|
||||||
|
self.update_gauge(
|
||||||
|
gauge=obj,
|
||||||
|
labels=payload.event_metadata.get("gauge_labels") or "",
|
||||||
|
amount=payload.event_metadata.get("gauge_value") or 0,
|
||||||
|
)
|
||||||
|
|
||||||
async def async_service_failure_hook(
|
async def async_service_failure_hook(
|
||||||
self,
|
self,
|
||||||
|
|
|
@ -10,6 +10,7 @@ class CredentialAccessor:
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def get_credential_values(credential_name: str) -> dict:
|
def get_credential_values(credential_name: str) -> dict:
|
||||||
"""Safe accessor for credentials."""
|
"""Safe accessor for credentials."""
|
||||||
|
|
||||||
if not litellm.credential_list:
|
if not litellm.credential_list:
|
||||||
return {}
|
return {}
|
||||||
for credential in litellm.credential_list:
|
for credential in litellm.credential_list:
|
||||||
|
|
|
@ -3,6 +3,7 @@ from typing import Optional, Tuple
|
||||||
import httpx
|
import httpx
|
||||||
|
|
||||||
import litellm
|
import litellm
|
||||||
|
from litellm.constants import REPLICATE_MODEL_NAME_WITH_ID_LENGTH
|
||||||
from litellm.secret_managers.main import get_secret, get_secret_str
|
from litellm.secret_managers.main import get_secret, get_secret_str
|
||||||
|
|
||||||
from ..types.router import LiteLLM_Params
|
from ..types.router import LiteLLM_Params
|
||||||
|
@ -256,10 +257,13 @@ def get_llm_provider( # noqa: PLR0915
|
||||||
elif model in litellm.cohere_chat_models:
|
elif model in litellm.cohere_chat_models:
|
||||||
custom_llm_provider = "cohere_chat"
|
custom_llm_provider = "cohere_chat"
|
||||||
## replicate
|
## replicate
|
||||||
elif model in litellm.replicate_models or (":" in model and len(model) > 64):
|
elif model in litellm.replicate_models or (
|
||||||
|
":" in model and len(model) > REPLICATE_MODEL_NAME_WITH_ID_LENGTH
|
||||||
|
):
|
||||||
model_parts = model.split(":")
|
model_parts = model.split(":")
|
||||||
if (
|
if (
|
||||||
len(model_parts) > 1 and len(model_parts[1]) == 64
|
len(model_parts) > 1
|
||||||
|
and len(model_parts[1]) == REPLICATE_MODEL_NAME_WITH_ID_LENGTH
|
||||||
): ## checks if model name has a 64 digit code - e.g. "meta/llama-2-70b-chat:02e509c789964a7ea8736978a43525956ef40397be9033abf9fd2badfe68c9e3"
|
): ## checks if model name has a 64 digit code - e.g. "meta/llama-2-70b-chat:02e509c789964a7ea8736978a43525956ef40397be9033abf9fd2badfe68c9e3"
|
||||||
custom_llm_provider = "replicate"
|
custom_llm_provider = "replicate"
|
||||||
elif model in litellm.replicate_models:
|
elif model in litellm.replicate_models:
|
||||||
|
|
|
@ -123,7 +123,7 @@ def get_supported_openai_params( # noqa: PLR0915
|
||||||
elif custom_llm_provider == "replicate":
|
elif custom_llm_provider == "replicate":
|
||||||
return litellm.ReplicateConfig().get_supported_openai_params(model=model)
|
return litellm.ReplicateConfig().get_supported_openai_params(model=model)
|
||||||
elif custom_llm_provider == "huggingface":
|
elif custom_llm_provider == "huggingface":
|
||||||
return litellm.HuggingfaceConfig().get_supported_openai_params(model=model)
|
return litellm.HuggingFaceChatConfig().get_supported_openai_params(model=model)
|
||||||
elif custom_llm_provider == "jina_ai":
|
elif custom_llm_provider == "jina_ai":
|
||||||
if request_type == "embeddings":
|
if request_type == "embeddings":
|
||||||
return litellm.JinaAIEmbeddingConfig().get_supported_openai_params()
|
return litellm.JinaAIEmbeddingConfig().get_supported_openai_params()
|
||||||
|
|
|
@ -28,6 +28,10 @@ from litellm._logging import _is_debugging_on, verbose_logger
|
||||||
from litellm.batches.batch_utils import _handle_completed_batch
|
from litellm.batches.batch_utils import _handle_completed_batch
|
||||||
from litellm.caching.caching import DualCache, InMemoryCache
|
from litellm.caching.caching import DualCache, InMemoryCache
|
||||||
from litellm.caching.caching_handler import LLMCachingHandler
|
from litellm.caching.caching_handler import LLMCachingHandler
|
||||||
|
from litellm.constants import (
|
||||||
|
DEFAULT_MOCK_RESPONSE_COMPLETION_TOKEN_COUNT,
|
||||||
|
DEFAULT_MOCK_RESPONSE_PROMPT_TOKEN_COUNT,
|
||||||
|
)
|
||||||
from litellm.cost_calculator import _select_model_name_for_cost_calc
|
from litellm.cost_calculator import _select_model_name_for_cost_calc
|
||||||
from litellm.integrations.arize.arize import ArizeLogger
|
from litellm.integrations.arize.arize import ArizeLogger
|
||||||
from litellm.integrations.custom_guardrail import CustomGuardrail
|
from litellm.integrations.custom_guardrail import CustomGuardrail
|
||||||
|
@ -290,6 +294,7 @@ class Logging(LiteLLMLoggingBaseClass):
|
||||||
"input": _input,
|
"input": _input,
|
||||||
"litellm_params": litellm_params,
|
"litellm_params": litellm_params,
|
||||||
"applied_guardrails": applied_guardrails,
|
"applied_guardrails": applied_guardrails,
|
||||||
|
"model": model,
|
||||||
}
|
}
|
||||||
|
|
||||||
def process_dynamic_callbacks(self):
|
def process_dynamic_callbacks(self):
|
||||||
|
@ -452,8 +457,12 @@ class Logging(LiteLLMLoggingBaseClass):
|
||||||
non_default_params: dict,
|
non_default_params: dict,
|
||||||
prompt_id: str,
|
prompt_id: str,
|
||||||
prompt_variables: Optional[dict],
|
prompt_variables: Optional[dict],
|
||||||
|
prompt_management_logger: Optional[CustomLogger] = None,
|
||||||
) -> Tuple[str, List[AllMessageValues], dict]:
|
) -> Tuple[str, List[AllMessageValues], dict]:
|
||||||
custom_logger = self.get_custom_logger_for_prompt_management(model)
|
custom_logger = (
|
||||||
|
prompt_management_logger
|
||||||
|
or self.get_custom_logger_for_prompt_management(model)
|
||||||
|
)
|
||||||
if custom_logger:
|
if custom_logger:
|
||||||
(
|
(
|
||||||
model,
|
model,
|
||||||
|
@ -892,6 +901,7 @@ class Logging(LiteLLMLoggingBaseClass):
|
||||||
ResponseCompletedEvent,
|
ResponseCompletedEvent,
|
||||||
],
|
],
|
||||||
cache_hit: Optional[bool] = None,
|
cache_hit: Optional[bool] = None,
|
||||||
|
litellm_model_name: Optional[str] = None,
|
||||||
) -> Optional[float]:
|
) -> Optional[float]:
|
||||||
"""
|
"""
|
||||||
Calculate response cost using result + logging object variables.
|
Calculate response cost using result + logging object variables.
|
||||||
|
@ -917,7 +927,7 @@ class Logging(LiteLLMLoggingBaseClass):
|
||||||
try:
|
try:
|
||||||
response_cost_calculator_kwargs = {
|
response_cost_calculator_kwargs = {
|
||||||
"response_object": result,
|
"response_object": result,
|
||||||
"model": self.model,
|
"model": litellm_model_name or self.model,
|
||||||
"cache_hit": cache_hit,
|
"cache_hit": cache_hit,
|
||||||
"custom_llm_provider": self.model_call_details.get(
|
"custom_llm_provider": self.model_call_details.get(
|
||||||
"custom_llm_provider", None
|
"custom_llm_provider", None
|
||||||
|
@ -1009,6 +1019,10 @@ class Logging(LiteLLMLoggingBaseClass):
|
||||||
return False
|
return False
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
def _update_completion_start_time(self, completion_start_time: datetime.datetime):
|
||||||
|
self.completion_start_time = completion_start_time
|
||||||
|
self.model_call_details["completion_start_time"] = self.completion_start_time
|
||||||
|
|
||||||
def _success_handler_helper_fn(
|
def _success_handler_helper_fn(
|
||||||
self,
|
self,
|
||||||
result=None,
|
result=None,
|
||||||
|
@ -3739,9 +3753,12 @@ def create_dummy_standard_logging_payload() -> StandardLoggingPayload:
|
||||||
response_cost=response_cost,
|
response_cost=response_cost,
|
||||||
response_cost_failure_debug_info=None,
|
response_cost_failure_debug_info=None,
|
||||||
status=str("success"),
|
status=str("success"),
|
||||||
total_tokens=int(30),
|
total_tokens=int(
|
||||||
prompt_tokens=int(20),
|
DEFAULT_MOCK_RESPONSE_PROMPT_TOKEN_COUNT
|
||||||
completion_tokens=int(10),
|
+ DEFAULT_MOCK_RESPONSE_COMPLETION_TOKEN_COUNT
|
||||||
|
),
|
||||||
|
prompt_tokens=int(DEFAULT_MOCK_RESPONSE_PROMPT_TOKEN_COUNT),
|
||||||
|
completion_tokens=int(DEFAULT_MOCK_RESPONSE_COMPLETION_TOKEN_COUNT),
|
||||||
startTime=start_time,
|
startTime=start_time,
|
||||||
endTime=end_time,
|
endTime=end_time,
|
||||||
completionStartTime=completion_start_time,
|
completionStartTime=completion_start_time,
|
||||||
|
|
|
@ -5,6 +5,7 @@ Helper utilities for tracking the cost of built-in tools.
|
||||||
from typing import Any, Dict, List, Optional
|
from typing import Any, Dict, List, Optional
|
||||||
|
|
||||||
import litellm
|
import litellm
|
||||||
|
from litellm.constants import OPENAI_FILE_SEARCH_COST_PER_1K_CALLS
|
||||||
from litellm.types.llms.openai import FileSearchTool, WebSearchOptions
|
from litellm.types.llms.openai import FileSearchTool, WebSearchOptions
|
||||||
from litellm.types.utils import (
|
from litellm.types.utils import (
|
||||||
ModelInfo,
|
ModelInfo,
|
||||||
|
@ -132,7 +133,7 @@ class StandardBuiltInToolCostTracking:
|
||||||
"""
|
"""
|
||||||
if file_search is None:
|
if file_search is None:
|
||||||
return 0.0
|
return 0.0
|
||||||
return 2.5 / 1000
|
return OPENAI_FILE_SEARCH_COST_PER_1K_CALLS
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def chat_completion_response_includes_annotations(
|
def chat_completion_response_includes_annotations(
|
||||||
|
|
|
@ -9,6 +9,7 @@ from typing import Dict, Iterable, List, Literal, Optional, Tuple, Union
|
||||||
import litellm
|
import litellm
|
||||||
from litellm._logging import verbose_logger
|
from litellm._logging import verbose_logger
|
||||||
from litellm.constants import RESPONSE_FORMAT_TOOL_NAME
|
from litellm.constants import RESPONSE_FORMAT_TOOL_NAME
|
||||||
|
from litellm.types.llms.databricks import DatabricksTool
|
||||||
from litellm.types.llms.openai import ChatCompletionThinkingBlock
|
from litellm.types.llms.openai import ChatCompletionThinkingBlock
|
||||||
from litellm.types.utils import (
|
from litellm.types.utils import (
|
||||||
ChatCompletionDeltaToolCall,
|
ChatCompletionDeltaToolCall,
|
||||||
|
@ -35,6 +36,25 @@ from litellm.types.utils import (
|
||||||
from .get_headers import get_response_headers
|
from .get_headers import get_response_headers
|
||||||
|
|
||||||
|
|
||||||
|
def convert_tool_call_to_json_mode(
|
||||||
|
tool_calls: List[ChatCompletionMessageToolCall],
|
||||||
|
convert_tool_call_to_json_mode: bool,
|
||||||
|
) -> Tuple[Optional[Message], Optional[str]]:
|
||||||
|
if _should_convert_tool_call_to_json_mode(
|
||||||
|
tool_calls=tool_calls,
|
||||||
|
convert_tool_call_to_json_mode=convert_tool_call_to_json_mode,
|
||||||
|
):
|
||||||
|
# to support 'json_schema' logic on older models
|
||||||
|
json_mode_content_str: Optional[str] = tool_calls[0]["function"].get(
|
||||||
|
"arguments"
|
||||||
|
)
|
||||||
|
if json_mode_content_str is not None:
|
||||||
|
message = litellm.Message(content=json_mode_content_str)
|
||||||
|
finish_reason = "stop"
|
||||||
|
return message, finish_reason
|
||||||
|
return None, None
|
||||||
|
|
||||||
|
|
||||||
async def convert_to_streaming_response_async(response_object: Optional[dict] = None):
|
async def convert_to_streaming_response_async(response_object: Optional[dict] = None):
|
||||||
"""
|
"""
|
||||||
Asynchronously converts a response object to a streaming response.
|
Asynchronously converts a response object to a streaming response.
|
||||||
|
@ -335,21 +355,14 @@ class LiteLLMResponseObjectHandler:
|
||||||
Only supported for HF TGI models
|
Only supported for HF TGI models
|
||||||
"""
|
"""
|
||||||
transformed_logprobs: Optional[TextCompletionLogprobs] = None
|
transformed_logprobs: Optional[TextCompletionLogprobs] = None
|
||||||
if custom_llm_provider == "huggingface":
|
|
||||||
# only supported for TGI models
|
|
||||||
try:
|
|
||||||
raw_response = response._hidden_params.get("original_response", None)
|
|
||||||
transformed_logprobs = litellm.huggingface._transform_logprobs(
|
|
||||||
hf_response=raw_response
|
|
||||||
)
|
|
||||||
except Exception as e:
|
|
||||||
verbose_logger.exception(f"LiteLLM non blocking exception: {e}")
|
|
||||||
|
|
||||||
return transformed_logprobs
|
return transformed_logprobs
|
||||||
|
|
||||||
|
|
||||||
def _should_convert_tool_call_to_json_mode(
|
def _should_convert_tool_call_to_json_mode(
|
||||||
tool_calls: Optional[List[ChatCompletionMessageToolCall]] = None,
|
tool_calls: Optional[
|
||||||
|
Union[List[ChatCompletionMessageToolCall], List[DatabricksTool]]
|
||||||
|
] = None,
|
||||||
convert_tool_call_to_json_mode: Optional[bool] = None,
|
convert_tool_call_to_json_mode: Optional[bool] = None,
|
||||||
) -> bool:
|
) -> bool:
|
||||||
"""
|
"""
|
||||||
|
|
|
@ -7,6 +7,7 @@ from typing import Dict, List, Literal, Optional, Union, cast
|
||||||
from litellm.types.llms.openai import (
|
from litellm.types.llms.openai import (
|
||||||
AllMessageValues,
|
AllMessageValues,
|
||||||
ChatCompletionAssistantMessage,
|
ChatCompletionAssistantMessage,
|
||||||
|
ChatCompletionFileObject,
|
||||||
ChatCompletionUserMessage,
|
ChatCompletionUserMessage,
|
||||||
)
|
)
|
||||||
from litellm.types.utils import Choices, ModelResponse, StreamingChoices
|
from litellm.types.utils import Choices, ModelResponse, StreamingChoices
|
||||||
|
@ -34,7 +35,7 @@ def handle_messages_with_content_list_to_str_conversion(
|
||||||
|
|
||||||
|
|
||||||
def strip_name_from_messages(
|
def strip_name_from_messages(
|
||||||
messages: List[AllMessageValues],
|
messages: List[AllMessageValues], allowed_name_roles: List[str] = ["user"]
|
||||||
) -> List[AllMessageValues]:
|
) -> List[AllMessageValues]:
|
||||||
"""
|
"""
|
||||||
Removes 'name' from messages
|
Removes 'name' from messages
|
||||||
|
@ -43,7 +44,7 @@ def strip_name_from_messages(
|
||||||
for message in messages:
|
for message in messages:
|
||||||
msg_role = message.get("role")
|
msg_role = message.get("role")
|
||||||
msg_copy = message.copy()
|
msg_copy = message.copy()
|
||||||
if msg_role == "user":
|
if msg_role not in allowed_name_roles:
|
||||||
msg_copy.pop("name", None) # type: ignore
|
msg_copy.pop("name", None) # type: ignore
|
||||||
new_messages.append(msg_copy)
|
new_messages.append(msg_copy)
|
||||||
return new_messages
|
return new_messages
|
||||||
|
@ -292,3 +293,58 @@ def get_completion_messages(
|
||||||
messages, assistant_continue_message, ensure_alternating_roles
|
messages, assistant_continue_message, ensure_alternating_roles
|
||||||
)
|
)
|
||||||
return messages
|
return messages
|
||||||
|
|
||||||
|
|
||||||
|
def get_file_ids_from_messages(messages: List[AllMessageValues]) -> List[str]:
|
||||||
|
"""
|
||||||
|
Gets file ids from messages
|
||||||
|
"""
|
||||||
|
file_ids = []
|
||||||
|
for message in messages:
|
||||||
|
if message.get("role") == "user":
|
||||||
|
content = message.get("content")
|
||||||
|
if content:
|
||||||
|
if isinstance(content, str):
|
||||||
|
continue
|
||||||
|
for c in content:
|
||||||
|
if c["type"] == "file":
|
||||||
|
file_object = cast(ChatCompletionFileObject, c)
|
||||||
|
file_object_file_field = file_object["file"]
|
||||||
|
file_id = file_object_file_field.get("file_id")
|
||||||
|
if file_id:
|
||||||
|
file_ids.append(file_id)
|
||||||
|
return file_ids
|
||||||
|
|
||||||
|
|
||||||
|
def update_messages_with_model_file_ids(
|
||||||
|
messages: List[AllMessageValues],
|
||||||
|
model_id: str,
|
||||||
|
model_file_id_mapping: Dict[str, Dict[str, str]],
|
||||||
|
) -> List[AllMessageValues]:
|
||||||
|
"""
|
||||||
|
Updates messages with model file ids.
|
||||||
|
|
||||||
|
model_file_id_mapping: Dict[str, Dict[str, str]] = {
|
||||||
|
"litellm_proxy/file_id": {
|
||||||
|
"model_id": "provider_file_id"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
"""
|
||||||
|
for message in messages:
|
||||||
|
if message.get("role") == "user":
|
||||||
|
content = message.get("content")
|
||||||
|
if content:
|
||||||
|
if isinstance(content, str):
|
||||||
|
continue
|
||||||
|
for c in content:
|
||||||
|
if c["type"] == "file":
|
||||||
|
file_object = cast(ChatCompletionFileObject, c)
|
||||||
|
file_object_file_field = file_object["file"]
|
||||||
|
file_id = file_object_file_field.get("file_id")
|
||||||
|
if file_id:
|
||||||
|
provider_file_id = (
|
||||||
|
model_file_id_mapping.get(file_id, {}).get(model_id)
|
||||||
|
or file_id
|
||||||
|
)
|
||||||
|
file_object_file_field["file_id"] = provider_file_id
|
||||||
|
return messages
|
||||||
|
|
|
@ -22,6 +22,7 @@ from litellm.types.llms.openai import (
|
||||||
AllMessageValues,
|
AllMessageValues,
|
||||||
ChatCompletionAssistantMessage,
|
ChatCompletionAssistantMessage,
|
||||||
ChatCompletionAssistantToolCall,
|
ChatCompletionAssistantToolCall,
|
||||||
|
ChatCompletionFileObject,
|
||||||
ChatCompletionFunctionMessage,
|
ChatCompletionFunctionMessage,
|
||||||
ChatCompletionImageObject,
|
ChatCompletionImageObject,
|
||||||
ChatCompletionTextObject,
|
ChatCompletionTextObject,
|
||||||
|
@ -1299,20 +1300,37 @@ def convert_to_anthropic_tool_invoke(
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
"""
|
"""
|
||||||
anthropic_tool_invoke = [
|
anthropic_tool_invoke = []
|
||||||
AnthropicMessagesToolUseParam(
|
|
||||||
|
for tool in tool_calls:
|
||||||
|
if not get_attribute_or_key(tool, "type") == "function":
|
||||||
|
continue
|
||||||
|
|
||||||
|
_anthropic_tool_use_param = AnthropicMessagesToolUseParam(
|
||||||
type="tool_use",
|
type="tool_use",
|
||||||
id=get_attribute_or_key(tool, "id"),
|
id=cast(str, get_attribute_or_key(tool, "id")),
|
||||||
name=get_attribute_or_key(get_attribute_or_key(tool, "function"), "name"),
|
name=cast(
|
||||||
|
str,
|
||||||
|
get_attribute_or_key(get_attribute_or_key(tool, "function"), "name"),
|
||||||
|
),
|
||||||
input=json.loads(
|
input=json.loads(
|
||||||
get_attribute_or_key(
|
get_attribute_or_key(
|
||||||
get_attribute_or_key(tool, "function"), "arguments"
|
get_attribute_or_key(tool, "function"), "arguments"
|
||||||
)
|
)
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
for tool in tool_calls
|
|
||||||
if get_attribute_or_key(tool, "type") == "function"
|
_content_element = add_cache_control_to_content(
|
||||||
]
|
anthropic_content_element=_anthropic_tool_use_param,
|
||||||
|
orignal_content_element=dict(tool),
|
||||||
|
)
|
||||||
|
|
||||||
|
if "cache_control" in _content_element:
|
||||||
|
_anthropic_tool_use_param["cache_control"] = _content_element[
|
||||||
|
"cache_control"
|
||||||
|
]
|
||||||
|
|
||||||
|
anthropic_tool_invoke.append(_anthropic_tool_use_param)
|
||||||
|
|
||||||
return anthropic_tool_invoke
|
return anthropic_tool_invoke
|
||||||
|
|
||||||
|
@ -1323,6 +1341,7 @@ def add_cache_control_to_content(
|
||||||
AnthropicMessagesImageParam,
|
AnthropicMessagesImageParam,
|
||||||
AnthropicMessagesTextParam,
|
AnthropicMessagesTextParam,
|
||||||
AnthropicMessagesDocumentParam,
|
AnthropicMessagesDocumentParam,
|
||||||
|
AnthropicMessagesToolUseParam,
|
||||||
ChatCompletionThinkingBlock,
|
ChatCompletionThinkingBlock,
|
||||||
],
|
],
|
||||||
orignal_content_element: Union[dict, AllMessageValues],
|
orignal_content_element: Union[dict, AllMessageValues],
|
||||||
|
@ -1455,6 +1474,25 @@ def anthropic_messages_pt( # noqa: PLR0915
|
||||||
user_content.append(_content_element)
|
user_content.append(_content_element)
|
||||||
elif m.get("type", "") == "document":
|
elif m.get("type", "") == "document":
|
||||||
user_content.append(cast(AnthropicMessagesDocumentParam, m))
|
user_content.append(cast(AnthropicMessagesDocumentParam, m))
|
||||||
|
elif m.get("type", "") == "file":
|
||||||
|
file_message = cast(ChatCompletionFileObject, m)
|
||||||
|
file_data = file_message["file"].get("file_data")
|
||||||
|
if file_data:
|
||||||
|
image_chunk = convert_to_anthropic_image_obj(
|
||||||
|
openai_image_url=file_data,
|
||||||
|
format=file_message["file"].get("format"),
|
||||||
|
)
|
||||||
|
anthropic_document_param = (
|
||||||
|
AnthropicMessagesDocumentParam(
|
||||||
|
type="document",
|
||||||
|
source=AnthropicContentParamSource(
|
||||||
|
type="base64",
|
||||||
|
media_type=image_chunk["media_type"],
|
||||||
|
data=image_chunk["data"],
|
||||||
|
),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
user_content.append(anthropic_document_param)
|
||||||
elif isinstance(user_message_types_block["content"], str):
|
elif isinstance(user_message_types_block["content"], str):
|
||||||
_anthropic_content_text_element: AnthropicMessagesTextParam = {
|
_anthropic_content_text_element: AnthropicMessagesTextParam = {
|
||||||
"type": "text",
|
"type": "text",
|
||||||
|
@ -2885,6 +2923,11 @@ class BedrockConverseMessagesProcessor:
|
||||||
image_url=image_url, format=format
|
image_url=image_url, format=format
|
||||||
)
|
)
|
||||||
_parts.append(_part) # type: ignore
|
_parts.append(_part) # type: ignore
|
||||||
|
elif element["type"] == "file":
|
||||||
|
_part = await BedrockConverseMessagesProcessor._async_process_file_message(
|
||||||
|
message=cast(ChatCompletionFileObject, element)
|
||||||
|
)
|
||||||
|
_parts.append(_part)
|
||||||
_cache_point_block = (
|
_cache_point_block = (
|
||||||
litellm.AmazonConverseConfig()._get_cache_point_block(
|
litellm.AmazonConverseConfig()._get_cache_point_block(
|
||||||
message_block=cast(
|
message_block=cast(
|
||||||
|
@ -3054,6 +3097,45 @@ class BedrockConverseMessagesProcessor:
|
||||||
reasoning_content_blocks.append(bedrock_content_block)
|
reasoning_content_blocks.append(bedrock_content_block)
|
||||||
return reasoning_content_blocks
|
return reasoning_content_blocks
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _process_file_message(message: ChatCompletionFileObject) -> BedrockContentBlock:
|
||||||
|
file_message = message["file"]
|
||||||
|
file_data = file_message.get("file_data")
|
||||||
|
file_id = file_message.get("file_id")
|
||||||
|
|
||||||
|
if file_data is None and file_id is None:
|
||||||
|
raise litellm.BadRequestError(
|
||||||
|
message="file_data and file_id cannot both be None. Got={}".format(
|
||||||
|
message
|
||||||
|
),
|
||||||
|
model="",
|
||||||
|
llm_provider="bedrock",
|
||||||
|
)
|
||||||
|
format = file_message.get("format")
|
||||||
|
return BedrockImageProcessor.process_image_sync(
|
||||||
|
image_url=cast(str, file_id or file_data), format=format
|
||||||
|
)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
async def _async_process_file_message(
|
||||||
|
message: ChatCompletionFileObject,
|
||||||
|
) -> BedrockContentBlock:
|
||||||
|
file_message = message["file"]
|
||||||
|
file_data = file_message.get("file_data")
|
||||||
|
file_id = file_message.get("file_id")
|
||||||
|
format = file_message.get("format")
|
||||||
|
if file_data is None and file_id is None:
|
||||||
|
raise litellm.BadRequestError(
|
||||||
|
message="file_data and file_id cannot both be None. Got={}".format(
|
||||||
|
message
|
||||||
|
),
|
||||||
|
model="",
|
||||||
|
llm_provider="bedrock",
|
||||||
|
)
|
||||||
|
return await BedrockImageProcessor.process_image_async(
|
||||||
|
image_url=cast(str, file_id or file_data), format=format
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def _bedrock_converse_messages_pt( # noqa: PLR0915
|
def _bedrock_converse_messages_pt( # noqa: PLR0915
|
||||||
messages: List,
|
messages: List,
|
||||||
|
@ -3126,6 +3208,13 @@ def _bedrock_converse_messages_pt( # noqa: PLR0915
|
||||||
format=format,
|
format=format,
|
||||||
)
|
)
|
||||||
_parts.append(_part) # type: ignore
|
_parts.append(_part) # type: ignore
|
||||||
|
elif element["type"] == "file":
|
||||||
|
_part = (
|
||||||
|
BedrockConverseMessagesProcessor._process_file_message(
|
||||||
|
message=cast(ChatCompletionFileObject, element)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
_parts.append(_part)
|
||||||
_cache_point_block = (
|
_cache_point_block = (
|
||||||
litellm.AmazonConverseConfig()._get_cache_point_block(
|
litellm.AmazonConverseConfig()._get_cache_point_block(
|
||||||
message_block=cast(
|
message_block=cast(
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
import base64
|
import base64
|
||||||
import time
|
import time
|
||||||
from typing import Any, Dict, List, Optional, Union
|
from typing import Any, Dict, List, Optional, Union, cast
|
||||||
|
|
||||||
from litellm.types.llms.openai import (
|
from litellm.types.llms.openai import (
|
||||||
ChatCompletionAssistantContentValue,
|
ChatCompletionAssistantContentValue,
|
||||||
|
@ -9,7 +9,9 @@ from litellm.types.llms.openai import (
|
||||||
from litellm.types.utils import (
|
from litellm.types.utils import (
|
||||||
ChatCompletionAudioResponse,
|
ChatCompletionAudioResponse,
|
||||||
ChatCompletionMessageToolCall,
|
ChatCompletionMessageToolCall,
|
||||||
|
Choices,
|
||||||
CompletionTokensDetails,
|
CompletionTokensDetails,
|
||||||
|
CompletionTokensDetailsWrapper,
|
||||||
Function,
|
Function,
|
||||||
FunctionCall,
|
FunctionCall,
|
||||||
ModelResponse,
|
ModelResponse,
|
||||||
|
@ -203,14 +205,14 @@ class ChunkProcessor:
|
||||||
)
|
)
|
||||||
|
|
||||||
def get_combined_content(
|
def get_combined_content(
|
||||||
self, chunks: List[Dict[str, Any]]
|
self, chunks: List[Dict[str, Any]], delta_key: str = "content"
|
||||||
) -> ChatCompletionAssistantContentValue:
|
) -> ChatCompletionAssistantContentValue:
|
||||||
content_list: List[str] = []
|
content_list: List[str] = []
|
||||||
for chunk in chunks:
|
for chunk in chunks:
|
||||||
choices = chunk["choices"]
|
choices = chunk["choices"]
|
||||||
for choice in choices:
|
for choice in choices:
|
||||||
delta = choice.get("delta", {})
|
delta = choice.get("delta", {})
|
||||||
content = delta.get("content", "")
|
content = delta.get(delta_key, "")
|
||||||
if content is None:
|
if content is None:
|
||||||
continue # openai v1.0.0 sets content = None for chunks
|
continue # openai v1.0.0 sets content = None for chunks
|
||||||
content_list.append(content)
|
content_list.append(content)
|
||||||
|
@ -221,6 +223,11 @@ class ChunkProcessor:
|
||||||
# Update the "content" field within the response dictionary
|
# Update the "content" field within the response dictionary
|
||||||
return combined_content
|
return combined_content
|
||||||
|
|
||||||
|
def get_combined_reasoning_content(
|
||||||
|
self, chunks: List[Dict[str, Any]]
|
||||||
|
) -> ChatCompletionAssistantContentValue:
|
||||||
|
return self.get_combined_content(chunks, delta_key="reasoning_content")
|
||||||
|
|
||||||
def get_combined_audio_content(
|
def get_combined_audio_content(
|
||||||
self, chunks: List[Dict[str, Any]]
|
self, chunks: List[Dict[str, Any]]
|
||||||
) -> ChatCompletionAudioResponse:
|
) -> ChatCompletionAudioResponse:
|
||||||
|
@ -296,12 +303,27 @@ class ChunkProcessor:
|
||||||
"prompt_tokens_details": prompt_tokens_details,
|
"prompt_tokens_details": prompt_tokens_details,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def count_reasoning_tokens(self, response: ModelResponse) -> int:
|
||||||
|
reasoning_tokens = 0
|
||||||
|
for choice in response.choices:
|
||||||
|
if (
|
||||||
|
hasattr(cast(Choices, choice).message, "reasoning_content")
|
||||||
|
and cast(Choices, choice).message.reasoning_content is not None
|
||||||
|
):
|
||||||
|
reasoning_tokens += token_counter(
|
||||||
|
text=cast(Choices, choice).message.reasoning_content,
|
||||||
|
count_response_tokens=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
return reasoning_tokens
|
||||||
|
|
||||||
def calculate_usage(
|
def calculate_usage(
|
||||||
self,
|
self,
|
||||||
chunks: List[Union[Dict[str, Any], ModelResponse]],
|
chunks: List[Union[Dict[str, Any], ModelResponse]],
|
||||||
model: str,
|
model: str,
|
||||||
completion_output: str,
|
completion_output: str,
|
||||||
messages: Optional[List] = None,
|
messages: Optional[List] = None,
|
||||||
|
reasoning_tokens: Optional[int] = None,
|
||||||
) -> Usage:
|
) -> Usage:
|
||||||
"""
|
"""
|
||||||
Calculate usage for the given chunks.
|
Calculate usage for the given chunks.
|
||||||
|
@ -382,6 +404,19 @@ class ChunkProcessor:
|
||||||
) # for anthropic
|
) # for anthropic
|
||||||
if completion_tokens_details is not None:
|
if completion_tokens_details is not None:
|
||||||
returned_usage.completion_tokens_details = completion_tokens_details
|
returned_usage.completion_tokens_details = completion_tokens_details
|
||||||
|
|
||||||
|
if reasoning_tokens is not None:
|
||||||
|
if returned_usage.completion_tokens_details is None:
|
||||||
|
returned_usage.completion_tokens_details = (
|
||||||
|
CompletionTokensDetailsWrapper(reasoning_tokens=reasoning_tokens)
|
||||||
|
)
|
||||||
|
elif (
|
||||||
|
returned_usage.completion_tokens_details is not None
|
||||||
|
and returned_usage.completion_tokens_details.reasoning_tokens is None
|
||||||
|
):
|
||||||
|
returned_usage.completion_tokens_details.reasoning_tokens = (
|
||||||
|
reasoning_tokens
|
||||||
|
)
|
||||||
if prompt_tokens_details is not None:
|
if prompt_tokens_details is not None:
|
||||||
returned_usage.prompt_tokens_details = prompt_tokens_details
|
returned_usage.prompt_tokens_details = prompt_tokens_details
|
||||||
|
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
import asyncio
|
import asyncio
|
||||||
import collections.abc
|
import collections.abc
|
||||||
|
import datetime
|
||||||
import json
|
import json
|
||||||
import threading
|
import threading
|
||||||
import time
|
import time
|
||||||
|
@ -213,10 +214,7 @@ class CustomStreamWrapper:
|
||||||
Output parse <s> / </s> special tokens for sagemaker + hf streaming.
|
Output parse <s> / </s> special tokens for sagemaker + hf streaming.
|
||||||
"""
|
"""
|
||||||
hold = False
|
hold = False
|
||||||
if (
|
if self.custom_llm_provider != "sagemaker":
|
||||||
self.custom_llm_provider != "huggingface"
|
|
||||||
and self.custom_llm_provider != "sagemaker"
|
|
||||||
):
|
|
||||||
return hold, chunk
|
return hold, chunk
|
||||||
|
|
||||||
if finish_reason:
|
if finish_reason:
|
||||||
|
@ -289,49 +287,6 @@ class CustomStreamWrapper:
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise e
|
raise e
|
||||||
|
|
||||||
def handle_huggingface_chunk(self, chunk):
|
|
||||||
try:
|
|
||||||
if not isinstance(chunk, str):
|
|
||||||
chunk = chunk.decode(
|
|
||||||
"utf-8"
|
|
||||||
) # DO NOT REMOVE this: This is required for HF inference API + Streaming
|
|
||||||
text = ""
|
|
||||||
is_finished = False
|
|
||||||
finish_reason = ""
|
|
||||||
print_verbose(f"chunk: {chunk}")
|
|
||||||
if chunk.startswith("data:"):
|
|
||||||
data_json = json.loads(chunk[5:])
|
|
||||||
print_verbose(f"data json: {data_json}")
|
|
||||||
if "token" in data_json and "text" in data_json["token"]:
|
|
||||||
text = data_json["token"]["text"]
|
|
||||||
if data_json.get("details", False) and data_json["details"].get(
|
|
||||||
"finish_reason", False
|
|
||||||
):
|
|
||||||
is_finished = True
|
|
||||||
finish_reason = data_json["details"]["finish_reason"]
|
|
||||||
elif data_json.get(
|
|
||||||
"generated_text", False
|
|
||||||
): # if full generated text exists, then stream is complete
|
|
||||||
text = "" # don't return the final bos token
|
|
||||||
is_finished = True
|
|
||||||
finish_reason = "stop"
|
|
||||||
elif data_json.get("error", False):
|
|
||||||
raise Exception(data_json.get("error"))
|
|
||||||
return {
|
|
||||||
"text": text,
|
|
||||||
"is_finished": is_finished,
|
|
||||||
"finish_reason": finish_reason,
|
|
||||||
}
|
|
||||||
elif "error" in chunk:
|
|
||||||
raise ValueError(chunk)
|
|
||||||
return {
|
|
||||||
"text": text,
|
|
||||||
"is_finished": is_finished,
|
|
||||||
"finish_reason": finish_reason,
|
|
||||||
}
|
|
||||||
except Exception as e:
|
|
||||||
raise e
|
|
||||||
|
|
||||||
def handle_ai21_chunk(self, chunk): # fake streaming
|
def handle_ai21_chunk(self, chunk): # fake streaming
|
||||||
chunk = chunk.decode("utf-8")
|
chunk = chunk.decode("utf-8")
|
||||||
data_json = json.loads(chunk)
|
data_json = json.loads(chunk)
|
||||||
|
@ -1048,11 +1003,6 @@ class CustomStreamWrapper:
|
||||||
completion_obj["content"] = response_obj["text"]
|
completion_obj["content"] = response_obj["text"]
|
||||||
if response_obj["is_finished"]:
|
if response_obj["is_finished"]:
|
||||||
self.received_finish_reason = response_obj["finish_reason"]
|
self.received_finish_reason = response_obj["finish_reason"]
|
||||||
elif self.custom_llm_provider and self.custom_llm_provider == "huggingface":
|
|
||||||
response_obj = self.handle_huggingface_chunk(chunk)
|
|
||||||
completion_obj["content"] = response_obj["text"]
|
|
||||||
if response_obj["is_finished"]:
|
|
||||||
self.received_finish_reason = response_obj["finish_reason"]
|
|
||||||
elif self.custom_llm_provider and self.custom_llm_provider == "predibase":
|
elif self.custom_llm_provider and self.custom_llm_provider == "predibase":
|
||||||
response_obj = self.handle_predibase_chunk(chunk)
|
response_obj = self.handle_predibase_chunk(chunk)
|
||||||
completion_obj["content"] = response_obj["text"]
|
completion_obj["content"] = response_obj["text"]
|
||||||
|
@ -1567,6 +1517,10 @@ class CustomStreamWrapper:
|
||||||
|
|
||||||
if response is None:
|
if response is None:
|
||||||
continue
|
continue
|
||||||
|
if self.logging_obj.completion_start_time is None:
|
||||||
|
self.logging_obj._update_completion_start_time(
|
||||||
|
completion_start_time=datetime.datetime.now()
|
||||||
|
)
|
||||||
## LOGGING
|
## LOGGING
|
||||||
executor.submit(
|
executor.submit(
|
||||||
self.run_success_logging_and_cache_storage,
|
self.run_success_logging_and_cache_storage,
|
||||||
|
@ -1721,6 +1675,11 @@ class CustomStreamWrapper:
|
||||||
if processed_chunk is None:
|
if processed_chunk is None:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
if self.logging_obj.completion_start_time is None:
|
||||||
|
self.logging_obj._update_completion_start_time(
|
||||||
|
completion_start_time=datetime.datetime.now()
|
||||||
|
)
|
||||||
|
|
||||||
choice = processed_chunk.choices[0]
|
choice = processed_chunk.choices[0]
|
||||||
if isinstance(choice, StreamingChoices):
|
if isinstance(choice, StreamingChoices):
|
||||||
self.response_uptil_now += choice.delta.get("content", "") or ""
|
self.response_uptil_now += choice.delta.get("content", "") or ""
|
||||||
|
|
|
@ -11,6 +11,10 @@ from litellm.constants import (
|
||||||
DEFAULT_IMAGE_HEIGHT,
|
DEFAULT_IMAGE_HEIGHT,
|
||||||
DEFAULT_IMAGE_TOKEN_COUNT,
|
DEFAULT_IMAGE_TOKEN_COUNT,
|
||||||
DEFAULT_IMAGE_WIDTH,
|
DEFAULT_IMAGE_WIDTH,
|
||||||
|
MAX_LONG_SIDE_FOR_IMAGE_HIGH_RES,
|
||||||
|
MAX_SHORT_SIDE_FOR_IMAGE_HIGH_RES,
|
||||||
|
MAX_TILE_HEIGHT,
|
||||||
|
MAX_TILE_WIDTH,
|
||||||
)
|
)
|
||||||
from litellm.llms.custom_httpx.http_handler import _get_httpx_client
|
from litellm.llms.custom_httpx.http_handler import _get_httpx_client
|
||||||
|
|
||||||
|
@ -97,11 +101,14 @@ def resize_image_high_res(
|
||||||
height: int,
|
height: int,
|
||||||
) -> Tuple[int, int]:
|
) -> Tuple[int, int]:
|
||||||
# Maximum dimensions for high res mode
|
# Maximum dimensions for high res mode
|
||||||
max_short_side = 768
|
max_short_side = MAX_SHORT_SIDE_FOR_IMAGE_HIGH_RES
|
||||||
max_long_side = 2000
|
max_long_side = MAX_LONG_SIDE_FOR_IMAGE_HIGH_RES
|
||||||
|
|
||||||
# Return early if no resizing is needed
|
# Return early if no resizing is needed
|
||||||
if width <= 768 and height <= 768:
|
if (
|
||||||
|
width <= MAX_SHORT_SIDE_FOR_IMAGE_HIGH_RES
|
||||||
|
and height <= MAX_SHORT_SIDE_FOR_IMAGE_HIGH_RES
|
||||||
|
):
|
||||||
return width, height
|
return width, height
|
||||||
|
|
||||||
# Determine the longer and shorter sides
|
# Determine the longer and shorter sides
|
||||||
|
@ -132,7 +139,10 @@ def resize_image_high_res(
|
||||||
|
|
||||||
# Test the function with the given example
|
# Test the function with the given example
|
||||||
def calculate_tiles_needed(
|
def calculate_tiles_needed(
|
||||||
resized_width, resized_height, tile_width=512, tile_height=512
|
resized_width,
|
||||||
|
resized_height,
|
||||||
|
tile_width=MAX_TILE_WIDTH,
|
||||||
|
tile_height=MAX_TILE_HEIGHT,
|
||||||
):
|
):
|
||||||
tiles_across = (resized_width + tile_width - 1) // tile_width
|
tiles_across = (resized_width + tile_width - 1) // tile_width
|
||||||
tiles_down = (resized_height + tile_height - 1) // tile_height
|
tiles_down = (resized_height + tile_height - 1) // tile_height
|
||||||
|
|
|
@ -27,6 +27,7 @@ class AiohttpOpenAIChatConfig(OpenAILikeChatConfig):
|
||||||
def get_complete_url(
|
def get_complete_url(
|
||||||
self,
|
self,
|
||||||
api_base: Optional[str],
|
api_base: Optional[str],
|
||||||
|
api_key: Optional[str],
|
||||||
model: str,
|
model: str,
|
||||||
optional_params: dict,
|
optional_params: dict,
|
||||||
litellm_params: dict,
|
litellm_params: dict,
|
||||||
|
|
|
@ -21,7 +21,6 @@ from litellm.llms.custom_httpx.http_handler import (
|
||||||
get_async_httpx_client,
|
get_async_httpx_client,
|
||||||
)
|
)
|
||||||
from litellm.types.llms.anthropic import (
|
from litellm.types.llms.anthropic import (
|
||||||
AnthropicChatCompletionUsageBlock,
|
|
||||||
ContentBlockDelta,
|
ContentBlockDelta,
|
||||||
ContentBlockStart,
|
ContentBlockStart,
|
||||||
ContentBlockStop,
|
ContentBlockStop,
|
||||||
|
@ -32,13 +31,13 @@ from litellm.types.llms.anthropic import (
|
||||||
from litellm.types.llms.openai import (
|
from litellm.types.llms.openai import (
|
||||||
ChatCompletionThinkingBlock,
|
ChatCompletionThinkingBlock,
|
||||||
ChatCompletionToolCallChunk,
|
ChatCompletionToolCallChunk,
|
||||||
ChatCompletionUsageBlock,
|
|
||||||
)
|
)
|
||||||
from litellm.types.utils import (
|
from litellm.types.utils import (
|
||||||
Delta,
|
Delta,
|
||||||
GenericStreamingChunk,
|
GenericStreamingChunk,
|
||||||
ModelResponseStream,
|
ModelResponseStream,
|
||||||
StreamingChoices,
|
StreamingChoices,
|
||||||
|
Usage,
|
||||||
)
|
)
|
||||||
from litellm.utils import CustomStreamWrapper, ModelResponse, ProviderConfigManager
|
from litellm.utils import CustomStreamWrapper, ModelResponse, ProviderConfigManager
|
||||||
|
|
||||||
|
@ -487,10 +486,8 @@ class ModelResponseIterator:
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def _handle_usage(
|
def _handle_usage(self, anthropic_usage_chunk: Union[dict, UsageDelta]) -> Usage:
|
||||||
self, anthropic_usage_chunk: Union[dict, UsageDelta]
|
usage_block = Usage(
|
||||||
) -> AnthropicChatCompletionUsageBlock:
|
|
||||||
usage_block = AnthropicChatCompletionUsageBlock(
|
|
||||||
prompt_tokens=anthropic_usage_chunk.get("input_tokens", 0),
|
prompt_tokens=anthropic_usage_chunk.get("input_tokens", 0),
|
||||||
completion_tokens=anthropic_usage_chunk.get("output_tokens", 0),
|
completion_tokens=anthropic_usage_chunk.get("output_tokens", 0),
|
||||||
total_tokens=anthropic_usage_chunk.get("input_tokens", 0)
|
total_tokens=anthropic_usage_chunk.get("input_tokens", 0)
|
||||||
|
@ -581,7 +578,7 @@ class ModelResponseIterator:
|
||||||
text = ""
|
text = ""
|
||||||
tool_use: Optional[ChatCompletionToolCallChunk] = None
|
tool_use: Optional[ChatCompletionToolCallChunk] = None
|
||||||
finish_reason = ""
|
finish_reason = ""
|
||||||
usage: Optional[ChatCompletionUsageBlock] = None
|
usage: Optional[Usage] = None
|
||||||
provider_specific_fields: Dict[str, Any] = {}
|
provider_specific_fields: Dict[str, Any] = {}
|
||||||
reasoning_content: Optional[str] = None
|
reasoning_content: Optional[str] = None
|
||||||
thinking_blocks: Optional[List[ChatCompletionThinkingBlock]] = None
|
thinking_blocks: Optional[List[ChatCompletionThinkingBlock]] = None
|
||||||
|
|
|
@ -5,7 +5,10 @@ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union, cast
|
||||||
import httpx
|
import httpx
|
||||||
|
|
||||||
import litellm
|
import litellm
|
||||||
from litellm.constants import RESPONSE_FORMAT_TOOL_NAME
|
from litellm.constants import (
|
||||||
|
DEFAULT_ANTHROPIC_CHAT_MAX_TOKENS,
|
||||||
|
RESPONSE_FORMAT_TOOL_NAME,
|
||||||
|
)
|
||||||
from litellm.litellm_core_utils.core_helpers import map_finish_reason
|
from litellm.litellm_core_utils.core_helpers import map_finish_reason
|
||||||
from litellm.litellm_core_utils.prompt_templates.factory import anthropic_messages_pt
|
from litellm.litellm_core_utils.prompt_templates.factory import anthropic_messages_pt
|
||||||
from litellm.llms.base_llm.base_utils import type_to_response_format_param
|
from litellm.llms.base_llm.base_utils import type_to_response_format_param
|
||||||
|
@ -18,8 +21,10 @@ from litellm.types.llms.anthropic import (
|
||||||
AnthropicMessagesTool,
|
AnthropicMessagesTool,
|
||||||
AnthropicMessagesToolChoice,
|
AnthropicMessagesToolChoice,
|
||||||
AnthropicSystemMessageContent,
|
AnthropicSystemMessageContent,
|
||||||
|
AnthropicThinkingParam,
|
||||||
)
|
)
|
||||||
from litellm.types.llms.openai import (
|
from litellm.types.llms.openai import (
|
||||||
|
REASONING_EFFORT,
|
||||||
AllMessageValues,
|
AllMessageValues,
|
||||||
ChatCompletionCachedContent,
|
ChatCompletionCachedContent,
|
||||||
ChatCompletionSystemMessage,
|
ChatCompletionSystemMessage,
|
||||||
|
@ -28,9 +33,16 @@ from litellm.types.llms.openai import (
|
||||||
ChatCompletionToolCallFunctionChunk,
|
ChatCompletionToolCallFunctionChunk,
|
||||||
ChatCompletionToolParam,
|
ChatCompletionToolParam,
|
||||||
)
|
)
|
||||||
|
from litellm.types.utils import CompletionTokensDetailsWrapper
|
||||||
from litellm.types.utils import Message as LitellmMessage
|
from litellm.types.utils import Message as LitellmMessage
|
||||||
from litellm.types.utils import PromptTokensDetailsWrapper
|
from litellm.types.utils import PromptTokensDetailsWrapper
|
||||||
from litellm.utils import ModelResponse, Usage, add_dummy_tool, has_tool_call_blocks
|
from litellm.utils import (
|
||||||
|
ModelResponse,
|
||||||
|
Usage,
|
||||||
|
add_dummy_tool,
|
||||||
|
has_tool_call_blocks,
|
||||||
|
token_counter,
|
||||||
|
)
|
||||||
|
|
||||||
from ..common_utils import AnthropicError, process_anthropic_headers
|
from ..common_utils import AnthropicError, process_anthropic_headers
|
||||||
|
|
||||||
|
@ -51,7 +63,7 @@ class AnthropicConfig(BaseConfig):
|
||||||
|
|
||||||
max_tokens: Optional[
|
max_tokens: Optional[
|
||||||
int
|
int
|
||||||
] = 4096 # anthropic requires a default value (Opus, Sonnet, and Haiku have the same default)
|
] = DEFAULT_ANTHROPIC_CHAT_MAX_TOKENS # anthropic requires a default value (Opus, Sonnet, and Haiku have the same default)
|
||||||
stop_sequences: Optional[list] = None
|
stop_sequences: Optional[list] = None
|
||||||
temperature: Optional[int] = None
|
temperature: Optional[int] = None
|
||||||
top_p: Optional[int] = None
|
top_p: Optional[int] = None
|
||||||
|
@ -63,7 +75,7 @@ class AnthropicConfig(BaseConfig):
|
||||||
self,
|
self,
|
||||||
max_tokens: Optional[
|
max_tokens: Optional[
|
||||||
int
|
int
|
||||||
] = 4096, # You can pass in a value yourself or use the default value 4096
|
] = DEFAULT_ANTHROPIC_CHAT_MAX_TOKENS, # You can pass in a value yourself or use the default value 4096
|
||||||
stop_sequences: Optional[list] = None,
|
stop_sequences: Optional[list] = None,
|
||||||
temperature: Optional[int] = None,
|
temperature: Optional[int] = None,
|
||||||
top_p: Optional[int] = None,
|
top_p: Optional[int] = None,
|
||||||
|
@ -94,6 +106,7 @@ class AnthropicConfig(BaseConfig):
|
||||||
"parallel_tool_calls",
|
"parallel_tool_calls",
|
||||||
"response_format",
|
"response_format",
|
||||||
"user",
|
"user",
|
||||||
|
"reasoning_effort",
|
||||||
]
|
]
|
||||||
|
|
||||||
if "claude-3-7-sonnet" in model:
|
if "claude-3-7-sonnet" in model:
|
||||||
|
@ -141,15 +154,9 @@ class AnthropicConfig(BaseConfig):
|
||||||
if user_anthropic_beta_headers is not None:
|
if user_anthropic_beta_headers is not None:
|
||||||
betas.update(user_anthropic_beta_headers)
|
betas.update(user_anthropic_beta_headers)
|
||||||
|
|
||||||
# Handle beta headers for Vertex AI
|
# Don't send any beta headers to Vertex, Vertex has failed requests when they are sent
|
||||||
# We allow prompt caching beta header for Vertex, but exclude other beta headers that might cause issues
|
|
||||||
if is_vertex_request is True:
|
if is_vertex_request is True:
|
||||||
vertex_safe_betas = set()
|
pass
|
||||||
# Allow prompt caching beta header for Vertex
|
|
||||||
if "prompt-caching-2024-07-31" in betas:
|
|
||||||
vertex_safe_betas.add("prompt-caching-2024-07-31")
|
|
||||||
if len(vertex_safe_betas) > 0:
|
|
||||||
headers["anthropic-beta"] = ",".join(vertex_safe_betas)
|
|
||||||
elif len(betas) > 0:
|
elif len(betas) > 0:
|
||||||
headers["anthropic-beta"] = ",".join(betas)
|
headers["anthropic-beta"] = ",".join(betas)
|
||||||
|
|
||||||
|
@ -297,6 +304,48 @@ class AnthropicConfig(BaseConfig):
|
||||||
new_stop = new_v
|
new_stop = new_v
|
||||||
return new_stop
|
return new_stop
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _map_reasoning_effort(
|
||||||
|
reasoning_effort: Optional[Union[REASONING_EFFORT, str]]
|
||||||
|
) -> Optional[AnthropicThinkingParam]:
|
||||||
|
if reasoning_effort is None:
|
||||||
|
return None
|
||||||
|
elif reasoning_effort == "low":
|
||||||
|
return AnthropicThinkingParam(type="enabled", budget_tokens=1024)
|
||||||
|
elif reasoning_effort == "medium":
|
||||||
|
return AnthropicThinkingParam(type="enabled", budget_tokens=2048)
|
||||||
|
elif reasoning_effort == "high":
|
||||||
|
return AnthropicThinkingParam(type="enabled", budget_tokens=4096)
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Unmapped reasoning effort: {reasoning_effort}")
|
||||||
|
|
||||||
|
def map_response_format_to_anthropic_tool(
|
||||||
|
self, value: Optional[dict], optional_params: dict, is_thinking_enabled: bool
|
||||||
|
) -> Optional[AnthropicMessagesTool]:
|
||||||
|
ignore_response_format_types = ["text"]
|
||||||
|
if (
|
||||||
|
value is None or value["type"] in ignore_response_format_types
|
||||||
|
): # value is a no-op
|
||||||
|
return None
|
||||||
|
|
||||||
|
json_schema: Optional[dict] = None
|
||||||
|
if "response_schema" in value:
|
||||||
|
json_schema = value["response_schema"]
|
||||||
|
elif "json_schema" in value:
|
||||||
|
json_schema = value["json_schema"]["schema"]
|
||||||
|
"""
|
||||||
|
When using tools in this way: - https://docs.anthropic.com/en/docs/build-with-claude/tool-use#json-mode
|
||||||
|
- You usually want to provide a single tool
|
||||||
|
- You should set tool_choice (see Forcing tool use) to instruct the model to explicitly use that tool
|
||||||
|
- Remember that the model will pass the input to the tool, so the name of the tool and description should be from the model’s perspective.
|
||||||
|
"""
|
||||||
|
|
||||||
|
_tool = self._create_json_tool_call_for_response_format(
|
||||||
|
json_schema=json_schema,
|
||||||
|
)
|
||||||
|
|
||||||
|
return _tool
|
||||||
|
|
||||||
def map_openai_params(
|
def map_openai_params(
|
||||||
self,
|
self,
|
||||||
non_default_params: dict,
|
non_default_params: dict,
|
||||||
|
@ -308,10 +357,6 @@ class AnthropicConfig(BaseConfig):
|
||||||
non_default_params=non_default_params
|
non_default_params=non_default_params
|
||||||
)
|
)
|
||||||
|
|
||||||
## handle thinking tokens
|
|
||||||
self.update_optional_params_with_thinking_tokens(
|
|
||||||
non_default_params=non_default_params, optional_params=optional_params
|
|
||||||
)
|
|
||||||
for param, value in non_default_params.items():
|
for param, value in non_default_params.items():
|
||||||
if param == "max_tokens":
|
if param == "max_tokens":
|
||||||
optional_params["max_tokens"] = value
|
optional_params["max_tokens"] = value
|
||||||
|
@ -344,39 +389,31 @@ class AnthropicConfig(BaseConfig):
|
||||||
if param == "top_p":
|
if param == "top_p":
|
||||||
optional_params["top_p"] = value
|
optional_params["top_p"] = value
|
||||||
if param == "response_format" and isinstance(value, dict):
|
if param == "response_format" and isinstance(value, dict):
|
||||||
ignore_response_format_types = ["text"]
|
_tool = self.map_response_format_to_anthropic_tool(
|
||||||
if value["type"] in ignore_response_format_types: # value is a no-op
|
value, optional_params, is_thinking_enabled
|
||||||
|
)
|
||||||
|
if _tool is None:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
json_schema: Optional[dict] = None
|
|
||||||
if "response_schema" in value:
|
|
||||||
json_schema = value["response_schema"]
|
|
||||||
elif "json_schema" in value:
|
|
||||||
json_schema = value["json_schema"]["schema"]
|
|
||||||
"""
|
|
||||||
When using tools in this way: - https://docs.anthropic.com/en/docs/build-with-claude/tool-use#json-mode
|
|
||||||
- You usually want to provide a single tool
|
|
||||||
- You should set tool_choice (see Forcing tool use) to instruct the model to explicitly use that tool
|
|
||||||
- Remember that the model will pass the input to the tool, so the name of the tool and description should be from the model’s perspective.
|
|
||||||
"""
|
|
||||||
|
|
||||||
if not is_thinking_enabled:
|
if not is_thinking_enabled:
|
||||||
_tool_choice = {"name": RESPONSE_FORMAT_TOOL_NAME, "type": "tool"}
|
_tool_choice = {"name": RESPONSE_FORMAT_TOOL_NAME, "type": "tool"}
|
||||||
optional_params["tool_choice"] = _tool_choice
|
optional_params["tool_choice"] = _tool_choice
|
||||||
|
optional_params["json_mode"] = True
|
||||||
_tool = self._create_json_tool_call_for_response_format(
|
|
||||||
json_schema=json_schema,
|
|
||||||
)
|
|
||||||
optional_params = self._add_tools_to_optional_params(
|
optional_params = self._add_tools_to_optional_params(
|
||||||
optional_params=optional_params, tools=[_tool]
|
optional_params=optional_params, tools=[_tool]
|
||||||
)
|
)
|
||||||
|
|
||||||
optional_params["json_mode"] = True
|
|
||||||
if param == "user":
|
if param == "user":
|
||||||
optional_params["metadata"] = {"user_id": value}
|
optional_params["metadata"] = {"user_id": value}
|
||||||
if param == "thinking":
|
if param == "thinking":
|
||||||
optional_params["thinking"] = value
|
optional_params["thinking"] = value
|
||||||
|
elif param == "reasoning_effort" and isinstance(value, str):
|
||||||
|
optional_params["thinking"] = AnthropicConfig._map_reasoning_effort(
|
||||||
|
value
|
||||||
|
)
|
||||||
|
|
||||||
|
## handle thinking tokens
|
||||||
|
self.update_optional_params_with_thinking_tokens(
|
||||||
|
non_default_params=non_default_params, optional_params=optional_params
|
||||||
|
)
|
||||||
return optional_params
|
return optional_params
|
||||||
|
|
||||||
def _create_json_tool_call_for_response_format(
|
def _create_json_tool_call_for_response_format(
|
||||||
|
@ -753,6 +790,15 @@ class AnthropicConfig(BaseConfig):
|
||||||
prompt_tokens_details = PromptTokensDetailsWrapper(
|
prompt_tokens_details = PromptTokensDetailsWrapper(
|
||||||
cached_tokens=cache_read_input_tokens
|
cached_tokens=cache_read_input_tokens
|
||||||
)
|
)
|
||||||
|
completion_token_details = (
|
||||||
|
CompletionTokensDetailsWrapper(
|
||||||
|
reasoning_tokens=token_counter(
|
||||||
|
text=reasoning_content, count_response_tokens=True
|
||||||
|
)
|
||||||
|
)
|
||||||
|
if reasoning_content
|
||||||
|
else None
|
||||||
|
)
|
||||||
total_tokens = prompt_tokens + completion_tokens
|
total_tokens = prompt_tokens + completion_tokens
|
||||||
usage = Usage(
|
usage = Usage(
|
||||||
prompt_tokens=prompt_tokens,
|
prompt_tokens=prompt_tokens,
|
||||||
|
@ -761,6 +807,7 @@ class AnthropicConfig(BaseConfig):
|
||||||
prompt_tokens_details=prompt_tokens_details,
|
prompt_tokens_details=prompt_tokens_details,
|
||||||
cache_creation_input_tokens=cache_creation_input_tokens,
|
cache_creation_input_tokens=cache_creation_input_tokens,
|
||||||
cache_read_input_tokens=cache_read_input_tokens,
|
cache_read_input_tokens=cache_read_input_tokens,
|
||||||
|
completion_tokens_details=completion_token_details,
|
||||||
)
|
)
|
||||||
|
|
||||||
setattr(model_response, "usage", usage) # type: ignore
|
setattr(model_response, "usage", usage) # type: ignore
|
||||||
|
|
|
@ -11,6 +11,7 @@ from typing import AsyncIterator, Dict, Iterator, List, Optional, Union
|
||||||
import httpx
|
import httpx
|
||||||
|
|
||||||
import litellm
|
import litellm
|
||||||
|
from litellm.constants import DEFAULT_MAX_TOKENS
|
||||||
from litellm.litellm_core_utils.prompt_templates.factory import (
|
from litellm.litellm_core_utils.prompt_templates.factory import (
|
||||||
custom_prompt,
|
custom_prompt,
|
||||||
prompt_factory,
|
prompt_factory,
|
||||||
|
@ -65,7 +66,9 @@ class AnthropicTextConfig(BaseConfig):
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
max_tokens_to_sample: Optional[int] = 256, # anthropic requires a default
|
max_tokens_to_sample: Optional[
|
||||||
|
int
|
||||||
|
] = DEFAULT_MAX_TOKENS, # anthropic requires a default
|
||||||
stop_sequences: Optional[list] = None,
|
stop_sequences: Optional[list] = None,
|
||||||
temperature: Optional[int] = None,
|
temperature: Optional[int] = None,
|
||||||
top_p: Optional[int] = None,
|
top_p: Optional[int] = None,
|
||||||
|
|
|
@ -6,7 +6,7 @@
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import json
|
import json
|
||||||
from typing import Any, AsyncIterator, Dict, Optional, Union, cast
|
from typing import AsyncIterator, Dict, List, Optional, Union, cast
|
||||||
|
|
||||||
import httpx
|
import httpx
|
||||||
|
|
||||||
|
@ -19,6 +19,9 @@ from litellm.llms.custom_httpx.http_handler import (
|
||||||
AsyncHTTPHandler,
|
AsyncHTTPHandler,
|
||||||
get_async_httpx_client,
|
get_async_httpx_client,
|
||||||
)
|
)
|
||||||
|
from litellm.types.llms.anthropic_messages.anthropic_response import (
|
||||||
|
AnthropicMessagesResponse,
|
||||||
|
)
|
||||||
from litellm.types.router import GenericLiteLLMParams
|
from litellm.types.router import GenericLiteLLMParams
|
||||||
from litellm.types.utils import ProviderSpecificHeader
|
from litellm.types.utils import ProviderSpecificHeader
|
||||||
from litellm.utils import ProviderConfigManager, client
|
from litellm.utils import ProviderConfigManager, client
|
||||||
|
@ -60,14 +63,25 @@ class AnthropicMessagesHandler:
|
||||||
|
|
||||||
@client
|
@client
|
||||||
async def anthropic_messages(
|
async def anthropic_messages(
|
||||||
api_key: str,
|
max_tokens: int,
|
||||||
|
messages: List[Dict],
|
||||||
model: str,
|
model: str,
|
||||||
stream: bool = False,
|
metadata: Optional[Dict] = None,
|
||||||
|
stop_sequences: Optional[List[str]] = None,
|
||||||
|
stream: Optional[bool] = False,
|
||||||
|
system: Optional[str] = None,
|
||||||
|
temperature: Optional[float] = None,
|
||||||
|
thinking: Optional[Dict] = None,
|
||||||
|
tool_choice: Optional[Dict] = None,
|
||||||
|
tools: Optional[List[Dict]] = None,
|
||||||
|
top_k: Optional[int] = None,
|
||||||
|
top_p: Optional[float] = None,
|
||||||
|
api_key: Optional[str] = None,
|
||||||
api_base: Optional[str] = None,
|
api_base: Optional[str] = None,
|
||||||
client: Optional[AsyncHTTPHandler] = None,
|
client: Optional[AsyncHTTPHandler] = None,
|
||||||
custom_llm_provider: Optional[str] = None,
|
custom_llm_provider: Optional[str] = None,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
) -> Union[Dict[str, Any], AsyncIterator]:
|
) -> Union[AnthropicMessagesResponse, AsyncIterator]:
|
||||||
"""
|
"""
|
||||||
Makes Anthropic `/v1/messages` API calls In the Anthropic API Spec
|
Makes Anthropic `/v1/messages` API calls In the Anthropic API Spec
|
||||||
"""
|
"""
|
||||||
|
@ -129,10 +143,8 @@ async def anthropic_messages(
|
||||||
},
|
},
|
||||||
custom_llm_provider=_custom_llm_provider,
|
custom_llm_provider=_custom_llm_provider,
|
||||||
)
|
)
|
||||||
litellm_logging_obj.model_call_details.update(kwargs)
|
|
||||||
|
|
||||||
# Prepare request body
|
# Prepare request body
|
||||||
request_body = kwargs.copy()
|
request_body = locals().copy()
|
||||||
request_body = {
|
request_body = {
|
||||||
k: v
|
k: v
|
||||||
for k, v in request_body.items()
|
for k, v in request_body.items()
|
||||||
|
@ -140,10 +152,12 @@ async def anthropic_messages(
|
||||||
in anthropic_messages_provider_config.get_supported_anthropic_messages_params(
|
in anthropic_messages_provider_config.get_supported_anthropic_messages_params(
|
||||||
model=model
|
model=model
|
||||||
)
|
)
|
||||||
|
and v is not None
|
||||||
}
|
}
|
||||||
request_body["stream"] = stream
|
request_body["stream"] = stream
|
||||||
request_body["model"] = model
|
request_body["model"] = model
|
||||||
litellm_logging_obj.stream = stream
|
litellm_logging_obj.stream = stream
|
||||||
|
litellm_logging_obj.model_call_details.update(request_body)
|
||||||
|
|
||||||
# Make the request
|
# Make the request
|
||||||
request_url = anthropic_messages_provider_config.get_complete_url(
|
request_url = anthropic_messages_provider_config.get_complete_url(
|
||||||
|
@ -164,7 +178,7 @@ async def anthropic_messages(
|
||||||
url=request_url,
|
url=request_url,
|
||||||
headers=headers,
|
headers=headers,
|
||||||
data=json.dumps(request_body),
|
data=json.dumps(request_body),
|
||||||
stream=stream,
|
stream=stream or False,
|
||||||
)
|
)
|
||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
|
|
||||||
|
|
|
@ -7,7 +7,7 @@ import httpx # type: ignore
|
||||||
from openai import APITimeoutError, AsyncAzureOpenAI, AzureOpenAI
|
from openai import APITimeoutError, AsyncAzureOpenAI, AzureOpenAI
|
||||||
|
|
||||||
import litellm
|
import litellm
|
||||||
from litellm.constants import DEFAULT_MAX_RETRIES
|
from litellm.constants import AZURE_OPERATION_POLLING_TIMEOUT, DEFAULT_MAX_RETRIES
|
||||||
from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObj
|
from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObj
|
||||||
from litellm.litellm_core_utils.logging_utils import track_llm_api_timing
|
from litellm.litellm_core_utils.logging_utils import track_llm_api_timing
|
||||||
from litellm.llms.custom_httpx.http_handler import (
|
from litellm.llms.custom_httpx.http_handler import (
|
||||||
|
@ -857,7 +857,7 @@ class AzureChatCompletion(BaseAzureLLM, BaseLLM):
|
||||||
|
|
||||||
await response.aread()
|
await response.aread()
|
||||||
|
|
||||||
timeout_secs: int = 120
|
timeout_secs: int = AZURE_OPERATION_POLLING_TIMEOUT
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
if "status" not in response.json():
|
if "status" not in response.json():
|
||||||
raise Exception(
|
raise Exception(
|
||||||
|
@ -955,7 +955,7 @@ class AzureChatCompletion(BaseAzureLLM, BaseLLM):
|
||||||
|
|
||||||
response.read()
|
response.read()
|
||||||
|
|
||||||
timeout_secs: int = 120
|
timeout_secs: int = AZURE_OPERATION_POLLING_TIMEOUT
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
if "status" not in response.json():
|
if "status" not in response.json():
|
||||||
raise Exception(
|
raise Exception(
|
||||||
|
|
|
@ -7,6 +7,10 @@ from litellm.litellm_core_utils.prompt_templates.factory import (
|
||||||
convert_to_azure_openai_messages,
|
convert_to_azure_openai_messages,
|
||||||
)
|
)
|
||||||
from litellm.llms.base_llm.chat.transformation import BaseLLMException
|
from litellm.llms.base_llm.chat.transformation import BaseLLMException
|
||||||
|
from litellm.types.llms.azure import (
|
||||||
|
API_VERSION_MONTH_SUPPORTED_RESPONSE_FORMAT,
|
||||||
|
API_VERSION_YEAR_SUPPORTED_RESPONSE_FORMAT,
|
||||||
|
)
|
||||||
from litellm.types.utils import ModelResponse
|
from litellm.types.utils import ModelResponse
|
||||||
from litellm.utils import supports_response_schema
|
from litellm.utils import supports_response_schema
|
||||||
|
|
||||||
|
@ -123,7 +127,10 @@ class AzureOpenAIConfig(BaseConfig):
|
||||||
- check if api_version is supported for response_format
|
- check if api_version is supported for response_format
|
||||||
"""
|
"""
|
||||||
|
|
||||||
is_supported = int(api_version_year) <= 2024 and int(api_version_month) >= 8
|
is_supported = (
|
||||||
|
int(api_version_year) <= API_VERSION_YEAR_SUPPORTED_RESPONSE_FORMAT
|
||||||
|
and int(api_version_month) >= API_VERSION_MONTH_SUPPORTED_RESPONSE_FORMAT
|
||||||
|
)
|
||||||
|
|
||||||
return is_supported
|
return is_supported
|
||||||
|
|
||||||
|
|
|
@ -14,6 +14,7 @@ Translations handled by LiteLLM:
|
||||||
|
|
||||||
from typing import List, Optional
|
from typing import List, Optional
|
||||||
|
|
||||||
|
import litellm
|
||||||
from litellm import verbose_logger
|
from litellm import verbose_logger
|
||||||
from litellm.types.llms.openai import AllMessageValues
|
from litellm.types.llms.openai import AllMessageValues
|
||||||
from litellm.utils import get_model_info
|
from litellm.utils import get_model_info
|
||||||
|
@ -22,6 +23,27 @@ from ...openai.chat.o_series_transformation import OpenAIOSeriesConfig
|
||||||
|
|
||||||
|
|
||||||
class AzureOpenAIO1Config(OpenAIOSeriesConfig):
|
class AzureOpenAIO1Config(OpenAIOSeriesConfig):
|
||||||
|
def get_supported_openai_params(self, model: str) -> list:
|
||||||
|
"""
|
||||||
|
Get the supported OpenAI params for the Azure O-Series models
|
||||||
|
"""
|
||||||
|
all_openai_params = litellm.OpenAIGPTConfig().get_supported_openai_params(
|
||||||
|
model=model
|
||||||
|
)
|
||||||
|
non_supported_params = [
|
||||||
|
"logprobs",
|
||||||
|
"top_p",
|
||||||
|
"presence_penalty",
|
||||||
|
"frequency_penalty",
|
||||||
|
"top_logprobs",
|
||||||
|
]
|
||||||
|
|
||||||
|
o_series_only_param = ["reasoning_effort"]
|
||||||
|
all_openai_params.extend(o_series_only_param)
|
||||||
|
return [
|
||||||
|
param for param in all_openai_params if param not in non_supported_params
|
||||||
|
]
|
||||||
|
|
||||||
def should_fake_stream(
|
def should_fake_stream(
|
||||||
self,
|
self,
|
||||||
model: Optional[str],
|
model: Optional[str],
|
||||||
|
|
|
@ -28,11 +28,11 @@ class AzureOpenAIFilesAPI(BaseAzureLLM):
|
||||||
self,
|
self,
|
||||||
create_file_data: CreateFileRequest,
|
create_file_data: CreateFileRequest,
|
||||||
openai_client: AsyncAzureOpenAI,
|
openai_client: AsyncAzureOpenAI,
|
||||||
) -> FileObject:
|
) -> OpenAIFileObject:
|
||||||
verbose_logger.debug("create_file_data=%s", create_file_data)
|
verbose_logger.debug("create_file_data=%s", create_file_data)
|
||||||
response = await openai_client.files.create(**create_file_data)
|
response = await openai_client.files.create(**create_file_data)
|
||||||
verbose_logger.debug("create_file_response=%s", response)
|
verbose_logger.debug("create_file_response=%s", response)
|
||||||
return response
|
return OpenAIFileObject(**response.model_dump())
|
||||||
|
|
||||||
def create_file(
|
def create_file(
|
||||||
self,
|
self,
|
||||||
|
@ -45,7 +45,7 @@ class AzureOpenAIFilesAPI(BaseAzureLLM):
|
||||||
max_retries: Optional[int],
|
max_retries: Optional[int],
|
||||||
client: Optional[Union[AzureOpenAI, AsyncAzureOpenAI]] = None,
|
client: Optional[Union[AzureOpenAI, AsyncAzureOpenAI]] = None,
|
||||||
litellm_params: Optional[dict] = None,
|
litellm_params: Optional[dict] = None,
|
||||||
) -> Union[FileObject, Coroutine[Any, Any, FileObject]]:
|
) -> Union[OpenAIFileObject, Coroutine[Any, Any, OpenAIFileObject]]:
|
||||||
openai_client: Optional[
|
openai_client: Optional[
|
||||||
Union[AzureOpenAI, AsyncAzureOpenAI]
|
Union[AzureOpenAI, AsyncAzureOpenAI]
|
||||||
] = self.get_azure_openai_client(
|
] = self.get_azure_openai_client(
|
||||||
|
@ -66,11 +66,11 @@ class AzureOpenAIFilesAPI(BaseAzureLLM):
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"AzureOpenAI client is not an instance of AsyncAzureOpenAI. Make sure you passed an AsyncAzureOpenAI client."
|
"AzureOpenAI client is not an instance of AsyncAzureOpenAI. Make sure you passed an AsyncAzureOpenAI client."
|
||||||
)
|
)
|
||||||
return self.acreate_file( # type: ignore
|
return self.acreate_file(
|
||||||
create_file_data=create_file_data, openai_client=openai_client
|
create_file_data=create_file_data, openai_client=openai_client
|
||||||
)
|
)
|
||||||
response = openai_client.files.create(**create_file_data)
|
response = cast(AzureOpenAI, openai_client).files.create(**create_file_data)
|
||||||
return response
|
return OpenAIFileObject(**response.model_dump())
|
||||||
|
|
||||||
async def afile_content(
|
async def afile_content(
|
||||||
self,
|
self,
|
||||||
|
|
|
@ -65,6 +65,7 @@ class AzureAIStudioConfig(OpenAIConfig):
|
||||||
def get_complete_url(
|
def get_complete_url(
|
||||||
self,
|
self,
|
||||||
api_base: Optional[str],
|
api_base: Optional[str],
|
||||||
|
api_key: Optional[str],
|
||||||
model: str,
|
model: str,
|
||||||
optional_params: dict,
|
optional_params: dict,
|
||||||
litellm_params: dict,
|
litellm_params: dict,
|
||||||
|
|
|
@ -28,6 +28,7 @@ class BaseAudioTranscriptionConfig(BaseConfig, ABC):
|
||||||
def get_complete_url(
|
def get_complete_url(
|
||||||
self,
|
self,
|
||||||
api_base: Optional[str],
|
api_base: Optional[str],
|
||||||
|
api_key: Optional[str],
|
||||||
model: str,
|
model: str,
|
||||||
optional_params: dict,
|
optional_params: dict,
|
||||||
litellm_params: dict,
|
litellm_params: dict,
|
||||||
|
|
|
@ -2,6 +2,7 @@ import json
|
||||||
from abc import abstractmethod
|
from abc import abstractmethod
|
||||||
from typing import Optional, Union
|
from typing import Optional, Union
|
||||||
|
|
||||||
|
import litellm
|
||||||
from litellm.types.utils import GenericStreamingChunk, ModelResponseStream
|
from litellm.types.utils import GenericStreamingChunk, ModelResponseStream
|
||||||
|
|
||||||
|
|
||||||
|
@ -33,6 +34,18 @@ class BaseModelResponseIterator:
|
||||||
self, str_line: str
|
self, str_line: str
|
||||||
) -> Union[GenericStreamingChunk, ModelResponseStream]:
|
) -> Union[GenericStreamingChunk, ModelResponseStream]:
|
||||||
# chunk is a str at this point
|
# chunk is a str at this point
|
||||||
|
|
||||||
|
stripped_chunk = litellm.CustomStreamWrapper._strip_sse_data_from_chunk(
|
||||||
|
str_line
|
||||||
|
)
|
||||||
|
try:
|
||||||
|
if stripped_chunk is not None:
|
||||||
|
stripped_json_chunk: Optional[dict] = json.loads(stripped_chunk)
|
||||||
|
else:
|
||||||
|
stripped_json_chunk = None
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
stripped_json_chunk = None
|
||||||
|
|
||||||
if "[DONE]" in str_line:
|
if "[DONE]" in str_line:
|
||||||
return GenericStreamingChunk(
|
return GenericStreamingChunk(
|
||||||
text="",
|
text="",
|
||||||
|
@ -42,9 +55,8 @@ class BaseModelResponseIterator:
|
||||||
index=0,
|
index=0,
|
||||||
tool_use=None,
|
tool_use=None,
|
||||||
)
|
)
|
||||||
elif str_line.startswith("data:"):
|
elif stripped_json_chunk:
|
||||||
data_json = json.loads(str_line[5:])
|
return self.chunk_parser(chunk=stripped_json_chunk)
|
||||||
return self.chunk_parser(chunk=data_json)
|
|
||||||
else:
|
else:
|
||||||
return GenericStreamingChunk(
|
return GenericStreamingChunk(
|
||||||
text="",
|
text="",
|
||||||
|
@ -85,6 +97,7 @@ class BaseModelResponseIterator:
|
||||||
async def __anext__(self):
|
async def __anext__(self):
|
||||||
try:
|
try:
|
||||||
chunk = await self.async_response_iterator.__anext__()
|
chunk = await self.async_response_iterator.__anext__()
|
||||||
|
|
||||||
except StopAsyncIteration:
|
except StopAsyncIteration:
|
||||||
raise StopAsyncIteration
|
raise StopAsyncIteration
|
||||||
except ValueError as e:
|
except ValueError as e:
|
||||||
|
@ -99,7 +112,9 @@ class BaseModelResponseIterator:
|
||||||
str_line = str_line[index:]
|
str_line = str_line[index:]
|
||||||
|
|
||||||
# chunk is a str at this point
|
# chunk is a str at this point
|
||||||
return self._handle_string_chunk(str_line=str_line)
|
chunk = self._handle_string_chunk(str_line=str_line)
|
||||||
|
|
||||||
|
return chunk
|
||||||
except StopAsyncIteration:
|
except StopAsyncIteration:
|
||||||
raise StopAsyncIteration
|
raise StopAsyncIteration
|
||||||
except ValueError as e:
|
except ValueError as e:
|
||||||
|
|
|
@ -3,6 +3,7 @@ Utility functions for base LLM classes.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import copy
|
import copy
|
||||||
|
import json
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from typing import List, Optional, Type, Union
|
from typing import List, Optional, Type, Union
|
||||||
|
|
||||||
|
@ -10,8 +11,8 @@ from openai.lib import _parsing, _pydantic
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
|
|
||||||
from litellm._logging import verbose_logger
|
from litellm._logging import verbose_logger
|
||||||
from litellm.types.llms.openai import AllMessageValues
|
from litellm.types.llms.openai import AllMessageValues, ChatCompletionToolCallChunk
|
||||||
from litellm.types.utils import ProviderSpecificModelInfo
|
from litellm.types.utils import Message, ProviderSpecificModelInfo
|
||||||
|
|
||||||
|
|
||||||
class BaseLLMModelInfo(ABC):
|
class BaseLLMModelInfo(ABC):
|
||||||
|
@ -55,6 +56,32 @@ class BaseLLMModelInfo(ABC):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def _convert_tool_response_to_message(
|
||||||
|
tool_calls: List[ChatCompletionToolCallChunk],
|
||||||
|
) -> Optional[Message]:
|
||||||
|
"""
|
||||||
|
In JSON mode, Anthropic API returns JSON schema as a tool call, we need to convert it to a message to follow the OpenAI format
|
||||||
|
|
||||||
|
"""
|
||||||
|
## HANDLE JSON MODE - anthropic returns single function call
|
||||||
|
json_mode_content_str: Optional[str] = tool_calls[0]["function"].get("arguments")
|
||||||
|
try:
|
||||||
|
if json_mode_content_str is not None:
|
||||||
|
args = json.loads(json_mode_content_str)
|
||||||
|
if isinstance(args, dict) and (values := args.get("values")) is not None:
|
||||||
|
_message = Message(content=json.dumps(values))
|
||||||
|
return _message
|
||||||
|
else:
|
||||||
|
# a lot of the times the `values` key is not present in the tool response
|
||||||
|
# relevant issue: https://github.com/BerriAI/litellm/issues/6741
|
||||||
|
_message = Message(content=json.dumps(args))
|
||||||
|
return _message
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
# json decode error does occur, return the original tool response str
|
||||||
|
return Message(content=json_mode_content_str)
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
def _dict_to_response_format_helper(
|
def _dict_to_response_format_helper(
|
||||||
response_format: dict, ref_template: Optional[str] = None
|
response_format: dict, ref_template: Optional[str] = None
|
||||||
) -> dict:
|
) -> dict:
|
||||||
|
|
|
@ -104,7 +104,10 @@ class BaseConfig(ABC):
|
||||||
return type_to_response_format_param(response_format=response_format)
|
return type_to_response_format_param(response_format=response_format)
|
||||||
|
|
||||||
def is_thinking_enabled(self, non_default_params: dict) -> bool:
|
def is_thinking_enabled(self, non_default_params: dict) -> bool:
|
||||||
return non_default_params.get("thinking", {}).get("type", None) == "enabled"
|
return (
|
||||||
|
non_default_params.get("thinking", {}).get("type") == "enabled"
|
||||||
|
or non_default_params.get("reasoning_effort") is not None
|
||||||
|
)
|
||||||
|
|
||||||
def update_optional_params_with_thinking_tokens(
|
def update_optional_params_with_thinking_tokens(
|
||||||
self, non_default_params: dict, optional_params: dict
|
self, non_default_params: dict, optional_params: dict
|
||||||
|
@ -116,9 +119,9 @@ class BaseConfig(ABC):
|
||||||
|
|
||||||
if 'thinking' is enabled and 'max_tokens' is not specified, set 'max_tokens' to the thinking token budget + DEFAULT_MAX_TOKENS
|
if 'thinking' is enabled and 'max_tokens' is not specified, set 'max_tokens' to the thinking token budget + DEFAULT_MAX_TOKENS
|
||||||
"""
|
"""
|
||||||
is_thinking_enabled = self.is_thinking_enabled(non_default_params)
|
is_thinking_enabled = self.is_thinking_enabled(optional_params)
|
||||||
if is_thinking_enabled and "max_tokens" not in non_default_params:
|
if is_thinking_enabled and "max_tokens" not in non_default_params:
|
||||||
thinking_token_budget = cast(dict, non_default_params["thinking"]).get(
|
thinking_token_budget = cast(dict, optional_params["thinking"]).get(
|
||||||
"budget_tokens", None
|
"budget_tokens", None
|
||||||
)
|
)
|
||||||
if thinking_token_budget is not None:
|
if thinking_token_budget is not None:
|
||||||
|
@ -291,6 +294,7 @@ class BaseConfig(ABC):
|
||||||
def get_complete_url(
|
def get_complete_url(
|
||||||
self,
|
self,
|
||||||
api_base: Optional[str],
|
api_base: Optional[str],
|
||||||
|
api_key: Optional[str],
|
||||||
model: str,
|
model: str,
|
||||||
optional_params: dict,
|
optional_params: dict,
|
||||||
litellm_params: dict,
|
litellm_params: dict,
|
||||||
|
|
|
@ -29,6 +29,7 @@ class BaseTextCompletionConfig(BaseConfig, ABC):
|
||||||
def get_complete_url(
|
def get_complete_url(
|
||||||
self,
|
self,
|
||||||
api_base: Optional[str],
|
api_base: Optional[str],
|
||||||
|
api_key: Optional[str],
|
||||||
model: str,
|
model: str,
|
||||||
optional_params: dict,
|
optional_params: dict,
|
||||||
litellm_params: dict,
|
litellm_params: dict,
|
||||||
|
|
|
@ -43,6 +43,7 @@ class BaseEmbeddingConfig(BaseConfig, ABC):
|
||||||
def get_complete_url(
|
def get_complete_url(
|
||||||
self,
|
self,
|
||||||
api_base: Optional[str],
|
api_base: Optional[str],
|
||||||
|
api_key: Optional[str],
|
||||||
model: str,
|
model: str,
|
||||||
optional_params: dict,
|
optional_params: dict,
|
||||||
litellm_params: dict,
|
litellm_params: dict,
|
||||||
|
|
102
litellm/llms/base_llm/files/transformation.py
Normal file
102
litellm/llms/base_llm/files/transformation.py
Normal file
|
@ -0,0 +1,102 @@
|
||||||
|
from abc import abstractmethod
|
||||||
|
from typing import TYPE_CHECKING, Any, List, Optional
|
||||||
|
|
||||||
|
import httpx
|
||||||
|
|
||||||
|
from litellm.types.llms.openai import (
|
||||||
|
AllMessageValues,
|
||||||
|
CreateFileRequest,
|
||||||
|
OpenAICreateFileRequestOptionalParams,
|
||||||
|
OpenAIFileObject,
|
||||||
|
)
|
||||||
|
from litellm.types.utils import LlmProviders, ModelResponse
|
||||||
|
|
||||||
|
from ..chat.transformation import BaseConfig
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from litellm.litellm_core_utils.litellm_logging import Logging as _LiteLLMLoggingObj
|
||||||
|
|
||||||
|
LiteLLMLoggingObj = _LiteLLMLoggingObj
|
||||||
|
else:
|
||||||
|
LiteLLMLoggingObj = Any
|
||||||
|
|
||||||
|
|
||||||
|
class BaseFilesConfig(BaseConfig):
|
||||||
|
@property
|
||||||
|
@abstractmethod
|
||||||
|
def custom_llm_provider(self) -> LlmProviders:
|
||||||
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def get_supported_openai_params(
|
||||||
|
self, model: str
|
||||||
|
) -> List[OpenAICreateFileRequestOptionalParams]:
|
||||||
|
pass
|
||||||
|
|
||||||
|
def get_complete_url(
|
||||||
|
self,
|
||||||
|
api_base: Optional[str],
|
||||||
|
api_key: Optional[str],
|
||||||
|
model: str,
|
||||||
|
optional_params: dict,
|
||||||
|
litellm_params: dict,
|
||||||
|
stream: Optional[bool] = None,
|
||||||
|
) -> str:
|
||||||
|
"""
|
||||||
|
OPTIONAL
|
||||||
|
|
||||||
|
Get the complete url for the request
|
||||||
|
|
||||||
|
Some providers need `model` in `api_base`
|
||||||
|
"""
|
||||||
|
return api_base or ""
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def transform_create_file_request(
|
||||||
|
self,
|
||||||
|
model: str,
|
||||||
|
create_file_data: CreateFileRequest,
|
||||||
|
optional_params: dict,
|
||||||
|
litellm_params: dict,
|
||||||
|
) -> dict:
|
||||||
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def transform_create_file_response(
|
||||||
|
self,
|
||||||
|
model: Optional[str],
|
||||||
|
raw_response: httpx.Response,
|
||||||
|
logging_obj: LiteLLMLoggingObj,
|
||||||
|
litellm_params: dict,
|
||||||
|
) -> OpenAIFileObject:
|
||||||
|
pass
|
||||||
|
|
||||||
|
def transform_request(
|
||||||
|
self,
|
||||||
|
model: str,
|
||||||
|
messages: List[AllMessageValues],
|
||||||
|
optional_params: dict,
|
||||||
|
litellm_params: dict,
|
||||||
|
headers: dict,
|
||||||
|
) -> dict:
|
||||||
|
raise NotImplementedError(
|
||||||
|
"AudioTranscriptionConfig does not need a request transformation for audio transcription models"
|
||||||
|
)
|
||||||
|
|
||||||
|
def transform_response(
|
||||||
|
self,
|
||||||
|
model: str,
|
||||||
|
raw_response: httpx.Response,
|
||||||
|
model_response: ModelResponse,
|
||||||
|
logging_obj: LiteLLMLoggingObj,
|
||||||
|
request_data: dict,
|
||||||
|
messages: List[AllMessageValues],
|
||||||
|
optional_params: dict,
|
||||||
|
litellm_params: dict,
|
||||||
|
encoding: Any,
|
||||||
|
api_key: Optional[str] = None,
|
||||||
|
json_mode: Optional[bool] = None,
|
||||||
|
) -> ModelResponse:
|
||||||
|
raise NotImplementedError(
|
||||||
|
"AudioTranscriptionConfig does not need a response transformation for audio transcription models"
|
||||||
|
)
|
|
@ -34,6 +34,7 @@ class BaseImageVariationConfig(BaseConfig, ABC):
|
||||||
def get_complete_url(
|
def get_complete_url(
|
||||||
self,
|
self,
|
||||||
api_base: Optional[str],
|
api_base: Optional[str],
|
||||||
|
api_key: Optional[str],
|
||||||
model: str,
|
model: str,
|
||||||
optional_params: dict,
|
optional_params: dict,
|
||||||
litellm_params: dict,
|
litellm_params: dict,
|
||||||
|
|
|
@ -9,7 +9,7 @@ from pydantic import BaseModel
|
||||||
|
|
||||||
from litellm._logging import verbose_logger
|
from litellm._logging import verbose_logger
|
||||||
from litellm.caching.caching import DualCache
|
from litellm.caching.caching import DualCache
|
||||||
from litellm.constants import BEDROCK_INVOKE_PROVIDERS_LITERAL
|
from litellm.constants import BEDROCK_INVOKE_PROVIDERS_LITERAL, BEDROCK_MAX_POLICY_SIZE
|
||||||
from litellm.litellm_core_utils.dd_tracing import tracer
|
from litellm.litellm_core_utils.dd_tracing import tracer
|
||||||
from litellm.secret_managers.main import get_secret
|
from litellm.secret_managers.main import get_secret
|
||||||
|
|
||||||
|
@ -381,7 +381,7 @@ class BaseAWSLLM:
|
||||||
"region_name": aws_region_name,
|
"region_name": aws_region_name,
|
||||||
}
|
}
|
||||||
|
|
||||||
if sts_response["PackedPolicySize"] > 75:
|
if sts_response["PackedPolicySize"] > BEDROCK_MAX_POLICY_SIZE:
|
||||||
verbose_logger.warning(
|
verbose_logger.warning(
|
||||||
f"The policy size is greater than 75% of the allowed size, PackedPolicySize: {sts_response['PackedPolicySize']}"
|
f"The policy size is greater than 75% of the allowed size, PackedPolicySize: {sts_response['PackedPolicySize']}"
|
||||||
)
|
)
|
||||||
|
|
|
@ -17,6 +17,7 @@ from litellm.litellm_core_utils.prompt_templates.factory import (
|
||||||
_bedrock_converse_messages_pt,
|
_bedrock_converse_messages_pt,
|
||||||
_bedrock_tools_pt,
|
_bedrock_tools_pt,
|
||||||
)
|
)
|
||||||
|
from litellm.llms.anthropic.chat.transformation import AnthropicConfig
|
||||||
from litellm.llms.base_llm.chat.transformation import BaseConfig, BaseLLMException
|
from litellm.llms.base_llm.chat.transformation import BaseConfig, BaseLLMException
|
||||||
from litellm.types.llms.bedrock import *
|
from litellm.types.llms.bedrock import *
|
||||||
from litellm.types.llms.openai import (
|
from litellm.types.llms.openai import (
|
||||||
|
@ -128,6 +129,7 @@ class AmazonConverseConfig(BaseConfig):
|
||||||
"claude-3-7" in model
|
"claude-3-7" in model
|
||||||
): # [TODO]: move to a 'supports_reasoning_content' param from model cost map
|
): # [TODO]: move to a 'supports_reasoning_content' param from model cost map
|
||||||
supported_params.append("thinking")
|
supported_params.append("thinking")
|
||||||
|
supported_params.append("reasoning_effort")
|
||||||
return supported_params
|
return supported_params
|
||||||
|
|
||||||
def map_tool_choice_values(
|
def map_tool_choice_values(
|
||||||
|
@ -218,9 +220,7 @@ class AmazonConverseConfig(BaseConfig):
|
||||||
messages: Optional[List[AllMessageValues]] = None,
|
messages: Optional[List[AllMessageValues]] = None,
|
||||||
) -> dict:
|
) -> dict:
|
||||||
is_thinking_enabled = self.is_thinking_enabled(non_default_params)
|
is_thinking_enabled = self.is_thinking_enabled(non_default_params)
|
||||||
self.update_optional_params_with_thinking_tokens(
|
|
||||||
non_default_params=non_default_params, optional_params=optional_params
|
|
||||||
)
|
|
||||||
for param, value in non_default_params.items():
|
for param, value in non_default_params.items():
|
||||||
if param == "response_format" and isinstance(value, dict):
|
if param == "response_format" and isinstance(value, dict):
|
||||||
ignore_response_format_types = ["text"]
|
ignore_response_format_types = ["text"]
|
||||||
|
@ -297,6 +297,14 @@ class AmazonConverseConfig(BaseConfig):
|
||||||
optional_params["tool_choice"] = _tool_choice_value
|
optional_params["tool_choice"] = _tool_choice_value
|
||||||
if param == "thinking":
|
if param == "thinking":
|
||||||
optional_params["thinking"] = value
|
optional_params["thinking"] = value
|
||||||
|
elif param == "reasoning_effort" and isinstance(value, str):
|
||||||
|
optional_params["thinking"] = AnthropicConfig._map_reasoning_effort(
|
||||||
|
value
|
||||||
|
)
|
||||||
|
|
||||||
|
self.update_optional_params_with_thinking_tokens(
|
||||||
|
non_default_params=non_default_params, optional_params=optional_params
|
||||||
|
)
|
||||||
|
|
||||||
return optional_params
|
return optional_params
|
||||||
|
|
||||||
|
|
|
@ -74,6 +74,7 @@ class AmazonInvokeConfig(BaseConfig, BaseAWSLLM):
|
||||||
def get_complete_url(
|
def get_complete_url(
|
||||||
self,
|
self,
|
||||||
api_base: Optional[str],
|
api_base: Optional[str],
|
||||||
|
api_key: Optional[str],
|
||||||
model: str,
|
model: str,
|
||||||
optional_params: dict,
|
optional_params: dict,
|
||||||
litellm_params: dict,
|
litellm_params: dict,
|
||||||
|
|
|
@ -77,6 +77,7 @@ class CloudflareChatConfig(BaseConfig):
|
||||||
def get_complete_url(
|
def get_complete_url(
|
||||||
self,
|
self,
|
||||||
api_base: Optional[str],
|
api_base: Optional[str],
|
||||||
|
api_key: Optional[str],
|
||||||
model: str,
|
model: str,
|
||||||
optional_params: dict,
|
optional_params: dict,
|
||||||
litellm_params: dict,
|
litellm_params: dict,
|
||||||
|
|
|
@ -54,7 +54,8 @@ class CohereChatConfig(BaseConfig):
|
||||||
search_queries_only (bool, optional): When true, the response will only contain a list of generated search queries.
|
search_queries_only (bool, optional): When true, the response will only contain a list of generated search queries.
|
||||||
documents (List[Dict[str, str]], optional): A list of relevant documents that the model can cite.
|
documents (List[Dict[str, str]], optional): A list of relevant documents that the model can cite.
|
||||||
temperature (float, optional): A non-negative float that tunes the degree of randomness in generation.
|
temperature (float, optional): A non-negative float that tunes the degree of randomness in generation.
|
||||||
max_tokens (int, optional): The maximum number of tokens the model will generate as part of the response.
|
max_tokens [DEPRECATED - use max_completion_tokens] (int, optional): The maximum number of tokens the model will generate as part of the response.
|
||||||
|
max_completion_tokens (int, optional): The maximum number of tokens the model will generate as part of the response.
|
||||||
k (int, optional): Ensures only the top k most likely tokens are considered for generation at each step.
|
k (int, optional): Ensures only the top k most likely tokens are considered for generation at each step.
|
||||||
p (float, optional): Ensures that only the most likely tokens, with total probability mass of p, are considered for generation.
|
p (float, optional): Ensures that only the most likely tokens, with total probability mass of p, are considered for generation.
|
||||||
frequency_penalty (float, optional): Used to reduce repetitiveness of generated tokens.
|
frequency_penalty (float, optional): Used to reduce repetitiveness of generated tokens.
|
||||||
|
@ -75,6 +76,7 @@ class CohereChatConfig(BaseConfig):
|
||||||
documents: Optional[list] = None
|
documents: Optional[list] = None
|
||||||
temperature: Optional[int] = None
|
temperature: Optional[int] = None
|
||||||
max_tokens: Optional[int] = None
|
max_tokens: Optional[int] = None
|
||||||
|
max_completion_tokens: Optional[int] = None
|
||||||
k: Optional[int] = None
|
k: Optional[int] = None
|
||||||
p: Optional[int] = None
|
p: Optional[int] = None
|
||||||
frequency_penalty: Optional[int] = None
|
frequency_penalty: Optional[int] = None
|
||||||
|
@ -96,6 +98,7 @@ class CohereChatConfig(BaseConfig):
|
||||||
documents: Optional[list] = None,
|
documents: Optional[list] = None,
|
||||||
temperature: Optional[int] = None,
|
temperature: Optional[int] = None,
|
||||||
max_tokens: Optional[int] = None,
|
max_tokens: Optional[int] = None,
|
||||||
|
max_completion_tokens: Optional[int] = None,
|
||||||
k: Optional[int] = None,
|
k: Optional[int] = None,
|
||||||
p: Optional[int] = None,
|
p: Optional[int] = None,
|
||||||
frequency_penalty: Optional[int] = None,
|
frequency_penalty: Optional[int] = None,
|
||||||
|
@ -131,6 +134,7 @@ class CohereChatConfig(BaseConfig):
|
||||||
"stream",
|
"stream",
|
||||||
"temperature",
|
"temperature",
|
||||||
"max_tokens",
|
"max_tokens",
|
||||||
|
"max_completion_tokens",
|
||||||
"top_p",
|
"top_p",
|
||||||
"frequency_penalty",
|
"frequency_penalty",
|
||||||
"presence_penalty",
|
"presence_penalty",
|
||||||
|
@ -156,6 +160,8 @@ class CohereChatConfig(BaseConfig):
|
||||||
optional_params["temperature"] = value
|
optional_params["temperature"] = value
|
||||||
if param == "max_tokens":
|
if param == "max_tokens":
|
||||||
optional_params["max_tokens"] = value
|
optional_params["max_tokens"] = value
|
||||||
|
if param == "max_completion_tokens":
|
||||||
|
optional_params["max_tokens"] = value
|
||||||
if param == "n":
|
if param == "n":
|
||||||
optional_params["num_generations"] = value
|
optional_params["num_generations"] = value
|
||||||
if param == "top_p":
|
if param == "top_p":
|
||||||
|
|
|
@ -230,6 +230,7 @@ class BaseLLMAIOHTTPHandler:
|
||||||
|
|
||||||
api_base = provider_config.get_complete_url(
|
api_base = provider_config.get_complete_url(
|
||||||
api_base=api_base,
|
api_base=api_base,
|
||||||
|
api_key=api_key,
|
||||||
model=model,
|
model=model,
|
||||||
optional_params=optional_params,
|
optional_params=optional_params,
|
||||||
litellm_params=litellm_params,
|
litellm_params=litellm_params,
|
||||||
|
@ -480,6 +481,7 @@ class BaseLLMAIOHTTPHandler:
|
||||||
|
|
||||||
api_base = provider_config.get_complete_url(
|
api_base = provider_config.get_complete_url(
|
||||||
api_base=api_base,
|
api_base=api_base,
|
||||||
|
api_key=api_key,
|
||||||
model=model,
|
model=model,
|
||||||
optional_params=optional_params,
|
optional_params=optional_params,
|
||||||
litellm_params=litellm_params,
|
litellm_params=litellm_params,
|
||||||
|
@ -519,7 +521,6 @@ class BaseLLMAIOHTTPHandler:
|
||||||
data=data,
|
data=data,
|
||||||
headers=headers,
|
headers=headers,
|
||||||
model_response=model_response,
|
model_response=model_response,
|
||||||
api_key=api_key,
|
|
||||||
logging_obj=logging_obj,
|
logging_obj=logging_obj,
|
||||||
model=model,
|
model=model,
|
||||||
timeout=timeout,
|
timeout=timeout,
|
||||||
|
|
|
@ -7,11 +7,13 @@ import litellm
|
||||||
import litellm.litellm_core_utils
|
import litellm.litellm_core_utils
|
||||||
import litellm.types
|
import litellm.types
|
||||||
import litellm.types.utils
|
import litellm.types.utils
|
||||||
|
from litellm._logging import verbose_logger
|
||||||
from litellm.llms.base_llm.audio_transcription.transformation import (
|
from litellm.llms.base_llm.audio_transcription.transformation import (
|
||||||
BaseAudioTranscriptionConfig,
|
BaseAudioTranscriptionConfig,
|
||||||
)
|
)
|
||||||
from litellm.llms.base_llm.chat.transformation import BaseConfig
|
from litellm.llms.base_llm.chat.transformation import BaseConfig
|
||||||
from litellm.llms.base_llm.embedding.transformation import BaseEmbeddingConfig
|
from litellm.llms.base_llm.embedding.transformation import BaseEmbeddingConfig
|
||||||
|
from litellm.llms.base_llm.files.transformation import BaseFilesConfig
|
||||||
from litellm.llms.base_llm.rerank.transformation import BaseRerankConfig
|
from litellm.llms.base_llm.rerank.transformation import BaseRerankConfig
|
||||||
from litellm.llms.base_llm.responses.transformation import BaseResponsesAPIConfig
|
from litellm.llms.base_llm.responses.transformation import BaseResponsesAPIConfig
|
||||||
from litellm.llms.custom_httpx.http_handler import (
|
from litellm.llms.custom_httpx.http_handler import (
|
||||||
|
@ -26,7 +28,12 @@ from litellm.responses.streaming_iterator import (
|
||||||
ResponsesAPIStreamingIterator,
|
ResponsesAPIStreamingIterator,
|
||||||
SyncResponsesAPIStreamingIterator,
|
SyncResponsesAPIStreamingIterator,
|
||||||
)
|
)
|
||||||
from litellm.types.llms.openai import ResponseInputParam, ResponsesAPIResponse
|
from litellm.types.llms.openai import (
|
||||||
|
CreateFileRequest,
|
||||||
|
OpenAIFileObject,
|
||||||
|
ResponseInputParam,
|
||||||
|
ResponsesAPIResponse,
|
||||||
|
)
|
||||||
from litellm.types.rerank import OptionalRerankParams, RerankResponse
|
from litellm.types.rerank import OptionalRerankParams, RerankResponse
|
||||||
from litellm.types.router import GenericLiteLLMParams
|
from litellm.types.router import GenericLiteLLMParams
|
||||||
from litellm.types.utils import EmbeddingResponse, FileTypes, TranscriptionResponse
|
from litellm.types.utils import EmbeddingResponse, FileTypes, TranscriptionResponse
|
||||||
|
@ -240,6 +247,7 @@ class BaseLLMHTTPHandler:
|
||||||
|
|
||||||
api_base = provider_config.get_complete_url(
|
api_base = provider_config.get_complete_url(
|
||||||
api_base=api_base,
|
api_base=api_base,
|
||||||
|
api_key=api_key,
|
||||||
model=model,
|
model=model,
|
||||||
optional_params=optional_params,
|
optional_params=optional_params,
|
||||||
stream=stream,
|
stream=stream,
|
||||||
|
@ -360,6 +368,7 @@ class BaseLLMHTTPHandler:
|
||||||
else None
|
else None
|
||||||
),
|
),
|
||||||
litellm_params=litellm_params,
|
litellm_params=litellm_params,
|
||||||
|
json_mode=json_mode,
|
||||||
)
|
)
|
||||||
return CustomStreamWrapper(
|
return CustomStreamWrapper(
|
||||||
completion_stream=completion_stream,
|
completion_stream=completion_stream,
|
||||||
|
@ -412,6 +421,7 @@ class BaseLLMHTTPHandler:
|
||||||
timeout: Union[float, httpx.Timeout],
|
timeout: Union[float, httpx.Timeout],
|
||||||
fake_stream: bool = False,
|
fake_stream: bool = False,
|
||||||
client: Optional[HTTPHandler] = None,
|
client: Optional[HTTPHandler] = None,
|
||||||
|
json_mode: bool = False,
|
||||||
) -> Tuple[Any, dict]:
|
) -> Tuple[Any, dict]:
|
||||||
if client is None or not isinstance(client, HTTPHandler):
|
if client is None or not isinstance(client, HTTPHandler):
|
||||||
sync_httpx_client = _get_httpx_client(
|
sync_httpx_client = _get_httpx_client(
|
||||||
|
@ -439,11 +449,15 @@ class BaseLLMHTTPHandler:
|
||||||
|
|
||||||
if fake_stream is True:
|
if fake_stream is True:
|
||||||
completion_stream = provider_config.get_model_response_iterator(
|
completion_stream = provider_config.get_model_response_iterator(
|
||||||
streaming_response=response.json(), sync_stream=True
|
streaming_response=response.json(),
|
||||||
|
sync_stream=True,
|
||||||
|
json_mode=json_mode,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
completion_stream = provider_config.get_model_response_iterator(
|
completion_stream = provider_config.get_model_response_iterator(
|
||||||
streaming_response=response.iter_lines(), sync_stream=True
|
streaming_response=response.iter_lines(),
|
||||||
|
sync_stream=True,
|
||||||
|
json_mode=json_mode,
|
||||||
)
|
)
|
||||||
|
|
||||||
# LOGGING
|
# LOGGING
|
||||||
|
@ -611,6 +625,7 @@ class BaseLLMHTTPHandler:
|
||||||
|
|
||||||
api_base = provider_config.get_complete_url(
|
api_base = provider_config.get_complete_url(
|
||||||
api_base=api_base,
|
api_base=api_base,
|
||||||
|
api_key=api_key,
|
||||||
model=model,
|
model=model,
|
||||||
optional_params=optional_params,
|
optional_params=optional_params,
|
||||||
litellm_params=litellm_params,
|
litellm_params=litellm_params,
|
||||||
|
@ -884,6 +899,7 @@ class BaseLLMHTTPHandler:
|
||||||
|
|
||||||
complete_url = provider_config.get_complete_url(
|
complete_url = provider_config.get_complete_url(
|
||||||
api_base=api_base,
|
api_base=api_base,
|
||||||
|
api_key=api_key,
|
||||||
model=model,
|
model=model,
|
||||||
optional_params=optional_params,
|
optional_params=optional_params,
|
||||||
litellm_params=litellm_params,
|
litellm_params=litellm_params,
|
||||||
|
@ -1185,6 +1201,188 @@ class BaseLLMHTTPHandler:
|
||||||
logging_obj=logging_obj,
|
logging_obj=logging_obj,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def create_file(
|
||||||
|
self,
|
||||||
|
create_file_data: CreateFileRequest,
|
||||||
|
litellm_params: dict,
|
||||||
|
provider_config: BaseFilesConfig,
|
||||||
|
headers: dict,
|
||||||
|
api_base: Optional[str],
|
||||||
|
api_key: Optional[str],
|
||||||
|
logging_obj: LiteLLMLoggingObj,
|
||||||
|
_is_async: bool = False,
|
||||||
|
client: Optional[Union[HTTPHandler, AsyncHTTPHandler]] = None,
|
||||||
|
timeout: Optional[Union[float, httpx.Timeout]] = None,
|
||||||
|
) -> Union[OpenAIFileObject, Coroutine[Any, Any, OpenAIFileObject]]:
|
||||||
|
"""
|
||||||
|
Creates a file using Gemini's two-step upload process
|
||||||
|
"""
|
||||||
|
# get config from model, custom llm provider
|
||||||
|
headers = provider_config.validate_environment(
|
||||||
|
api_key=api_key,
|
||||||
|
headers=headers,
|
||||||
|
model="",
|
||||||
|
messages=[],
|
||||||
|
optional_params={},
|
||||||
|
)
|
||||||
|
|
||||||
|
api_base = provider_config.get_complete_url(
|
||||||
|
api_base=api_base,
|
||||||
|
api_key=api_key,
|
||||||
|
model="",
|
||||||
|
optional_params={},
|
||||||
|
litellm_params=litellm_params,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Get the transformed request data for both steps
|
||||||
|
transformed_request = provider_config.transform_create_file_request(
|
||||||
|
model="",
|
||||||
|
create_file_data=create_file_data,
|
||||||
|
litellm_params=litellm_params,
|
||||||
|
optional_params={},
|
||||||
|
)
|
||||||
|
|
||||||
|
if _is_async:
|
||||||
|
return self.async_create_file(
|
||||||
|
transformed_request=transformed_request,
|
||||||
|
litellm_params=litellm_params,
|
||||||
|
provider_config=provider_config,
|
||||||
|
headers=headers,
|
||||||
|
api_base=api_base,
|
||||||
|
logging_obj=logging_obj,
|
||||||
|
client=client,
|
||||||
|
timeout=timeout,
|
||||||
|
)
|
||||||
|
|
||||||
|
if client is None or not isinstance(client, HTTPHandler):
|
||||||
|
sync_httpx_client = _get_httpx_client()
|
||||||
|
else:
|
||||||
|
sync_httpx_client = client
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Step 1: Initial request to get upload URL
|
||||||
|
initial_response = sync_httpx_client.post(
|
||||||
|
url=api_base,
|
||||||
|
headers={
|
||||||
|
**headers,
|
||||||
|
**transformed_request["initial_request"]["headers"],
|
||||||
|
},
|
||||||
|
data=json.dumps(transformed_request["initial_request"]["data"]),
|
||||||
|
timeout=timeout,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Extract upload URL from response headers
|
||||||
|
upload_url = initial_response.headers.get("X-Goog-Upload-URL")
|
||||||
|
|
||||||
|
if not upload_url:
|
||||||
|
raise ValueError("Failed to get upload URL from initial request")
|
||||||
|
|
||||||
|
# Step 2: Upload the actual file
|
||||||
|
upload_response = sync_httpx_client.post(
|
||||||
|
url=upload_url,
|
||||||
|
headers=transformed_request["upload_request"]["headers"],
|
||||||
|
data=transformed_request["upload_request"]["data"],
|
||||||
|
timeout=timeout,
|
||||||
|
)
|
||||||
|
|
||||||
|
return provider_config.transform_create_file_response(
|
||||||
|
model=None,
|
||||||
|
raw_response=upload_response,
|
||||||
|
logging_obj=logging_obj,
|
||||||
|
litellm_params=litellm_params,
|
||||||
|
)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
raise self._handle_error(
|
||||||
|
e=e,
|
||||||
|
provider_config=provider_config,
|
||||||
|
)
|
||||||
|
|
||||||
|
async def async_create_file(
|
||||||
|
self,
|
||||||
|
transformed_request: dict,
|
||||||
|
litellm_params: dict,
|
||||||
|
provider_config: BaseFilesConfig,
|
||||||
|
headers: dict,
|
||||||
|
api_base: str,
|
||||||
|
logging_obj: LiteLLMLoggingObj,
|
||||||
|
client: Optional[Union[HTTPHandler, AsyncHTTPHandler]] = None,
|
||||||
|
timeout: Optional[Union[float, httpx.Timeout]] = None,
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Creates a file using Gemini's two-step upload process
|
||||||
|
"""
|
||||||
|
if client is None or not isinstance(client, AsyncHTTPHandler):
|
||||||
|
async_httpx_client = get_async_httpx_client(
|
||||||
|
llm_provider=provider_config.custom_llm_provider
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
async_httpx_client = client
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Step 1: Initial request to get upload URL
|
||||||
|
initial_response = await async_httpx_client.post(
|
||||||
|
url=api_base,
|
||||||
|
headers={
|
||||||
|
**headers,
|
||||||
|
**transformed_request["initial_request"]["headers"],
|
||||||
|
},
|
||||||
|
data=json.dumps(transformed_request["initial_request"]["data"]),
|
||||||
|
timeout=timeout,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Extract upload URL from response headers
|
||||||
|
upload_url = initial_response.headers.get("X-Goog-Upload-URL")
|
||||||
|
|
||||||
|
if not upload_url:
|
||||||
|
raise ValueError("Failed to get upload URL from initial request")
|
||||||
|
|
||||||
|
# Step 2: Upload the actual file
|
||||||
|
upload_response = await async_httpx_client.post(
|
||||||
|
url=upload_url,
|
||||||
|
headers=transformed_request["upload_request"]["headers"],
|
||||||
|
data=transformed_request["upload_request"]["data"],
|
||||||
|
timeout=timeout,
|
||||||
|
)
|
||||||
|
|
||||||
|
return provider_config.transform_create_file_response(
|
||||||
|
model=None,
|
||||||
|
raw_response=upload_response,
|
||||||
|
logging_obj=logging_obj,
|
||||||
|
litellm_params=litellm_params,
|
||||||
|
)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
verbose_logger.exception(f"Error creating file: {e}")
|
||||||
|
raise self._handle_error(
|
||||||
|
e=e,
|
||||||
|
provider_config=provider_config,
|
||||||
|
)
|
||||||
|
|
||||||
|
def list_files(self):
|
||||||
|
"""
|
||||||
|
Lists all files
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
def delete_file(self):
|
||||||
|
"""
|
||||||
|
Deletes a file
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
def retrieve_file(self):
|
||||||
|
"""
|
||||||
|
Returns the metadata of the file
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
def retrieve_file_content(self):
|
||||||
|
"""
|
||||||
|
Returns the content of the file
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
def _prepare_fake_stream_request(
|
def _prepare_fake_stream_request(
|
||||||
self,
|
self,
|
||||||
stream: bool,
|
stream: bool,
|
||||||
|
|
|
@ -1,84 +0,0 @@
|
||||||
"""
|
|
||||||
Handles the chat completion request for Databricks
|
|
||||||
"""
|
|
||||||
|
|
||||||
from typing import Callable, List, Optional, Union, cast
|
|
||||||
|
|
||||||
from httpx._config import Timeout
|
|
||||||
|
|
||||||
from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
|
|
||||||
from litellm.types.llms.openai import AllMessageValues
|
|
||||||
from litellm.types.utils import CustomStreamingDecoder
|
|
||||||
from litellm.utils import ModelResponse
|
|
||||||
|
|
||||||
from ...openai_like.chat.handler import OpenAILikeChatHandler
|
|
||||||
from ..common_utils import DatabricksBase
|
|
||||||
from .transformation import DatabricksConfig
|
|
||||||
|
|
||||||
|
|
||||||
class DatabricksChatCompletion(OpenAILikeChatHandler, DatabricksBase):
|
|
||||||
def __init__(self, **kwargs):
|
|
||||||
super().__init__(**kwargs)
|
|
||||||
|
|
||||||
def completion(
|
|
||||||
self,
|
|
||||||
*,
|
|
||||||
model: str,
|
|
||||||
messages: list,
|
|
||||||
api_base: str,
|
|
||||||
custom_llm_provider: str,
|
|
||||||
custom_prompt_dict: dict,
|
|
||||||
model_response: ModelResponse,
|
|
||||||
print_verbose: Callable,
|
|
||||||
encoding,
|
|
||||||
api_key: Optional[str],
|
|
||||||
logging_obj,
|
|
||||||
optional_params: dict,
|
|
||||||
acompletion=None,
|
|
||||||
litellm_params=None,
|
|
||||||
logger_fn=None,
|
|
||||||
headers: Optional[dict] = None,
|
|
||||||
timeout: Optional[Union[float, Timeout]] = None,
|
|
||||||
client: Optional[Union[HTTPHandler, AsyncHTTPHandler]] = None,
|
|
||||||
custom_endpoint: Optional[bool] = None,
|
|
||||||
streaming_decoder: Optional[CustomStreamingDecoder] = None,
|
|
||||||
fake_stream: bool = False,
|
|
||||||
):
|
|
||||||
messages = DatabricksConfig()._transform_messages(
|
|
||||||
messages=cast(List[AllMessageValues], messages), model=model
|
|
||||||
)
|
|
||||||
api_base, headers = self.databricks_validate_environment(
|
|
||||||
api_base=api_base,
|
|
||||||
api_key=api_key,
|
|
||||||
endpoint_type="chat_completions",
|
|
||||||
custom_endpoint=custom_endpoint,
|
|
||||||
headers=headers,
|
|
||||||
)
|
|
||||||
|
|
||||||
if optional_params.get("stream") is True:
|
|
||||||
fake_stream = DatabricksConfig()._should_fake_stream(optional_params)
|
|
||||||
else:
|
|
||||||
fake_stream = False
|
|
||||||
|
|
||||||
return super().completion(
|
|
||||||
model=model,
|
|
||||||
messages=messages,
|
|
||||||
api_base=api_base,
|
|
||||||
custom_llm_provider=custom_llm_provider,
|
|
||||||
custom_prompt_dict=custom_prompt_dict,
|
|
||||||
model_response=model_response,
|
|
||||||
print_verbose=print_verbose,
|
|
||||||
encoding=encoding,
|
|
||||||
api_key=api_key,
|
|
||||||
logging_obj=logging_obj,
|
|
||||||
optional_params=optional_params,
|
|
||||||
acompletion=acompletion,
|
|
||||||
litellm_params=litellm_params,
|
|
||||||
logger_fn=logger_fn,
|
|
||||||
headers=headers,
|
|
||||||
timeout=timeout,
|
|
||||||
client=client,
|
|
||||||
custom_endpoint=True,
|
|
||||||
streaming_decoder=streaming_decoder,
|
|
||||||
fake_stream=fake_stream,
|
|
||||||
)
|
|
|
@ -2,21 +2,68 @@
|
||||||
Translates from OpenAI's `/v1/chat/completions` to Databricks' `/chat/completions`
|
Translates from OpenAI's `/v1/chat/completions` to Databricks' `/chat/completions`
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from typing import List, Optional, Union
|
from typing import (
|
||||||
|
TYPE_CHECKING,
|
||||||
|
Any,
|
||||||
|
AsyncIterator,
|
||||||
|
Iterator,
|
||||||
|
List,
|
||||||
|
Optional,
|
||||||
|
Tuple,
|
||||||
|
Union,
|
||||||
|
cast,
|
||||||
|
)
|
||||||
|
|
||||||
|
import httpx
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
|
|
||||||
|
from litellm.constants import RESPONSE_FORMAT_TOOL_NAME
|
||||||
|
from litellm.litellm_core_utils.llm_response_utils.convert_dict_to_response import (
|
||||||
|
_handle_invalid_parallel_tool_calls,
|
||||||
|
_should_convert_tool_call_to_json_mode,
|
||||||
|
)
|
||||||
from litellm.litellm_core_utils.prompt_templates.common_utils import (
|
from litellm.litellm_core_utils.prompt_templates.common_utils import (
|
||||||
handle_messages_with_content_list_to_str_conversion,
|
handle_messages_with_content_list_to_str_conversion,
|
||||||
strip_name_from_messages,
|
strip_name_from_messages,
|
||||||
)
|
)
|
||||||
from litellm.types.llms.openai import AllMessageValues
|
from litellm.llms.base_llm.base_model_iterator import BaseModelResponseIterator
|
||||||
from litellm.types.utils import ProviderField
|
from litellm.types.llms.anthropic import AnthropicMessagesTool
|
||||||
|
from litellm.types.llms.databricks import (
|
||||||
|
AllDatabricksContentValues,
|
||||||
|
DatabricksChoice,
|
||||||
|
DatabricksFunction,
|
||||||
|
DatabricksResponse,
|
||||||
|
DatabricksTool,
|
||||||
|
)
|
||||||
|
from litellm.types.llms.openai import (
|
||||||
|
AllMessageValues,
|
||||||
|
ChatCompletionThinkingBlock,
|
||||||
|
ChatCompletionToolChoiceFunctionParam,
|
||||||
|
ChatCompletionToolChoiceObjectParam,
|
||||||
|
)
|
||||||
|
from litellm.types.utils import (
|
||||||
|
ChatCompletionMessageToolCall,
|
||||||
|
Choices,
|
||||||
|
Message,
|
||||||
|
ModelResponse,
|
||||||
|
ModelResponseStream,
|
||||||
|
ProviderField,
|
||||||
|
Usage,
|
||||||
|
)
|
||||||
|
|
||||||
|
from ...anthropic.chat.transformation import AnthropicConfig
|
||||||
from ...openai_like.chat.transformation import OpenAILikeChatConfig
|
from ...openai_like.chat.transformation import OpenAILikeChatConfig
|
||||||
|
from ..common_utils import DatabricksBase, DatabricksException
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from litellm.litellm_core_utils.litellm_logging import Logging as _LiteLLMLoggingObj
|
||||||
|
|
||||||
|
LiteLLMLoggingObj = _LiteLLMLoggingObj
|
||||||
|
else:
|
||||||
|
LiteLLMLoggingObj = Any
|
||||||
|
|
||||||
|
|
||||||
class DatabricksConfig(OpenAILikeChatConfig):
|
class DatabricksConfig(DatabricksBase, OpenAILikeChatConfig, AnthropicConfig):
|
||||||
"""
|
"""
|
||||||
Reference: https://docs.databricks.com/en/machine-learning/foundation-models/api-reference.html#chat-request
|
Reference: https://docs.databricks.com/en/machine-learning/foundation-models/api-reference.html#chat-request
|
||||||
"""
|
"""
|
||||||
|
@ -63,6 +110,39 @@ class DatabricksConfig(OpenAILikeChatConfig):
|
||||||
),
|
),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
def validate_environment(
|
||||||
|
self,
|
||||||
|
headers: dict,
|
||||||
|
model: str,
|
||||||
|
messages: List[AllMessageValues],
|
||||||
|
optional_params: dict,
|
||||||
|
api_key: Optional[str] = None,
|
||||||
|
api_base: Optional[str] = None,
|
||||||
|
) -> dict:
|
||||||
|
api_base, headers = self.databricks_validate_environment(
|
||||||
|
api_base=api_base,
|
||||||
|
api_key=api_key,
|
||||||
|
endpoint_type="chat_completions",
|
||||||
|
custom_endpoint=False,
|
||||||
|
headers=headers,
|
||||||
|
)
|
||||||
|
# Ensure Content-Type header is set
|
||||||
|
headers["Content-Type"] = "application/json"
|
||||||
|
return headers
|
||||||
|
|
||||||
|
def get_complete_url(
|
||||||
|
self,
|
||||||
|
api_base: Optional[str],
|
||||||
|
api_key: Optional[str],
|
||||||
|
model: str,
|
||||||
|
optional_params: dict,
|
||||||
|
litellm_params: dict,
|
||||||
|
stream: Optional[bool] = None,
|
||||||
|
) -> str:
|
||||||
|
api_base = self._get_api_base(api_base)
|
||||||
|
complete_url = f"{api_base}/chat/completions"
|
||||||
|
return complete_url
|
||||||
|
|
||||||
def get_supported_openai_params(self, model: Optional[str] = None) -> list:
|
def get_supported_openai_params(self, model: Optional[str] = None) -> list:
|
||||||
return [
|
return [
|
||||||
"stream",
|
"stream",
|
||||||
|
@ -75,8 +155,98 @@ class DatabricksConfig(OpenAILikeChatConfig):
|
||||||
"response_format",
|
"response_format",
|
||||||
"tools",
|
"tools",
|
||||||
"tool_choice",
|
"tool_choice",
|
||||||
|
"reasoning_effort",
|
||||||
|
"thinking",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
def convert_anthropic_tool_to_databricks_tool(
|
||||||
|
self, tool: Optional[AnthropicMessagesTool]
|
||||||
|
) -> Optional[DatabricksTool]:
|
||||||
|
if tool is None:
|
||||||
|
return None
|
||||||
|
|
||||||
|
return DatabricksTool(
|
||||||
|
type="function",
|
||||||
|
function=DatabricksFunction(
|
||||||
|
name=tool["name"],
|
||||||
|
parameters=cast(dict, tool.get("input_schema") or {}),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
def map_response_format_to_databricks_tool(
|
||||||
|
self,
|
||||||
|
model: str,
|
||||||
|
value: Optional[dict],
|
||||||
|
optional_params: dict,
|
||||||
|
is_thinking_enabled: bool,
|
||||||
|
) -> Optional[DatabricksTool]:
|
||||||
|
if value is None:
|
||||||
|
return None
|
||||||
|
|
||||||
|
tool = self.map_response_format_to_anthropic_tool(
|
||||||
|
value, optional_params, is_thinking_enabled
|
||||||
|
)
|
||||||
|
|
||||||
|
databricks_tool = self.convert_anthropic_tool_to_databricks_tool(tool)
|
||||||
|
return databricks_tool
|
||||||
|
|
||||||
|
def map_openai_params(
|
||||||
|
self,
|
||||||
|
non_default_params: dict,
|
||||||
|
optional_params: dict,
|
||||||
|
model: str,
|
||||||
|
drop_params: bool,
|
||||||
|
replace_max_completion_tokens_with_max_tokens: bool = True,
|
||||||
|
) -> dict:
|
||||||
|
is_thinking_enabled = self.is_thinking_enabled(non_default_params)
|
||||||
|
mapped_params = super().map_openai_params(
|
||||||
|
non_default_params, optional_params, model, drop_params
|
||||||
|
)
|
||||||
|
if (
|
||||||
|
"max_completion_tokens" in non_default_params
|
||||||
|
and replace_max_completion_tokens_with_max_tokens
|
||||||
|
):
|
||||||
|
mapped_params["max_tokens"] = non_default_params[
|
||||||
|
"max_completion_tokens"
|
||||||
|
] # most openai-compatible providers support 'max_tokens' not 'max_completion_tokens'
|
||||||
|
mapped_params.pop("max_completion_tokens", None)
|
||||||
|
|
||||||
|
if "response_format" in non_default_params and "claude" in model:
|
||||||
|
_tool = self.map_response_format_to_databricks_tool(
|
||||||
|
model,
|
||||||
|
non_default_params["response_format"],
|
||||||
|
mapped_params,
|
||||||
|
is_thinking_enabled,
|
||||||
|
)
|
||||||
|
|
||||||
|
if _tool is not None:
|
||||||
|
self._add_tools_to_optional_params(
|
||||||
|
optional_params=optional_params, tools=[_tool]
|
||||||
|
)
|
||||||
|
optional_params["json_mode"] = True
|
||||||
|
if not is_thinking_enabled:
|
||||||
|
_tool_choice = ChatCompletionToolChoiceObjectParam(
|
||||||
|
type="function",
|
||||||
|
function=ChatCompletionToolChoiceFunctionParam(
|
||||||
|
name=RESPONSE_FORMAT_TOOL_NAME
|
||||||
|
),
|
||||||
|
)
|
||||||
|
optional_params["tool_choice"] = _tool_choice
|
||||||
|
optional_params.pop(
|
||||||
|
"response_format", None
|
||||||
|
) # unsupported for claude models - if json_schema -> convert to tool call
|
||||||
|
|
||||||
|
if "reasoning_effort" in non_default_params and "claude" in model:
|
||||||
|
optional_params["thinking"] = AnthropicConfig._map_reasoning_effort(
|
||||||
|
non_default_params.get("reasoning_effort")
|
||||||
|
)
|
||||||
|
## handle thinking tokens
|
||||||
|
self.update_optional_params_with_thinking_tokens(
|
||||||
|
non_default_params=non_default_params, optional_params=mapped_params
|
||||||
|
)
|
||||||
|
|
||||||
|
return mapped_params
|
||||||
|
|
||||||
def _should_fake_stream(self, optional_params: dict) -> bool:
|
def _should_fake_stream(self, optional_params: dict) -> bool:
|
||||||
"""
|
"""
|
||||||
Databricks doesn't support 'response_format' while streaming
|
Databricks doesn't support 'response_format' while streaming
|
||||||
|
@ -104,3 +274,259 @@ class DatabricksConfig(OpenAILikeChatConfig):
|
||||||
new_messages = handle_messages_with_content_list_to_str_conversion(new_messages)
|
new_messages = handle_messages_with_content_list_to_str_conversion(new_messages)
|
||||||
new_messages = strip_name_from_messages(new_messages)
|
new_messages = strip_name_from_messages(new_messages)
|
||||||
return super()._transform_messages(messages=new_messages, model=model)
|
return super()._transform_messages(messages=new_messages, model=model)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def extract_content_str(
|
||||||
|
content: Optional[AllDatabricksContentValues],
|
||||||
|
) -> Optional[str]:
|
||||||
|
if content is None:
|
||||||
|
return None
|
||||||
|
if isinstance(content, str):
|
||||||
|
return content
|
||||||
|
elif isinstance(content, list):
|
||||||
|
content_str = ""
|
||||||
|
for item in content:
|
||||||
|
if item["type"] == "text":
|
||||||
|
content_str += item["text"]
|
||||||
|
return content_str
|
||||||
|
else:
|
||||||
|
raise Exception(f"Unsupported content type: {type(content)}")
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def extract_reasoning_content(
|
||||||
|
content: Optional[AllDatabricksContentValues],
|
||||||
|
) -> Tuple[Optional[str], Optional[List[ChatCompletionThinkingBlock]]]:
|
||||||
|
"""
|
||||||
|
Extract and return the reasoning content and thinking blocks
|
||||||
|
"""
|
||||||
|
if content is None:
|
||||||
|
return None, None
|
||||||
|
thinking_blocks: Optional[List[ChatCompletionThinkingBlock]] = None
|
||||||
|
reasoning_content: Optional[str] = None
|
||||||
|
if isinstance(content, list):
|
||||||
|
for item in content:
|
||||||
|
if item["type"] == "reasoning":
|
||||||
|
for sum in item["summary"]:
|
||||||
|
if reasoning_content is None:
|
||||||
|
reasoning_content = ""
|
||||||
|
reasoning_content += sum["text"]
|
||||||
|
thinking_block = ChatCompletionThinkingBlock(
|
||||||
|
type="thinking",
|
||||||
|
thinking=sum["text"],
|
||||||
|
signature=sum["signature"],
|
||||||
|
)
|
||||||
|
if thinking_blocks is None:
|
||||||
|
thinking_blocks = []
|
||||||
|
thinking_blocks.append(thinking_block)
|
||||||
|
return reasoning_content, thinking_blocks
|
||||||
|
|
||||||
|
def _transform_choices(
|
||||||
|
self, choices: List[DatabricksChoice], json_mode: Optional[bool] = None
|
||||||
|
) -> List[Choices]:
|
||||||
|
transformed_choices = []
|
||||||
|
|
||||||
|
for choice in choices:
|
||||||
|
## HANDLE JSON MODE - anthropic returns single function call]
|
||||||
|
tool_calls = choice["message"].get("tool_calls", None)
|
||||||
|
if tool_calls is not None:
|
||||||
|
_openai_tool_calls = []
|
||||||
|
for _tc in tool_calls:
|
||||||
|
_openai_tc = ChatCompletionMessageToolCall(**_tc) # type: ignore
|
||||||
|
_openai_tool_calls.append(_openai_tc)
|
||||||
|
fixed_tool_calls = _handle_invalid_parallel_tool_calls(
|
||||||
|
_openai_tool_calls
|
||||||
|
)
|
||||||
|
|
||||||
|
if fixed_tool_calls is not None:
|
||||||
|
tool_calls = fixed_tool_calls
|
||||||
|
|
||||||
|
translated_message: Optional[Message] = None
|
||||||
|
finish_reason: Optional[str] = None
|
||||||
|
if tool_calls and _should_convert_tool_call_to_json_mode(
|
||||||
|
tool_calls=tool_calls,
|
||||||
|
convert_tool_call_to_json_mode=json_mode,
|
||||||
|
):
|
||||||
|
# to support response_format on claude models
|
||||||
|
json_mode_content_str: Optional[str] = (
|
||||||
|
str(tool_calls[0]["function"].get("arguments", "")) or None
|
||||||
|
)
|
||||||
|
if json_mode_content_str is not None:
|
||||||
|
translated_message = Message(content=json_mode_content_str)
|
||||||
|
finish_reason = "stop"
|
||||||
|
|
||||||
|
if translated_message is None:
|
||||||
|
## get the content str
|
||||||
|
content_str = DatabricksConfig.extract_content_str(
|
||||||
|
choice["message"]["content"]
|
||||||
|
)
|
||||||
|
|
||||||
|
## get the reasoning content
|
||||||
|
(
|
||||||
|
reasoning_content,
|
||||||
|
thinking_blocks,
|
||||||
|
) = DatabricksConfig.extract_reasoning_content(
|
||||||
|
choice["message"].get("content")
|
||||||
|
)
|
||||||
|
|
||||||
|
translated_message = Message(
|
||||||
|
role="assistant",
|
||||||
|
content=content_str,
|
||||||
|
reasoning_content=reasoning_content,
|
||||||
|
thinking_blocks=thinking_blocks,
|
||||||
|
tool_calls=choice["message"].get("tool_calls"),
|
||||||
|
)
|
||||||
|
|
||||||
|
if finish_reason is None:
|
||||||
|
finish_reason = choice["finish_reason"]
|
||||||
|
|
||||||
|
translated_choice = Choices(
|
||||||
|
finish_reason=finish_reason,
|
||||||
|
index=choice["index"],
|
||||||
|
message=translated_message,
|
||||||
|
logprobs=None,
|
||||||
|
enhancements=None,
|
||||||
|
)
|
||||||
|
|
||||||
|
transformed_choices.append(translated_choice)
|
||||||
|
|
||||||
|
return transformed_choices
|
||||||
|
|
||||||
|
def transform_response(
|
||||||
|
self,
|
||||||
|
model: str,
|
||||||
|
raw_response: httpx.Response,
|
||||||
|
model_response: ModelResponse,
|
||||||
|
logging_obj: LiteLLMLoggingObj,
|
||||||
|
request_data: dict,
|
||||||
|
messages: List[AllMessageValues],
|
||||||
|
optional_params: dict,
|
||||||
|
litellm_params: dict,
|
||||||
|
encoding: Any,
|
||||||
|
api_key: Optional[str] = None,
|
||||||
|
json_mode: Optional[bool] = None,
|
||||||
|
) -> ModelResponse:
|
||||||
|
## LOGGING
|
||||||
|
logging_obj.post_call(
|
||||||
|
input=messages,
|
||||||
|
api_key=api_key,
|
||||||
|
original_response=raw_response.text,
|
||||||
|
additional_args={"complete_input_dict": request_data},
|
||||||
|
)
|
||||||
|
|
||||||
|
## RESPONSE OBJECT
|
||||||
|
try:
|
||||||
|
completion_response = DatabricksResponse(**raw_response.json()) # type: ignore
|
||||||
|
except Exception as e:
|
||||||
|
response_headers = getattr(raw_response, "headers", None)
|
||||||
|
raise DatabricksException(
|
||||||
|
message="Unable to get json response - {}, Original Response: {}".format(
|
||||||
|
str(e), raw_response.text
|
||||||
|
),
|
||||||
|
status_code=raw_response.status_code,
|
||||||
|
headers=response_headers,
|
||||||
|
)
|
||||||
|
|
||||||
|
model_response.model = completion_response["model"]
|
||||||
|
model_response.id = completion_response["id"]
|
||||||
|
model_response.created = completion_response["created"]
|
||||||
|
setattr(model_response, "usage", Usage(**completion_response["usage"]))
|
||||||
|
|
||||||
|
model_response.choices = self._transform_choices( # type: ignore
|
||||||
|
choices=completion_response["choices"],
|
||||||
|
json_mode=json_mode,
|
||||||
|
)
|
||||||
|
|
||||||
|
return model_response
|
||||||
|
|
||||||
|
def get_model_response_iterator(
|
||||||
|
self,
|
||||||
|
streaming_response: Union[Iterator[str], AsyncIterator[str], ModelResponse],
|
||||||
|
sync_stream: bool,
|
||||||
|
json_mode: Optional[bool] = False,
|
||||||
|
):
|
||||||
|
return DatabricksChatResponseIterator(
|
||||||
|
streaming_response=streaming_response,
|
||||||
|
sync_stream=sync_stream,
|
||||||
|
json_mode=json_mode,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class DatabricksChatResponseIterator(BaseModelResponseIterator):
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
streaming_response: Union[Iterator[str], AsyncIterator[str], ModelResponse],
|
||||||
|
sync_stream: bool,
|
||||||
|
json_mode: Optional[bool] = False,
|
||||||
|
):
|
||||||
|
super().__init__(streaming_response, sync_stream)
|
||||||
|
|
||||||
|
self.json_mode = json_mode
|
||||||
|
self._last_function_name = None # Track the last seen function name
|
||||||
|
|
||||||
|
def chunk_parser(self, chunk: dict) -> ModelResponseStream:
|
||||||
|
try:
|
||||||
|
translated_choices = []
|
||||||
|
for choice in chunk["choices"]:
|
||||||
|
tool_calls = choice["delta"].get("tool_calls")
|
||||||
|
if tool_calls and self.json_mode:
|
||||||
|
# 1. Check if the function name is set and == RESPONSE_FORMAT_TOOL_NAME
|
||||||
|
# 2. If no function name, just args -> check last function name (saved via state variable)
|
||||||
|
# 3. Convert args to json
|
||||||
|
# 4. Convert json to message
|
||||||
|
# 5. Set content to message.content
|
||||||
|
# 6. Set tool_calls to None
|
||||||
|
from litellm.constants import RESPONSE_FORMAT_TOOL_NAME
|
||||||
|
from litellm.llms.base_llm.base_utils import (
|
||||||
|
_convert_tool_response_to_message,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Check if this chunk has a function name
|
||||||
|
function_name = tool_calls[0].get("function", {}).get("name")
|
||||||
|
if function_name is not None:
|
||||||
|
self._last_function_name = function_name
|
||||||
|
|
||||||
|
# If we have a saved function name that matches RESPONSE_FORMAT_TOOL_NAME
|
||||||
|
# or this chunk has the matching function name
|
||||||
|
if (
|
||||||
|
self._last_function_name == RESPONSE_FORMAT_TOOL_NAME
|
||||||
|
or function_name == RESPONSE_FORMAT_TOOL_NAME
|
||||||
|
):
|
||||||
|
# Convert tool calls to message format
|
||||||
|
message = _convert_tool_response_to_message(tool_calls)
|
||||||
|
if message is not None:
|
||||||
|
if message.content == "{}": # empty json
|
||||||
|
message.content = ""
|
||||||
|
choice["delta"]["content"] = message.content
|
||||||
|
choice["delta"]["tool_calls"] = None
|
||||||
|
|
||||||
|
# extract the content str
|
||||||
|
content_str = DatabricksConfig.extract_content_str(
|
||||||
|
choice["delta"].get("content")
|
||||||
|
)
|
||||||
|
|
||||||
|
# extract the reasoning content
|
||||||
|
(
|
||||||
|
reasoning_content,
|
||||||
|
thinking_blocks,
|
||||||
|
) = DatabricksConfig.extract_reasoning_content(
|
||||||
|
choice["delta"]["content"]
|
||||||
|
)
|
||||||
|
|
||||||
|
choice["delta"]["content"] = content_str
|
||||||
|
choice["delta"]["reasoning_content"] = reasoning_content
|
||||||
|
choice["delta"]["thinking_blocks"] = thinking_blocks
|
||||||
|
translated_choices.append(choice)
|
||||||
|
return ModelResponseStream(
|
||||||
|
id=chunk["id"],
|
||||||
|
object="chat.completion.chunk",
|
||||||
|
created=chunk["created"],
|
||||||
|
model=chunk["model"],
|
||||||
|
choices=translated_choices,
|
||||||
|
)
|
||||||
|
except KeyError as e:
|
||||||
|
raise DatabricksException(
|
||||||
|
message=f"KeyError: {e}, Got unexpected response from Databricks: {chunk}",
|
||||||
|
status_code=400,
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
raise e
|
||||||
|
|
|
@ -1,9 +1,35 @@
|
||||||
from typing import Literal, Optional, Tuple
|
from typing import Literal, Optional, Tuple
|
||||||
|
|
||||||
from .exceptions import DatabricksError
|
from litellm.llms.base_llm.chat.transformation import BaseLLMException
|
||||||
|
|
||||||
|
|
||||||
|
class DatabricksException(BaseLLMException):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
class DatabricksBase:
|
class DatabricksBase:
|
||||||
|
def _get_api_base(self, api_base: Optional[str]) -> str:
|
||||||
|
if api_base is None:
|
||||||
|
try:
|
||||||
|
from databricks.sdk import WorkspaceClient
|
||||||
|
|
||||||
|
databricks_client = WorkspaceClient()
|
||||||
|
|
||||||
|
api_base = (
|
||||||
|
api_base or f"{databricks_client.config.host}/serving-endpoints"
|
||||||
|
)
|
||||||
|
|
||||||
|
return api_base
|
||||||
|
except ImportError:
|
||||||
|
raise DatabricksException(
|
||||||
|
status_code=400,
|
||||||
|
message=(
|
||||||
|
"Either set the DATABRICKS_API_BASE and DATABRICKS_API_KEY environment variables, "
|
||||||
|
"or install the databricks-sdk Python library."
|
||||||
|
),
|
||||||
|
)
|
||||||
|
return api_base
|
||||||
|
|
||||||
def _get_databricks_credentials(
|
def _get_databricks_credentials(
|
||||||
self, api_key: Optional[str], api_base: Optional[str], headers: Optional[dict]
|
self, api_key: Optional[str], api_base: Optional[str], headers: Optional[dict]
|
||||||
) -> Tuple[str, dict]:
|
) -> Tuple[str, dict]:
|
||||||
|
@ -23,7 +49,7 @@ class DatabricksBase:
|
||||||
|
|
||||||
return api_base, headers
|
return api_base, headers
|
||||||
except ImportError:
|
except ImportError:
|
||||||
raise DatabricksError(
|
raise DatabricksException(
|
||||||
status_code=400,
|
status_code=400,
|
||||||
message=(
|
message=(
|
||||||
"If the Databricks base URL and API key are not set, the databricks-sdk "
|
"If the Databricks base URL and API key are not set, the databricks-sdk "
|
||||||
|
@ -41,9 +67,9 @@ class DatabricksBase:
|
||||||
custom_endpoint: Optional[bool],
|
custom_endpoint: Optional[bool],
|
||||||
headers: Optional[dict],
|
headers: Optional[dict],
|
||||||
) -> Tuple[str, dict]:
|
) -> Tuple[str, dict]:
|
||||||
if api_key is None and headers is None:
|
if api_key is None and not headers: # handle empty headers
|
||||||
if custom_endpoint is not None:
|
if custom_endpoint is not None:
|
||||||
raise DatabricksError(
|
raise DatabricksException(
|
||||||
status_code=400,
|
status_code=400,
|
||||||
message="Missing API Key - A call is being made to LLM Provider but no key is set either in the environment variables ({LLM_PROVIDER}_API_KEY) or via params",
|
message="Missing API Key - A call is being made to LLM Provider but no key is set either in the environment variables ({LLM_PROVIDER}_API_KEY) or via params",
|
||||||
)
|
)
|
||||||
|
@ -54,7 +80,7 @@ class DatabricksBase:
|
||||||
|
|
||||||
if api_base is None:
|
if api_base is None:
|
||||||
if custom_endpoint:
|
if custom_endpoint:
|
||||||
raise DatabricksError(
|
raise DatabricksException(
|
||||||
status_code=400,
|
status_code=400,
|
||||||
message="Missing API Base - A call is being made to LLM Provider but no api base is set either in the environment variables ({LLM_PROVIDER}_API_KEY) or via params",
|
message="Missing API Base - A call is being made to LLM Provider but no api base is set either in the environment variables ({LLM_PROVIDER}_API_KEY) or via params",
|
||||||
)
|
)
|
||||||
|
|
Some files were not shown because too many files have changed in this diff Show more
Loading…
Add table
Add a link
Reference in a new issue