Merge branch 'main' into litellm_embedding_caching_updates
|
@ -40,7 +40,9 @@ jobs:
|
|||
pip install "httpx==0.24.1"
|
||||
pip install "gunicorn==21.2.0"
|
||||
pip install "anyio==3.7.1"
|
||||
pip install "aiodynamo==23.10.1"
|
||||
pip install "asyncio==3.4.3"
|
||||
pip install "apscheduler==3.10.4"
|
||||
pip install "PyGithub==1.59.1"
|
||||
- save_cache:
|
||||
paths:
|
||||
|
@ -96,6 +98,43 @@ jobs:
|
|||
command: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y docker-ce docker-ce-cli containerd.io
|
||||
- run:
|
||||
name: Install Python 3.9
|
||||
command: |
|
||||
curl https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh --output miniconda.sh
|
||||
bash miniconda.sh -b -p $HOME/miniconda
|
||||
export PATH="$HOME/miniconda/bin:$PATH"
|
||||
conda init bash
|
||||
source ~/.bashrc
|
||||
conda create -n myenv python=3.9 -y
|
||||
conda activate myenv
|
||||
python --version
|
||||
- run:
|
||||
name: Install Dependencies
|
||||
command: |
|
||||
pip install "pytest==7.3.1"
|
||||
pip install "pytest-asyncio==0.21.1"
|
||||
pip install aiohttp
|
||||
pip install openai
|
||||
python -m pip install --upgrade pip
|
||||
python -m pip install -r .circleci/requirements.txt
|
||||
pip install "pytest==7.3.1"
|
||||
pip install "pytest-asyncio==0.21.1"
|
||||
pip install mypy
|
||||
pip install "google-generativeai>=0.3.2"
|
||||
pip install "google-cloud-aiplatform>=1.38.0"
|
||||
pip install "boto3>=1.28.57"
|
||||
pip install langchain
|
||||
pip install "langfuse>=2.0.0"
|
||||
pip install numpydoc
|
||||
pip install prisma
|
||||
pip install "httpx==0.24.1"
|
||||
pip install "gunicorn==21.2.0"
|
||||
pip install "anyio==3.7.1"
|
||||
pip install "aiodynamo==23.10.1"
|
||||
pip install "asyncio==3.4.3"
|
||||
pip install "PyGithub==1.59.1"
|
||||
# Run pytest and generate JUnit XML report
|
||||
- run:
|
||||
name: Build Docker image
|
||||
command: docker build -t my-app:latest -f Dockerfile.database .
|
||||
|
@ -105,15 +144,20 @@ jobs:
|
|||
docker run -d \
|
||||
-p 4000:4000 \
|
||||
-e DATABASE_URL=$PROXY_DOCKER_DB_URL \
|
||||
-e AZURE_API_KEY=$AZURE_FRANCE_API_KEY \
|
||||
-e AZURE_API_KEY=$AZURE_API_KEY \
|
||||
-e AZURE_FRANCE_API_KEY=$AZURE_FRANCE_API_KEY \
|
||||
-e AZURE_EUROPE_API_KEY=$AZURE_EUROPE_API_KEY \
|
||||
-e AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID \
|
||||
-e AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY \
|
||||
-e AWS_REGION_NAME=$AWS_REGION_NAME \
|
||||
--name my-app \
|
||||
-v $(pwd)/proxy_server_config.yaml:/app/config.yaml \
|
||||
my-app:latest \
|
||||
--config /app/config.yaml \
|
||||
--port 4000 \
|
||||
--num_workers 8
|
||||
--num_workers 8 \
|
||||
--detailed_debug \
|
||||
--run_gunicorn \
|
||||
- run:
|
||||
name: Install curl and dockerize
|
||||
command: |
|
||||
|
@ -124,63 +168,22 @@ jobs:
|
|||
sudo rm dockerize-linux-amd64-v0.6.1.tar.gz
|
||||
- run:
|
||||
name: Start outputting logs
|
||||
command: |
|
||||
while true; do
|
||||
docker logs my-app
|
||||
sleep 10
|
||||
done
|
||||
command: docker logs -f my-app
|
||||
background: true
|
||||
- run:
|
||||
name: Wait for app to be ready
|
||||
command: dockerize -wait http://localhost:4000 -timeout 1m
|
||||
- run:
|
||||
name: Test the application
|
||||
name: Run tests
|
||||
command: |
|
||||
mkdir -p /tmp/responses
|
||||
for i in {1..10}; do
|
||||
status_file="/tmp/responses/status_${i}.txt"
|
||||
response_file="/tmp/responses/response_${i}.json"
|
||||
pwd
|
||||
ls
|
||||
python -m pytest -vv tests/ -x --junitxml=test-results/junit.xml --durations=5
|
||||
no_output_timeout: 120m
|
||||
|
||||
(curl --location --request POST 'http://0.0.0.0:4000/key/generate' \
|
||||
--header 'Authorization: Bearer sk-1234' \
|
||||
--header 'Content-Type: application/json' \
|
||||
--data '{"models": ["azure-models"], "aliases": {"mistral-7b": "gpt-3.5-turbo"}, "duration": null}' \
|
||||
--silent --output "${response_file}" --write-out '%{http_code}' > "${status_file}") &
|
||||
|
||||
# Capture PIDs of background processes
|
||||
pids[${i}]=$!
|
||||
done
|
||||
|
||||
# Wait for all background processes to finish
|
||||
for pid in ${pids[*]}; do
|
||||
wait $pid
|
||||
done
|
||||
|
||||
# Check all responses and status codes
|
||||
fail=false
|
||||
for i in {1..10}; do
|
||||
status=$(cat "/tmp/responses/status_${i}.txt")
|
||||
|
||||
# Here, we need to set the correct response file path for each iteration
|
||||
response_file="/tmp/responses/response_${i}.json" # This was missing in the provided script
|
||||
|
||||
response=$(cat "${response_file}")
|
||||
echo "Response ${i} (Status code: ${status}):"
|
||||
echo "${response}" # Use echo here to print the contents
|
||||
echo # Additional newline for readability
|
||||
|
||||
if [ "$status" -ne 200 ]; then
|
||||
echo "A request did not return a 200 status code: $status"
|
||||
fail=true
|
||||
fi
|
||||
done
|
||||
|
||||
# If any request did not return status code 200, fail the job
|
||||
if [ "$fail" = true ]; then
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "All requests returned a 200 status code."
|
||||
# Store test results
|
||||
- store_test_results:
|
||||
path: test-results
|
||||
|
||||
publish_to_pypi:
|
||||
docker:
|
||||
|
|
33
.github/workflows/ghcr_deploy.yml
vendored
|
@ -41,6 +41,7 @@ jobs:
|
|||
push: true
|
||||
file: Dockerfile.database
|
||||
tags: litellm/litellm-database:${{ github.event.inputs.tag || 'latest' }}
|
||||
|
||||
build-and-push-image:
|
||||
runs-on: ubuntu-latest
|
||||
# Sets the permissions granted to the `GITHUB_TOKEN` for the actions in this job.
|
||||
|
@ -74,7 +75,9 @@ jobs:
|
|||
push: true
|
||||
tags: ${{ steps.meta.outputs.tags }}-${{ github.event.inputs.tag || 'latest' }}, ${{ steps.meta.outputs.tags }}-latest # if a tag is provided, use that, otherwise use the release tag, and if neither is available, use 'latest'
|
||||
labels: ${{ steps.meta.outputs.labels }}
|
||||
build-and-push-image-alpine:
|
||||
platform: local, linux/amd64,linux/arm64,linux/arm64/v8
|
||||
|
||||
build-and-push-image-ui:
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
contents: read
|
||||
|
@ -90,20 +93,21 @@ jobs:
|
|||
username: ${{ github.actor }}
|
||||
password: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
- name: Extract metadata (tags, labels) for Alpine Dockerfile
|
||||
id: meta-alpine
|
||||
- name: Extract metadata (tags, labels) for UI Dockerfile
|
||||
id: meta-ui
|
||||
uses: docker/metadata-action@9ec57ed1fcdbf14dcef7dfbe97b2010124a938b7
|
||||
with:
|
||||
images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}-alpine
|
||||
images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}-ui
|
||||
|
||||
- name: Build and push Alpine Docker image
|
||||
- name: Build and push UI Docker image
|
||||
uses: docker/build-push-action@f2a1d5e99d037542a71f64918e516c093c6f3fc4
|
||||
with:
|
||||
context: .
|
||||
file: Dockerfile.alpine
|
||||
context: ui/
|
||||
file: ui/Dockerfile
|
||||
push: true
|
||||
tags: ${{ steps.meta-alpine.outputs.tags }}-${{ github.event.inputs.tag || 'latest' }}, ${{ steps.meta-alpine.outputs.tags }}-latest
|
||||
labels: ${{ steps.meta-alpine.outputs.labels }}
|
||||
tags: ${{ steps.meta-ui.outputs.tags }}-${{ github.event.inputs.tag || 'latest' }}, ${{ steps.meta-ui.outputs.tags }}-latest
|
||||
labels: ${{ steps.meta-ui.outputs.labels }}
|
||||
platform: local, linux/amd64,linux/arm64,linux/arm64/v8
|
||||
build-and-push-image-database:
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
|
@ -168,3 +172,14 @@ jobs:
|
|||
} catch (error) {
|
||||
core.setFailed(error.message);
|
||||
}
|
||||
- name: Github Releases To Discord
|
||||
uses: SethCohen/github-releases-to-discord@v1.13.1
|
||||
with:
|
||||
webhook_url: ${{ secrets.WEBHOOK_URL }}
|
||||
color: "2105893"
|
||||
username: "Release Changelog"
|
||||
avatar_url: "https://cdn.discordapp.com/avatars/487431320314576937/bd64361e4ba6313d561d54e78c9e7171.png"
|
||||
content: "||@everyone||"
|
||||
footer_title: "Changelog"
|
||||
footer_icon_url: "https://cdn.discordapp.com/avatars/487431320314576937/bd64361e4ba6313d561d54e78c9e7171.png"
|
||||
footer_timestamp: true
|
8
.gitignore
vendored
|
@ -32,3 +32,11 @@ proxy_server_config_@.yaml
|
|||
proxy_server_config_2.yaml
|
||||
litellm/proxy/secret_managers/credentials.json
|
||||
hosted_config.yaml
|
||||
litellm/proxy/tests/node_modules
|
||||
litellm/proxy/tests/package.json
|
||||
litellm/proxy/tests/package-lock.json
|
||||
ui/litellm-dashboard/.next
|
||||
ui/litellm-dashboard/node_modules
|
||||
ui/litellm-dashboard/next-env.d.ts
|
||||
ui/litellm-dashboard/package.json
|
||||
ui/litellm-dashboard/package-lock.json
|
|
@ -52,4 +52,4 @@ RUN chmod +x entrypoint.sh
|
|||
EXPOSE 4000/tcp
|
||||
|
||||
ENTRYPOINT ["litellm"]
|
||||
CMD ["--port", "4000"]
|
||||
CMD ["--port", "4000", "--config", "./proxy_server_config.yaml", "--detailed_debug", "--run_gunicorn"]
|
|
@ -56,4 +56,4 @@ EXPOSE 4000/tcp
|
|||
# # Set your entrypoint and command
|
||||
|
||||
ENTRYPOINT ["litellm"]
|
||||
CMD ["--port", "4000"]
|
||||
CMD ["--port", "4000", "--run_gunicorn"]
|
||||
|
|
2
cookbook/misc/dev_release.txt
Normal file
|
@ -0,0 +1,2 @@
|
|||
python3 -m build
|
||||
twine upload --verbose dist/litellm-1.18.13.dev4.tar.gz -u __token__ -
|
34
cookbook/misc/openai_timeouts.py
Normal file
|
@ -0,0 +1,34 @@
|
|||
import os
|
||||
from openai import OpenAI
|
||||
from dotenv import load_dotenv
|
||||
import httpx
|
||||
import concurrent.futures
|
||||
|
||||
load_dotenv()
|
||||
|
||||
client = OpenAI(
|
||||
# This is the default and can be omitted
|
||||
api_key=os.environ.get("OPENAI_API_KEY"),
|
||||
)
|
||||
|
||||
|
||||
def create_chat_completion():
|
||||
return client.chat.completions.create(
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Say this is a test. Respond in 20 lines",
|
||||
}
|
||||
],
|
||||
model="gpt-3.5-turbo",
|
||||
)
|
||||
|
||||
|
||||
with concurrent.futures.ThreadPoolExecutor() as executor:
|
||||
# Set a timeout of 10 seconds
|
||||
future = executor.submit(create_chat_completion)
|
||||
try:
|
||||
chat_completion = future.result(timeout=0.00001)
|
||||
print(chat_completion)
|
||||
except concurrent.futures.TimeoutError:
|
||||
print("Operation timed out.")
|
61
cookbook/misc/sagmaker_streaming.py
Normal file
|
@ -0,0 +1,61 @@
|
|||
# Notes - on how to do sagemaker streaming using boto3
|
||||
import json
|
||||
import boto3
|
||||
|
||||
import sys, os
|
||||
import traceback
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv()
|
||||
import os, io
|
||||
|
||||
sys.path.insert(
|
||||
0, os.path.abspath("../..")
|
||||
) # Adds the parent directory to the system path
|
||||
import pytest
|
||||
import litellm
|
||||
|
||||
import io
|
||||
import json
|
||||
|
||||
|
||||
class TokenIterator:
|
||||
def __init__(self, stream):
|
||||
self.byte_iterator = iter(stream)
|
||||
self.buffer = io.BytesIO()
|
||||
self.read_pos = 0
|
||||
|
||||
def __iter__(self):
|
||||
return self
|
||||
|
||||
def __next__(self):
|
||||
while True:
|
||||
self.buffer.seek(self.read_pos)
|
||||
line = self.buffer.readline()
|
||||
if line and line[-1] == ord("\n"):
|
||||
self.read_pos += len(line) + 1
|
||||
full_line = line[:-1].decode("utf-8")
|
||||
line_data = json.loads(full_line.lstrip("data:").rstrip("/n"))
|
||||
return line_data["token"]["text"]
|
||||
chunk = next(self.byte_iterator)
|
||||
self.buffer.seek(0, io.SEEK_END)
|
||||
self.buffer.write(chunk["PayloadPart"]["Bytes"])
|
||||
|
||||
|
||||
payload = {
|
||||
"inputs": "How do I build a website?",
|
||||
"parameters": {"max_new_tokens": 256},
|
||||
"stream": True,
|
||||
}
|
||||
|
||||
import boto3
|
||||
|
||||
client = boto3.client("sagemaker-runtime", region_name="us-west-2")
|
||||
response = client.invoke_endpoint_with_response_stream(
|
||||
EndpointName="berri-benchmarking-Llama-2-70b-chat-hf-4",
|
||||
Body=json.dumps(payload),
|
||||
ContentType="application/json",
|
||||
)
|
||||
|
||||
# for token in TokenIterator(response["Body"]):
|
||||
# print(token)
|
BIN
dist/litellm-1.16.21.dev1-py3-none-any.whl
vendored
Normal file
BIN
dist/litellm-1.16.21.dev1.tar.gz
vendored
Normal file
BIN
dist/litellm-1.16.21.dev2-py3-none-any.whl
vendored
Normal file
BIN
dist/litellm-1.16.21.dev2.tar.gz
vendored
Normal file
BIN
dist/litellm-1.16.21.dev3-py3-none-any.whl
vendored
Normal file
BIN
dist/litellm-1.16.21.dev3.tar.gz
vendored
Normal file
|
@ -1,12 +0,0 @@
|
|||
version: "3.9"
|
||||
services:
|
||||
litellm:
|
||||
image: ghcr.io/berriai/litellm:main
|
||||
ports:
|
||||
- "8000:8000" # Map the container port to the host, change the host port if necessary
|
||||
volumes:
|
||||
- ./litellm-config.yaml:/app/config.yaml # Mount the local configuration file
|
||||
# You can change the port or number of workers as per your requirements or pass any new supported CLI augument. Make sure the port passed here matches with the container port defined above in `ports` value
|
||||
command: [ "--config", "/app/config.yaml", "--port", "8000", "--num_workers", "8" ]
|
||||
|
||||
# ...rest of your docker-compose config if any
|
15
docker-compose.yml
Normal file
|
@ -0,0 +1,15 @@
|
|||
version: "3.9"
|
||||
services:
|
||||
litellm:
|
||||
image: ghcr.io/berriai/litellm:main-latest
|
||||
volumes:
|
||||
- ./proxy_server_config.yaml:/app/proxy_server_config.yaml # mount your litellm config.yaml
|
||||
ports:
|
||||
- "4000:4000"
|
||||
environment:
|
||||
- AZURE_API_KEY=sk-123
|
||||
litellm-ui:
|
||||
image: ghcr.io/berriai/litellm-ui:main-latest
|
||||
|
||||
|
||||
|
|
@ -204,6 +204,7 @@ def __init__(
|
|||
s3_bucket_name: Optional[str] = None,
|
||||
s3_region_name: Optional[str] = None,
|
||||
s3_api_version: Optional[str] = None,
|
||||
s3_path: Optional[str] = None, # if you wish to save to a spefic path
|
||||
s3_use_ssl: Optional[bool] = True,
|
||||
s3_verify: Optional[Union[bool, str]] = None,
|
||||
s3_endpoint_url: Optional[str] = None,
|
||||
|
|
|
@ -150,5 +150,12 @@ litellm.register_model(model_cost=
|
|||
"https://raw.githubusercontent.com/BerriAI/litellm/main/model_prices_and_context_window.json")
|
||||
```
|
||||
|
||||
**Don't pull hosted model_cost_map**
|
||||
If you have firewalls, and want to just use the local copy of the model cost map, you can do so like this:
|
||||
```bash
|
||||
export LITELLM_LOCAL_MODEL_COST_MAP="True"
|
||||
```
|
||||
|
||||
Note: this means you will need to upgrade to get updated pricing, and newer models.
|
||||
|
||||
|
||||
|
|
|
@ -13,8 +13,8 @@ response = embedding(model='text-embedding-ada-002', input=["good morning from l
|
|||
|
||||
- `model`: *string* - ID of the model to use. `model='text-embedding-ada-002'`
|
||||
|
||||
- `input`: *array* - Input text to embed, encoded as a string or array of tokens. To embed multiple inputs in a single request, pass an array of strings or array of token arrays. The input must not exceed the max input tokens for the model (8192 tokens for text-embedding-ada-002), cannot be an empty string, and any array must be 2048 dimensions or less.
|
||||
```
|
||||
- `input`: *string or array* - Input text to embed, encoded as a string or array of tokens. To embed multiple inputs in a single request, pass an array of strings or array of token arrays. The input must not exceed the max input tokens for the model (8192 tokens for text-embedding-ada-002), cannot be an empty string, and any array must be 2048 dimensions or less.
|
||||
```python
|
||||
input=["good morning from litellm"]
|
||||
```
|
||||
|
||||
|
@ -22,7 +22,11 @@ input=["good morning from litellm"]
|
|||
|
||||
- `user`: *string (optional)* A unique identifier representing your end-user,
|
||||
|
||||
- `timeout`: *integer* - The maximum time, in seconds, to wait for the API to respond. Defaults to 600 seconds (10 minutes).
|
||||
- `dimensions`: *integer (Optional)* The number of dimensions the resulting output embeddings should have. Only supported in OpenAI/Azure text-embedding-3 and later models.
|
||||
|
||||
- `encoding_format`: *string (Optional)* The format to return the embeddings in. Can be either `"float"` or `"base64"`. Defaults to `encoding_format="float"`
|
||||
|
||||
- `timeout`: *integer (Optional)* - The maximum time, in seconds, to wait for the API to respond. Defaults to 600 seconds (10 minutes).
|
||||
|
||||
- `api_base`: *string (optional)* - The api endpoint you want to call the model with
|
||||
|
||||
|
@ -66,11 +70,18 @@ input=["good morning from litellm"]
|
|||
from litellm import embedding
|
||||
import os
|
||||
os.environ['OPENAI_API_KEY'] = ""
|
||||
response = embedding('text-embedding-ada-002', input=["good morning from litellm"])
|
||||
response = embedding(
|
||||
model="text-embedding-3-small",
|
||||
input=["good morning from litellm", "this is another item"],
|
||||
metadata={"anything": "good day"},
|
||||
dimensions=5 # Only supported in text-embedding-3 and later models.
|
||||
)
|
||||
```
|
||||
|
||||
| Model Name | Function Call | Required OS Variables |
|
||||
|----------------------|---------------------------------------------|--------------------------------------|
|
||||
| text-embedding-3-small | `embedding('text-embedding-3-small', input)` | `os.environ['OPENAI_API_KEY']` |
|
||||
| text-embedding-3-large | `embedding('text-embedding-3-large', input)` | `os.environ['OPENAI_API_KEY']` |
|
||||
| text-embedding-ada-002 | `embedding('text-embedding-ada-002', input)` | `os.environ['OPENAI_API_KEY']` |
|
||||
|
||||
## Azure OpenAI Embedding Models
|
||||
|
|
|
@ -57,7 +57,7 @@ print(f"response: {response}")
|
|||
|
||||
- `api_type`: *string (optional)* - The type of API to use.
|
||||
|
||||
### Output from `litellm.embedding()`
|
||||
### Output from `litellm.image_generation()`
|
||||
|
||||
```json
|
||||
|
||||
|
@ -85,7 +85,7 @@ response = image_generation(model='dall-e-2', prompt="cute baby otter")
|
|||
| Model Name | Function Call | Required OS Variables |
|
||||
|----------------------|---------------------------------------------|--------------------------------------|
|
||||
| dall-e-2 | `image_generation(model='dall-e-2', prompt="cute baby otter")` | `os.environ['OPENAI_API_KEY']` |
|
||||
| dall-e-3 | `image_generation(model='dall-e-2', prompt="cute baby otter")` | `os.environ['OPENAI_API_KEY']` |
|
||||
| dall-e-3 | `image_generation(model='dall-e-3', prompt="cute baby otter")` | `os.environ['OPENAI_API_KEY']` |
|
||||
|
||||
## Azure OpenAI Image Generation Models
|
||||
|
||||
|
@ -130,4 +130,24 @@ response = image_generation(
|
|||
api_base="http://0.0.0.0:8000/" # set API Base of your Custom OpenAI Endpoint
|
||||
prompt="cute baby otter"
|
||||
)
|
||||
```
|
||||
|
||||
## Bedrock - Stable Diffusion
|
||||
Use this for stable diffusion on bedrock
|
||||
|
||||
|
||||
### Usage
|
||||
```python
|
||||
import os
|
||||
from litellm import image_generation
|
||||
|
||||
os.environ["AWS_ACCESS_KEY_ID"] = ""
|
||||
os.environ["AWS_SECRET_ACCESS_KEY"] = ""
|
||||
os.environ["AWS_REGION_NAME"] = ""
|
||||
|
||||
response = image_generation(
|
||||
prompt="A cute baby sea otter",
|
||||
model="bedrock/stability.stable-diffusion-xl-v0",
|
||||
)
|
||||
print(f"response: {response}")
|
||||
```
|
|
@ -28,6 +28,8 @@ import litellm
|
|||
import os
|
||||
|
||||
os.environ["LANGSMITH_API_KEY"] = ""
|
||||
os.environ["LANGSMITH_PROJECT"] = "" # defaults to litellm-completion
|
||||
os.environ["LANGSMITH_DEFAULT_RUN_NAME"] = "" # defaults to LLMRun
|
||||
# LLM API Keys
|
||||
os.environ['OPENAI_API_KEY']=""
|
||||
|
||||
|
|
|
@ -116,6 +116,57 @@ response = completion(
|
|||
|
||||
```
|
||||
|
||||
### Usage - with Azure Vision enhancements
|
||||
|
||||
Note: **Azure requires the `base_url` to be set with `/extensions`**
|
||||
|
||||
Example
|
||||
```python
|
||||
base_url=https://gpt-4-vision-resource.openai.azure.com/openai/deployments/gpt-4-vision/extensions
|
||||
# base_url="{azure_endpoint}/openai/deployments/{azure_deployment}/extensions"
|
||||
```
|
||||
|
||||
**Usage**
|
||||
```python
|
||||
import os
|
||||
from litellm import completion
|
||||
|
||||
os.environ["AZURE_API_KEY"] = "your-api-key"
|
||||
|
||||
# azure call
|
||||
response = completion(
|
||||
model="azure/gpt-4-vision",
|
||||
timeout=5,
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": "Whats in this image?"},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": "https://avatars.githubusercontent.com/u/29436595?v=4"
|
||||
},
|
||||
},
|
||||
],
|
||||
}
|
||||
],
|
||||
base_url="https://gpt-4-vision-resource.openai.azure.com/openai/deployments/gpt-4-vision/extensions",
|
||||
api_key=os.getenv("AZURE_VISION_API_KEY"),
|
||||
enhancements={"ocr": {"enabled": True}, "grounding": {"enabled": True}},
|
||||
dataSources=[
|
||||
{
|
||||
"type": "AzureComputerVision",
|
||||
"parameters": {
|
||||
"endpoint": "https://gpt-4-vision-enhancement.cognitiveservices.azure.com/",
|
||||
"key": os.environ["AZURE_VISION_ENHANCE_KEY"],
|
||||
},
|
||||
}
|
||||
],
|
||||
)
|
||||
```
|
||||
|
||||
|
||||
## Advanced
|
||||
### Azure API Load-Balancing
|
||||
|
||||
|
|
|
@ -195,6 +195,81 @@ response = completion(
|
|||
)
|
||||
```
|
||||
|
||||
### SSO Login (AWS Profile)
|
||||
- Set `AWS_PROFILE` environment variable
|
||||
- Make bedrock completion call
|
||||
```python
|
||||
import os
|
||||
from litellm import completion
|
||||
|
||||
response = completion(
|
||||
model="bedrock/anthropic.claude-instant-v1",
|
||||
messages=[{ "content": "Hello, how are you?","role": "user"}]
|
||||
)
|
||||
```
|
||||
|
||||
### STS based Auth
|
||||
|
||||
- Set `aws_role_name` and `aws_session_name` in completion() / embedding() function
|
||||
|
||||
Make the bedrock completion call
|
||||
```python
|
||||
from litellm import completion
|
||||
|
||||
response = completion(
|
||||
model="bedrock/anthropic.claude-instant-v1",
|
||||
messages=messages,
|
||||
max_tokens=10,
|
||||
temperature=0.1,
|
||||
aws_role_name=aws_role_name,
|
||||
aws_session_name="my-test-session",
|
||||
)
|
||||
```
|
||||
|
||||
If you also need to dynamically set the aws user accessing the role, add the additional args in the completion()/embedding() function
|
||||
|
||||
```python
|
||||
from litellm import completion
|
||||
|
||||
response = completion(
|
||||
model="bedrock/anthropic.claude-instant-v1",
|
||||
messages=messages,
|
||||
max_tokens=10,
|
||||
temperature=0.1,
|
||||
aws_region_name=aws_region_name,
|
||||
aws_access_key_id=aws_access_key_id,
|
||||
aws_secret_access_key=aws_secret_access_key,
|
||||
aws_role_name=aws_role_name,
|
||||
aws_session_name="my-test-session",
|
||||
)
|
||||
```
|
||||
|
||||
## Provisioned throughput models
|
||||
To use provisioned throughput Bedrock models pass
|
||||
- `model=bedrock/<base-model>`, example `model=bedrock/anthropic.claude-v2`. Set `model` to any of the [Supported AWS models](#supported-aws-bedrock-models)
|
||||
- `model_id=provisioned-model-arn`
|
||||
|
||||
Completion
|
||||
```python
|
||||
import litellm
|
||||
response = litellm.completion(
|
||||
model="bedrock/anthropic.claude-instant-v1",
|
||||
model_id="provisioned-model-arn",
|
||||
messages=[{"content": "Hello, how are you?", "role": "user"}]
|
||||
)
|
||||
```
|
||||
|
||||
Embedding
|
||||
```python
|
||||
import litellm
|
||||
response = litellm.embedding(
|
||||
model="bedrock/amazon.titan-embed-text-v1",
|
||||
model_id="provisioned-model-arn",
|
||||
input=["hi"],
|
||||
)
|
||||
```
|
||||
|
||||
|
||||
## Supported AWS Bedrock Models
|
||||
Here's an example of using a bedrock model with LiteLLM
|
||||
|
||||
|
@ -240,3 +315,50 @@ print(response)
|
|||
| Titan Embeddings - G1 | `embedding(model="bedrock/amazon.titan-embed-text-v1", input=input)` |
|
||||
| Cohere Embeddings - English | `embedding(model="bedrock/cohere.embed-english-v3", input=input)` |
|
||||
| Cohere Embeddings - Multilingual | `embedding(model="bedrock/cohere.embed-multilingual-v3", input=input)` |
|
||||
|
||||
## Image Generation
|
||||
Use this for stable diffusion on bedrock
|
||||
|
||||
|
||||
### Usage
|
||||
```python
|
||||
import os
|
||||
from litellm import image_generation
|
||||
|
||||
os.environ["AWS_ACCESS_KEY_ID"] = ""
|
||||
os.environ["AWS_SECRET_ACCESS_KEY"] = ""
|
||||
os.environ["AWS_REGION_NAME"] = ""
|
||||
|
||||
response = image_generation(
|
||||
prompt="A cute baby sea otter",
|
||||
model="bedrock/stability.stable-diffusion-xl-v0",
|
||||
)
|
||||
print(f"response: {response}")
|
||||
```
|
||||
|
||||
**Set optional params**
|
||||
```python
|
||||
import os
|
||||
from litellm import image_generation
|
||||
|
||||
os.environ["AWS_ACCESS_KEY_ID"] = ""
|
||||
os.environ["AWS_SECRET_ACCESS_KEY"] = ""
|
||||
os.environ["AWS_REGION_NAME"] = ""
|
||||
|
||||
response = image_generation(
|
||||
prompt="A cute baby sea otter",
|
||||
model="bedrock/stability.stable-diffusion-xl-v0",
|
||||
### OPENAI-COMPATIBLE ###
|
||||
size="128x512", # width=128, height=512
|
||||
### PROVIDER-SPECIFIC ### see `AmazonStabilityConfig` in bedrock.py for all params
|
||||
seed=30
|
||||
)
|
||||
print(f"response: {response}")
|
||||
```
|
||||
|
||||
## Supported AWS Bedrock Image Generation Models
|
||||
|
||||
| Model Name | Function Call |
|
||||
|----------------------|---------------------------------------------|
|
||||
| Stable Diffusion - v0 | `embedding(model="bedrock/stability.stable-diffusion-xl-v0", prompt=prompt)` |
|
||||
| Stable Diffusion - v0 | `embedding(model="bedrock/stability.stable-diffusion-xl-v1", prompt=prompt)` |
|
|
@ -6,7 +6,7 @@
|
|||
# Gemini-Pro
|
||||
## Sample Usage
|
||||
```python
|
||||
import litellm
|
||||
from litellm import completion
|
||||
import os
|
||||
|
||||
os.environ['GEMINI_API_KEY'] = ""
|
||||
|
@ -24,7 +24,7 @@ LiteLLM Supports the following image types passed in `url`
|
|||
## Sample Usage
|
||||
```python
|
||||
import os
|
||||
import litellm
|
||||
import litellm
|
||||
from dotenv import load_dotenv
|
||||
|
||||
# Load the environment variables from .env file
|
||||
|
|
|
@ -34,6 +34,7 @@ os.environ["OPENAI_API_BASE"] = "openaiai-api-base" # OPTIONAL
|
|||
|
||||
| Model Name | Function Call |
|
||||
|-----------------------|-----------------------------------------------------------------|
|
||||
| gpt-4-0125-preview | `response = completion(model="gpt-4-0125-preview", messages=messages)` |
|
||||
| gpt-4-1106-preview | `response = completion(model="gpt-4-1106-preview", messages=messages)` |
|
||||
| gpt-3.5-turbo-1106 | `response = completion(model="gpt-3.5-turbo-1106", messages=messages)` |
|
||||
| gpt-3.5-turbo | `response = completion(model="gpt-3.5-turbo", messages=messages)` |
|
||||
|
@ -173,6 +174,31 @@ response = completion(
|
|||
messages=[{ "content": "Hello, how are you?","role": "user"}]
|
||||
)
|
||||
```
|
||||
|
||||
### Set `ssl_verify=False`
|
||||
|
||||
This is done by setting your own `httpx.Client`
|
||||
|
||||
- For `litellm.completion` set `litellm.client_session=httpx.Client(verify=False)`
|
||||
- For `litellm.acompletion` set `litellm.aclient_session=AsyncClient.Client(verify=False)`
|
||||
```python
|
||||
import litellm, httpx
|
||||
|
||||
# for completion
|
||||
litellm.client_session = httpx.Client(verify=False)
|
||||
response = litellm.completion(
|
||||
model="gpt-3.5-turbo",
|
||||
messages=messages,
|
||||
)
|
||||
|
||||
# for acompletion
|
||||
litellm.aclient_session = httpx.AsyncClient(verify=False)
|
||||
response = litellm.acompletion(
|
||||
model="gpt-3.5-turbo",
|
||||
messages=messages,
|
||||
)
|
||||
```
|
||||
|
||||
### Using Helicone Proxy with LiteLLM
|
||||
```python
|
||||
import os
|
||||
|
|
|
@ -5,7 +5,7 @@
|
|||
|
||||
## Sample Usage
|
||||
```python
|
||||
import litellm
|
||||
from litellm import completion
|
||||
import os
|
||||
|
||||
os.environ['PALM_API_KEY'] = ""
|
||||
|
@ -17,7 +17,7 @@ response = completion(
|
|||
|
||||
## Sample Usage - Streaming
|
||||
```python
|
||||
import litellm
|
||||
from litellm import completion
|
||||
import os
|
||||
|
||||
os.environ['PALM_API_KEY'] = ""
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
# VertexAI - Google [Gemini]
|
||||
# VertexAI - Google [Gemini, Model Garden]
|
||||
|
||||
<a target="_blank" href="https://colab.research.google.com/github/BerriAI/litellm/blob/main/cookbook/liteLLM_VertextAI_Example.ipynb">
|
||||
<img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
|
||||
|
@ -17,7 +17,28 @@ import litellm
|
|||
litellm.vertex_project = "hardy-device-38811" # Your Project ID
|
||||
litellm.vertex_location = "us-central1" # proj location
|
||||
|
||||
response = completion(model="gemini-pro", messages=[{"role": "user", "content": "write code for saying hi from LiteLLM"}])
|
||||
response = litellm.completion(model="gemini-pro", messages=[{"role": "user", "content": "write code for saying hi from LiteLLM"}])
|
||||
```
|
||||
|
||||
## OpenAI Proxy Usage
|
||||
|
||||
1. Modify the config.yaml
|
||||
|
||||
```yaml
|
||||
litellm_settings:
|
||||
vertex_project: "hardy-device-38811" # Your Project ID
|
||||
vertex_location: "us-central1" # proj location
|
||||
|
||||
model_list:
|
||||
-model_name: team1-gemini-pro
|
||||
litellm_params:
|
||||
model: gemini-pro
|
||||
```
|
||||
|
||||
2. Start the proxy
|
||||
|
||||
```bash
|
||||
$ litellm --config /path/to/config.yaml
|
||||
```
|
||||
|
||||
## Set Vertex Project & Vertex Location
|
||||
|
@ -46,16 +67,39 @@ os.environ["VERTEXAI_LOCATION"] = "us-central1 # Your Location
|
|||
# set directly on module
|
||||
litellm.vertex_location = "us-central1 # Your Location
|
||||
```
|
||||
## Model Garden
|
||||
| Model Name | Function Call |
|
||||
|------------------|--------------------------------------|
|
||||
| llama2 | `completion('vertex_ai/<endpoint_id>', messages)` |
|
||||
|
||||
#### Using Model Garden
|
||||
|
||||
```python
|
||||
from litellm import completion
|
||||
import os
|
||||
|
||||
## set ENV variables
|
||||
os.environ["VERTEXAI_PROJECT"] = "hardy-device-38811"
|
||||
os.environ["VERTEXAI_LOCATION"] = "us-central1"
|
||||
|
||||
response = completion(
|
||||
model="vertex_ai/<your-endpoint-id>",
|
||||
messages=[{ "content": "Hello, how are you?","role": "user"}]
|
||||
)
|
||||
```
|
||||
|
||||
## Gemini Pro
|
||||
| Model Name | Function Call |
|
||||
|------------------|--------------------------------------|
|
||||
| gemini-pro | `completion('gemini-pro', messages)` |
|
||||
| gemini-pro | `completion('gemini-pro', messages)`, `completion('vertex_ai/gemini-pro', messages)` |
|
||||
|
||||
## Gemini Pro Vision
|
||||
| Model Name | Function Call |
|
||||
|------------------|--------------------------------------|
|
||||
| gemini-pro-vision | `completion('gemini-pro-vision', messages)` |
|
||||
| gemini-pro-vision | `completion('gemini-pro-vision', messages)`, `completion('vertex_ai/gemini-pro-vision', messages)`|
|
||||
|
||||
|
||||
|
||||
|
||||
#### Using Gemini Pro Vision
|
||||
|
||||
|
@ -93,6 +137,7 @@ response = litellm.completion(
|
|||
print(response)
|
||||
```
|
||||
|
||||
|
||||
## Chat Models
|
||||
| Model Name | Function Call |
|
||||
|------------------|--------------------------------------|
|
||||
|
|
|
@ -11,7 +11,7 @@ pip install litellm vllm
|
|||
```python
|
||||
import litellm
|
||||
|
||||
response = completion(
|
||||
response = litellm.completion(
|
||||
model="vllm/facebook/opt-125m", # add a vllm prefix so litellm knows the custom_llm_provider==vllm
|
||||
messages=messages,
|
||||
temperature=0.2,
|
||||
|
@ -29,7 +29,7 @@ In order to use litellm to call a hosted vllm server add the following to your c
|
|||
```python
|
||||
import litellm
|
||||
|
||||
response = completion(
|
||||
response = litellm.completion(
|
||||
model="openai/facebook/opt-125m", # pass the vllm model name
|
||||
messages=messages,
|
||||
api_base="https://hosted-vllm-api.co",
|
||||
|
|
|
@ -1,6 +1,13 @@
|
|||
# Slack Alerting
|
||||
|
||||
Get alerts for failed db read/writes, hanging api calls, failed api calls.
|
||||
Get alerts for:
|
||||
- hanging LLM api calls
|
||||
- failed LLM api calls
|
||||
- slow LLM api calls
|
||||
- budget Tracking per key/user:
|
||||
- When a User/Key crosses their Budget
|
||||
- When a User/Key is 15% away from crossing their Budget
|
||||
- failed db read/writes
|
||||
|
||||
## Quick Start
|
||||
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
# Modify Incoming Data
|
||||
# Modify / Reject Incoming Requests
|
||||
|
||||
Modify data just before making litellm completion calls call on proxy
|
||||
|
||||
|
|
|
@ -1,31 +1,46 @@
|
|||
# CLI Arguments
|
||||
Cli arguments, --host, --port, --num_workers
|
||||
|
||||
#### --host
|
||||
## --host
|
||||
- **Default:** `'0.0.0.0'`
|
||||
- The host for the server to listen on.
|
||||
- **Usage:**
|
||||
```shell
|
||||
litellm --host 127.0.0.1
|
||||
```
|
||||
- **Usage - set Environment Variable:** `HOST`
|
||||
```shell
|
||||
export HOST=127.0.0.1
|
||||
litellm
|
||||
```
|
||||
|
||||
#### --port
|
||||
## --port
|
||||
- **Default:** `8000`
|
||||
- The port to bind the server to.
|
||||
- **Usage:**
|
||||
```shell
|
||||
litellm --port 8080
|
||||
```
|
||||
- **Usage - set Environment Variable:** `PORT`
|
||||
```shell
|
||||
export PORT=8080
|
||||
litellm
|
||||
```
|
||||
|
||||
#### --num_workers
|
||||
## --num_workers
|
||||
- **Default:** `1`
|
||||
- The number of uvicorn workers to spin up.
|
||||
- **Usage:**
|
||||
```shell
|
||||
litellm --num_workers 4
|
||||
```
|
||||
- **Usage - set Environment Variable:** `NUM_WORKERS`
|
||||
```shell
|
||||
export NUM_WORKERS=4
|
||||
litellm
|
||||
```
|
||||
|
||||
#### --api_base
|
||||
## --api_base
|
||||
- **Default:** `None`
|
||||
- The API base for the model litellm should call.
|
||||
- **Usage:**
|
||||
|
@ -33,7 +48,7 @@ Cli arguments, --host, --port, --num_workers
|
|||
litellm --model huggingface/tinyllama --api_base https://k58ory32yinf1ly0.us-east-1.aws.endpoints.huggingface.cloud
|
||||
```
|
||||
|
||||
#### --api_version
|
||||
## --api_version
|
||||
- **Default:** `None`
|
||||
- For Azure services, specify the API version.
|
||||
- **Usage:**
|
||||
|
@ -41,7 +56,7 @@ Cli arguments, --host, --port, --num_workers
|
|||
litellm --model azure/gpt-deployment --api_version 2023-08-01 --api_base https://<your api base>"
|
||||
```
|
||||
|
||||
#### --model or -m
|
||||
## --model or -m
|
||||
- **Default:** `None`
|
||||
- The model name to pass to Litellm.
|
||||
- **Usage:**
|
||||
|
@ -49,7 +64,7 @@ Cli arguments, --host, --port, --num_workers
|
|||
litellm --model gpt-3.5-turbo
|
||||
```
|
||||
|
||||
#### --test
|
||||
## --test
|
||||
- **Type:** `bool` (Flag)
|
||||
- Proxy chat completions URL to make a test request.
|
||||
- **Usage:**
|
||||
|
@ -57,7 +72,7 @@ Cli arguments, --host, --port, --num_workers
|
|||
litellm --test
|
||||
```
|
||||
|
||||
#### --health
|
||||
## --health
|
||||
- **Type:** `bool` (Flag)
|
||||
- Runs a health check on all models in config.yaml
|
||||
- **Usage:**
|
||||
|
@ -65,7 +80,7 @@ Cli arguments, --host, --port, --num_workers
|
|||
litellm --health
|
||||
```
|
||||
|
||||
#### --alias
|
||||
## --alias
|
||||
- **Default:** `None`
|
||||
- An alias for the model, for user-friendly reference.
|
||||
- **Usage:**
|
||||
|
@ -73,7 +88,7 @@ Cli arguments, --host, --port, --num_workers
|
|||
litellm --alias my-gpt-model
|
||||
```
|
||||
|
||||
#### --debug
|
||||
## --debug
|
||||
- **Default:** `False`
|
||||
- **Type:** `bool` (Flag)
|
||||
- Enable debugging mode for the input.
|
||||
|
@ -81,15 +96,25 @@ Cli arguments, --host, --port, --num_workers
|
|||
```shell
|
||||
litellm --debug
|
||||
```
|
||||
- **Usage - set Environment Variable:** `DEBUG`
|
||||
```shell
|
||||
export DEBUG=True
|
||||
litellm
|
||||
```
|
||||
|
||||
#### --detailed_debug
|
||||
## --detailed_debug
|
||||
- **Default:** `False`
|
||||
- **Type:** `bool` (Flag)
|
||||
- Enable debugging mode for the input.
|
||||
- **Usage:**
|
||||
```shell
|
||||
litellm --detailed_debug
|
||||
``
|
||||
```
|
||||
- **Usage - set Environment Variable:** `DETAILED_DEBUG`
|
||||
```shell
|
||||
export DETAILED_DEBUG=True
|
||||
litellm
|
||||
```
|
||||
|
||||
#### --temperature
|
||||
- **Default:** `None`
|
||||
|
@ -100,7 +125,7 @@ Cli arguments, --host, --port, --num_workers
|
|||
litellm --temperature 0.7
|
||||
```
|
||||
|
||||
#### --max_tokens
|
||||
## --max_tokens
|
||||
- **Default:** `None`
|
||||
- **Type:** `int`
|
||||
- Set the maximum number of tokens for the model output.
|
||||
|
@ -109,7 +134,7 @@ Cli arguments, --host, --port, --num_workers
|
|||
litellm --max_tokens 50
|
||||
```
|
||||
|
||||
#### --request_timeout
|
||||
## --request_timeout
|
||||
- **Default:** `600`
|
||||
- **Type:** `int`
|
||||
- Set the timeout in seconds for completion calls.
|
||||
|
@ -118,7 +143,7 @@ Cli arguments, --host, --port, --num_workers
|
|||
litellm --request_timeout 300
|
||||
```
|
||||
|
||||
#### --drop_params
|
||||
## --drop_params
|
||||
- **Type:** `bool` (Flag)
|
||||
- Drop any unmapped params.
|
||||
- **Usage:**
|
||||
|
@ -126,7 +151,7 @@ Cli arguments, --host, --port, --num_workers
|
|||
litellm --drop_params
|
||||
```
|
||||
|
||||
#### --add_function_to_prompt
|
||||
## --add_function_to_prompt
|
||||
- **Type:** `bool` (Flag)
|
||||
- If a function passed but unsupported, pass it as a part of the prompt.
|
||||
- **Usage:**
|
||||
|
@ -134,14 +159,14 @@ Cli arguments, --host, --port, --num_workers
|
|||
litellm --add_function_to_prompt
|
||||
```
|
||||
|
||||
#### --config
|
||||
## --config
|
||||
- Configure Litellm by providing a configuration file path.
|
||||
- **Usage:**
|
||||
```shell
|
||||
litellm --config path/to/config.yaml
|
||||
```
|
||||
|
||||
#### --telemetry
|
||||
## --telemetry
|
||||
- **Default:** `True`
|
||||
- **Type:** `bool`
|
||||
- Help track usage of this feature.
|
||||
|
|
|
@ -22,18 +22,22 @@ Set a model alias for your deployments.
|
|||
|
||||
In the `config.yaml` the model_name parameter is the user-facing name to use for your deployment.
|
||||
|
||||
In the config below requests with:
|
||||
In the config below:
|
||||
- `model_name`: the name to pass TO litellm from the external client
|
||||
- `litellm_params.model`: the model string passed to the litellm.completion() function
|
||||
|
||||
E.g.:
|
||||
- `model=vllm-models` will route to `openai/facebook/opt-125m`.
|
||||
- `model=gpt-3.5-turbo` will load balance between `azure/gpt-turbo-small-eu` and `azure/gpt-turbo-small-ca`
|
||||
|
||||
```yaml
|
||||
model_list:
|
||||
- model_name: gpt-3.5-turbo # user-facing model alias
|
||||
- model_name: gpt-3.5-turbo ### RECEIVED MODEL NAME ###
|
||||
litellm_params: # all params accepted by litellm.completion() - https://docs.litellm.ai/docs/completion/input
|
||||
model: azure/gpt-turbo-small-eu
|
||||
model: azure/gpt-turbo-small-eu ### MODEL NAME sent to `litellm.completion()` ###
|
||||
api_base: https://my-endpoint-europe-berri-992.openai.azure.com/
|
||||
api_key: "os.environ/AZURE_API_KEY_EU" # does os.getenv("AZURE_API_KEY_EU")
|
||||
rpm: 6 # Rate limit for this deployment: in requests per minute (rpm)
|
||||
rpm: 6 # [OPTIONAL] Rate limit for this deployment: in requests per minute (rpm)
|
||||
- model_name: bedrock-claude-v1
|
||||
litellm_params:
|
||||
model: bedrock/anthropic.claude-instant-v1
|
||||
|
@ -43,6 +47,11 @@ model_list:
|
|||
api_base: https://my-endpoint-canada-berri992.openai.azure.com/
|
||||
api_key: "os.environ/AZURE_API_KEY_CA"
|
||||
rpm: 6
|
||||
- model_name: anthropic-claude
|
||||
litellm_params:
|
||||
model="bedrock/anthropic.claude-instant-v1"
|
||||
### [OPTIONAL] SET AWS REGION ###
|
||||
aws_region_name="us-east-1"
|
||||
- model_name: vllm-models
|
||||
litellm_params:
|
||||
model: openai/facebook/opt-125m # the `openai/` prefix tells litellm it's openai compatible
|
||||
|
@ -58,6 +67,11 @@ litellm_settings: # module level litellm settings - https://github.com/BerriAI/l
|
|||
general_settings:
|
||||
master_key: sk-1234 # [OPTIONAL] Only use this if you to require all calls to contain this key (Authorization: Bearer sk-1234)
|
||||
```
|
||||
:::info
|
||||
|
||||
For more provider-specific info, [go here](../providers/)
|
||||
|
||||
:::
|
||||
|
||||
#### Step 2: Start Proxy with config
|
||||
|
||||
|
@ -188,7 +202,7 @@ print(response)
|
|||
</Tabs>
|
||||
|
||||
|
||||
## Save Model-specific params (API Base, API Keys, Temperature, Max Tokens, Seed, Headers etc.)
|
||||
## Save Model-specific params (API Base, API Keys, Temperature, Max Tokens, Seed, Organization, Headers etc.)
|
||||
You can use the config to save model-specific information like api_base, api_key, temperature, max_tokens, etc.
|
||||
|
||||
[**All input params**](https://docs.litellm.ai/docs/completion/input#input-params-1)
|
||||
|
@ -210,6 +224,12 @@ model_list:
|
|||
api_key: sk-123
|
||||
api_base: https://openai-gpt-4-test-v-2.openai.azure.com/
|
||||
temperature: 0.2
|
||||
- model_name: openai-gpt-3.5
|
||||
litellm_params:
|
||||
model: openai/gpt-3.5-turbo
|
||||
api_key: sk-123
|
||||
organization: org-ikDc4ex8NB
|
||||
temperature: 0.2
|
||||
- model_name: mistral-7b
|
||||
litellm_params:
|
||||
model: ollama/mistral
|
||||
|
@ -318,6 +338,26 @@ See supported Embedding Providers & Models [here](https://docs.litellm.ai/docs/e
|
|||
#### Create Config.yaml
|
||||
|
||||
<Tabs>
|
||||
<TabItem value="bedrock" label="Bedrock Completion/Chat">
|
||||
|
||||
```yaml
|
||||
model_list:
|
||||
- model_name: bedrock-cohere
|
||||
litellm_params:
|
||||
model: "bedrock/cohere.command-text-v14"
|
||||
aws_region_name: "us-west-2"
|
||||
- model_name: bedrock-cohere
|
||||
litellm_params:
|
||||
model: "bedrock/cohere.command-text-v14"
|
||||
aws_region_name: "us-east-2"
|
||||
- model_name: bedrock-cohere
|
||||
litellm_params:
|
||||
model: "bedrock/cohere.command-text-v14"
|
||||
aws_region_name: "us-east-1"
|
||||
|
||||
```
|
||||
|
||||
</TabItem>
|
||||
|
||||
<TabItem value="sagemaker" label="Sagemaker, Bedrock Embeddings">
|
||||
|
||||
|
@ -430,20 +470,26 @@ model_list:
|
|||
</Tabs>
|
||||
|
||||
#### Start Proxy
|
||||
|
||||
```shell
|
||||
litellm --config config.yaml
|
||||
```
|
||||
|
||||
#### Make Request
|
||||
Sends Request to `deployed-codebert-base`
|
||||
Sends Request to `bedrock-cohere`
|
||||
|
||||
```shell
|
||||
curl --location 'http://0.0.0.0:8000/embeddings' \
|
||||
curl --location 'http://0.0.0.0:8000/chat/completions' \
|
||||
--header 'Content-Type: application/json' \
|
||||
--data ' {
|
||||
"model": "deployed-codebert-base",
|
||||
"input": ["write a litellm poem"]
|
||||
}'
|
||||
"model": "bedrock-cohere",
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "gm"
|
||||
}
|
||||
]
|
||||
}'
|
||||
```
|
||||
|
||||
|
||||
|
@ -483,3 +529,55 @@ general_settings:
|
|||
max_parallel_requests: 100 # max parallel requests for a user = 100
|
||||
```
|
||||
|
||||
## All settings
|
||||
|
||||
```python
|
||||
{
|
||||
"environment_variables": {},
|
||||
"model_list": [
|
||||
{
|
||||
"model_name": "string",
|
||||
"litellm_params": {},
|
||||
"model_info": {
|
||||
"id": "string",
|
||||
"mode": "embedding",
|
||||
"input_cost_per_token": 0,
|
||||
"output_cost_per_token": 0,
|
||||
"max_tokens": 2048,
|
||||
"base_model": "gpt-4-1106-preview",
|
||||
"additionalProp1": {}
|
||||
}
|
||||
}
|
||||
],
|
||||
"litellm_settings": {}, # ALL (https://github.com/BerriAI/litellm/blob/main/litellm/__init__.py)
|
||||
"general_settings": {
|
||||
"completion_model": "string",
|
||||
"key_management_system": "google_kms", # either google_kms or azure_kms
|
||||
"master_key": "string",
|
||||
"database_url": "string",
|
||||
"database_type": "dynamo_db",
|
||||
"database_args": {
|
||||
"billing_mode": "PROVISIONED_THROUGHPUT",
|
||||
"read_capacity_units": 0,
|
||||
"write_capacity_units": 0,
|
||||
"ssl_verify": true,
|
||||
"region_name": "string",
|
||||
"user_table_name": "LiteLLM_UserTable",
|
||||
"key_table_name": "LiteLLM_VerificationToken",
|
||||
"config_table_name": "LiteLLM_Config",
|
||||
"spend_table_name": "LiteLLM_SpendLogs"
|
||||
},
|
||||
"otel": true,
|
||||
"custom_auth": "string",
|
||||
"max_parallel_requests": 0,
|
||||
"infer_model_from_keys": true,
|
||||
"background_health_checks": true,
|
||||
"health_check_interval": 300,
|
||||
"alerting": [
|
||||
"string"
|
||||
],
|
||||
"alerting_threshold": 0
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
|
|
115
docs/my-website/docs/proxy/custom_pricing.md
Normal file
|
@ -0,0 +1,115 @@
|
|||
import Image from '@theme/IdealImage';
|
||||
|
||||
# Custom Pricing - Sagemaker, etc.
|
||||
|
||||
Use this to register custom pricing for models.
|
||||
|
||||
There's 2 ways to track cost:
|
||||
- cost per token
|
||||
- cost per second
|
||||
|
||||
By default, the response cost is accessible in the logging object via `kwargs["response_cost"]` on success (sync + async). [**Learn More**](../observability/custom_callback.md)
|
||||
|
||||
## Quick Start
|
||||
|
||||
Register custom pricing for sagemaker completion model.
|
||||
|
||||
For cost per second pricing, you **just** need to register `input_cost_per_second`.
|
||||
|
||||
```python
|
||||
# !pip install boto3
|
||||
from litellm import completion, completion_cost
|
||||
|
||||
os.environ["AWS_ACCESS_KEY_ID"] = ""
|
||||
os.environ["AWS_SECRET_ACCESS_KEY"] = ""
|
||||
os.environ["AWS_REGION_NAME"] = ""
|
||||
|
||||
|
||||
def test_completion_sagemaker():
|
||||
try:
|
||||
print("testing sagemaker")
|
||||
response = completion(
|
||||
model="sagemaker/berri-benchmarking-Llama-2-70b-chat-hf-4",
|
||||
messages=[{"role": "user", "content": "Hey, how's it going?"}],
|
||||
input_cost_per_second=0.000420,
|
||||
)
|
||||
# Add any assertions here to check the response
|
||||
print(response)
|
||||
cost = completion_cost(completion_response=response)
|
||||
print(cost)
|
||||
except Exception as e:
|
||||
raise Exception(f"Error occurred: {e}")
|
||||
|
||||
```
|
||||
|
||||
### Usage with OpenAI Proxy Server
|
||||
|
||||
**Step 1: Add pricing to config.yaml**
|
||||
```yaml
|
||||
model_list:
|
||||
- model_name: sagemaker-completion-model
|
||||
litellm_params:
|
||||
model: sagemaker/berri-benchmarking-Llama-2-70b-chat-hf-4
|
||||
input_cost_per_second: 0.000420
|
||||
- model_name: sagemaker-embedding-model
|
||||
litellm_params:
|
||||
model: sagemaker/berri-benchmarking-gpt-j-6b-fp16
|
||||
input_cost_per_second: 0.000420
|
||||
```
|
||||
|
||||
**Step 2: Start proxy**
|
||||
|
||||
```bash
|
||||
litellm /path/to/config.yaml
|
||||
```
|
||||
|
||||
**Step 3: View Spend Logs**
|
||||
|
||||
<Image img={require('../../img/spend_logs_table.png')} />
|
||||
|
||||
## Cost Per Token (e.g. Azure)
|
||||
|
||||
|
||||
```python
|
||||
# !pip install boto3
|
||||
from litellm import completion, completion_cost
|
||||
|
||||
## set ENV variables
|
||||
os.environ["AZURE_API_KEY"] = ""
|
||||
os.environ["AZURE_API_BASE"] = ""
|
||||
os.environ["AZURE_API_VERSION"] = ""
|
||||
|
||||
|
||||
def test_completion_azure_model():
|
||||
try:
|
||||
print("testing azure custom pricing")
|
||||
# azure call
|
||||
response = completion(
|
||||
model = "azure/<your_deployment_name>",
|
||||
messages = [{ "content": "Hello, how are you?","role": "user"}]
|
||||
input_cost_per_token=0.005,
|
||||
output_cost_per_token=1,
|
||||
)
|
||||
# Add any assertions here to check the response
|
||||
print(response)
|
||||
cost = completion_cost(completion_response=response)
|
||||
print(cost)
|
||||
except Exception as e:
|
||||
raise Exception(f"Error occurred: {e}")
|
||||
|
||||
test_completion_azure_model()
|
||||
```
|
||||
|
||||
### Usage with OpenAI Proxy Server
|
||||
|
||||
```yaml
|
||||
model_list:
|
||||
- model_name: azure-model
|
||||
litellm_params:
|
||||
model: azure/<your_deployment_name>
|
||||
api_key: os.environ/AZURE_API_KEY
|
||||
api_base: os.environ/AZURE_API_BASE
|
||||
api_version: os.envrion/AZURE_API_VERSION
|
||||
input_cost_per_token: 0.000421 # 👈 ONLY to track cost per token
|
||||
output_cost_per_token: 0.000520 # 👈 ONLY to track cost per token
|
||||
```
|
34
docs/my-website/docs/proxy/debugging.md
Normal file
|
@ -0,0 +1,34 @@
|
|||
# Debugging
|
||||
|
||||
2 levels of debugging supported.
|
||||
|
||||
- debug (prints info logs)
|
||||
- detailed debug (prints debug logs)
|
||||
|
||||
## `debug`
|
||||
|
||||
**via cli**
|
||||
|
||||
```bash
|
||||
$ litellm --debug
|
||||
```
|
||||
|
||||
**via env**
|
||||
|
||||
```python
|
||||
os.environ["LITELLM_LOG"] = "INFO"
|
||||
```
|
||||
|
||||
## `detailed debug`
|
||||
|
||||
**via cli**
|
||||
|
||||
```bash
|
||||
$ litellm --detailed_debug
|
||||
```
|
||||
|
||||
**via env**
|
||||
|
||||
```python
|
||||
os.environ["LITELLM_LOG"] = "DEBUG"
|
||||
```
|
|
@ -5,8 +5,10 @@ Use this to health check all LLMs defined in your config.yaml
|
|||
|
||||
The proxy exposes:
|
||||
* a /health endpoint which returns the health of the LLM APIs
|
||||
* a /test endpoint which makes a ping to the litellm server
|
||||
* a /health/readiness endpoint for returning if the proxy is ready to accept requests
|
||||
* a /health/liveliness endpoint for returning if the proxy is alive
|
||||
|
||||
## `/health`
|
||||
#### Request
|
||||
Make a GET Request to `/health` on the proxy
|
||||
```shell
|
||||
|
@ -39,7 +41,7 @@ litellm --health
|
|||
}
|
||||
```
|
||||
|
||||
## Background Health Checks
|
||||
### Background Health Checks
|
||||
|
||||
You can enable model health checks being run in the background, to prevent each model from being queried too frequently via `/health`.
|
||||
|
||||
|
@ -61,7 +63,7 @@ $ litellm /path/to/config.yaml
|
|||
curl --location 'http://0.0.0.0:8000/health'
|
||||
```
|
||||
|
||||
## Embedding Models
|
||||
### Embedding Models
|
||||
|
||||
We need some way to know if the model is an embedding model when running checks, if you have this in your config, specifying mode it makes an embedding health check
|
||||
|
||||
|
@ -77,3 +79,69 @@ model_list:
|
|||
mode: embedding # 👈 ADD THIS
|
||||
```
|
||||
|
||||
### Text Completion Models
|
||||
|
||||
We need some way to know if the model is a text completion model when running checks, if you have this in your config, specifying mode it makes an embedding health check
|
||||
|
||||
```yaml
|
||||
model_list:
|
||||
- model_name: azure-text-completion
|
||||
litellm_params:
|
||||
model: azure/text-davinci-003
|
||||
api_base: os.environ/AZURE_API_BASE
|
||||
api_key: os.environ/AZURE_API_KEY
|
||||
api_version: "2023-07-01-preview"
|
||||
model_info:
|
||||
mode: completion # 👈 ADD THIS
|
||||
```
|
||||
|
||||
## `/health/readiness`
|
||||
|
||||
Unprotected endpoint for checking if proxy is ready to accept requests
|
||||
|
||||
Example Request:
|
||||
|
||||
```bash
|
||||
curl --location 'http://0.0.0.0:8000/health/readiness'
|
||||
```
|
||||
|
||||
Example Response:
|
||||
|
||||
*If proxy connected to a database*
|
||||
|
||||
```json
|
||||
{
|
||||
"status": "healthy",
|
||||
"db": "connected",
|
||||
"litellm_version":"1.19.2",
|
||||
}
|
||||
```
|
||||
|
||||
*If proxy not connected to a database*
|
||||
|
||||
```json
|
||||
{
|
||||
"status": "healthy",
|
||||
"db": "Not connected",
|
||||
"litellm_version":"1.19.2",
|
||||
}
|
||||
```
|
||||
|
||||
## `/health/liveliness`
|
||||
|
||||
Unprotected endpoint for checking if proxy is alive
|
||||
|
||||
|
||||
Example Request:
|
||||
|
||||
```
|
||||
curl -X 'GET' \
|
||||
'http://0.0.0.0:8000/health/liveliness' \
|
||||
-H 'accept: application/json'
|
||||
```
|
||||
|
||||
Example Response:
|
||||
|
||||
```json
|
||||
"I'm alive!"
|
||||
```
|
|
@ -1,5 +1,4 @@
|
|||
|
||||
# Load Balancing - Multiple Instances of 1 model
|
||||
# Multiple Instances of 1 model
|
||||
Load balance multiple instances of the same model
|
||||
|
||||
The proxy will handle routing requests (using LiteLLM's Router). **Set `rpm` in the config if you want maximize throughput**
|
||||
|
|
|
@ -435,6 +435,7 @@ print(response)
|
|||
</TabItem>
|
||||
</Tabs>
|
||||
|
||||
|
||||
## Logging Proxy Input/Output - s3 Buckets
|
||||
|
||||
We will use the `--config` to set
|
||||
|
@ -490,6 +491,34 @@ curl --location 'http://0.0.0.0:8000/chat/completions' \
|
|||
|
||||
Your logs should be available on the specified s3 Bucket
|
||||
|
||||
## Team-based Logging
|
||||
|
||||
Set success callbacks (e.g. langfuse), for a specific team-id.
|
||||
|
||||
```yaml
|
||||
litellm_settings:
|
||||
default_team_settings:
|
||||
- team_id: my-secret-project
|
||||
success_callback: ["langfuse"]
|
||||
langfuse_public_key: os.environ/LANGFUSE_PUB_KEY_2
|
||||
langfuse_secret: os.environ/LANGFUSE_PRIVATE_KEY_2
|
||||
- team_id: ishaans-secret-project
|
||||
success_callback: ["langfuse"]
|
||||
langfuse_public_key: os.environ/LANGFUSE_PUB_KEY_3
|
||||
langfuse_secret: os.environ/LANGFUSE_SECRET_3
|
||||
```
|
||||
|
||||
Now, when you [generate keys](./virtual_keys.md) for this team-id
|
||||
|
||||
```bash
|
||||
curl -X POST 'http://0.0.0.0:8000/key/generate' \
|
||||
-H 'Authorization: Bearer sk-1234' \
|
||||
-H 'Content-Type: application/json' \
|
||||
-D '{"team_id": "ishaans-secret-project"}'
|
||||
```
|
||||
|
||||
All requests made with these keys will log data to their team-specific logging.
|
||||
|
||||
## Logging Proxy Input/Output - DynamoDB
|
||||
|
||||
We will use the `--config` to set
|
||||
|
|
|
@ -40,115 +40,6 @@ litellm --test
|
|||
|
||||
This will now automatically route any requests for gpt-3.5-turbo to bigcode starcoder, hosted on huggingface inference endpoints.
|
||||
|
||||
### Using LiteLLM Proxy - Curl Request, OpenAI Package, Langchain
|
||||
|
||||
<Tabs>
|
||||
<TabItem value="Curl" label="Curl Request">
|
||||
|
||||
```shell
|
||||
curl --location 'http://0.0.0.0:8000/chat/completions' \
|
||||
--header 'Content-Type: application/json' \
|
||||
--data ' {
|
||||
"model": "gpt-3.5-turbo",
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "what llm are you"
|
||||
}
|
||||
]
|
||||
}
|
||||
'
|
||||
```
|
||||
</TabItem>
|
||||
<TabItem value="openai" label="OpenAI v1.0.0+">
|
||||
|
||||
```python
|
||||
import openai
|
||||
client = openai.OpenAI(
|
||||
api_key="anything",
|
||||
base_url="http://0.0.0.0:8000"
|
||||
)
|
||||
|
||||
# request sent to model set on litellm proxy, `litellm --model`
|
||||
response = client.chat.completions.create(model="gpt-3.5-turbo", messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "this is a test request, write a short poem"
|
||||
}
|
||||
])
|
||||
|
||||
print(response)
|
||||
|
||||
```
|
||||
</TabItem>
|
||||
<TabItem value="langchain" label="Langchain">
|
||||
|
||||
```python
|
||||
from langchain.chat_models import ChatOpenAI
|
||||
from langchain.prompts.chat import (
|
||||
ChatPromptTemplate,
|
||||
HumanMessagePromptTemplate,
|
||||
SystemMessagePromptTemplate,
|
||||
)
|
||||
from langchain.schema import HumanMessage, SystemMessage
|
||||
|
||||
chat = ChatOpenAI(
|
||||
openai_api_base="http://0.0.0.0:8000", # set openai_api_base to the LiteLLM Proxy
|
||||
model = "gpt-3.5-turbo",
|
||||
temperature=0.1
|
||||
)
|
||||
|
||||
messages = [
|
||||
SystemMessage(
|
||||
content="You are a helpful assistant that im using to make a test request to."
|
||||
),
|
||||
HumanMessage(
|
||||
content="test from litellm. tell me why it's amazing in 1 sentence"
|
||||
),
|
||||
]
|
||||
response = chat(messages)
|
||||
|
||||
print(response)
|
||||
```
|
||||
|
||||
</TabItem>
|
||||
<TabItem value="langchain-embedding" label="Langchain Embeddings">
|
||||
|
||||
```python
|
||||
from langchain.embeddings import OpenAIEmbeddings
|
||||
|
||||
embeddings = OpenAIEmbeddings(model="sagemaker-embeddings", openai_api_base="http://0.0.0.0:8000", openai_api_key="temp-key")
|
||||
|
||||
|
||||
text = "This is a test document."
|
||||
|
||||
query_result = embeddings.embed_query(text)
|
||||
|
||||
print(f"SAGEMAKER EMBEDDINGS")
|
||||
print(query_result[:5])
|
||||
|
||||
embeddings = OpenAIEmbeddings(model="bedrock-embeddings", openai_api_base="http://0.0.0.0:8000", openai_api_key="temp-key")
|
||||
|
||||
text = "This is a test document."
|
||||
|
||||
query_result = embeddings.embed_query(text)
|
||||
|
||||
print(f"BEDROCK EMBEDDINGS")
|
||||
print(query_result[:5])
|
||||
|
||||
embeddings = OpenAIEmbeddings(model="bedrock-titan-embeddings", openai_api_base="http://0.0.0.0:8000", openai_api_key="temp-key")
|
||||
|
||||
text = "This is a test document."
|
||||
|
||||
query_result = embeddings.embed_query(text)
|
||||
|
||||
print(f"TITAN EMBEDDINGS")
|
||||
print(query_result[:5])
|
||||
```
|
||||
</TabItem>
|
||||
</Tabs>
|
||||
|
||||
|
||||
### Supported LLMs
|
||||
All LiteLLM supported LLMs are supported on the Proxy. Seel all [supported llms](https://docs.litellm.ai/docs/providers)
|
||||
<Tabs>
|
||||
|
@ -331,6 +222,113 @@ $ litellm --model command-nightly
|
|||
</Tabs>
|
||||
|
||||
|
||||
### Using LiteLLM Proxy - Curl Request, OpenAI Package, Langchain
|
||||
|
||||
<Tabs>
|
||||
<TabItem value="Curl" label="Curl Request">
|
||||
|
||||
```shell
|
||||
curl --location 'http://0.0.0.0:8000/chat/completions' \
|
||||
--header 'Content-Type: application/json' \
|
||||
--data ' {
|
||||
"model": "gpt-3.5-turbo",
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "what llm are you"
|
||||
}
|
||||
]
|
||||
}
|
||||
'
|
||||
```
|
||||
</TabItem>
|
||||
<TabItem value="openai" label="OpenAI v1.0.0+">
|
||||
|
||||
```python
|
||||
import openai
|
||||
client = openai.OpenAI(
|
||||
api_key="anything",
|
||||
base_url="http://0.0.0.0:8000"
|
||||
)
|
||||
|
||||
# request sent to model set on litellm proxy, `litellm --model`
|
||||
response = client.chat.completions.create(model="gpt-3.5-turbo", messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "this is a test request, write a short poem"
|
||||
}
|
||||
])
|
||||
|
||||
print(response)
|
||||
|
||||
```
|
||||
</TabItem>
|
||||
<TabItem value="langchain" label="Langchain">
|
||||
|
||||
```python
|
||||
from langchain.chat_models import ChatOpenAI
|
||||
from langchain.prompts.chat import (
|
||||
ChatPromptTemplate,
|
||||
HumanMessagePromptTemplate,
|
||||
SystemMessagePromptTemplate,
|
||||
)
|
||||
from langchain.schema import HumanMessage, SystemMessage
|
||||
|
||||
chat = ChatOpenAI(
|
||||
openai_api_base="http://0.0.0.0:8000", # set openai_api_base to the LiteLLM Proxy
|
||||
model = "gpt-3.5-turbo",
|
||||
temperature=0.1
|
||||
)
|
||||
|
||||
messages = [
|
||||
SystemMessage(
|
||||
content="You are a helpful assistant that im using to make a test request to."
|
||||
),
|
||||
HumanMessage(
|
||||
content="test from litellm. tell me why it's amazing in 1 sentence"
|
||||
),
|
||||
]
|
||||
response = chat(messages)
|
||||
|
||||
print(response)
|
||||
```
|
||||
|
||||
</TabItem>
|
||||
<TabItem value="langchain-embedding" label="Langchain Embeddings">
|
||||
|
||||
```python
|
||||
from langchain.embeddings import OpenAIEmbeddings
|
||||
|
||||
embeddings = OpenAIEmbeddings(model="sagemaker-embeddings", openai_api_base="http://0.0.0.0:8000", openai_api_key="temp-key")
|
||||
|
||||
|
||||
text = "This is a test document."
|
||||
|
||||
query_result = embeddings.embed_query(text)
|
||||
|
||||
print(f"SAGEMAKER EMBEDDINGS")
|
||||
print(query_result[:5])
|
||||
|
||||
embeddings = OpenAIEmbeddings(model="bedrock-embeddings", openai_api_base="http://0.0.0.0:8000", openai_api_key="temp-key")
|
||||
|
||||
text = "This is a test document."
|
||||
|
||||
query_result = embeddings.embed_query(text)
|
||||
|
||||
print(f"BEDROCK EMBEDDINGS")
|
||||
print(query_result[:5])
|
||||
|
||||
embeddings = OpenAIEmbeddings(model="bedrock-titan-embeddings", openai_api_base="http://0.0.0.0:8000", openai_api_key="temp-key")
|
||||
|
||||
text = "This is a test document."
|
||||
|
||||
query_result = embeddings.embed_query(text)
|
||||
|
||||
print(f"TITAN EMBEDDINGS")
|
||||
print(query_result[:5])
|
||||
```
|
||||
</TabItem>
|
||||
</Tabs>
|
||||
|
||||
|
||||
## Quick Start - LiteLLM Proxy + Config.yaml
|
||||
|
|
|
@ -8,9 +8,12 @@ Use this to fail a request based on the output of an llm api call.
|
|||
|
||||
```python
|
||||
def my_custom_rule(input): # receives the model response
|
||||
if len(input) < 5: # trigger fallback if the model response is too short
|
||||
return False
|
||||
return True
|
||||
if len(input) < 5:
|
||||
return {
|
||||
"decision": False,
|
||||
"message": "This violates LiteLLM Proxy Rules. Response too short"
|
||||
}
|
||||
return {"decision": True} # message not required since, request will pass
|
||||
```
|
||||
|
||||
### Step 2. Point it to your proxy
|
||||
|
@ -18,7 +21,6 @@ def my_custom_rule(input): # receives the model response
|
|||
```python
|
||||
litellm_settings:
|
||||
post_call_rules: post_call_rules.my_custom_rule
|
||||
num_retries: 3
|
||||
```
|
||||
|
||||
### Step 3. Start + test your proxy
|
||||
|
@ -32,7 +34,7 @@ curl --location 'http://0.0.0.0:8000/v1/chat/completions' \
|
|||
--header 'Content-Type: application/json' \
|
||||
--header 'Authorization: Bearer sk-1234' \
|
||||
--data '{
|
||||
"model": "deepseek-coder",
|
||||
"model": "gpt-3.5-turbo",
|
||||
"messages": [{"role":"user","content":"What llm are you?"}],
|
||||
"temperature": 0.7,
|
||||
"max_tokens": 10,
|
||||
|
@ -40,4 +42,20 @@ curl --location 'http://0.0.0.0:8000/v1/chat/completions' \
|
|||
```
|
||||
---
|
||||
|
||||
This will now check if a response is > len 5, and if it fails, it'll retry a call 3 times before failing.
|
||||
This will now check if a response is > len 5, and if it fails, it'll retry a call 3 times before failing.
|
||||
|
||||
### Response that fail the rule
|
||||
|
||||
This is the response from LiteLLM Proxy on failing a rule
|
||||
|
||||
```json
|
||||
{
|
||||
"error":
|
||||
{
|
||||
"message":"This violates LiteLLM Proxy Rules. Response too short",
|
||||
"type":null,
|
||||
"param":null,
|
||||
"code":500
|
||||
}
|
||||
}
|
||||
```
|
|
@ -1,8 +1,8 @@
|
|||
import Image from '@theme/IdealImage';
|
||||
import Tabs from '@theme/Tabs';
|
||||
import TabItem from '@theme/TabItem';
|
||||
|
||||
# [BETA] Self-serve UI
|
||||
|
||||
Allow your users to create their own keys through a UI
|
||||
# [BETA] Admin UI
|
||||
|
||||
:::info
|
||||
|
||||
|
@ -10,40 +10,89 @@ This is in beta, so things may change. If you have feedback, [let us know](https
|
|||
|
||||
:::
|
||||
|
||||
Allow your users to create, view their own keys through a UI
|
||||
|
||||
<Image img={require('../../img/admin_ui_2.png')} />
|
||||
|
||||
|
||||
|
||||
## Quick Start
|
||||
|
||||
Requirements:
|
||||
## 1. Setup SSO/Auth for UI
|
||||
|
||||
- Need to a SMTP server connection to send emails (e.g. [Resend](https://resend.com/docs/send-with-smtp))
|
||||
<Tabs>
|
||||
|
||||
[**See code**](https://github.com/BerriAI/litellm/blob/61cd800b9ffbb02c286481d2056b65c7fb5447bf/litellm/proxy/proxy_server.py#L1782)
|
||||
<TabItem value="username" label="Quick Start - Username, Password">
|
||||
|
||||
### Step 1. Save SMTP server credentials
|
||||
Set the following in your .env on the Proxy
|
||||
|
||||
```env
|
||||
export SMTP_HOST="my-smtp-host"
|
||||
export SMTP_USERNAME="my-smtp-password"
|
||||
export SMTP_PASSWORD="my-smtp-password"
|
||||
export SMTP_SENDER_EMAIL="krrish@berri.ai"
|
||||
```shell
|
||||
UI_USERNAME=ishaan-litellm
|
||||
UI_PASSWORD=langchain
|
||||
```
|
||||
|
||||
### Step 2. Enable user auth
|
||||
On accessing the LiteLLM UI, you will be prompted to enter your username, password
|
||||
|
||||
In your config.yaml,
|
||||
</TabItem>
|
||||
|
||||
```yaml
|
||||
general_settings:
|
||||
# other changes
|
||||
allow_user_auth: true
|
||||
<TabItem value="google" label="Google SSO">
|
||||
|
||||
- Create a new Oauth 2.0 Client on https://console.cloud.google.com/
|
||||
|
||||
**Required .env variables on your Proxy**
|
||||
```shell
|
||||
# for Google SSO Login
|
||||
GOOGLE_CLIENT_ID=
|
||||
GOOGLE_CLIENT_SECRET=
|
||||
```
|
||||
|
||||
This will enable:
|
||||
* Users to create keys via `/key/generate` (by default, only admin can create keys)
|
||||
* The `/user/auth` endpoint to send user's emails with their login credentials (key + user id)
|
||||
- Set Redirect URL on your Oauth 2.0 Client on https://console.cloud.google.com/
|
||||
- Set a redirect url = `<your proxy base url>/sso/callback`
|
||||
```shell
|
||||
https://litellm-production-7002.up.railway.app/sso/callback
|
||||
```
|
||||
|
||||
### Step 3. Connect to UI
|
||||
</TabItem>
|
||||
|
||||
You can use our hosted UI (https://dashboard.litellm.ai/) or [self-host your own](https://github.com/BerriAI/litellm/tree/main/ui).
|
||||
<TabItem value="msft" label="Microsoft SSO">
|
||||
|
||||
- Create a new App Registration on https://portal.azure.com/
|
||||
- Create a client Secret for your App Registration
|
||||
|
||||
**Required .env variables on your Proxy**
|
||||
```shell
|
||||
MICROSOFT_CLIENT_ID="84583a4d-"
|
||||
MICROSOFT_CLIENT_SECRET="nbk8Q~"
|
||||
MICROSOFT_TENANT="5a39737
|
||||
```
|
||||
- Set Redirect URI on your App Registration on https://portal.azure.com/
|
||||
- Set a redirect url = `<your proxy base url>/sso/callback`
|
||||
```shell
|
||||
http://localhost:4000/sso/callback
|
||||
```
|
||||
|
||||
</TabItem>
|
||||
|
||||
</Tabs>
|
||||
|
||||
## 2. Start Proxy Server
|
||||
|
||||
```shell
|
||||
litellm --config proxy_config.yaml --port 4000
|
||||
|
||||
# start proxy on port 4000
|
||||
```
|
||||
|
||||
## 3. Get Admin UI Link to you on Swagger
|
||||
|
||||
Your Proxy Swagger is available on the root of the Proxy: `http://localhost:4000/`
|
||||
|
||||
<Image img={require('../../img/ui_link.png')} />
|
||||
|
||||
|
||||
|
||||
|
||||
<!-- You can use our hosted UI (https://dashboard.litellm.ai/) or [self-host your own](https://github.com/BerriAI/litellm/tree/main/ui).
|
||||
|
||||
If you self-host, you need to save the UI url in your proxy environment as `LITELLM_HOSTED_UI`.
|
||||
|
||||
|
@ -62,4 +111,13 @@ Connect your proxy to your UI, by entering:
|
|||
|
||||
### Create Keys
|
||||
|
||||
<Image img={require('../../img/user_create_key_screen.png')} />
|
||||
<Image img={require('../../img/user_create_key_screen.png')} />
|
||||
|
||||
### Spend Per Key
|
||||
|
||||
<Image img={require('../../img/spend_per_api_key.png')} />
|
||||
|
||||
### Spend Per User
|
||||
|
||||
<Image img={require('../../img/spend_per_user.png')} /> -->
|
||||
|
||||
|
|
|
@ -1,4 +1,7 @@
|
|||
# 💰 Budgets, Rate Limits per user
|
||||
import Tabs from '@theme/Tabs';
|
||||
import TabItem from '@theme/TabItem';
|
||||
|
||||
# 💰 Budgets, Rate Limits
|
||||
|
||||
Requirements:
|
||||
|
||||
|
@ -6,17 +9,74 @@ Requirements:
|
|||
|
||||
|
||||
## Set Budgets
|
||||
LiteLLM exposes a `/user/new` endpoint to create budgets for users, that persist across multiple keys.
|
||||
|
||||
This is documented in the swagger (live on your server root endpoint - e.g. `http://0.0.0.0:8000/`). Here's an example request.
|
||||
You can set budgets at 3 levels:
|
||||
- For the proxy
|
||||
- For a user
|
||||
- For a key
|
||||
|
||||
|
||||
<Tabs>
|
||||
<TabItem value="proxy" label="For Proxy">
|
||||
|
||||
Apply a budget across all calls on the proxy
|
||||
|
||||
**Step 1. Modify config.yaml**
|
||||
|
||||
```yaml
|
||||
general_settings:
|
||||
master_key: sk-1234
|
||||
|
||||
litellm_settings:
|
||||
# other litellm settings
|
||||
max_budget: 0 # (float) sets max budget as $0 USD
|
||||
budget_duration: 30d # (str) frequency of reset - You can set duration as seconds ("30s"), minutes ("30m"), hours ("30h"), days ("30d").
|
||||
```
|
||||
|
||||
**Step 2. Start proxy**
|
||||
|
||||
```bash
|
||||
litellm /path/to/config.yaml
|
||||
```
|
||||
|
||||
**Step 3. Send test call**
|
||||
|
||||
```bash
|
||||
curl --location 'http://0.0.0.0:8000/chat/completions' \
|
||||
--header 'Autherization: Bearer sk-1234' \
|
||||
--header 'Content-Type: application/json' \
|
||||
--data '{
|
||||
"model": "gpt-3.5-turbo",
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "what llm are you"
|
||||
}
|
||||
],
|
||||
}'
|
||||
```
|
||||
</TabItem>
|
||||
<TabItem value="per-user" label="For User">
|
||||
|
||||
Apply a budget across multiple keys.
|
||||
|
||||
LiteLLM exposes a `/user/new` endpoint to create budgets for this.
|
||||
|
||||
You can:
|
||||
- Add budgets to users [**Jump**](#add-budgets-to-users)
|
||||
- Add budget durations, to reset spend [**Jump**](#add-budget-duration-to-users)
|
||||
|
||||
By default the `max_budget` is set to `null` and is not checked for keys
|
||||
|
||||
### **Add budgets to users**
|
||||
```shell
|
||||
curl --location 'http://localhost:8000/user/new' \
|
||||
--header 'Authorization: Bearer <your-master-key>' \
|
||||
--header 'Content-Type: application/json' \
|
||||
--data-raw '{"models": ["azure-models"], "max_budget": 0, "user_id": "krrish3@berri.ai"}'
|
||||
```
|
||||
The request is a normal `/key/generate` request body + a `max_budget` field.
|
||||
|
||||
[**See Swagger**](https://litellm-api.up.railway.app/#/user%20management/new_user_user_new_post)
|
||||
|
||||
**Sample Response**
|
||||
|
||||
|
@ -29,14 +89,201 @@ The request is a normal `/key/generate` request body + a `max_budget` field.
|
|||
}
|
||||
```
|
||||
|
||||
### **Add budget duration to users**
|
||||
|
||||
`budget_duration`: Budget is reset at the end of specified duration. If not set, budget is never reset. You can set duration as seconds ("30s"), minutes ("30m"), hours ("30h"), days ("30d").
|
||||
|
||||
```
|
||||
curl 'http://0.0.0.0:8000/user/new' \
|
||||
--header 'Authorization: Bearer <your-master-key>' \
|
||||
--header 'Content-Type: application/json' \
|
||||
--data-raw '{
|
||||
"team_id": "core-infra", # [OPTIONAL]
|
||||
"max_budget": 10,
|
||||
"budget_duration": 10s,
|
||||
}'
|
||||
```
|
||||
|
||||
### Create new keys for existing user
|
||||
|
||||
Now you can just call `/key/generate` with that user_id (i.e. krrish3@berri.ai) and:
|
||||
- **Budget Check**: krrish3@berri.ai's budget (i.e. $10) will be checked for this key
|
||||
- **Spend Tracking**: spend for this key will update krrish3@berri.ai's spend as well
|
||||
|
||||
```bash
|
||||
curl --location 'http://0.0.0.0:8000/key/generate' \
|
||||
--header 'Authorization: Bearer <your-master-key>' \
|
||||
--header 'Content-Type: application/json' \
|
||||
--data '{"models": ["azure-models"], "user_id": "krrish3@berri.ai"}'
|
||||
```
|
||||
|
||||
</TabItem>
|
||||
<TabItem value="per-key" label="For Key">
|
||||
|
||||
Apply a budget on a key.
|
||||
|
||||
You can:
|
||||
- Add budgets to keys [**Jump**](#add-budgets-to-keys)
|
||||
- Add budget durations, to reset spend [**Jump**](#add-budget-duration-to-keys)
|
||||
|
||||
**Expected Behaviour**
|
||||
- Costs Per key get auto-populated in `LiteLLM_VerificationToken` Table
|
||||
- After the key crosses it's `max_budget`, requests fail
|
||||
- If duration set, spend is reset at the end of the duration
|
||||
|
||||
By default the `max_budget` is set to `null` and is not checked for keys
|
||||
|
||||
### **Add budgets to keys**
|
||||
|
||||
```bash
|
||||
curl 'http://0.0.0.0:8000/key/generate' \
|
||||
--header 'Authorization: Bearer <your-master-key>' \
|
||||
--header 'Content-Type: application/json' \
|
||||
--data-raw '{
|
||||
"team_id": "core-infra", # [OPTIONAL]
|
||||
"max_budget": 10,
|
||||
}'
|
||||
```
|
||||
|
||||
Example Request to `/chat/completions` when key has crossed budget
|
||||
|
||||
```shell
|
||||
curl --location 'http://0.0.0.0:8000/chat/completions' \
|
||||
--header 'Content-Type: application/json' \
|
||||
--header 'Authorization: Bearer <generated-key>' \
|
||||
--data ' {
|
||||
"model": "azure-gpt-3.5",
|
||||
"user": "e09b4da8-ed80-4b05-ac93-e16d9eb56fca",
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "respond in 50 lines"
|
||||
}
|
||||
],
|
||||
}'
|
||||
```
|
||||
|
||||
|
||||
Expected Response from `/chat/completions` when key has crossed budget
|
||||
```shell
|
||||
{
|
||||
"detail":"Authentication Error, ExceededTokenBudget: Current spend for token: 7.2e-05; Max Budget for Token: 2e-07"
|
||||
}
|
||||
```
|
||||
|
||||
### **Add budget duration to keys**
|
||||
|
||||
`budget_duration`: Budget is reset at the end of specified duration. If not set, budget is never reset. You can set duration as seconds ("30s"), minutes ("30m"), hours ("30h"), days ("30d").
|
||||
|
||||
```
|
||||
curl 'http://0.0.0.0:8000/key/generate' \
|
||||
--header 'Authorization: Bearer <your-master-key>' \
|
||||
--header 'Content-Type: application/json' \
|
||||
--data-raw '{
|
||||
"team_id": "core-infra", # [OPTIONAL]
|
||||
"max_budget": 10,
|
||||
"budget_duration": 10s,
|
||||
}'
|
||||
```
|
||||
|
||||
</TabItem>
|
||||
</Tabs>
|
||||
|
||||
## Set Rate Limits
|
||||
|
||||
Set max parallel requests a user can make, when you create user keys - `/key/generate`.
|
||||
You can set:
|
||||
- max parallel requests
|
||||
- tpm limits
|
||||
- rpm limits
|
||||
|
||||
<Tabs>
|
||||
<TabItem value="per-user" label="Per User">
|
||||
|
||||
Use `/user/new`, to persist rate limits across multiple keys.
|
||||
|
||||
|
||||
```shell
|
||||
curl --location 'http://0.0.0.0:8000/user/new' \
|
||||
--header 'Authorization: Bearer sk-1234' \
|
||||
--header 'Content-Type: application/json' \
|
||||
--data '{"user_id": "krrish@berri.ai", "max_parallel_requests": 10, "tpm_limit": 20, "rpm_limit": 4}'
|
||||
```
|
||||
|
||||
[**See Swagger**](https://litellm-api.up.railway.app/#/user%20management/new_user_user_new_post)
|
||||
|
||||
**Expected Response**
|
||||
|
||||
```json
|
||||
{
|
||||
"key": "sk-sA7VDkyhlQ7m8Gt77Mbt3Q",
|
||||
"expires": "2024-01-19T01:21:12.816168",
|
||||
"user_id": "krrish@berri.ai",
|
||||
}
|
||||
```
|
||||
|
||||
</TabItem>
|
||||
<TabItem value="per-key" label="Per Key">
|
||||
|
||||
Use `/key/generate`, if you want them for just that key.
|
||||
|
||||
```shell
|
||||
curl --location 'http://0.0.0.0:8000/key/generate' \
|
||||
--header 'Authorization: Bearer sk-1234' \
|
||||
--header 'Content-Type: application/json' \
|
||||
--data '{"duration": "20m", "max_parallel_requests": 1}' # 👈 max parallel requests = 1
|
||||
--data '{"max_parallel_requests": 10, "tpm_limit": 20, "rpm_limit": 4}'
|
||||
```
|
||||
|
||||
**Expected Response**
|
||||
|
||||
```json
|
||||
{
|
||||
"key": "sk-ulGNRXWtv7M0lFnnsQk0wQ",
|
||||
"expires": "2024-01-18T20:48:44.297973",
|
||||
"user_id": "78c2c8fc-c233-43b9-b0c3-eb931da27b84" // 👈 auto-generated
|
||||
}
|
||||
```
|
||||
|
||||
</TabItem>
|
||||
</Tabs>
|
||||
|
||||
## Grant Access to new model
|
||||
|
||||
Use model access groups to give users access to select models, and add new ones to it over time (e.g. mistral, llama-2, etc.).
|
||||
|
||||
Difference between doing this with `/key/generate` vs. `/user/new`? If you do it on `/user/new` it'll persist across multiple keys generated for that user.
|
||||
|
||||
**Step 1. Assign model, access group in config.yaml**
|
||||
|
||||
```yaml
|
||||
model_list:
|
||||
- model_name: text-embedding-ada-002
|
||||
litellm_params:
|
||||
model: azure/azure-embedding-model
|
||||
api_base: "os.environ/AZURE_API_BASE"
|
||||
api_key: "os.environ/AZURE_API_KEY"
|
||||
api_version: "2023-07-01-preview"
|
||||
model_info:
|
||||
access_groups: ["beta-models"] # 👈 Model Access Group
|
||||
```
|
||||
|
||||
**Step 2. Create key with access group**
|
||||
|
||||
```bash
|
||||
curl --location 'http://localhost:8000/user/new' \
|
||||
-H 'Authorization: Bearer <your-master-key>' \
|
||||
-H 'Content-Type: application/json' \
|
||||
-d '{"models": ["beta-models"], # 👈 Model Access Group
|
||||
"max_budget": 0}'
|
||||
```
|
||||
|
||||
|
||||
## Create new keys for existing user
|
||||
|
||||
Just include user_id in the `/key/generate` request.
|
||||
|
||||
```bash
|
||||
curl --location 'http://0.0.0.0:8000/key/generate' \
|
||||
--header 'Authorization: Bearer <your-master-key>' \
|
||||
--header 'Content-Type: application/json' \
|
||||
--data '{"models": ["azure-models"], "user_id": "krrish@berri.ai"}'
|
||||
```
|
|
@ -1,4 +1,4 @@
|
|||
# Key Management
|
||||
# Virtual Keys, Users
|
||||
Track Spend, Set budgets and create virtual keys for the proxy
|
||||
|
||||
Grant other's temporary access to your proxy, with keys that expire after a set duration.
|
||||
|
@ -12,7 +12,7 @@ Grant other's temporary access to your proxy, with keys that expire after a set
|
|||
|
||||
:::
|
||||
|
||||
## Quick Start
|
||||
## Setup
|
||||
|
||||
Requirements:
|
||||
|
||||
|
@ -56,38 +56,55 @@ litellm --config /path/to/config.yaml
|
|||
|
||||
```shell
|
||||
curl 'http://0.0.0.0:8000/key/generate' \
|
||||
--header 'Authorization: Bearer sk-1234' \
|
||||
--header 'Authorization: Bearer <your-master-key>' \
|
||||
--header 'Content-Type: application/json' \
|
||||
--data-raw '{"models": ["gpt-3.5-turbo", "gpt-4", "claude-2"], "duration": "20m","metadata": {"user": "ishaan@berri.ai", "team": "core-infra"}}'
|
||||
--data-raw '{"models": ["gpt-3.5-turbo", "gpt-4", "claude-2"], "duration": "20m","metadata": {"user": "ishaan@berri.ai"}}'
|
||||
```
|
||||
|
||||
- `models`: *list or null (optional)* - Specify the models a token has access too. If null, then token has access to all models on server.
|
||||
|
||||
- `duration`: *str or null (optional)* Specify the length of time the token is valid for. If null, default is set to 1 hour. You can set duration as seconds ("30s"), minutes ("30m"), hours ("30h"), days ("30d").
|
||||
## /key/generate
|
||||
|
||||
- `metadata`: *dict or null (optional)* Pass metadata for the created token. If null defaults to {}
|
||||
### Request
|
||||
```shell
|
||||
curl 'http://0.0.0.0:8000/key/generate' \
|
||||
--header 'Authorization: Bearer <your-master-key>' \
|
||||
--header 'Content-Type: application/json' \
|
||||
--data-raw '{
|
||||
"models": ["gpt-3.5-turbo", "gpt-4", "claude-2"],
|
||||
"duration": "20m",
|
||||
"metadata": {"user": "ishaan@berri.ai"},
|
||||
"team_id": "core-infra",
|
||||
"max_budget": 10,
|
||||
}'
|
||||
```
|
||||
|
||||
Expected response:
|
||||
|
||||
Request Params:
|
||||
|
||||
- `duration`: *Optional[str]* - Specify the length of time the token is valid for. You can set duration as seconds ("30s"), minutes ("30m"), hours ("30h"), days ("30d").
|
||||
- `key_alias`: *Optional[str]* - User defined key alias
|
||||
- `team_id`: *Optional[str]* - The team id of the user
|
||||
- `models`: *Optional[list]* - Model_name's a user is allowed to call. (if empty, key is allowed to call all models)
|
||||
- `aliases`: *Optional[dict]* - Any alias mappings, on top of anything in the config.yaml model list. - https://docs.litellm.ai/docs/proxy/virtual_keys#managing-auth---upgradedowngrade-models
|
||||
- `config`: *Optional[dict]* - any key-specific configs, overrides config in config.yaml
|
||||
- `spend`: *Optional[int]* - Amount spent by key. Default is 0. Will be updated by proxy whenever key is used. https://docs.litellm.ai/docs/proxy/virtual_keys#managing-auth---tracking-spend
|
||||
- `max_budget`: *Optional[float]* - Specify max budget for a given key.
|
||||
- `max_parallel_requests`: *Optional[int]* - Rate limit a user based on the number of parallel requests. Raises 429 error, if user's parallel requests > x.
|
||||
- `metadata`: *Optional[dict]* - Metadata for key, store information for key. Example metadata = {"team": "core-infra", "app": "app2", "email": "ishaan@berri.ai" }
|
||||
|
||||
|
||||
### Response
|
||||
|
||||
```python
|
||||
{
|
||||
"key": "sk-kdEXbIqZRwEeEiHwdg7sFA", # Bearer token
|
||||
"expires": "2023-11-19T01:38:25.838000+00:00" # datetime object
|
||||
"key_name": "sk-...7sFA" # abbreviated key string, ONLY stored in db if `allow_user_auth: true` set - [see](./ui.md)
|
||||
...
|
||||
}
|
||||
```
|
||||
|
||||
## Keys that don't expire
|
||||
|
||||
Just set duration to None.
|
||||
|
||||
```bash
|
||||
curl --location 'http://0.0.0.0:8000/key/generate' \
|
||||
--header 'Authorization: Bearer sk-1234' \
|
||||
--header 'Content-Type: application/json' \
|
||||
--data '{"models": ["azure-models"], "aliases": {"mistral-7b": "gpt-3.5-turbo"}, "duration": null}'
|
||||
```
|
||||
|
||||
## Upgrade/Downgrade Models
|
||||
### Upgrade/Downgrade Models
|
||||
|
||||
If a user is expected to use a given model (i.e. gpt3-5), and you want to:
|
||||
|
||||
|
@ -124,7 +141,7 @@ model_list:
|
|||
|
||||
```bash
|
||||
curl -X POST "https://0.0.0.0:8000/key/generate" \
|
||||
-H "Authorization: Bearer sk-1234" \
|
||||
-H "Authorization: Bearer <your-master-key>" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"models": ["my-free-tier"],
|
||||
|
@ -136,6 +153,291 @@ curl -X POST "https://0.0.0.0:8000/key/generate" \
|
|||
- **How to upgrade / downgrade request?** Change the alias mapping
|
||||
- **How are routing between diff keys/api bases done?** litellm handles this by shuffling between different models in the model list with the same model_name. [**See Code**](https://github.com/BerriAI/litellm/blob/main/litellm/router.py)
|
||||
|
||||
|
||||
### Grant Access to new model
|
||||
|
||||
Use model access groups to give users access to select models, and add new ones to it over time (e.g. mistral, llama-2, etc.)
|
||||
|
||||
**Step 1. Assign model, access group in config.yaml**
|
||||
|
||||
```yaml
|
||||
model_list:
|
||||
- model_name: text-embedding-ada-002
|
||||
litellm_params:
|
||||
model: azure/azure-embedding-model
|
||||
api_base: "os.environ/AZURE_API_BASE"
|
||||
api_key: "os.environ/AZURE_API_KEY"
|
||||
api_version: "2023-07-01-preview"
|
||||
model_info:
|
||||
access_groups: ["beta-models"] # 👈 Model Access Group
|
||||
```
|
||||
|
||||
**Step 2. Create key with access group**
|
||||
|
||||
```bash
|
||||
curl --location 'http://localhost:8000/key/generate' \
|
||||
-H 'Authorization: Bearer <your-master-key>' \
|
||||
-H 'Content-Type: application/json' \
|
||||
-d '{"models": ["beta-models"], # 👈 Model Access Group
|
||||
"max_budget": 0,}'
|
||||
```
|
||||
|
||||
|
||||
## /key/info
|
||||
|
||||
### Request
|
||||
```shell
|
||||
curl -X GET "http://0.0.0.0:8000/key/info?key=sk-02Wr4IAlN3NvPXvL5JVvDA" \
|
||||
-H "Authorization: Bearer sk-1234"
|
||||
```
|
||||
|
||||
Request Params:
|
||||
- key: str - The key you want the info for
|
||||
|
||||
### Response
|
||||
|
||||
`token` is the hashed key (The DB stores the hashed key for security)
|
||||
```json
|
||||
{
|
||||
"key": "sk-02Wr4IAlN3NvPXvL5JVvDA",
|
||||
"info": {
|
||||
"token": "80321a12d03412c527f2bd9db5fabd746abead2e1d50b435a534432fbaca9ef5",
|
||||
"spend": 0.0,
|
||||
"expires": "2024-01-18T23:52:09.125000+00:00",
|
||||
"models": ["azure-gpt-3.5", "azure-embedding-model"],
|
||||
"aliases": {},
|
||||
"config": {},
|
||||
"user_id": "ishaan2@berri.ai",
|
||||
"team_id": "None",
|
||||
"max_parallel_requests": null,
|
||||
"metadata": {}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
```
|
||||
|
||||
## /key/update
|
||||
|
||||
### Request
|
||||
```shell
|
||||
curl 'http://0.0.0.0:8000/key/update' \
|
||||
--header 'Authorization: Bearer <your-master-key>' \
|
||||
--header 'Content-Type: application/json' \
|
||||
--data-raw '{
|
||||
"key": "sk-kdEXbIqZRwEeEiHwdg7sFA",
|
||||
"models": ["gpt-3.5-turbo", "gpt-4", "claude-2"],
|
||||
"metadata": {"user": "ishaan@berri.ai"},
|
||||
"team_id": "core-infra"
|
||||
}'
|
||||
```
|
||||
|
||||
Request Params:
|
||||
- key: str - The key that needs to be updated.
|
||||
|
||||
- models: list or null (optional) - Specify the models a token has access to. If null, then the token has access to all models on the server.
|
||||
|
||||
- metadata: dict or null (optional) - Pass metadata for the updated token. If null, defaults to an empty dictionary.
|
||||
|
||||
- team_id: str or null (optional) - Specify the team_id for the associated key.
|
||||
|
||||
### Response
|
||||
|
||||
```json
|
||||
{
|
||||
"key": "sk-kdEXbIqZRwEeEiHwdg7sFA",
|
||||
"models": ["gpt-3.5-turbo", "gpt-4", "claude-2"],
|
||||
"metadata": {
|
||||
"user": "ishaan@berri.ai"
|
||||
}
|
||||
}
|
||||
|
||||
```
|
||||
|
||||
|
||||
## /key/delete
|
||||
|
||||
### Request
|
||||
```shell
|
||||
curl 'http://0.0.0.0:8000/key/delete' \
|
||||
--header 'Authorization: Bearer <your-master-key>' \
|
||||
--header 'Content-Type: application/json' \
|
||||
--data-raw '{
|
||||
"keys": ["sk-kdEXbIqZRwEeEiHwdg7sFA"]
|
||||
}'
|
||||
```
|
||||
|
||||
Request Params:
|
||||
- keys: List[str] - List of keys to delete
|
||||
|
||||
### Response
|
||||
|
||||
```json
|
||||
{
|
||||
"deleted_keys": ["sk-kdEXbIqZRwEeEiHwdg7sFA"]
|
||||
}
|
||||
```
|
||||
|
||||
## /user/new
|
||||
|
||||
### Request
|
||||
|
||||
All [key/generate params supported](#keygenerate) for creating a user
|
||||
```shell
|
||||
curl 'http://0.0.0.0:4000/user/new' \
|
||||
--header 'Authorization: Bearer sk-1234' \
|
||||
--header 'Content-Type: application/json' \
|
||||
--data-raw '{
|
||||
"user_id": "ishaan1",
|
||||
"user_email": "ishaan@litellm.ai",
|
||||
"user_role": "admin",
|
||||
"team_id": "cto-team",
|
||||
"max_budget": 20,
|
||||
"budget_duration": "1h"
|
||||
|
||||
}'
|
||||
```
|
||||
|
||||
Request Params:
|
||||
|
||||
- user_id: str (optional - defaults to uuid) - The unique identifier for the user.
|
||||
- user_email: str (optional - defaults to "") - The email address associated with the user.
|
||||
- user_role: str (optional - defaults to "app_user") - The role assigned to the user. Can be "admin", "app_owner", "app_user"
|
||||
|
||||
**Possible `user_role` values**
|
||||
```
|
||||
"admin" - Maintaining the proxy and owning the overall budget
|
||||
"app_owner" - employees maintaining the apps, each owner may own more than one app
|
||||
"app_user" - users who know nothing about the proxy. These users get created when you pass `user` to /chat/completions
|
||||
```
|
||||
- team_id: str (optional - defaults to "") - The identifier for the team to which the user belongs.
|
||||
- max_budget: float (optional - defaults to `null`) - The maximum budget allocated for the user. No budget checks done if `max_budget==null`
|
||||
- budget_duration: str (optional - defaults to `null`) - The duration for which the budget is valid, e.g., "1h", "1d"
|
||||
|
||||
### Response
|
||||
A key will be generated for the new user created
|
||||
|
||||
```shell
|
||||
{
|
||||
"models": [],
|
||||
"spend": 0.0,
|
||||
"max_budget": null,
|
||||
"user_id": "ishaan1",
|
||||
"team_id": null,
|
||||
"max_parallel_requests": null,
|
||||
"metadata": {},
|
||||
"tpm_limit": null,
|
||||
"rpm_limit": null,
|
||||
"budget_duration": null,
|
||||
"allowed_cache_controls": [],
|
||||
"key_alias": null,
|
||||
"duration": null,
|
||||
"aliases": {},
|
||||
"config": {},
|
||||
"key": "sk-JflB33ucTqc2NYvNAgiBCA",
|
||||
"key_name": null,
|
||||
"expires": null
|
||||
}
|
||||
|
||||
```
|
||||
|
||||
Request Params:
|
||||
- keys: List[str] - List of keys to delete
|
||||
|
||||
### Response
|
||||
|
||||
```json
|
||||
{
|
||||
"deleted_keys": ["sk-kdEXbIqZRwEeEiHwdg7sFA"]
|
||||
}
|
||||
```
|
||||
|
||||
## Default /key/generate params
|
||||
Use this, if you need to control the default `max_budget` or any `key/generate` param per key.
|
||||
|
||||
When a `/key/generate` request does not specify `max_budget`, it will use the `max_budget` specified in `default_key_generate_params`
|
||||
|
||||
Set `litellm_settings:default_key_generate_params`:
|
||||
```yaml
|
||||
litellm_settings:
|
||||
default_key_generate_params:
|
||||
max_budget: 1.5000
|
||||
models: ["azure-gpt-3.5"]
|
||||
duration: # blank means `null`
|
||||
metadata: {"setting":"default"}
|
||||
team_id: "core-infra"
|
||||
```
|
||||
## Set Budgets - Per Key
|
||||
|
||||
Set `max_budget` in (USD $) param in the `key/generate` request. By default the `max_budget` is set to `null` and is not checked for keys
|
||||
|
||||
```shell
|
||||
curl 'http://0.0.0.0:8000/key/generate' \
|
||||
--header 'Authorization: Bearer <your-master-key>' \
|
||||
--header 'Content-Type: application/json' \
|
||||
--data-raw '{
|
||||
"metadata": {"user": "ishaan@berri.ai"},
|
||||
"team_id": "core-infra",
|
||||
"max_budget": 10,
|
||||
}'
|
||||
```
|
||||
|
||||
#### Expected Behaviour
|
||||
- Costs Per key get auto-populated in `LiteLLM_VerificationToken` Table
|
||||
- After the key crosses it's `max_budget`, requests fail
|
||||
|
||||
Example Request to `/chat/completions` when key has crossed budget
|
||||
|
||||
```shell
|
||||
curl --location 'http://0.0.0.0:8000/chat/completions' \
|
||||
--header 'Content-Type: application/json' \
|
||||
--header 'Authorization: Bearer sk-ULl_IKCVFy2EZRzQB16RUA' \
|
||||
--data ' {
|
||||
"model": "azure-gpt-3.5",
|
||||
"user": "e09b4da8-ed80-4b05-ac93-e16d9eb56fca",
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "respond in 50 lines"
|
||||
}
|
||||
],
|
||||
}'
|
||||
```
|
||||
|
||||
|
||||
Expected Response from `/chat/completions` when key has crossed budget
|
||||
```shell
|
||||
{
|
||||
"detail":"Authentication Error, ExceededTokenBudget: Current spend for token: 7.2e-05; Max Budget for Token: 2e-07"
|
||||
}
|
||||
```
|
||||
|
||||
|
||||
## Set Budgets - Per User
|
||||
|
||||
LiteLLM exposes a `/user/new` endpoint to create budgets for users, that persist across multiple keys.
|
||||
|
||||
This is documented in the swagger (live on your server root endpoint - e.g. `http://0.0.0.0:8000/`). Here's an example request.
|
||||
|
||||
```shell
|
||||
curl --location 'http://localhost:8000/user/new' \
|
||||
--header 'Authorization: Bearer <your-master-key>' \
|
||||
--header 'Content-Type: application/json' \
|
||||
--data-raw '{"models": ["azure-models"], "max_budget": 0, "user_id": "krrish3@berri.ai"}'
|
||||
```
|
||||
The request is a normal `/key/generate` request body + a `max_budget` field.
|
||||
|
||||
**Sample Response**
|
||||
|
||||
```shell
|
||||
{
|
||||
"key": "sk-YF2OxDbrgd1y2KgwxmEA2w",
|
||||
"expires": "2023-12-22T09:53:13.861000Z",
|
||||
"user_id": "krrish3@berri.ai",
|
||||
"max_budget": 0.0
|
||||
}
|
||||
```
|
||||
|
||||
## Tracking Spend
|
||||
|
||||
You can get spend for a key by using the `/key/info` endpoint.
|
||||
|
@ -171,32 +473,6 @@ This is automatically updated (in USD) when calls are made to /completions, /cha
|
|||
```
|
||||
|
||||
|
||||
|
||||
## Set Budgets
|
||||
|
||||
LiteLLM exposes a `/user/new` endpoint to create budgets for users, that persist across multiple keys.
|
||||
|
||||
This is documented in the swagger (live on your server root endpoint - e.g. `http://0.0.0.0:8000/`). Here's an example request.
|
||||
|
||||
```curl
|
||||
curl --location 'http://localhost:8000/user/new' \
|
||||
--header 'Authorization: Bearer <your-master-key>' \
|
||||
--header 'Content-Type: application/json' \
|
||||
--data-raw '{"models": ["azure-models"], "max_budget": 0, "user_id": "krrish3@berri.ai"}'
|
||||
```
|
||||
The request is a normal `/key/generate` request body + a `max_budget` field.
|
||||
|
||||
**Sample Response**
|
||||
|
||||
```curl
|
||||
{
|
||||
"key": "sk-YF2OxDbrgd1y2KgwxmEA2w",
|
||||
"expires": "2023-12-22T09:53:13.861000Z",
|
||||
"user_id": "krrish3@berri.ai",
|
||||
"max_budget": 0.0
|
||||
}
|
||||
```
|
||||
|
||||
## Custom Auth
|
||||
|
||||
You can now override the default api key auth.
|
||||
|
@ -242,7 +518,133 @@ general_settings:
|
|||
[**Implementation Code**](https://github.com/BerriAI/litellm/blob/caf2a6b279ddbe89ebd1d8f4499f65715d684851/litellm/proxy/utils.py#L122)
|
||||
|
||||
### 3. Start the proxy
|
||||
```bash
|
||||
```shell
|
||||
$ litellm --config /path/to/config.yaml
|
||||
```
|
||||
|
||||
## Custom /key/generate
|
||||
|
||||
If you need to add custom logic before generating a Proxy API Key (Example Validating `team_id`)
|
||||
|
||||
### 1. Write a custom `custom_generate_key_fn`
|
||||
|
||||
|
||||
The input to the custom_generate_key_fn function is a single parameter: `data` [(Type: GenerateKeyRequest)](https://github.com/BerriAI/litellm/blob/main/litellm/proxy/_types.py#L125)
|
||||
|
||||
The output of your `custom_generate_key_fn` should be a dictionary with the following structure
|
||||
```python
|
||||
{
|
||||
"decision": False,
|
||||
"message": "This violates LiteLLM Proxy Rules. No team id provided.",
|
||||
}
|
||||
|
||||
```
|
||||
|
||||
- decision (Type: bool): A boolean value indicating whether the key generation is allowed (True) or not (False).
|
||||
|
||||
- message (Type: str, Optional): An optional message providing additional information about the decision. This field is included when the decision is False.
|
||||
|
||||
|
||||
```python
|
||||
async def custom_generate_key_fn(data: GenerateKeyRequest)-> dict:
|
||||
"""
|
||||
Asynchronous function for generating a key based on the input data.
|
||||
|
||||
Args:
|
||||
data (GenerateKeyRequest): The input data for key generation.
|
||||
|
||||
Returns:
|
||||
dict: A dictionary containing the decision and an optional message.
|
||||
{
|
||||
"decision": False,
|
||||
"message": "This violates LiteLLM Proxy Rules. No team id provided.",
|
||||
}
|
||||
"""
|
||||
|
||||
# decide if a key should be generated or not
|
||||
print("using custom auth function!")
|
||||
data_json = data.json() # type: ignore
|
||||
|
||||
# Unpacking variables
|
||||
team_id = data_json.get("team_id")
|
||||
duration = data_json.get("duration")
|
||||
models = data_json.get("models")
|
||||
aliases = data_json.get("aliases")
|
||||
config = data_json.get("config")
|
||||
spend = data_json.get("spend")
|
||||
user_id = data_json.get("user_id")
|
||||
max_parallel_requests = data_json.get("max_parallel_requests")
|
||||
metadata = data_json.get("metadata")
|
||||
tpm_limit = data_json.get("tpm_limit")
|
||||
rpm_limit = data_json.get("rpm_limit")
|
||||
|
||||
if team_id is not None and team_id == "litellm-core-infra@gmail.com":
|
||||
# only team_id="litellm-core-infra@gmail.com" can make keys
|
||||
return {
|
||||
"decision": True,
|
||||
}
|
||||
else:
|
||||
print("Failed custom auth")
|
||||
return {
|
||||
"decision": False,
|
||||
"message": "This violates LiteLLM Proxy Rules. No team id provided.",
|
||||
}
|
||||
```
|
||||
|
||||
|
||||
### 2. Pass the filepath (relative to the config.yaml)
|
||||
|
||||
Pass the filepath to the config.yaml
|
||||
|
||||
e.g. if they're both in the same dir - `./config.yaml` and `./custom_auth.py`, this is what it looks like:
|
||||
```yaml
|
||||
model_list:
|
||||
- model_name: "openai-model"
|
||||
litellm_params:
|
||||
model: "gpt-3.5-turbo"
|
||||
|
||||
litellm_settings:
|
||||
drop_params: True
|
||||
set_verbose: True
|
||||
|
||||
general_settings:
|
||||
custom_key_generate: custom_auth.custom_generate_key_fn
|
||||
```
|
||||
|
||||
|
||||
|
||||
|
||||
## [BETA] Dynamo DB
|
||||
|
||||
Only live in `v1.16.21.dev1`.
|
||||
|
||||
### Step 1. Save keys to env
|
||||
|
||||
```shell
|
||||
AWS_ACCESS_KEY_ID = "your-aws-access-key-id"
|
||||
AWS_SECRET_ACCESS_KEY = "your-aws-secret-access-key"
|
||||
```
|
||||
|
||||
### Step 2. Add details to config
|
||||
|
||||
```yaml
|
||||
general_settings:
|
||||
master_key: sk-1234
|
||||
database_type: "dynamo_db"
|
||||
database_args: { # 👈 all args - https://github.com/BerriAI/litellm/blob/befbcbb7ac8f59835ce47415c128decf37aac328/litellm/proxy/_types.py#L190
|
||||
"billing_mode": "PAY_PER_REQUEST",
|
||||
"region_name": "us-west-2"
|
||||
"user_table_name": "your-user-table",
|
||||
"key_table_name": "your-token-table",
|
||||
"config_table_name": "your-config-table"
|
||||
}
|
||||
```
|
||||
|
||||
### Step 3. Generate Key
|
||||
|
||||
```bash
|
||||
curl --location 'http://0.0.0.0:8000/key/generate' \
|
||||
--header 'Authorization: Bearer sk-1234' \
|
||||
--header 'Content-Type: application/json' \
|
||||
--data '{"models": ["azure-models"], "aliases": {"mistral-7b": "gpt-3.5-turbo"}, "duration": null}'
|
||||
```
|
|
@ -302,6 +302,7 @@ asyncio.run(router_acompletion())
|
|||
|
||||
The timeout set in router is for the entire length of the call, and is passed down to the completion() call level as well.
|
||||
|
||||
**Global Timeouts**
|
||||
```python
|
||||
from litellm import Router
|
||||
|
||||
|
@ -313,6 +314,36 @@ router = Router(model_list=model_list,
|
|||
print(response)
|
||||
```
|
||||
|
||||
**Timeouts per model**
|
||||
|
||||
```python
|
||||
from litellm import Router
|
||||
import asyncio
|
||||
|
||||
model_list = [{
|
||||
"model_name": "gpt-3.5-turbo",
|
||||
"litellm_params": {
|
||||
"model": "azure/chatgpt-v-2",
|
||||
"api_key": os.getenv("AZURE_API_KEY"),
|
||||
"api_version": os.getenv("AZURE_API_VERSION"),
|
||||
"api_base": os.getenv("AZURE_API_BASE"),
|
||||
"timeout": 300 # sets a 5 minute timeout
|
||||
"stream_timeout": 30 # sets a 30s timeout for streaming calls
|
||||
}
|
||||
}]
|
||||
|
||||
# init router
|
||||
router = Router(model_list=model_list, routing_strategy="least-busy")
|
||||
async def router_acompletion():
|
||||
response = await router.acompletion(
|
||||
model="gpt-3.5-turbo",
|
||||
messages=[{"role": "user", "content": "Hey, how's it going?"}]
|
||||
)
|
||||
print(response)
|
||||
return response
|
||||
|
||||
asyncio.run(router_acompletion())
|
||||
```
|
||||
### Cooldowns
|
||||
|
||||
Set the limit for how many calls a model is allowed to fail in a minute, before being cooled down for a minute.
|
||||
|
@ -574,6 +605,49 @@ response = router.completion(model="gpt-3.5-turbo", messages=messages)
|
|||
print(f"response: {response}")
|
||||
```
|
||||
|
||||
## Custom Callbacks - Track API Key, API Endpoint, Model Used
|
||||
|
||||
If you need to track the api_key, api endpoint, model, custom_llm_provider used for each completion call, you can setup a [custom callback](https://docs.litellm.ai/docs/observability/custom_callback)
|
||||
|
||||
### Usage
|
||||
|
||||
```python
|
||||
import litellm
|
||||
from litellm.integrations.custom_logger import CustomLogger
|
||||
|
||||
class MyCustomHandler(CustomLogger):
|
||||
def log_success_event(self, kwargs, response_obj, start_time, end_time):
|
||||
print(f"On Success")
|
||||
print("kwargs=", kwargs)
|
||||
litellm_params= kwargs.get("litellm_params")
|
||||
api_key = litellm_params.get("api_key")
|
||||
api_base = litellm_params.get("api_base")
|
||||
custom_llm_provider= litellm_params.get("custom_llm_provider")
|
||||
response_cost = kwargs.get("response_cost")
|
||||
|
||||
# print the values
|
||||
print("api_key=", api_key)
|
||||
print("api_base=", api_base)
|
||||
print("custom_llm_provider=", custom_llm_provider)
|
||||
print("response_cost=", response_cost)
|
||||
|
||||
def log_failure_event(self, kwargs, response_obj, start_time, end_time):
|
||||
print(f"On Failure")
|
||||
print("kwargs=")
|
||||
|
||||
customHandler = MyCustomHandler()
|
||||
|
||||
litellm.callbacks = [customHandler]
|
||||
|
||||
# Init Router
|
||||
router = Router(model_list=model_list, routing_strategy="simple-shuffle")
|
||||
|
||||
# router completion call
|
||||
response = router.completion(
|
||||
model="gpt-3.5-turbo",
|
||||
messages=[{ "role": "user", "content": "Hi who are you"}]
|
||||
)
|
||||
```
|
||||
|
||||
## Deploy Router
|
||||
|
||||
|
@ -602,17 +676,63 @@ def __init__(
|
|||
num_retries: int = 0,
|
||||
timeout: Optional[float] = None,
|
||||
default_litellm_params={}, # default params for Router.chat.completion.create
|
||||
set_verbose: bool = False,
|
||||
fallbacks: List = [],
|
||||
allowed_fails: Optional[int] = None,
|
||||
allowed_fails: Optional[int] = None, # Number of times a deployment can failbefore being added to cooldown
|
||||
cooldown_time: float = 1, # (seconds) time to cooldown a deployment after failure
|
||||
context_window_fallbacks: List = [],
|
||||
model_group_alias: Optional[dict] = {},
|
||||
retry_after: int = 0, # min time to wait before retrying a failed request
|
||||
retry_after: int = 0, # (min) time to wait before retrying a failed request
|
||||
routing_strategy: Literal[
|
||||
"simple-shuffle",
|
||||
"least-busy",
|
||||
"usage-based-routing",
|
||||
"latency-based-routing",
|
||||
] = "simple-shuffle",
|
||||
|
||||
## DEBUGGING ##
|
||||
set_verbose: bool = False, # set this to True for seeing logs
|
||||
debug_level: Literal["DEBUG", "INFO"] = "INFO", # set this to "DEBUG" for detailed debugging
|
||||
):
|
||||
```
|
||||
```
|
||||
|
||||
## Debugging Router
|
||||
### Basic Debugging
|
||||
Set `Router(set_verbose=True)`
|
||||
|
||||
```python
|
||||
from litellm import Router
|
||||
|
||||
router = Router(
|
||||
model_list=model_list,
|
||||
set_verbose=True
|
||||
)
|
||||
```
|
||||
|
||||
### Detailed Debugging
|
||||
Set `Router(set_verbose=True,debug_level="DEBUG")`
|
||||
|
||||
```python
|
||||
from litellm import Router
|
||||
|
||||
router = Router(
|
||||
model_list=model_list,
|
||||
set_verbose=True,
|
||||
debug_level="DEBUG" # defaults to INFO
|
||||
)
|
||||
```
|
||||
|
||||
### Very Detailed Debugging
|
||||
Set `litellm.set_verbose=True` and `Router(set_verbose=True,debug_level="DEBUG")`
|
||||
|
||||
```python
|
||||
from litellm import Router
|
||||
import litellm
|
||||
|
||||
litellm.set_verbose = True
|
||||
|
||||
router = Router(
|
||||
model_list=model_list,
|
||||
set_verbose=True,
|
||||
debug_level="DEBUG" # defaults to INFO
|
||||
)
|
||||
```
|
||||
|
|
BIN
docs/my-website/img/admin_ui_2.png
Normal file
After Width: | Height: | Size: 159 KiB |
BIN
docs/my-website/img/google_oauth2.png
Normal file
After Width: | Height: | Size: 351 KiB |
BIN
docs/my-website/img/google_redirect.png
Normal file
After Width: | Height: | Size: 297 KiB |
BIN
docs/my-website/img/spend_logs_table.png
Normal file
After Width: | Height: | Size: 189 KiB |
BIN
docs/my-website/img/spend_per_api_key.png
Normal file
After Width: | Height: | Size: 468 KiB |
BIN
docs/my-website/img/spend_per_user.png
Normal file
After Width: | Height: | Size: 249 KiB |
BIN
docs/my-website/img/ui_link.png
Normal file
After Width: | Height: | Size: 69 KiB |
|
@ -104,24 +104,49 @@ const sidebars = {
|
|||
items: [
|
||||
"proxy/quick_start",
|
||||
"proxy/configs",
|
||||
{
|
||||
type: 'link',
|
||||
label: '📖 All Endpoints',
|
||||
href: 'https://litellm-api.up.railway.app/',
|
||||
},
|
||||
"proxy/user_keys",
|
||||
"proxy/load_balancing",
|
||||
"proxy/virtual_keys",
|
||||
"proxy/ui",
|
||||
"proxy/users",
|
||||
"proxy/ui",
|
||||
"proxy/model_management",
|
||||
"proxy/reliability",
|
||||
"proxy/caching",
|
||||
"proxy/logging",
|
||||
"proxy/health",
|
||||
"proxy/call_hooks",
|
||||
"proxy/rules",
|
||||
"proxy/alerting",
|
||||
"proxy/streaming_logging",
|
||||
"proxy/debugging",
|
||||
{
|
||||
"type": "category",
|
||||
"label": "🔥 Load Balancing",
|
||||
"items": [
|
||||
"proxy/load_balancing",
|
||||
"proxy/reliability",
|
||||
]
|
||||
},
|
||||
{
|
||||
"type": "category",
|
||||
"label": "Logging, Alerting, Caching",
|
||||
"items": [
|
||||
"proxy/logging",
|
||||
"proxy/alerting",
|
||||
"proxy/streaming_logging",
|
||||
"proxy/caching",
|
||||
]
|
||||
},
|
||||
{
|
||||
"type": "category",
|
||||
"label": "Admin Controls",
|
||||
"items": [
|
||||
"proxy/call_hooks",
|
||||
"proxy/rules",
|
||||
]
|
||||
},
|
||||
"proxy/deploy",
|
||||
"proxy/cli",
|
||||
]
|
||||
},
|
||||
"proxy/custom_pricing",
|
||||
"routing",
|
||||
"rules",
|
||||
"set_keys",
|
||||
|
|
|
@ -1,11 +1,17 @@
|
|||
### INIT VARIABLES ###
|
||||
import threading, requests
|
||||
import threading, requests, os
|
||||
from typing import Callable, List, Optional, Dict, Union, Any
|
||||
from litellm.caching import Cache
|
||||
from litellm._logging import set_verbose
|
||||
from litellm._logging import set_verbose, _turn_on_debug, verbose_logger
|
||||
from litellm.proxy._types import KeyManagementSystem
|
||||
import httpx
|
||||
import dotenv
|
||||
|
||||
dotenv.load_dotenv()
|
||||
#############################################
|
||||
if set_verbose == True:
|
||||
_turn_on_debug()
|
||||
#############################################
|
||||
input_callback: List[Union[str, Callable]] = []
|
||||
success_callback: List[Union[str, Callable]] = []
|
||||
failure_callback: List[Union[str, Callable]] = []
|
||||
|
@ -58,6 +64,9 @@ cache: Optional[
|
|||
model_alias_map: Dict[str, str] = {}
|
||||
model_group_alias_map: Dict[str, str] = {}
|
||||
max_budget: float = 0.0 # set the max budget across all providers
|
||||
budget_duration: Optional[
|
||||
str
|
||||
] = None # proxy only - resets budget after fixed duration. You can set duration as seconds ("30s"), minutes ("30m"), hours ("30h"), days ("30d").
|
||||
_openai_completion_params = [
|
||||
"functions",
|
||||
"function_call",
|
||||
|
@ -136,6 +145,8 @@ model_cost_map_url: str = "https://raw.githubusercontent.com/BerriAI/litellm/mai
|
|||
suppress_debug_info = False
|
||||
dynamodb_table_name: Optional[str] = None
|
||||
s3_callback_params: Optional[Dict] = None
|
||||
default_key_generate_params: Optional[Dict] = None
|
||||
default_team_settings: Optional[List] = None
|
||||
#### RELIABILITY ####
|
||||
request_timeout: Optional[float] = 6000
|
||||
num_retries: Optional[int] = None # per model endpoint
|
||||
|
@ -155,6 +166,19 @@ _key_management_system: Optional[KeyManagementSystem] = None
|
|||
|
||||
|
||||
def get_model_cost_map(url: str):
|
||||
if (
|
||||
os.getenv("LITELLM_LOCAL_MODEL_COST_MAP", False) == True
|
||||
or os.getenv("LITELLM_LOCAL_MODEL_COST_MAP", False) == "True"
|
||||
):
|
||||
import importlib.resources
|
||||
import json
|
||||
|
||||
with importlib.resources.open_text(
|
||||
"litellm", "model_prices_and_context_window_backup.json"
|
||||
) as f:
|
||||
content = json.load(f)
|
||||
return content
|
||||
|
||||
try:
|
||||
with requests.get(
|
||||
url, timeout=5
|
||||
|
@ -210,6 +234,7 @@ vertex_chat_models: List = []
|
|||
vertex_code_chat_models: List = []
|
||||
vertex_text_models: List = []
|
||||
vertex_code_text_models: List = []
|
||||
vertex_embedding_models: List = []
|
||||
ai21_models: List = []
|
||||
nlp_cloud_models: List = []
|
||||
aleph_alpha_models: List = []
|
||||
|
@ -239,6 +264,8 @@ for key, value in model_cost.items():
|
|||
vertex_chat_models.append(key)
|
||||
elif value.get("litellm_provider") == "vertex_ai-code-chat-models":
|
||||
vertex_code_chat_models.append(key)
|
||||
elif value.get("litellm_provider") == "vertex_ai-embedding-models":
|
||||
vertex_embedding_models.append(key)
|
||||
elif value.get("litellm_provider") == "ai21":
|
||||
ai21_models.append(key)
|
||||
elif value.get("litellm_provider") == "nlp_cloud":
|
||||
|
@ -475,7 +502,10 @@ bedrock_embedding_models: List = [
|
|||
]
|
||||
|
||||
all_embedding_models = (
|
||||
open_ai_embedding_models + cohere_embedding_models + bedrock_embedding_models
|
||||
open_ai_embedding_models
|
||||
+ cohere_embedding_models
|
||||
+ bedrock_embedding_models
|
||||
+ vertex_embedding_models
|
||||
)
|
||||
|
||||
####### IMAGE GENERATION MODELS ###################
|
||||
|
@ -530,6 +560,7 @@ from .llms.bedrock import (
|
|||
AmazonAnthropicConfig,
|
||||
AmazonCohereConfig,
|
||||
AmazonLlamaConfig,
|
||||
AmazonStabilityConfig,
|
||||
)
|
||||
from .llms.openai import OpenAIConfig, OpenAITextCompletionConfig
|
||||
from .llms.azure import AzureOpenAIConfig, AzureOpenAIError
|
||||
|
|
|
@ -7,20 +7,14 @@ handler = logging.StreamHandler()
|
|||
handler.setLevel(logging.DEBUG)
|
||||
|
||||
# Create a formatter and set it for the handler
|
||||
formatter = logging.Formatter(
|
||||
"\033[92m%(asctime)s - %(name)s:%(levelname)s\033[0m: %(message)s",
|
||||
datefmt="%H:%M:%S",
|
||||
)
|
||||
|
||||
formatter = logging.Formatter("\033[92m%(name)s - %(levelname)s\033[0m: %(message)s")
|
||||
|
||||
handler.setFormatter(formatter)
|
||||
|
||||
|
||||
def print_verbose(print_statement):
|
||||
try:
|
||||
if set_verbose:
|
||||
print(print_statement) # noqa
|
||||
except:
|
||||
pass
|
||||
|
||||
|
||||
verbose_proxy_logger = logging.getLogger("LiteLLM Proxy")
|
||||
verbose_router_logger = logging.getLogger("LiteLLM Router")
|
||||
verbose_logger = logging.getLogger("LiteLLM")
|
||||
|
@ -28,3 +22,18 @@ verbose_logger = logging.getLogger("LiteLLM")
|
|||
# Add the handler to the logger
|
||||
verbose_router_logger.addHandler(handler)
|
||||
verbose_proxy_logger.addHandler(handler)
|
||||
verbose_logger.addHandler(handler)
|
||||
|
||||
|
||||
def _turn_on_debug():
|
||||
verbose_logger.setLevel(level=logging.DEBUG) # set package log to debug
|
||||
verbose_router_logger.setLevel(level=logging.DEBUG) # set router logs to debug
|
||||
verbose_proxy_logger.setLevel(level=logging.DEBUG) # set proxy logs to debug
|
||||
|
||||
|
||||
def print_verbose(print_statement):
|
||||
try:
|
||||
if set_verbose:
|
||||
print(print_statement) # noqa
|
||||
except:
|
||||
pass
|
||||
|
|
|
@ -1,3 +1,12 @@
|
|||
# +-----------------------------------------------+
|
||||
# | |
|
||||
# | NOT PROXY BUDGET MANAGER |
|
||||
# | proxy budget manager is in proxy_server.py |
|
||||
# | |
|
||||
# +-----------------------------------------------+
|
||||
#
|
||||
# Thank you users! We ❤️ you! - Krrish & Ishaan
|
||||
|
||||
import os, json, time
|
||||
import litellm
|
||||
from litellm.utils import ModelResponse
|
||||
|
@ -11,10 +20,12 @@ class BudgetManager:
|
|||
project_name: str,
|
||||
client_type: str = "local",
|
||||
api_base: Optional[str] = None,
|
||||
headers: Optional[dict] = None,
|
||||
):
|
||||
self.client_type = client_type
|
||||
self.project_name = project_name
|
||||
self.api_base = api_base or "https://api.litellm.ai"
|
||||
self.headers = headers or {"Content-Type": "application/json"}
|
||||
## load the data or init the initial dictionaries
|
||||
self.load_data()
|
||||
|
||||
|
@ -43,7 +54,7 @@ class BudgetManager:
|
|||
url = self.api_base + "/get_budget"
|
||||
headers = {"Content-Type": "application/json"}
|
||||
data = {"project_name": self.project_name}
|
||||
response = requests.post(url, headers=headers, json=data)
|
||||
response = requests.post(url, headers=self.headers, json=data)
|
||||
response = response.json()
|
||||
if response["status"] == "error":
|
||||
self.user_dict = (
|
||||
|
@ -201,6 +212,6 @@ class BudgetManager:
|
|||
url = self.api_base + "/set_budget"
|
||||
headers = {"Content-Type": "application/json"}
|
||||
data = {"project_name": self.project_name, "user_dict": self.user_dict}
|
||||
response = requests.post(url, headers=headers, json=data)
|
||||
response = requests.post(url, headers=self.headers, json=data)
|
||||
response = response.json()
|
||||
return response
|
||||
|
|
|
@ -12,10 +12,12 @@ import time, logging, asyncio
|
|||
import json, traceback, ast, hashlib
|
||||
from typing import Optional, Literal, List, Union, Any
|
||||
from openai._models import BaseModel as OpenAIObject
|
||||
from litellm._logging import verbose_logger
|
||||
|
||||
|
||||
def print_verbose(print_statement):
|
||||
try:
|
||||
verbose_logger.debug(print_statement)
|
||||
if litellm.set_verbose:
|
||||
print(print_statement) # noqa
|
||||
except:
|
||||
|
@ -81,9 +83,14 @@ class InMemoryCache(BaseCache):
|
|||
self.cache_dict.clear()
|
||||
self.ttl_dict.clear()
|
||||
|
||||
|
||||
async def disconnect(self):
|
||||
pass
|
||||
|
||||
def delete_cache(self, key):
|
||||
self.cache_dict.pop(key, None)
|
||||
self.ttl_dict.pop(key, None)
|
||||
|
||||
|
||||
class RedisCache(BaseCache):
|
||||
# if users don't provider one, use the default litellm cache
|
||||
|
@ -210,9 +217,14 @@ class RedisCache(BaseCache):
|
|||
def flush_cache(self):
|
||||
self.redis_client.flushall()
|
||||
|
||||
|
||||
async def disconnect(self):
|
||||
pass
|
||||
|
||||
def delete_cache(self, key):
|
||||
self.redis_client.delete(key)
|
||||
|
||||
|
||||
|
||||
class S3Cache(BaseCache):
|
||||
def __init__(
|
||||
|
@ -227,11 +239,13 @@ class S3Cache(BaseCache):
|
|||
s3_aws_secret_access_key=None,
|
||||
s3_aws_session_token=None,
|
||||
s3_config=None,
|
||||
s3_path=None,
|
||||
**kwargs,
|
||||
):
|
||||
import boto3
|
||||
|
||||
self.bucket_name = s3_bucket_name
|
||||
self.key_prefix = s3_path.rstrip("/") + "/" if s3_path else ""
|
||||
# Create an S3 client with custom endpoint URL
|
||||
self.s3_client = boto3.client(
|
||||
"s3",
|
||||
|
@ -253,6 +267,8 @@ class S3Cache(BaseCache):
|
|||
ttl = kwargs.get("ttl", None)
|
||||
# Convert value to JSON before storing in S3
|
||||
serialized_value = json.dumps(value)
|
||||
key = self.key_prefix + key
|
||||
|
||||
if ttl is not None:
|
||||
cache_control = f"immutable, max-age={ttl}, s-maxage={ttl}"
|
||||
import datetime
|
||||
|
@ -294,6 +310,8 @@ class S3Cache(BaseCache):
|
|||
import boto3, botocore
|
||||
|
||||
try:
|
||||
key = self.key_prefix + key
|
||||
|
||||
print_verbose(f"Get S3 Cache: key: {key}")
|
||||
# Download the data from S3
|
||||
cached_response = self.s3_client.get_object(
|
||||
|
@ -400,6 +418,12 @@ class DualCache(BaseCache):
|
|||
if self.redis_cache is not None:
|
||||
self.redis_cache.flush_cache()
|
||||
|
||||
def delete_cache(self, key):
|
||||
if self.in_memory_cache is not None:
|
||||
self.in_memory_cache.delete_cache(key)
|
||||
if self.redis_cache is not None:
|
||||
self.redis_cache.delete_cache(key)
|
||||
|
||||
|
||||
#### LiteLLM.Completion / Embedding Cache ####
|
||||
class Cache:
|
||||
|
|
|
@ -21,6 +21,7 @@ from openai import (
|
|||
APIConnectionError,
|
||||
APIResponseValidationError,
|
||||
UnprocessableEntityError,
|
||||
PermissionDeniedError,
|
||||
)
|
||||
import httpx
|
||||
|
||||
|
@ -82,6 +83,17 @@ class Timeout(APITimeoutError): # type: ignore
|
|||
) # Call the base class constructor with the parameters it needs
|
||||
|
||||
|
||||
class PermissionDeniedError(PermissionDeniedError): # type:ignore
|
||||
def __init__(self, message, llm_provider, model, response: httpx.Response):
|
||||
self.status_code = 403
|
||||
self.message = message
|
||||
self.llm_provider = llm_provider
|
||||
self.model = model
|
||||
super().__init__(
|
||||
self.message, response=response, body=None
|
||||
) # Call the base class constructor with the parameters it needs
|
||||
|
||||
|
||||
class RateLimitError(RateLimitError): # type: ignore
|
||||
def __init__(self, message, llm_provider, model, response: httpx.Response):
|
||||
self.status_code = 429
|
||||
|
|
|
@ -2,6 +2,7 @@
|
|||
# On success, logs events to Helicone
|
||||
import dotenv, os
|
||||
import requests
|
||||
import litellm
|
||||
|
||||
dotenv.load_dotenv() # Loading env variables using dotenv
|
||||
import traceback
|
||||
|
@ -56,6 +57,10 @@ class HeliconeLogger:
|
|||
else "gpt-3.5-turbo"
|
||||
)
|
||||
provider_request = {"model": model, "messages": messages}
|
||||
if isinstance(response_obj, litellm.EmbeddingResponse) or isinstance(
|
||||
response_obj, litellm.ModelResponse
|
||||
):
|
||||
response_obj = response_obj.json()
|
||||
|
||||
if "claude" in model:
|
||||
provider_request, response_obj = self.claude_mapping(
|
||||
|
|
|
@ -8,11 +8,13 @@ from datetime import datetime
|
|||
dotenv.load_dotenv() # Loading env variables using dotenv
|
||||
import traceback
|
||||
from packaging.version import Version
|
||||
from litellm._logging import verbose_logger
|
||||
import litellm
|
||||
|
||||
|
||||
class LangFuseLogger:
|
||||
# Class variables or attributes
|
||||
def __init__(self):
|
||||
def __init__(self, langfuse_public_key=None, langfuse_secret=None):
|
||||
try:
|
||||
from langfuse import Langfuse
|
||||
except Exception as e:
|
||||
|
@ -20,8 +22,8 @@ class LangFuseLogger:
|
|||
f"\033[91mLangfuse not installed, try running 'pip install langfuse' to fix this error: {e}\033[0m"
|
||||
)
|
||||
# Instance variables
|
||||
self.secret_key = os.getenv("LANGFUSE_SECRET_KEY")
|
||||
self.public_key = os.getenv("LANGFUSE_PUBLIC_KEY")
|
||||
self.secret_key = langfuse_secret or os.getenv("LANGFUSE_SECRET_KEY")
|
||||
self.public_key = langfuse_public_key or os.getenv("LANGFUSE_PUBLIC_KEY")
|
||||
self.langfuse_host = os.getenv("LANGFUSE_HOST", "https://cloud.langfuse.com")
|
||||
self.langfuse_release = os.getenv("LANGFUSE_RELEASE")
|
||||
self.langfuse_debug = os.getenv("LANGFUSE_DEBUG")
|
||||
|
@ -33,6 +35,26 @@ class LangFuseLogger:
|
|||
debug=self.langfuse_debug,
|
||||
)
|
||||
|
||||
if os.getenv("UPSTREAM_LANGFUSE_SECRET_KEY") is not None:
|
||||
self.upstream_langfuse_secret_key = os.getenv(
|
||||
"UPSTREAM_LANGFUSE_SECRET_KEY"
|
||||
)
|
||||
self.upstream_langfuse_public_key = os.getenv(
|
||||
"UPSTREAM_LANGFUSE_PUBLIC_KEY"
|
||||
)
|
||||
self.upstream_langfuse_host = os.getenv("UPSTREAM_LANGFUSE_HOST")
|
||||
self.upstream_langfuse_release = os.getenv("UPSTREAM_LANGFUSE_RELEASE")
|
||||
self.upstream_langfuse_debug = os.getenv("UPSTREAM_LANGFUSE_DEBUG")
|
||||
self.upstream_langfuse = Langfuse(
|
||||
public_key=self.upstream_langfuse_public_key,
|
||||
secret_key=self.upstream_langfuse_secret_key,
|
||||
host=self.upstream_langfuse_host,
|
||||
release=self.upstream_langfuse_release,
|
||||
debug=self.upstream_langfuse_debug,
|
||||
)
|
||||
else:
|
||||
self.upstream_langfuse = None
|
||||
|
||||
def log_event(
|
||||
self, kwargs, response_obj, start_time, end_time, user_id, print_verbose
|
||||
):
|
||||
|
@ -62,11 +84,15 @@ class LangFuseLogger:
|
|||
pass
|
||||
|
||||
# end of processing langfuse ########################
|
||||
input = prompt
|
||||
output = response_obj["choices"][0]["message"].json()
|
||||
print_verbose(
|
||||
f"OUTPUT IN LANGFUSE: {output}; original: {response_obj['choices'][0]['message']}"
|
||||
)
|
||||
if kwargs.get("call_type", None) == "embedding" or isinstance(
|
||||
response_obj, litellm.EmbeddingResponse
|
||||
):
|
||||
input = prompt
|
||||
output = response_obj["data"]
|
||||
else:
|
||||
input = prompt
|
||||
output = response_obj["choices"][0]["message"].json()
|
||||
print_verbose(f"OUTPUT IN LANGFUSE: {output}; original: {response_obj}")
|
||||
self._log_langfuse_v2(
|
||||
user_id,
|
||||
metadata,
|
||||
|
@ -77,6 +103,7 @@ class LangFuseLogger:
|
|||
optional_params,
|
||||
input,
|
||||
response_obj,
|
||||
print_verbose,
|
||||
) if self._is_langfuse_v2() else self._log_langfuse_v1(
|
||||
user_id,
|
||||
metadata,
|
||||
|
@ -93,6 +120,7 @@ class LangFuseLogger:
|
|||
print_verbose(
|
||||
f"Langfuse Layer Logging - final response object: {response_obj}"
|
||||
)
|
||||
verbose_logger.info(f"Langfuse Layer Logging - logging success")
|
||||
except:
|
||||
traceback.print_exc()
|
||||
print_verbose(f"Langfuse Layer Error - {traceback.format_exc()}")
|
||||
|
@ -165,18 +193,46 @@ class LangFuseLogger:
|
|||
optional_params,
|
||||
input,
|
||||
response_obj,
|
||||
print_verbose,
|
||||
):
|
||||
trace = self.Langfuse.trace(
|
||||
name=metadata.get("generation_name", "litellm-completion"),
|
||||
input=input,
|
||||
output=output,
|
||||
user_id=metadata.get("trace_user_id", user_id),
|
||||
id=metadata.get("trace_id", None),
|
||||
)
|
||||
import langfuse
|
||||
|
||||
tags = []
|
||||
supports_tags = Version(langfuse.version.__version__) >= Version("2.6.3")
|
||||
supports_costs = Version(langfuse.version.__version__) >= Version("2.7.3")
|
||||
|
||||
print_verbose(f"Langfuse Layer Logging - logging to langfuse v2 ")
|
||||
|
||||
generation_name = metadata.get("generation_name", None)
|
||||
if generation_name is None:
|
||||
# just log `litellm-{call_type}` as the generation name
|
||||
generation_name = f"litellm-{kwargs.get('call_type', 'completion')}"
|
||||
|
||||
trace_params = {
|
||||
"name": generation_name,
|
||||
"input": input,
|
||||
"output": output,
|
||||
"user_id": metadata.get("trace_user_id", user_id),
|
||||
"id": metadata.get("trace_id", None),
|
||||
}
|
||||
cost = kwargs["response_cost"]
|
||||
print_verbose(f"trace: {cost}")
|
||||
if supports_tags:
|
||||
for key, value in metadata.items():
|
||||
tags.append(f"{key}:{value}")
|
||||
if "cache_hit" in kwargs:
|
||||
tags.append(f"cache_hit:{kwargs['cache_hit']}")
|
||||
trace_params.update({"tags": tags})
|
||||
|
||||
trace = self.Langfuse.trace(**trace_params)
|
||||
|
||||
# get generation_id
|
||||
generation_id = None
|
||||
if response_obj.get("id", None) is not None:
|
||||
generation_id = litellm.utils.get_logging_id(start_time, response_obj)
|
||||
trace.generation(
|
||||
name=metadata.get("generation_name", "litellm-completion"),
|
||||
id=metadata.get("generation_id", None),
|
||||
name=generation_name,
|
||||
id=metadata.get("generation_id", generation_id),
|
||||
startTime=start_time,
|
||||
endTime=end_time,
|
||||
model=kwargs["model"],
|
||||
|
@ -186,6 +242,7 @@ class LangFuseLogger:
|
|||
usage={
|
||||
"prompt_tokens": response_obj["usage"]["prompt_tokens"],
|
||||
"completion_tokens": response_obj["usage"]["completion_tokens"],
|
||||
"total_cost": cost if supports_costs else None,
|
||||
},
|
||||
metadata=metadata,
|
||||
)
|
||||
|
|
|
@ -13,19 +13,22 @@ class LangsmithLogger:
|
|||
# Class variables or attributes
|
||||
def __init__(self):
|
||||
self.langsmith_api_key = os.getenv("LANGSMITH_API_KEY")
|
||||
self.langsmith_project = os.getenv("LANGSMITH_PROJECT", "litellm-completion")
|
||||
self.langsmith_default_run_name = os.getenv(
|
||||
"LANGSMITH_DEFAULT_RUN_NAME", "LLMRun"
|
||||
)
|
||||
|
||||
def log_event(self, kwargs, response_obj, start_time, end_time, print_verbose):
|
||||
# Method definition
|
||||
# inspired by Langsmith http api here: https://github.com/langchain-ai/langsmith-cookbook/blob/main/tracing-examples/rest/rest.ipynb
|
||||
metadata = {}
|
||||
if "litellm_params" in kwargs:
|
||||
metadata = kwargs["litellm_params"].get("metadata", {})
|
||||
metadata = kwargs.get('litellm_params', {}).get("metadata", {}) or {} # if metadata is None
|
||||
|
||||
# set project name and run_name for langsmith logging
|
||||
# users can pass project_name and run name to litellm.completion()
|
||||
# Example: litellm.completion(model, messages, metadata={"project_name": "my-litellm-project", "run_name": "my-langsmith-run"})
|
||||
# if not set litellm will use default project_name = litellm-completion, run_name = LLMRun
|
||||
project_name = metadata.get("project_name", "litellm-completion")
|
||||
run_name = metadata.get("run_name", "LLMRun")
|
||||
# if not set litellm will fallback to the environment variable LANGSMITH_PROJECT, then to the default project_name = litellm-completion, run_name = LLMRun
|
||||
project_name = metadata.get("project_name", self.langsmith_project)
|
||||
run_name = metadata.get("run_name", self.langsmith_default_run_name)
|
||||
print_verbose(
|
||||
f"Langsmith Logging - project_name: {project_name}, run_name {run_name}"
|
||||
)
|
||||
|
|
|
@ -8,7 +8,7 @@ dotenv.load_dotenv() # Loading env variables using dotenv
|
|||
import traceback
|
||||
import datetime, subprocess, sys
|
||||
import litellm, uuid
|
||||
from litellm._logging import print_verbose
|
||||
from litellm._logging import print_verbose, verbose_logger
|
||||
|
||||
|
||||
class S3Logger:
|
||||
|
@ -16,6 +16,7 @@ class S3Logger:
|
|||
def __init__(
|
||||
self,
|
||||
s3_bucket_name=None,
|
||||
s3_path=None,
|
||||
s3_region_name=None,
|
||||
s3_api_version=None,
|
||||
s3_use_ssl=True,
|
||||
|
@ -30,7 +31,9 @@ class S3Logger:
|
|||
import boto3
|
||||
|
||||
try:
|
||||
print_verbose("in init s3 logger")
|
||||
verbose_logger.debug(
|
||||
f"in init s3 logger - s3_callback_params {litellm.s3_callback_params}"
|
||||
)
|
||||
|
||||
if litellm.s3_callback_params is not None:
|
||||
# read in .env variables - example os.environ/AWS_BUCKET_NAME
|
||||
|
@ -41,7 +44,7 @@ class S3Logger:
|
|||
s3_bucket_name = litellm.s3_callback_params.get("s3_bucket_name")
|
||||
s3_region_name = litellm.s3_callback_params.get("s3_region_name")
|
||||
s3_api_version = litellm.s3_callback_params.get("s3_api_version")
|
||||
s3_use_ssl = litellm.s3_callback_params.get("s3_use_ssl")
|
||||
s3_use_ssl = litellm.s3_callback_params.get("s3_use_ssl", True)
|
||||
s3_verify = litellm.s3_callback_params.get("s3_verify")
|
||||
s3_endpoint_url = litellm.s3_callback_params.get("s3_endpoint_url")
|
||||
s3_aws_access_key_id = litellm.s3_callback_params.get(
|
||||
|
@ -57,6 +60,8 @@ class S3Logger:
|
|||
# done reading litellm.s3_callback_params
|
||||
|
||||
self.bucket_name = s3_bucket_name
|
||||
self.s3_path = s3_path
|
||||
verbose_logger.debug(f"s3 logger using endpoint url {s3_endpoint_url}")
|
||||
# Create an S3 client with custom endpoint URL
|
||||
self.s3_client = boto3.client(
|
||||
"s3",
|
||||
|
@ -82,7 +87,9 @@ class S3Logger:
|
|||
|
||||
def log_event(self, kwargs, response_obj, start_time, end_time, print_verbose):
|
||||
try:
|
||||
print_verbose(f"s3 Logging - Enters logging function for model {kwargs}")
|
||||
verbose_logger.debug(
|
||||
f"s3 Logging - Enters logging function for model {kwargs}"
|
||||
)
|
||||
|
||||
# construct payload to send to s3
|
||||
# follows the same params as langfuse.py
|
||||
|
@ -121,9 +128,22 @@ class S3Logger:
|
|||
# non blocking if it can't cast to a str
|
||||
pass
|
||||
|
||||
s3_file_name = litellm.utils.get_logging_id(start_time, payload) or ""
|
||||
s3_object_key = (
|
||||
payload["id"] + "-time=" + str(start_time)
|
||||
(self.s3_path.rstrip("/") + "/" if self.s3_path else "")
|
||||
+ start_time.strftime("%Y-%m-%d")
|
||||
+ "/"
|
||||
+ s3_file_name
|
||||
) # we need the s3 key to include the time, so we log cache hits too
|
||||
s3_object_key += ".json"
|
||||
|
||||
s3_object_download_filename = (
|
||||
"time-"
|
||||
+ start_time.strftime("%Y-%m-%dT%H-%M-%S-%f")
|
||||
+ "_"
|
||||
+ payload["id"]
|
||||
+ ".json"
|
||||
)
|
||||
|
||||
import json
|
||||
|
||||
|
@ -137,7 +157,8 @@ class S3Logger:
|
|||
Body=payload,
|
||||
ContentType="application/json",
|
||||
ContentLanguage="en",
|
||||
ContentDisposition=f'inline; filename="{key}.json"',
|
||||
ContentDisposition=f'inline; filename="{s3_object_download_filename}"',
|
||||
CacheControl="private, immutable, max-age=31536000, s-maxage=0",
|
||||
)
|
||||
|
||||
print_verbose(f"Response from s3:{str(response)}")
|
||||
|
@ -146,5 +167,5 @@ class S3Logger:
|
|||
return response
|
||||
except Exception as e:
|
||||
traceback.print_exc()
|
||||
print_verbose(f"s3 Layer Error - {str(e)}\n{traceback.format_exc()}")
|
||||
verbose_logger.debug(f"s3 Layer Error - {str(e)}\n{traceback.format_exc()}")
|
||||
pass
|
||||
|
|
|
@ -78,7 +78,7 @@ class AnthropicConfig:
|
|||
|
||||
|
||||
# makes headers for API call
|
||||
def validate_environment(api_key):
|
||||
def validate_environment(api_key, user_headers):
|
||||
if api_key is None:
|
||||
raise ValueError(
|
||||
"Missing Anthropic API Key - A call is being made to anthropic but no key is set either in the environment variables or via params"
|
||||
|
@ -89,6 +89,8 @@ def validate_environment(api_key):
|
|||
"content-type": "application/json",
|
||||
"x-api-key": api_key,
|
||||
}
|
||||
if user_headers is not None and isinstance(user_headers, dict):
|
||||
headers = {**headers, **user_headers}
|
||||
return headers
|
||||
|
||||
|
||||
|
@ -105,8 +107,9 @@ def completion(
|
|||
optional_params=None,
|
||||
litellm_params=None,
|
||||
logger_fn=None,
|
||||
headers={},
|
||||
):
|
||||
headers = validate_environment(api_key)
|
||||
headers = validate_environment(api_key, headers)
|
||||
if model in custom_prompt_dict:
|
||||
# check if the model has a registered custom prompt
|
||||
model_prompt_details = custom_prompt_dict[model]
|
||||
|
@ -139,7 +142,11 @@ def completion(
|
|||
logging_obj.pre_call(
|
||||
input=prompt,
|
||||
api_key=api_key,
|
||||
additional_args={"complete_input_dict": data, "api_base": api_base},
|
||||
additional_args={
|
||||
"complete_input_dict": data,
|
||||
"api_base": api_base,
|
||||
"headers": headers,
|
||||
},
|
||||
)
|
||||
|
||||
## COMPLETION CALL
|
||||
|
|
|
@ -95,6 +95,26 @@ class AzureOpenAIConfig(OpenAIConfig):
|
|||
)
|
||||
|
||||
|
||||
def select_azure_base_url_or_endpoint(azure_client_params: dict):
|
||||
# azure_client_params = {
|
||||
# "api_version": api_version,
|
||||
# "azure_endpoint": api_base,
|
||||
# "azure_deployment": model,
|
||||
# "http_client": litellm.client_session,
|
||||
# "max_retries": max_retries,
|
||||
# "timeout": timeout,
|
||||
# }
|
||||
azure_endpoint = azure_client_params.get("azure_endpoint", None)
|
||||
if azure_endpoint is not None:
|
||||
# see : https://github.com/openai/openai-python/blob/3d61ed42aba652b547029095a7eb269ad4e1e957/src/openai/lib/azure.py#L192
|
||||
if "/openai/deployments" in azure_endpoint:
|
||||
# this is base_url, not an azure_endpoint
|
||||
azure_client_params["base_url"] = azure_endpoint
|
||||
azure_client_params.pop("azure_endpoint")
|
||||
|
||||
return azure_client_params
|
||||
|
||||
|
||||
class AzureChatCompletion(BaseLLM):
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
|
@ -239,6 +259,9 @@ class AzureChatCompletion(BaseLLM):
|
|||
"max_retries": max_retries,
|
||||
"timeout": timeout,
|
||||
}
|
||||
azure_client_params = select_azure_base_url_or_endpoint(
|
||||
azure_client_params=azure_client_params
|
||||
)
|
||||
if api_key is not None:
|
||||
azure_client_params["api_key"] = api_key
|
||||
elif azure_ad_token is not None:
|
||||
|
@ -303,6 +326,9 @@ class AzureChatCompletion(BaseLLM):
|
|||
"max_retries": max_retries,
|
||||
"timeout": timeout,
|
||||
}
|
||||
azure_client_params = select_azure_base_url_or_endpoint(
|
||||
azure_client_params=azure_client_params
|
||||
)
|
||||
if api_key is not None:
|
||||
azure_client_params["api_key"] = api_key
|
||||
elif azure_ad_token is not None:
|
||||
|
@ -364,6 +390,9 @@ class AzureChatCompletion(BaseLLM):
|
|||
"max_retries": max_retries,
|
||||
"timeout": timeout,
|
||||
}
|
||||
azure_client_params = select_azure_base_url_or_endpoint(
|
||||
azure_client_params=azure_client_params
|
||||
)
|
||||
if api_key is not None:
|
||||
azure_client_params["api_key"] = api_key
|
||||
elif azure_ad_token is not None:
|
||||
|
@ -414,6 +443,9 @@ class AzureChatCompletion(BaseLLM):
|
|||
"max_retries": data.pop("max_retries", 2),
|
||||
"timeout": timeout,
|
||||
}
|
||||
azure_client_params = select_azure_base_url_or_endpoint(
|
||||
azure_client_params=azure_client_params
|
||||
)
|
||||
if api_key is not None:
|
||||
azure_client_params["api_key"] = api_key
|
||||
elif azure_ad_token is not None:
|
||||
|
@ -527,6 +559,9 @@ class AzureChatCompletion(BaseLLM):
|
|||
"max_retries": max_retries,
|
||||
"timeout": timeout,
|
||||
}
|
||||
azure_client_params = select_azure_base_url_or_endpoint(
|
||||
azure_client_params=azure_client_params
|
||||
)
|
||||
if api_key is not None:
|
||||
azure_client_params["api_key"] = api_key
|
||||
elif azure_ad_token is not None:
|
||||
|
@ -594,12 +629,23 @@ class AzureChatCompletion(BaseLLM):
|
|||
client_session = litellm.aclient_session or httpx.AsyncClient(
|
||||
transport=AsyncCustomHTTPTransport(),
|
||||
)
|
||||
openai_aclient = AsyncAzureOpenAI(
|
||||
azure_client = AsyncAzureOpenAI(
|
||||
http_client=client_session, **azure_client_params
|
||||
)
|
||||
else:
|
||||
openai_aclient = client
|
||||
response = await openai_aclient.images.generate(**data, timeout=timeout)
|
||||
azure_client = client
|
||||
## LOGGING
|
||||
logging_obj.pre_call(
|
||||
input=data["prompt"],
|
||||
api_key=azure_client.api_key,
|
||||
additional_args={
|
||||
"headers": {"api_key": azure_client.api_key},
|
||||
"api_base": azure_client._base_url._uri_reference,
|
||||
"acompletion": True,
|
||||
"complete_input_dict": data,
|
||||
},
|
||||
)
|
||||
response = await azure_client.images.generate(**data, timeout=timeout)
|
||||
stringified_response = response.model_dump()
|
||||
## LOGGING
|
||||
logging_obj.post_call(
|
||||
|
@ -659,6 +705,9 @@ class AzureChatCompletion(BaseLLM):
|
|||
"max_retries": max_retries,
|
||||
"timeout": timeout,
|
||||
}
|
||||
azure_client_params = select_azure_base_url_or_endpoint(
|
||||
azure_client_params=azure_client_params
|
||||
)
|
||||
if api_key is not None:
|
||||
azure_client_params["api_key"] = api_key
|
||||
elif azure_ad_token is not None:
|
||||
|
@ -681,7 +730,7 @@ class AzureChatCompletion(BaseLLM):
|
|||
input=prompt,
|
||||
api_key=azure_client.api_key,
|
||||
additional_args={
|
||||
"headers": {"Authorization": f"Bearer {azure_client.api_key}"},
|
||||
"headers": {"api_key": azure_client.api_key},
|
||||
"api_base": azure_client._base_url._uri_reference,
|
||||
"acompletion": False,
|
||||
"complete_input_dict": data,
|
||||
|
@ -753,6 +802,11 @@ class AzureChatCompletion(BaseLLM):
|
|||
completion = None
|
||||
|
||||
if mode == "completion":
|
||||
completion = await client.completions.with_raw_response.create(
|
||||
model=model, # type: ignore
|
||||
prompt=prompt, # type: ignore
|
||||
)
|
||||
elif mode == "chat":
|
||||
if messages is None:
|
||||
raise Exception("messages is not set")
|
||||
completion = await client.chat.completions.with_raw_response.create(
|
||||
|
|
|
@ -2,9 +2,9 @@ import json, copy, types
|
|||
import os
|
||||
from enum import Enum
|
||||
import time
|
||||
from typing import Callable, Optional, Any, Union
|
||||
from typing import Callable, Optional, Any, Union, List
|
||||
import litellm
|
||||
from litellm.utils import ModelResponse, get_secret, Usage
|
||||
from litellm.utils import ModelResponse, get_secret, Usage, ImageResponse
|
||||
from .prompt_templates.factory import prompt_factory, custom_prompt
|
||||
import httpx
|
||||
|
||||
|
@ -282,12 +282,82 @@ class AmazonLlamaConfig:
|
|||
}
|
||||
|
||||
|
||||
class AmazonStabilityConfig:
|
||||
"""
|
||||
Reference: https://us-west-2.console.aws.amazon.com/bedrock/home?region=us-west-2#/providers?model=stability.stable-diffusion-xl-v0
|
||||
|
||||
Supported Params for the Amazon / Stable Diffusion models:
|
||||
|
||||
- `cfg_scale` (integer): Default `7`. Between [ 0 .. 35 ]. How strictly the diffusion process adheres to the prompt text (higher values keep your image closer to your prompt)
|
||||
|
||||
- `seed` (float): Default: `0`. Between [ 0 .. 4294967295 ]. Random noise seed (omit this option or use 0 for a random seed)
|
||||
|
||||
- `steps` (array of strings): Default `30`. Between [ 10 .. 50 ]. Number of diffusion steps to run.
|
||||
|
||||
- `width` (integer): Default: `512`. multiple of 64 >= 128. Width of the image to generate, in pixels, in an increment divible by 64.
|
||||
Engine-specific dimension validation:
|
||||
|
||||
- SDXL Beta: must be between 128x128 and 512x896 (or 896x512); only one dimension can be greater than 512.
|
||||
- SDXL v0.9: must be one of 1024x1024, 1152x896, 1216x832, 1344x768, 1536x640, 640x1536, 768x1344, 832x1216, or 896x1152
|
||||
- SDXL v1.0: same as SDXL v0.9
|
||||
- SD v1.6: must be between 320x320 and 1536x1536
|
||||
|
||||
- `height` (integer): Default: `512`. multiple of 64 >= 128. Height of the image to generate, in pixels, in an increment divible by 64.
|
||||
Engine-specific dimension validation:
|
||||
|
||||
- SDXL Beta: must be between 128x128 and 512x896 (or 896x512); only one dimension can be greater than 512.
|
||||
- SDXL v0.9: must be one of 1024x1024, 1152x896, 1216x832, 1344x768, 1536x640, 640x1536, 768x1344, 832x1216, or 896x1152
|
||||
- SDXL v1.0: same as SDXL v0.9
|
||||
- SD v1.6: must be between 320x320 and 1536x1536
|
||||
"""
|
||||
|
||||
cfg_scale: Optional[int] = None
|
||||
seed: Optional[float] = None
|
||||
steps: Optional[List[str]] = None
|
||||
width: Optional[int] = None
|
||||
height: Optional[int] = None
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
cfg_scale: Optional[int] = None,
|
||||
seed: Optional[float] = None,
|
||||
steps: Optional[List[str]] = None,
|
||||
width: Optional[int] = None,
|
||||
height: Optional[int] = None,
|
||||
) -> None:
|
||||
locals_ = locals()
|
||||
for key, value in locals_.items():
|
||||
if key != "self" and value is not None:
|
||||
setattr(self.__class__, key, value)
|
||||
|
||||
@classmethod
|
||||
def get_config(cls):
|
||||
return {
|
||||
k: v
|
||||
for k, v in cls.__dict__.items()
|
||||
if not k.startswith("__")
|
||||
and not isinstance(
|
||||
v,
|
||||
(
|
||||
types.FunctionType,
|
||||
types.BuiltinFunctionType,
|
||||
classmethod,
|
||||
staticmethod,
|
||||
),
|
||||
)
|
||||
and v is not None
|
||||
}
|
||||
|
||||
|
||||
def init_bedrock_client(
|
||||
region_name=None,
|
||||
aws_access_key_id: Optional[str] = None,
|
||||
aws_secret_access_key: Optional[str] = None,
|
||||
aws_region_name: Optional[str] = None,
|
||||
aws_bedrock_runtime_endpoint: Optional[str] = None,
|
||||
aws_session_name: Optional[str] = None,
|
||||
aws_role_name: Optional[str] = None,
|
||||
timeout: Optional[int] = None,
|
||||
):
|
||||
# check for custom AWS_REGION_NAME and use it if not passed to init_bedrock_client
|
||||
litellm_aws_region_name = get_secret("AWS_REGION_NAME", None)
|
||||
|
@ -300,6 +370,8 @@ def init_bedrock_client(
|
|||
aws_secret_access_key,
|
||||
aws_region_name,
|
||||
aws_bedrock_runtime_endpoint,
|
||||
aws_session_name,
|
||||
aws_role_name,
|
||||
]
|
||||
|
||||
# Iterate over parameters and update if needed
|
||||
|
@ -312,7 +384,11 @@ def init_bedrock_client(
|
|||
aws_secret_access_key,
|
||||
aws_region_name,
|
||||
aws_bedrock_runtime_endpoint,
|
||||
aws_session_name,
|
||||
aws_role_name,
|
||||
) = params_to_check
|
||||
|
||||
### SET REGION NAME
|
||||
if region_name:
|
||||
pass
|
||||
elif aws_region_name:
|
||||
|
@ -338,7 +414,31 @@ def init_bedrock_client(
|
|||
|
||||
import boto3
|
||||
|
||||
if aws_access_key_id != None:
|
||||
config = boto3.session.Config(connect_timeout=timeout, read_timeout=timeout)
|
||||
|
||||
### CHECK STS ###
|
||||
if aws_role_name is not None and aws_session_name is not None:
|
||||
# use sts if role name passed in
|
||||
sts_client = boto3.client(
|
||||
"sts",
|
||||
aws_access_key_id=aws_access_key_id,
|
||||
aws_secret_access_key=aws_secret_access_key,
|
||||
)
|
||||
|
||||
sts_response = sts_client.assume_role(
|
||||
RoleArn=aws_role_name, RoleSessionName=aws_session_name
|
||||
)
|
||||
|
||||
client = boto3.client(
|
||||
service_name="bedrock-runtime",
|
||||
aws_access_key_id=sts_response["Credentials"]["AccessKeyId"],
|
||||
aws_secret_access_key=sts_response["Credentials"]["SecretAccessKey"],
|
||||
aws_session_token=sts_response["Credentials"]["SessionToken"],
|
||||
region_name=region_name,
|
||||
endpoint_url=endpoint_url,
|
||||
config=config,
|
||||
)
|
||||
elif aws_access_key_id is not None:
|
||||
# uses auth params passed to completion
|
||||
# aws_access_key_id is not None, assume user is trying to auth using litellm.completion
|
||||
|
||||
|
@ -348,6 +448,7 @@ def init_bedrock_client(
|
|||
aws_secret_access_key=aws_secret_access_key,
|
||||
region_name=region_name,
|
||||
endpoint_url=endpoint_url,
|
||||
config=config,
|
||||
)
|
||||
else:
|
||||
# aws_access_key_id is None, assume user is trying to auth using env variables
|
||||
|
@ -357,6 +458,7 @@ def init_bedrock_client(
|
|||
service_name="bedrock-runtime",
|
||||
region_name=region_name,
|
||||
endpoint_url=endpoint_url,
|
||||
config=config,
|
||||
)
|
||||
|
||||
return client
|
||||
|
@ -419,6 +521,8 @@ def completion(
|
|||
aws_secret_access_key = optional_params.pop("aws_secret_access_key", None)
|
||||
aws_access_key_id = optional_params.pop("aws_access_key_id", None)
|
||||
aws_region_name = optional_params.pop("aws_region_name", None)
|
||||
aws_role_name = optional_params.pop("aws_role_name", None)
|
||||
aws_session_name = optional_params.pop("aws_session_name", None)
|
||||
aws_bedrock_runtime_endpoint = optional_params.pop(
|
||||
"aws_bedrock_runtime_endpoint", None
|
||||
)
|
||||
|
@ -433,9 +537,14 @@ def completion(
|
|||
aws_secret_access_key=aws_secret_access_key,
|
||||
aws_region_name=aws_region_name,
|
||||
aws_bedrock_runtime_endpoint=aws_bedrock_runtime_endpoint,
|
||||
aws_role_name=aws_role_name,
|
||||
aws_session_name=aws_session_name,
|
||||
)
|
||||
|
||||
model = model
|
||||
modelId = (
|
||||
optional_params.pop("model_id", None) or model
|
||||
) # default to model if not passed
|
||||
provider = model.split(".")[0]
|
||||
prompt = convert_messages_to_prompt(
|
||||
model, messages, provider, custom_prompt_dict
|
||||
|
@ -498,6 +607,8 @@ def completion(
|
|||
"textGenerationConfig": inference_params,
|
||||
}
|
||||
)
|
||||
else:
|
||||
data = json.dumps({})
|
||||
|
||||
## COMPLETION CALL
|
||||
accept = "application/json"
|
||||
|
@ -508,7 +619,7 @@ def completion(
|
|||
request_str = f"""
|
||||
response = client.invoke_model(
|
||||
body={data},
|
||||
modelId={model},
|
||||
modelId={modelId},
|
||||
accept=accept,
|
||||
contentType=contentType
|
||||
)
|
||||
|
@ -523,7 +634,7 @@ def completion(
|
|||
)
|
||||
|
||||
response = client.invoke_model(
|
||||
body=data, modelId=model, accept=accept, contentType=contentType
|
||||
body=data, modelId=modelId, accept=accept, contentType=contentType
|
||||
)
|
||||
|
||||
response = response.get("body").read()
|
||||
|
@ -533,7 +644,7 @@ def completion(
|
|||
request_str = f"""
|
||||
response = client.invoke_model_with_response_stream(
|
||||
body={data},
|
||||
modelId={model},
|
||||
modelId={modelId},
|
||||
accept=accept,
|
||||
contentType=contentType
|
||||
)
|
||||
|
@ -548,7 +659,7 @@ def completion(
|
|||
)
|
||||
|
||||
response = client.invoke_model_with_response_stream(
|
||||
body=data, modelId=model, accept=accept, contentType=contentType
|
||||
body=data, modelId=modelId, accept=accept, contentType=contentType
|
||||
)
|
||||
response = response.get("body")
|
||||
return response
|
||||
|
@ -557,7 +668,7 @@ def completion(
|
|||
request_str = f"""
|
||||
response = client.invoke_model(
|
||||
body={data},
|
||||
modelId={model},
|
||||
modelId={modelId},
|
||||
accept=accept,
|
||||
contentType=contentType
|
||||
)
|
||||
|
@ -571,8 +682,12 @@ def completion(
|
|||
},
|
||||
)
|
||||
response = client.invoke_model(
|
||||
body=data, modelId=model, accept=accept, contentType=contentType
|
||||
body=data, modelId=modelId, accept=accept, contentType=contentType
|
||||
)
|
||||
except client.exceptions.ValidationException as e:
|
||||
if "The provided model identifier is invalid" in str(e):
|
||||
raise BedrockError(status_code=404, message=str(e))
|
||||
raise BedrockError(status_code=400, message=str(e))
|
||||
except Exception as e:
|
||||
raise BedrockError(status_code=500, message=str(e))
|
||||
|
||||
|
@ -617,9 +732,16 @@ def completion(
|
|||
)
|
||||
|
||||
## CALCULATING USAGE - baseten charges on time, not tokens - have some mapping of cost here.
|
||||
prompt_tokens = len(encoding.encode(prompt))
|
||||
completion_tokens = len(
|
||||
encoding.encode(model_response["choices"][0]["message"].get("content", ""))
|
||||
prompt_tokens = response_metadata.get(
|
||||
"x-amzn-bedrock-input-token-count", len(encoding.encode(prompt))
|
||||
)
|
||||
completion_tokens = response_metadata.get(
|
||||
"x-amzn-bedrock-output-token-count",
|
||||
len(
|
||||
encoding.encode(
|
||||
model_response["choices"][0]["message"].get("content", "")
|
||||
)
|
||||
),
|
||||
)
|
||||
|
||||
model_response["created"] = int(time.time())
|
||||
|
@ -630,6 +752,8 @@ def completion(
|
|||
total_tokens=prompt_tokens + completion_tokens,
|
||||
)
|
||||
model_response.usage = usage
|
||||
model_response._hidden_params["region_name"] = client.meta.region_name
|
||||
print_verbose(f"model_response._hidden_params: {model_response._hidden_params}")
|
||||
return model_response
|
||||
except BedrockError as e:
|
||||
exception_mapping_worked = True
|
||||
|
@ -651,6 +775,11 @@ def _embedding_func_single(
|
|||
encoding=None,
|
||||
logging_obj=None,
|
||||
):
|
||||
if isinstance(input, str) is False:
|
||||
raise BedrockError(
|
||||
message="Bedrock Embedding API input must be type str | List[str]",
|
||||
status_code=400,
|
||||
)
|
||||
# logic for parsing in - calling - parsing out model embedding calls
|
||||
## FORMAT EMBEDDING INPUT ##
|
||||
provider = model.split(".")[0]
|
||||
|
@ -658,6 +787,9 @@ def _embedding_func_single(
|
|||
inference_params.pop(
|
||||
"user", None
|
||||
) # make sure user is not passed in for bedrock call
|
||||
modelId = (
|
||||
optional_params.pop("model_id", None) or model
|
||||
) # default to model if not passed
|
||||
if provider == "amazon":
|
||||
input = input.replace(os.linesep, " ")
|
||||
data = {"inputText": input, **inference_params}
|
||||
|
@ -672,7 +804,7 @@ def _embedding_func_single(
|
|||
request_str = f"""
|
||||
response = client.invoke_model(
|
||||
body={body},
|
||||
modelId={model},
|
||||
modelId={modelId},
|
||||
accept="*/*",
|
||||
contentType="application/json",
|
||||
)""" # type: ignore
|
||||
|
@ -680,14 +812,14 @@ def _embedding_func_single(
|
|||
input=input,
|
||||
api_key="", # boto3 is used for init.
|
||||
additional_args={
|
||||
"complete_input_dict": {"model": model, "texts": input},
|
||||
"complete_input_dict": {"model": modelId, "texts": input},
|
||||
"request_str": request_str,
|
||||
},
|
||||
)
|
||||
try:
|
||||
response = client.invoke_model(
|
||||
body=body,
|
||||
modelId=model,
|
||||
modelId=modelId,
|
||||
accept="*/*",
|
||||
contentType="application/json",
|
||||
)
|
||||
|
@ -726,6 +858,8 @@ def embedding(
|
|||
aws_secret_access_key = optional_params.pop("aws_secret_access_key", None)
|
||||
aws_access_key_id = optional_params.pop("aws_access_key_id", None)
|
||||
aws_region_name = optional_params.pop("aws_region_name", None)
|
||||
aws_role_name = optional_params.pop("aws_role_name", None)
|
||||
aws_session_name = optional_params.pop("aws_session_name", None)
|
||||
aws_bedrock_runtime_endpoint = optional_params.pop(
|
||||
"aws_bedrock_runtime_endpoint", None
|
||||
)
|
||||
|
@ -736,8 +870,11 @@ def embedding(
|
|||
aws_secret_access_key=aws_secret_access_key,
|
||||
aws_region_name=aws_region_name,
|
||||
aws_bedrock_runtime_endpoint=aws_bedrock_runtime_endpoint,
|
||||
aws_role_name=aws_role_name,
|
||||
aws_session_name=aws_session_name,
|
||||
)
|
||||
if type(input) == str:
|
||||
if isinstance(input, str):
|
||||
## Embedding Call
|
||||
embeddings = [
|
||||
_embedding_func_single(
|
||||
model,
|
||||
|
@ -747,8 +884,8 @@ def embedding(
|
|||
logging_obj=logging_obj,
|
||||
)
|
||||
]
|
||||
else:
|
||||
## Embedding Call
|
||||
elif isinstance(input, list):
|
||||
## Embedding Call - assuming this is a List[str]
|
||||
embeddings = [
|
||||
_embedding_func_single(
|
||||
model,
|
||||
|
@ -759,6 +896,12 @@ def embedding(
|
|||
)
|
||||
for i in input
|
||||
] # [TODO]: make these parallel calls
|
||||
else:
|
||||
# enters this branch if input = int, ex. input=2
|
||||
raise BedrockError(
|
||||
message="Bedrock Embedding API input must be type str | List[str]",
|
||||
status_code=400,
|
||||
)
|
||||
|
||||
## Populate OpenAI compliant dictionary
|
||||
embedding_response = []
|
||||
|
@ -785,3 +928,112 @@ def embedding(
|
|||
model_response.usage = usage
|
||||
|
||||
return model_response
|
||||
|
||||
|
||||
def image_generation(
|
||||
model: str,
|
||||
prompt: str,
|
||||
timeout=None,
|
||||
logging_obj=None,
|
||||
model_response=None,
|
||||
optional_params=None,
|
||||
aimg_generation=False,
|
||||
):
|
||||
"""
|
||||
Bedrock Image Gen endpoint support
|
||||
"""
|
||||
### BOTO3 INIT ###
|
||||
# pop aws_secret_access_key, aws_access_key_id, aws_region_name from kwargs, since completion calls fail with them
|
||||
aws_secret_access_key = optional_params.pop("aws_secret_access_key", None)
|
||||
aws_access_key_id = optional_params.pop("aws_access_key_id", None)
|
||||
aws_region_name = optional_params.pop("aws_region_name", None)
|
||||
aws_role_name = optional_params.pop("aws_role_name", None)
|
||||
aws_session_name = optional_params.pop("aws_session_name", None)
|
||||
aws_bedrock_runtime_endpoint = optional_params.pop(
|
||||
"aws_bedrock_runtime_endpoint", None
|
||||
)
|
||||
|
||||
# use passed in BedrockRuntime.Client if provided, otherwise create a new one
|
||||
client = init_bedrock_client(
|
||||
aws_access_key_id=aws_access_key_id,
|
||||
aws_secret_access_key=aws_secret_access_key,
|
||||
aws_region_name=aws_region_name,
|
||||
aws_bedrock_runtime_endpoint=aws_bedrock_runtime_endpoint,
|
||||
aws_role_name=aws_role_name,
|
||||
aws_session_name=aws_session_name,
|
||||
timeout=timeout,
|
||||
)
|
||||
|
||||
### FORMAT IMAGE GENERATION INPUT ###
|
||||
modelId = model
|
||||
provider = model.split(".")[0]
|
||||
inference_params = copy.deepcopy(optional_params)
|
||||
inference_params.pop(
|
||||
"user", None
|
||||
) # make sure user is not passed in for bedrock call
|
||||
data = {}
|
||||
if provider == "stability":
|
||||
prompt = prompt.replace(os.linesep, " ")
|
||||
## LOAD CONFIG
|
||||
config = litellm.AmazonStabilityConfig.get_config()
|
||||
for k, v in config.items():
|
||||
if (
|
||||
k not in inference_params
|
||||
): # completion(top_k=3) > anthropic_config(top_k=3) <- allows for dynamic variables to be passed in
|
||||
inference_params[k] = v
|
||||
data = {"text_prompts": [{"text": prompt, "weight": 1}], **inference_params}
|
||||
else:
|
||||
raise BedrockError(
|
||||
status_code=422, message=f"Unsupported model={model}, passed in"
|
||||
)
|
||||
|
||||
body = json.dumps(data).encode("utf-8")
|
||||
## LOGGING
|
||||
request_str = f"""
|
||||
response = client.invoke_model(
|
||||
body={body},
|
||||
modelId={modelId},
|
||||
accept="application/json",
|
||||
contentType="application/json",
|
||||
)""" # type: ignore
|
||||
logging_obj.pre_call(
|
||||
input=prompt,
|
||||
api_key="", # boto3 is used for init.
|
||||
additional_args={
|
||||
"complete_input_dict": {"model": modelId, "texts": prompt},
|
||||
"request_str": request_str,
|
||||
},
|
||||
)
|
||||
try:
|
||||
response = client.invoke_model(
|
||||
body=body,
|
||||
modelId=modelId,
|
||||
accept="application/json",
|
||||
contentType="application/json",
|
||||
)
|
||||
response_body = json.loads(response.get("body").read())
|
||||
## LOGGING
|
||||
logging_obj.post_call(
|
||||
input=prompt,
|
||||
api_key="",
|
||||
additional_args={"complete_input_dict": data},
|
||||
original_response=json.dumps(response_body),
|
||||
)
|
||||
except Exception as e:
|
||||
raise BedrockError(
|
||||
message=f"Embedding Error with model {model}: {e}", status_code=500
|
||||
)
|
||||
|
||||
### FORMAT RESPONSE TO OPENAI FORMAT ###
|
||||
if response_body is None:
|
||||
raise Exception("Error in response object format")
|
||||
|
||||
if model_response is None:
|
||||
model_response = ImageResponse()
|
||||
|
||||
image_list: List = []
|
||||
for artifact in response_body["artifacts"]:
|
||||
image_dict = {"url": artifact["base64"]}
|
||||
|
||||
model_response.data = image_dict
|
||||
return model_response
|
||||
|
|
|
@ -43,7 +43,7 @@ class AsyncCustomHTTPTransport(httpx.AsyncHTTPTransport):
|
|||
request=request,
|
||||
)
|
||||
|
||||
time.sleep(int(response.headers.get("retry-after")) or 10)
|
||||
await asyncio.sleep(int(response.headers.get("retry-after") or 10))
|
||||
response = await super().handle_async_request(request)
|
||||
await response.aread()
|
||||
|
||||
|
@ -95,7 +95,6 @@ class CustomHTTPTransport(httpx.HTTPTransport):
|
|||
request.method = "GET"
|
||||
response = super().handle_request(request)
|
||||
response.read()
|
||||
|
||||
timeout_secs: int = 120
|
||||
start_time = time.time()
|
||||
while response.json()["status"] not in ["succeeded", "failed"]:
|
||||
|
@ -112,11 +111,9 @@ class CustomHTTPTransport(httpx.HTTPTransport):
|
|||
content=json.dumps(timeout).encode("utf-8"),
|
||||
request=request,
|
||||
)
|
||||
|
||||
time.sleep(int(response.headers.get("retry-after")) or 10)
|
||||
time.sleep(int(response.headers.get("retry-after", None) or 10))
|
||||
response = super().handle_request(request)
|
||||
response.read()
|
||||
|
||||
if response.json()["status"] == "failed":
|
||||
error_data = response.json()
|
||||
return httpx.Response(
|
||||
|
|
|
@ -120,9 +120,7 @@ def completion(
|
|||
|
||||
## Load Config
|
||||
inference_params = copy.deepcopy(optional_params)
|
||||
inference_params.pop(
|
||||
"stream", None
|
||||
) # palm does not support streaming, so we handle this by fake streaming in main.py
|
||||
stream = inference_params.pop("stream", None)
|
||||
config = litellm.GeminiConfig.get_config()
|
||||
for k, v in config.items():
|
||||
if (
|
||||
|
@ -139,10 +137,18 @@ def completion(
|
|||
## COMPLETION CALL
|
||||
try:
|
||||
_model = genai.GenerativeModel(f"models/{model}")
|
||||
response = _model.generate_content(
|
||||
contents=prompt,
|
||||
generation_config=genai.types.GenerationConfig(**inference_params),
|
||||
)
|
||||
if stream != True:
|
||||
response = _model.generate_content(
|
||||
contents=prompt,
|
||||
generation_config=genai.types.GenerationConfig(**inference_params),
|
||||
)
|
||||
else:
|
||||
response = _model.generate_content(
|
||||
contents=prompt,
|
||||
generation_config=genai.types.GenerationConfig(**inference_params),
|
||||
stream=True,
|
||||
)
|
||||
return response
|
||||
except Exception as e:
|
||||
raise GeminiError(
|
||||
message=str(e),
|
||||
|
@ -177,16 +183,20 @@ def completion(
|
|||
|
||||
try:
|
||||
completion_response = model_response["choices"][0]["message"].get("content")
|
||||
if completion_response is None:
|
||||
if completion_response is None:
|
||||
raise Exception
|
||||
except:
|
||||
original_response = f"response: {response}"
|
||||
if hasattr(response, "candidates"):
|
||||
if hasattr(response, "candidates"):
|
||||
original_response = f"response: {response.candidates}"
|
||||
if "SAFETY" in original_response:
|
||||
original_response += "\nThe candidate content was flagged for safety reasons."
|
||||
if "SAFETY" in original_response:
|
||||
original_response += (
|
||||
"\nThe candidate content was flagged for safety reasons."
|
||||
)
|
||||
elif "RECITATION" in original_response:
|
||||
original_response += "\nThe candidate content was flagged for recitation reasons."
|
||||
original_response += (
|
||||
"\nThe candidate content was flagged for recitation reasons."
|
||||
)
|
||||
raise GeminiError(
|
||||
status_code=400,
|
||||
message=f"No response received. Original response - {original_response}",
|
||||
|
|
|
@ -145,8 +145,8 @@ def get_ollama_response(
|
|||
): # completion(top_k=3) > cohere_config(top_k=3) <- allows for dynamic variables to be passed in
|
||||
optional_params[k] = v
|
||||
|
||||
optional_params["stream"] = optional_params.get("stream", False)
|
||||
data = {"model": model, "prompt": prompt, **optional_params}
|
||||
stream = optional_params.pop("stream", False)
|
||||
data = {"model": model, "prompt": prompt, "options": optional_params}
|
||||
## LOGGING
|
||||
logging_obj.pre_call(
|
||||
input=None,
|
||||
|
@ -159,7 +159,7 @@ def get_ollama_response(
|
|||
},
|
||||
)
|
||||
if acompletion is True:
|
||||
if optional_params.get("stream", False) == True:
|
||||
if stream == True:
|
||||
response = ollama_async_streaming(
|
||||
url=url,
|
||||
data=data,
|
||||
|
@ -176,7 +176,7 @@ def get_ollama_response(
|
|||
logging_obj=logging_obj,
|
||||
)
|
||||
return response
|
||||
elif optional_params.get("stream", False) == True:
|
||||
elif stream == True:
|
||||
return ollama_completion_stream(url=url, data=data, logging_obj=logging_obj)
|
||||
|
||||
response = requests.post(url=f"{url}", json=data, timeout=litellm.request_timeout)
|
||||
|
@ -254,7 +254,7 @@ async def ollama_async_streaming(url, data, model_response, encoding, logging_ob
|
|||
) as response:
|
||||
if response.status_code != 200:
|
||||
raise OllamaError(
|
||||
status_code=response.status_code, message=response.text
|
||||
status_code=response.status_code, message=await response.aread()
|
||||
)
|
||||
|
||||
streamwrapper = litellm.CustomStreamWrapper(
|
||||
|
|
|
@ -145,8 +145,8 @@ def get_ollama_response(
|
|||
): # completion(top_k=3) > cohere_config(top_k=3) <- allows for dynamic variables to be passed in
|
||||
optional_params[k] = v
|
||||
|
||||
optional_params["stream"] = optional_params.get("stream", False)
|
||||
data = {"model": model, "messages": messages, **optional_params}
|
||||
stream = optional_params.pop("stream", False)
|
||||
data = {"model": model, "messages": messages, "options": optional_params}
|
||||
## LOGGING
|
||||
logging_obj.pre_call(
|
||||
input=None,
|
||||
|
@ -159,7 +159,7 @@ def get_ollama_response(
|
|||
},
|
||||
)
|
||||
if acompletion is True:
|
||||
if optional_params.get("stream", False) == True:
|
||||
if stream == True:
|
||||
response = ollama_async_streaming(
|
||||
url=url,
|
||||
data=data,
|
||||
|
@ -176,7 +176,7 @@ def get_ollama_response(
|
|||
logging_obj=logging_obj,
|
||||
)
|
||||
return response
|
||||
elif optional_params.get("stream", False) == True:
|
||||
elif stream == True:
|
||||
return ollama_completion_stream(url=url, data=data, logging_obj=logging_obj)
|
||||
|
||||
response = requests.post(
|
||||
|
@ -220,8 +220,10 @@ def get_ollama_response(
|
|||
model_response["choices"][0]["message"] = response_json["message"]
|
||||
model_response["created"] = int(time.time())
|
||||
model_response["model"] = "ollama/" + model
|
||||
prompt_tokens = response_json["prompt_eval_count"] # type: ignore
|
||||
completion_tokens = response_json["eval_count"]
|
||||
prompt_tokens = response_json.get("prompt_eval_count", litellm.token_counter(messages=messages)) # type: ignore
|
||||
completion_tokens = response_json.get(
|
||||
"eval_count", litellm.token_counter(text=response_json["message"])
|
||||
)
|
||||
model_response["usage"] = litellm.Usage(
|
||||
prompt_tokens=prompt_tokens,
|
||||
completion_tokens=completion_tokens,
|
||||
|
@ -320,8 +322,10 @@ async def ollama_acompletion(url, data, model_response, encoding, logging_obj):
|
|||
model_response["choices"][0]["message"] = response_json["message"]
|
||||
model_response["created"] = int(time.time())
|
||||
model_response["model"] = "ollama/" + data["model"]
|
||||
prompt_tokens = response_json["prompt_eval_count"] # type: ignore
|
||||
completion_tokens = response_json["eval_count"]
|
||||
prompt_tokens = response_json.get("prompt_eval_count", litellm.token_counter(messages=data["messages"])) # type: ignore
|
||||
completion_tokens = response_json.get(
|
||||
"eval_count", litellm.token_counter(text=response_json["message"])
|
||||
)
|
||||
model_response["usage"] = litellm.Usage(
|
||||
prompt_tokens=prompt_tokens,
|
||||
completion_tokens=completion_tokens,
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
from typing import Optional, Union, Any
|
||||
import types, time, json
|
||||
import types, time, json, traceback
|
||||
import httpx
|
||||
from .base import BaseLLM
|
||||
from litellm.utils import (
|
||||
|
@ -221,6 +221,7 @@ class OpenAIChatCompletion(BaseLLM):
|
|||
headers: Optional[dict] = None,
|
||||
custom_prompt_dict: dict = {},
|
||||
client=None,
|
||||
organization: Optional[str] = None,
|
||||
):
|
||||
super().completion()
|
||||
exception_mapping_worked = False
|
||||
|
@ -254,6 +255,7 @@ class OpenAIChatCompletion(BaseLLM):
|
|||
timeout=timeout,
|
||||
client=client,
|
||||
max_retries=max_retries,
|
||||
organization=organization,
|
||||
)
|
||||
else:
|
||||
return self.acompletion(
|
||||
|
@ -266,6 +268,7 @@ class OpenAIChatCompletion(BaseLLM):
|
|||
timeout=timeout,
|
||||
client=client,
|
||||
max_retries=max_retries,
|
||||
organization=organization,
|
||||
)
|
||||
elif optional_params.get("stream", False):
|
||||
return self.streaming(
|
||||
|
@ -278,6 +281,7 @@ class OpenAIChatCompletion(BaseLLM):
|
|||
timeout=timeout,
|
||||
client=client,
|
||||
max_retries=max_retries,
|
||||
organization=organization,
|
||||
)
|
||||
else:
|
||||
if not isinstance(max_retries, int):
|
||||
|
@ -291,6 +295,7 @@ class OpenAIChatCompletion(BaseLLM):
|
|||
http_client=litellm.client_session,
|
||||
timeout=timeout,
|
||||
max_retries=max_retries,
|
||||
organization=organization,
|
||||
)
|
||||
else:
|
||||
openai_client = client
|
||||
|
@ -349,7 +354,7 @@ class OpenAIChatCompletion(BaseLLM):
|
|||
if hasattr(e, "status_code"):
|
||||
raise OpenAIError(status_code=e.status_code, message=str(e))
|
||||
else:
|
||||
raise OpenAIError(status_code=500, message=str(e))
|
||||
raise OpenAIError(status_code=500, message=traceback.format_exc())
|
||||
|
||||
async def acompletion(
|
||||
self,
|
||||
|
@ -358,6 +363,7 @@ class OpenAIChatCompletion(BaseLLM):
|
|||
timeout: float,
|
||||
api_key: Optional[str] = None,
|
||||
api_base: Optional[str] = None,
|
||||
organization: Optional[str] = None,
|
||||
client=None,
|
||||
max_retries=None,
|
||||
logging_obj=None,
|
||||
|
@ -372,6 +378,7 @@ class OpenAIChatCompletion(BaseLLM):
|
|||
http_client=litellm.aclient_session,
|
||||
timeout=timeout,
|
||||
max_retries=max_retries,
|
||||
organization=organization,
|
||||
)
|
||||
else:
|
||||
openai_aclient = client
|
||||
|
@ -412,6 +419,7 @@ class OpenAIChatCompletion(BaseLLM):
|
|||
model: str,
|
||||
api_key: Optional[str] = None,
|
||||
api_base: Optional[str] = None,
|
||||
organization: Optional[str] = None,
|
||||
client=None,
|
||||
max_retries=None,
|
||||
headers=None,
|
||||
|
@ -423,6 +431,7 @@ class OpenAIChatCompletion(BaseLLM):
|
|||
http_client=litellm.client_session,
|
||||
timeout=timeout,
|
||||
max_retries=max_retries,
|
||||
organization=organization,
|
||||
)
|
||||
else:
|
||||
openai_client = client
|
||||
|
@ -454,6 +463,7 @@ class OpenAIChatCompletion(BaseLLM):
|
|||
model: str,
|
||||
api_key: Optional[str] = None,
|
||||
api_base: Optional[str] = None,
|
||||
organization: Optional[str] = None,
|
||||
client=None,
|
||||
max_retries=None,
|
||||
headers=None,
|
||||
|
@ -467,6 +477,7 @@ class OpenAIChatCompletion(BaseLLM):
|
|||
http_client=litellm.aclient_session,
|
||||
timeout=timeout,
|
||||
max_retries=max_retries,
|
||||
organization=organization,
|
||||
)
|
||||
else:
|
||||
openai_aclient = client
|
||||
|
@ -706,19 +717,34 @@ class OpenAIChatCompletion(BaseLLM):
|
|||
|
||||
## COMPLETION CALL
|
||||
response = openai_client.images.generate(**data, timeout=timeout) # type: ignore
|
||||
response = response.model_dump() # type: ignore
|
||||
## LOGGING
|
||||
logging_obj.post_call(
|
||||
input=input,
|
||||
input=prompt,
|
||||
api_key=api_key,
|
||||
additional_args={"complete_input_dict": data},
|
||||
original_response=response,
|
||||
)
|
||||
# return response
|
||||
return convert_to_model_response_object(response_object=response.model_dump(), model_response_object=model_response, response_type="image_generation") # type: ignore
|
||||
return convert_to_model_response_object(response_object=response, model_response_object=model_response, response_type="image_generation") # type: ignore
|
||||
except OpenAIError as e:
|
||||
exception_mapping_worked = True
|
||||
## LOGGING
|
||||
logging_obj.post_call(
|
||||
input=prompt,
|
||||
api_key=api_key,
|
||||
additional_args={"complete_input_dict": data},
|
||||
original_response=str(e),
|
||||
)
|
||||
raise e
|
||||
except Exception as e:
|
||||
## LOGGING
|
||||
logging_obj.post_call(
|
||||
input=prompt,
|
||||
api_key=api_key,
|
||||
additional_args={"complete_input_dict": data},
|
||||
original_response=str(e),
|
||||
)
|
||||
if hasattr(e, "status_code"):
|
||||
raise OpenAIError(status_code=e.status_code, message=str(e))
|
||||
else:
|
||||
|
@ -733,14 +759,22 @@ class OpenAIChatCompletion(BaseLLM):
|
|||
messages: Optional[list] = None,
|
||||
input: Optional[list] = None,
|
||||
prompt: Optional[str] = None,
|
||||
organization: Optional[str] = None,
|
||||
):
|
||||
client = AsyncOpenAI(api_key=api_key, timeout=timeout)
|
||||
client = AsyncOpenAI(
|
||||
api_key=api_key, timeout=timeout, organization=organization
|
||||
)
|
||||
if model is None and mode != "image_generation":
|
||||
raise Exception("model is not set")
|
||||
|
||||
completion = None
|
||||
|
||||
if mode == "completion":
|
||||
completion = await client.completions.with_raw_response.create(
|
||||
model=model, # type: ignore
|
||||
prompt=prompt, # type: ignore
|
||||
)
|
||||
elif mode == "chat":
|
||||
if messages is None:
|
||||
raise Exception("messages is not set")
|
||||
completion = await client.chat.completions.with_raw_response.create(
|
||||
|
@ -889,7 +923,7 @@ class OpenAITextCompletion(BaseLLM):
|
|||
headers=headers,
|
||||
model_response=model_response,
|
||||
model=model,
|
||||
timeout=timeout
|
||||
timeout=timeout,
|
||||
)
|
||||
else:
|
||||
return self.acompletion(api_base=api_base, data=data, headers=headers, model_response=model_response, prompt=prompt, api_key=api_key, logging_obj=logging_obj, model=model, timeout=timeout) # type: ignore
|
||||
|
@ -901,14 +935,11 @@ class OpenAITextCompletion(BaseLLM):
|
|||
headers=headers,
|
||||
model_response=model_response,
|
||||
model=model,
|
||||
timeout=timeout
|
||||
timeout=timeout,
|
||||
)
|
||||
else:
|
||||
response = httpx.post(
|
||||
url=f"{api_base}",
|
||||
json=data,
|
||||
headers=headers,
|
||||
timeout=timeout
|
||||
url=f"{api_base}", json=data, headers=headers, timeout=timeout
|
||||
)
|
||||
if response.status_code != 200:
|
||||
raise OpenAIError(
|
||||
|
@ -944,7 +975,7 @@ class OpenAITextCompletion(BaseLLM):
|
|||
prompt: str,
|
||||
api_key: str,
|
||||
model: str,
|
||||
timeout: float
|
||||
timeout: float,
|
||||
):
|
||||
async with httpx.AsyncClient(timeout=timeout) as client:
|
||||
try:
|
||||
|
@ -986,7 +1017,7 @@ class OpenAITextCompletion(BaseLLM):
|
|||
headers: dict,
|
||||
model_response: ModelResponse,
|
||||
model: str,
|
||||
timeout: float
|
||||
timeout: float,
|
||||
):
|
||||
with httpx.stream(
|
||||
url=f"{api_base}",
|
||||
|
@ -1017,7 +1048,7 @@ class OpenAITextCompletion(BaseLLM):
|
|||
headers: dict,
|
||||
model_response: ModelResponse,
|
||||
model: str,
|
||||
timeout: float
|
||||
timeout: float,
|
||||
):
|
||||
client = httpx.AsyncClient()
|
||||
async with client.stream(
|
||||
|
|
|
@ -99,12 +99,16 @@ def ollama_pt(
|
|||
|
||||
|
||||
def mistral_instruct_pt(messages):
|
||||
# Following the Mistral example's https://huggingface.co/docs/transformers/main/chat_templating
|
||||
prompt = custom_prompt(
|
||||
initial_prompt_value="<s>",
|
||||
role_dict={
|
||||
"system": {"pre_message": "[INST]", "post_message": "[/INST]"},
|
||||
"user": {"pre_message": "[INST]", "post_message": "[/INST]"},
|
||||
"assistant": {"pre_message": "[INST]", "post_message": "[/INST]"},
|
||||
"system": {
|
||||
"pre_message": "[INST] \n",
|
||||
"post_message": " [/INST]\n",
|
||||
},
|
||||
"user": {"pre_message": "[INST] ", "post_message": " [/INST]\n"},
|
||||
"assistant": {"pre_message": " ", "post_message": " "},
|
||||
},
|
||||
final_prompt_value="</s>",
|
||||
messages=messages,
|
||||
|
@ -372,6 +376,7 @@ def anthropic_pt(
|
|||
You can "put words in Claude's mouth" by ending with an assistant message.
|
||||
See: https://docs.anthropic.com/claude/docs/put-words-in-claudes-mouth
|
||||
"""
|
||||
|
||||
class AnthropicConstants(Enum):
|
||||
HUMAN_PROMPT = "\n\nHuman: "
|
||||
AI_PROMPT = "\n\nAssistant: "
|
||||
|
@ -394,32 +399,35 @@ def anthropic_pt(
|
|||
prompt += f"{AnthropicConstants.AI_PROMPT.value}"
|
||||
return prompt
|
||||
|
||||
|
||||
|
||||
def _load_image_from_url(image_url):
|
||||
try:
|
||||
from PIL import Image
|
||||
except:
|
||||
raise Exception("gemini image conversion failed please run `pip install Pillow`")
|
||||
raise Exception(
|
||||
"gemini image conversion failed please run `pip install Pillow`"
|
||||
)
|
||||
from io import BytesIO
|
||||
|
||||
try:
|
||||
# Send a GET request to the image URL
|
||||
response = requests.get(image_url)
|
||||
response.raise_for_status() # Raise an exception for HTTP errors
|
||||
|
||||
# Check the response's content type to ensure it is an image
|
||||
content_type = response.headers.get('content-type')
|
||||
if not content_type or 'image' not in content_type:
|
||||
raise ValueError(f"URL does not point to a valid image (content-type: {content_type})")
|
||||
content_type = response.headers.get("content-type")
|
||||
if not content_type or "image" not in content_type:
|
||||
raise ValueError(
|
||||
f"URL does not point to a valid image (content-type: {content_type})"
|
||||
)
|
||||
|
||||
# Load the image from the response content
|
||||
return Image.open(BytesIO(response.content))
|
||||
|
||||
|
||||
except requests.RequestException as e:
|
||||
print(f"Request failed: {e}")
|
||||
except UnidentifiedImageError:
|
||||
print("Cannot identify image file (it may not be a supported image format or might be corrupted).")
|
||||
except ValueError as e:
|
||||
print(e)
|
||||
raise Exception(f"Request failed: {e}")
|
||||
except Exception as e:
|
||||
raise e
|
||||
|
||||
|
||||
def _gemini_vision_convert_messages(messages: list):
|
||||
|
@ -437,10 +445,11 @@ def _gemini_vision_convert_messages(messages: list):
|
|||
try:
|
||||
from PIL import Image
|
||||
except:
|
||||
raise Exception("gemini image conversion failed please run `pip install Pillow`")
|
||||
raise Exception(
|
||||
"gemini image conversion failed please run `pip install Pillow`"
|
||||
)
|
||||
|
||||
try:
|
||||
|
||||
# given messages for gpt-4 vision, convert them for gemini
|
||||
# https://github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/getting-started/intro_gemini_python.ipynb
|
||||
prompt = ""
|
||||
|
@ -589,7 +598,7 @@ def prompt_factory(
|
|||
if custom_llm_provider == "ollama":
|
||||
return ollama_pt(model=model, messages=messages)
|
||||
elif custom_llm_provider == "anthropic":
|
||||
if any(_ in model for _ in ["claude-2.1","claude-v2:1"]):
|
||||
if any(_ in model for _ in ["claude-2.1", "claude-v2:1"]):
|
||||
return claude_2_1_pt(messages=messages)
|
||||
else:
|
||||
return anthropic_pt(messages=messages)
|
||||
|
|
|
@ -25,6 +25,46 @@ class SagemakerError(Exception):
|
|||
) # Call the base class constructor with the parameters it needs
|
||||
|
||||
|
||||
import io
|
||||
import json
|
||||
|
||||
|
||||
class TokenIterator:
|
||||
def __init__(self, stream):
|
||||
self.byte_iterator = iter(stream)
|
||||
self.buffer = io.BytesIO()
|
||||
self.read_pos = 0
|
||||
self.end_of_data = False
|
||||
|
||||
def __iter__(self):
|
||||
return self
|
||||
|
||||
def __next__(self):
|
||||
try:
|
||||
while True:
|
||||
self.buffer.seek(self.read_pos)
|
||||
line = self.buffer.readline()
|
||||
if line and line[-1] == ord("\n"):
|
||||
response_obj = {"text": "", "is_finished": False}
|
||||
self.read_pos += len(line) + 1
|
||||
full_line = line[:-1].decode("utf-8")
|
||||
line_data = json.loads(full_line.lstrip("data:").rstrip("/n"))
|
||||
if line_data.get("generated_text", None) is not None:
|
||||
self.end_of_data = True
|
||||
response_obj["is_finished"] = True
|
||||
response_obj["text"] = line_data["token"]["text"]
|
||||
return response_obj
|
||||
chunk = next(self.byte_iterator)
|
||||
self.buffer.seek(0, io.SEEK_END)
|
||||
self.buffer.write(chunk["PayloadPart"]["Bytes"])
|
||||
except StopIteration as e:
|
||||
if self.end_of_data == True:
|
||||
raise e # Re-raise StopIteration
|
||||
else:
|
||||
self.end_of_data = True
|
||||
return "data: [DONE]"
|
||||
|
||||
|
||||
class SagemakerConfig:
|
||||
"""
|
||||
Reference: https://d-uuwbxj1u4cnu.studio.us-west-2.sagemaker.aws/jupyter/default/lab/workspaces/auto-q/tree/DemoNotebooks/meta-textgeneration-llama-2-7b-SDK_1.ipynb
|
||||
|
@ -121,7 +161,6 @@ def completion(
|
|||
|
||||
# pop streaming if it's in the optional params as 'stream' raises an error with sagemaker
|
||||
inference_params = deepcopy(optional_params)
|
||||
inference_params.pop("stream", None)
|
||||
|
||||
## Load Config
|
||||
config = litellm.SagemakerConfig.get_config()
|
||||
|
@ -152,6 +191,28 @@ def completion(
|
|||
hf_model_name or model
|
||||
) # pass in hf model name for pulling it's prompt template - (e.g. `hf_model_name="meta-llama/Llama-2-7b-chat-hf` applies the llama2 chat template to the prompt)
|
||||
prompt = prompt_factory(model=hf_model_name, messages=messages)
|
||||
stream = inference_params.pop("stream", None)
|
||||
if stream == True:
|
||||
data = json.dumps(
|
||||
{"inputs": prompt, "parameters": inference_params, "stream": True}
|
||||
).encode("utf-8")
|
||||
## LOGGING
|
||||
request_str = f"""
|
||||
response = client.invoke_endpoint_with_response_stream(
|
||||
EndpointName={model},
|
||||
ContentType="application/json",
|
||||
Body={data},
|
||||
CustomAttributes="accept_eula=true",
|
||||
)
|
||||
""" # type: ignore
|
||||
response = client.invoke_endpoint_with_response_stream(
|
||||
EndpointName=model,
|
||||
ContentType="application/json",
|
||||
Body=data,
|
||||
CustomAttributes="accept_eula=true",
|
||||
)
|
||||
|
||||
return response["Body"]
|
||||
|
||||
data = json.dumps({"inputs": prompt, "parameters": inference_params}).encode(
|
||||
"utf-8"
|
||||
|
@ -184,7 +245,15 @@ def completion(
|
|||
CustomAttributes="accept_eula=true",
|
||||
)
|
||||
except Exception as e:
|
||||
raise SagemakerError(status_code=500, message=f"{str(e)}")
|
||||
status_code = (
|
||||
getattr(e, "response", {})
|
||||
.get("ResponseMetadata", {})
|
||||
.get("HTTPStatusCode", 500)
|
||||
)
|
||||
error_message = (
|
||||
getattr(e, "response", {}).get("Error", {}).get("Message", str(e))
|
||||
)
|
||||
raise SagemakerError(status_code=status_code, message=error_message)
|
||||
|
||||
response = response["Body"].read().decode("utf8")
|
||||
## LOGGING
|
||||
|
@ -358,7 +427,15 @@ def embedding(
|
|||
CustomAttributes="accept_eula=true",
|
||||
)
|
||||
except Exception as e:
|
||||
raise SagemakerError(status_code=500, message=f"{str(e)}")
|
||||
status_code = (
|
||||
getattr(e, "response", {})
|
||||
.get("ResponseMetadata", {})
|
||||
.get("HTTPStatusCode", 500)
|
||||
)
|
||||
error_message = (
|
||||
getattr(e, "response", {}).get("Error", {}).get("Message", str(e))
|
||||
)
|
||||
raise SagemakerError(status_code=status_code, message=error_message)
|
||||
|
||||
response = json.loads(response["Body"].read().decode("utf8"))
|
||||
## LOGGING
|
||||
|
@ -368,7 +445,7 @@ def embedding(
|
|||
original_response=response,
|
||||
additional_args={"complete_input_dict": data},
|
||||
)
|
||||
|
||||
|
||||
print_verbose(f"raw model_response: {response}")
|
||||
if "embedding" not in response:
|
||||
raise SagemakerError(status_code=500, message="embedding not found in response")
|
||||
|
|
0
litellm/llms/tokenizers/__init__.py
Normal file
|
@ -3,7 +3,7 @@ import json
|
|||
from enum import Enum
|
||||
import requests
|
||||
import time
|
||||
from typing import Callable, Optional
|
||||
from typing import Callable, Optional, Union
|
||||
from litellm.utils import ModelResponse, Usage, CustomStreamWrapper
|
||||
import litellm, uuid
|
||||
import httpx
|
||||
|
@ -75,6 +75,41 @@ class VertexAIConfig:
|
|||
}
|
||||
|
||||
|
||||
import asyncio
|
||||
|
||||
|
||||
class TextStreamer:
|
||||
"""
|
||||
Fake streaming iterator for Vertex AI Model Garden calls
|
||||
"""
|
||||
|
||||
def __init__(self, text):
|
||||
self.text = text.split() # let's assume words as a streaming unit
|
||||
self.index = 0
|
||||
|
||||
def __iter__(self):
|
||||
return self
|
||||
|
||||
def __next__(self):
|
||||
if self.index < len(self.text):
|
||||
result = self.text[self.index]
|
||||
self.index += 1
|
||||
return result
|
||||
else:
|
||||
raise StopIteration
|
||||
|
||||
def __aiter__(self):
|
||||
return self
|
||||
|
||||
async def __anext__(self):
|
||||
if self.index < len(self.text):
|
||||
result = self.text[self.index]
|
||||
self.index += 1
|
||||
return result
|
||||
else:
|
||||
raise StopAsyncIteration # once we run out of data to stream, we raise this error
|
||||
|
||||
|
||||
def _get_image_bytes_from_url(image_url: str) -> bytes:
|
||||
try:
|
||||
response = requests.get(image_url)
|
||||
|
@ -216,6 +251,14 @@ def completion(
|
|||
status_code=400,
|
||||
message="vertexai import failed please run `pip install google-cloud-aiplatform`",
|
||||
)
|
||||
|
||||
if not (
|
||||
hasattr(vertexai, "preview") or hasattr(vertexai.preview, "language_models")
|
||||
):
|
||||
raise VertexAIError(
|
||||
status_code=400,
|
||||
message="""Upgrade vertex ai. Run `pip install "google-cloud-aiplatform>=1.38"`""",
|
||||
)
|
||||
try:
|
||||
from vertexai.preview.language_models import (
|
||||
ChatModel,
|
||||
|
@ -228,9 +271,17 @@ def completion(
|
|||
Part,
|
||||
GenerationConfig,
|
||||
)
|
||||
from google.cloud import aiplatform
|
||||
from google.protobuf import json_format # type: ignore
|
||||
from google.protobuf.struct_pb2 import Value # type: ignore
|
||||
from google.cloud.aiplatform_v1beta1.types import content as gapic_content_types
|
||||
import google.auth
|
||||
|
||||
vertexai.init(project=vertex_project, location=vertex_location)
|
||||
## Load credentials with the correct quota project ref: https://github.com/googleapis/python-aiplatform/issues/2557#issuecomment-1709284744
|
||||
creds, _ = google.auth.default(quota_project_id=vertex_project)
|
||||
vertexai.init(
|
||||
project=vertex_project, location=vertex_location, credentials=creds
|
||||
)
|
||||
|
||||
## Load Config
|
||||
config = litellm.VertexAIConfig.get_config()
|
||||
|
@ -264,6 +315,11 @@ def completion(
|
|||
|
||||
request_str = ""
|
||||
response_obj = None
|
||||
async_client = None
|
||||
instances = None
|
||||
client_options = {
|
||||
"api_endpoint": f"{vertex_location}-aiplatform.googleapis.com"
|
||||
}
|
||||
if (
|
||||
model in litellm.vertex_language_models
|
||||
or model in litellm.vertex_vision_models
|
||||
|
@ -283,39 +339,51 @@ def completion(
|
|||
llm_model = CodeGenerationModel.from_pretrained(model)
|
||||
mode = "text"
|
||||
request_str += f"llm_model = CodeGenerationModel.from_pretrained({model})\n"
|
||||
else: # vertex_code_llm_models
|
||||
elif model in litellm.vertex_code_chat_models: # vertex_code_llm_models
|
||||
llm_model = CodeChatModel.from_pretrained(model)
|
||||
mode = "chat"
|
||||
request_str += f"llm_model = CodeChatModel.from_pretrained({model})\n"
|
||||
else: # assume vertex model garden
|
||||
client = aiplatform.gapic.PredictionServiceClient(
|
||||
client_options=client_options
|
||||
)
|
||||
|
||||
if acompletion == True: # [TODO] expand support to vertex ai chat + text models
|
||||
instances = [optional_params]
|
||||
instances[0]["prompt"] = prompt
|
||||
instances = [
|
||||
json_format.ParseDict(instance_dict, Value())
|
||||
for instance_dict in instances
|
||||
]
|
||||
llm_model = client.endpoint_path(
|
||||
project=vertex_project, location=vertex_location, endpoint=model
|
||||
)
|
||||
|
||||
mode = "custom"
|
||||
request_str += f"llm_model = client.endpoint_path(project={vertex_project}, location={vertex_location}, endpoint={model})\n"
|
||||
|
||||
if acompletion == True:
|
||||
data = {
|
||||
"llm_model": llm_model,
|
||||
"mode": mode,
|
||||
"prompt": prompt,
|
||||
"logging_obj": logging_obj,
|
||||
"request_str": request_str,
|
||||
"model": model,
|
||||
"model_response": model_response,
|
||||
"encoding": encoding,
|
||||
"messages": messages,
|
||||
"print_verbose": print_verbose,
|
||||
"client_options": client_options,
|
||||
"instances": instances,
|
||||
"vertex_location": vertex_location,
|
||||
"vertex_project": vertex_project,
|
||||
**optional_params,
|
||||
}
|
||||
if optional_params.get("stream", False) is True:
|
||||
# async streaming
|
||||
return async_streaming(
|
||||
llm_model=llm_model,
|
||||
mode=mode,
|
||||
prompt=prompt,
|
||||
logging_obj=logging_obj,
|
||||
request_str=request_str,
|
||||
model=model,
|
||||
model_response=model_response,
|
||||
messages=messages,
|
||||
print_verbose=print_verbose,
|
||||
**optional_params,
|
||||
)
|
||||
return async_completion(
|
||||
llm_model=llm_model,
|
||||
mode=mode,
|
||||
prompt=prompt,
|
||||
logging_obj=logging_obj,
|
||||
request_str=request_str,
|
||||
model=model,
|
||||
model_response=model_response,
|
||||
encoding=encoding,
|
||||
messages=messages,
|
||||
print_verbose=print_verbose,
|
||||
**optional_params,
|
||||
)
|
||||
return async_streaming(**data)
|
||||
|
||||
return async_completion(**data)
|
||||
|
||||
if mode == "vision":
|
||||
print_verbose("\nMaking VertexAI Gemini Pro Vision Call")
|
||||
|
@ -460,7 +528,36 @@ def completion(
|
|||
},
|
||||
)
|
||||
completion_response = llm_model.predict(prompt, **optional_params).text
|
||||
elif mode == "custom":
|
||||
"""
|
||||
Vertex AI Model Garden
|
||||
"""
|
||||
request_str += (
|
||||
f"client.predict(endpoint={llm_model}, instances={instances})\n"
|
||||
)
|
||||
## LOGGING
|
||||
logging_obj.pre_call(
|
||||
input=prompt,
|
||||
api_key=None,
|
||||
additional_args={
|
||||
"complete_input_dict": optional_params,
|
||||
"request_str": request_str,
|
||||
},
|
||||
)
|
||||
|
||||
response = client.predict(
|
||||
endpoint=llm_model,
|
||||
instances=instances,
|
||||
).predictions
|
||||
completion_response = response[0]
|
||||
if (
|
||||
isinstance(completion_response, str)
|
||||
and "\nOutput:\n" in completion_response
|
||||
):
|
||||
completion_response = completion_response.split("\nOutput:\n", 1)[1]
|
||||
if "stream" in optional_params and optional_params["stream"] == True:
|
||||
response = TextStreamer(completion_response)
|
||||
return response
|
||||
## LOGGING
|
||||
logging_obj.post_call(
|
||||
input=prompt, api_key=None, original_response=completion_response
|
||||
|
@ -528,6 +625,10 @@ async def async_completion(
|
|||
encoding=None,
|
||||
messages=None,
|
||||
print_verbose=None,
|
||||
client_options=None,
|
||||
instances=None,
|
||||
vertex_project=None,
|
||||
vertex_location=None,
|
||||
**optional_params,
|
||||
):
|
||||
"""
|
||||
|
@ -616,7 +717,43 @@ async def async_completion(
|
|||
)
|
||||
response_obj = await llm_model.predict_async(prompt, **optional_params)
|
||||
completion_response = response_obj.text
|
||||
elif mode == "custom":
|
||||
"""
|
||||
Vertex AI Model Garden
|
||||
"""
|
||||
from google.cloud import aiplatform
|
||||
|
||||
async_client = aiplatform.gapic.PredictionServiceAsyncClient(
|
||||
client_options=client_options
|
||||
)
|
||||
llm_model = async_client.endpoint_path(
|
||||
project=vertex_project, location=vertex_location, endpoint=model
|
||||
)
|
||||
|
||||
request_str += (
|
||||
f"client.predict(endpoint={llm_model}, instances={instances})\n"
|
||||
)
|
||||
## LOGGING
|
||||
logging_obj.pre_call(
|
||||
input=prompt,
|
||||
api_key=None,
|
||||
additional_args={
|
||||
"complete_input_dict": optional_params,
|
||||
"request_str": request_str,
|
||||
},
|
||||
)
|
||||
|
||||
response_obj = await async_client.predict(
|
||||
endpoint=llm_model,
|
||||
instances=instances,
|
||||
)
|
||||
response = response_obj.predictions
|
||||
completion_response = response[0]
|
||||
if (
|
||||
isinstance(completion_response, str)
|
||||
and "\nOutput:\n" in completion_response
|
||||
):
|
||||
completion_response = completion_response.split("\nOutput:\n", 1)[1]
|
||||
## LOGGING
|
||||
logging_obj.post_call(
|
||||
input=prompt, api_key=None, original_response=completion_response
|
||||
|
@ -646,14 +783,12 @@ async def async_completion(
|
|||
# init prompt tokens
|
||||
# this block attempts to get usage from response_obj if it exists, if not it uses the litellm token counter
|
||||
prompt_tokens, completion_tokens, total_tokens = 0, 0, 0
|
||||
if response_obj is not None:
|
||||
if hasattr(response_obj, "usage_metadata") and hasattr(
|
||||
response_obj.usage_metadata, "prompt_token_count"
|
||||
):
|
||||
prompt_tokens = response_obj.usage_metadata.prompt_token_count
|
||||
completion_tokens = (
|
||||
response_obj.usage_metadata.candidates_token_count
|
||||
)
|
||||
if response_obj is not None and (
|
||||
hasattr(response_obj, "usage_metadata")
|
||||
and hasattr(response_obj.usage_metadata, "prompt_token_count")
|
||||
):
|
||||
prompt_tokens = response_obj.usage_metadata.prompt_token_count
|
||||
completion_tokens = response_obj.usage_metadata.candidates_token_count
|
||||
else:
|
||||
prompt_tokens = len(encoding.encode(prompt))
|
||||
completion_tokens = len(
|
||||
|
@ -682,8 +817,13 @@ async def async_streaming(
|
|||
model_response: ModelResponse,
|
||||
logging_obj=None,
|
||||
request_str=None,
|
||||
encoding=None,
|
||||
messages=None,
|
||||
print_verbose=None,
|
||||
client_options=None,
|
||||
instances=None,
|
||||
vertex_project=None,
|
||||
vertex_location=None,
|
||||
**optional_params,
|
||||
):
|
||||
"""
|
||||
|
@ -752,17 +892,198 @@ async def async_streaming(
|
|||
},
|
||||
)
|
||||
response = llm_model.predict_streaming_async(prompt, **optional_params)
|
||||
elif mode == "custom":
|
||||
from google.cloud import aiplatform
|
||||
|
||||
async_client = aiplatform.gapic.PredictionServiceAsyncClient(
|
||||
client_options=client_options
|
||||
)
|
||||
llm_model = async_client.endpoint_path(
|
||||
project=vertex_project, location=vertex_location, endpoint=model
|
||||
)
|
||||
|
||||
request_str += f"client.predict(endpoint={llm_model}, instances={instances})\n"
|
||||
## LOGGING
|
||||
logging_obj.pre_call(
|
||||
input=prompt,
|
||||
api_key=None,
|
||||
additional_args={
|
||||
"complete_input_dict": optional_params,
|
||||
"request_str": request_str,
|
||||
},
|
||||
)
|
||||
|
||||
response_obj = await async_client.predict(
|
||||
endpoint=llm_model,
|
||||
instances=instances,
|
||||
)
|
||||
response = response_obj.predictions
|
||||
completion_response = response[0]
|
||||
if (
|
||||
isinstance(completion_response, str)
|
||||
and "\nOutput:\n" in completion_response
|
||||
):
|
||||
completion_response = completion_response.split("\nOutput:\n", 1)[1]
|
||||
if "stream" in optional_params and optional_params["stream"] == True:
|
||||
response = TextStreamer(completion_response)
|
||||
streamwrapper = CustomStreamWrapper(
|
||||
completion_stream=response,
|
||||
model=model,
|
||||
custom_llm_provider="vertex_ai",
|
||||
logging_obj=logging_obj,
|
||||
)
|
||||
async for transformed_chunk in streamwrapper:
|
||||
yield transformed_chunk
|
||||
return streamwrapper
|
||||
|
||||
|
||||
def embedding():
|
||||
def embedding(
|
||||
model: str,
|
||||
input: Union[list, str],
|
||||
api_key: Optional[str] = None,
|
||||
logging_obj=None,
|
||||
model_response=None,
|
||||
optional_params=None,
|
||||
encoding=None,
|
||||
vertex_project=None,
|
||||
vertex_location=None,
|
||||
aembedding=False,
|
||||
):
|
||||
# logic for parsing in - calling - parsing out model embedding calls
|
||||
pass
|
||||
try:
|
||||
import vertexai
|
||||
except:
|
||||
raise VertexAIError(
|
||||
status_code=400,
|
||||
message="vertexai import failed please run `pip install google-cloud-aiplatform`",
|
||||
)
|
||||
|
||||
from vertexai.language_models import TextEmbeddingModel
|
||||
import google.auth
|
||||
|
||||
## Load credentials with the correct quota project ref: https://github.com/googleapis/python-aiplatform/issues/2557#issuecomment-1709284744
|
||||
try:
|
||||
creds, _ = google.auth.default(quota_project_id=vertex_project)
|
||||
vertexai.init(
|
||||
project=vertex_project, location=vertex_location, credentials=creds
|
||||
)
|
||||
except Exception as e:
|
||||
raise VertexAIError(status_code=401, message=str(e))
|
||||
|
||||
if isinstance(input, str):
|
||||
input = [input]
|
||||
|
||||
try:
|
||||
llm_model = TextEmbeddingModel.from_pretrained(model)
|
||||
except Exception as e:
|
||||
raise VertexAIError(status_code=422, message=str(e))
|
||||
|
||||
if aembedding == True:
|
||||
return async_embedding(
|
||||
model=model,
|
||||
client=llm_model,
|
||||
input=input,
|
||||
logging_obj=logging_obj,
|
||||
model_response=model_response,
|
||||
optional_params=optional_params,
|
||||
encoding=encoding,
|
||||
)
|
||||
|
||||
request_str = f"""embeddings = llm_model.get_embeddings({input})"""
|
||||
## LOGGING PRE-CALL
|
||||
logging_obj.pre_call(
|
||||
input=input,
|
||||
api_key=None,
|
||||
additional_args={
|
||||
"complete_input_dict": optional_params,
|
||||
"request_str": request_str,
|
||||
},
|
||||
)
|
||||
|
||||
try:
|
||||
embeddings = llm_model.get_embeddings(input)
|
||||
except Exception as e:
|
||||
raise VertexAIError(status_code=500, message=str(e))
|
||||
|
||||
## LOGGING POST-CALL
|
||||
logging_obj.post_call(input=input, api_key=None, original_response=embeddings)
|
||||
## Populate OpenAI compliant dictionary
|
||||
embedding_response = []
|
||||
for idx, embedding in enumerate(embeddings):
|
||||
embedding_response.append(
|
||||
{
|
||||
"object": "embedding",
|
||||
"index": idx,
|
||||
"embedding": embedding.values,
|
||||
}
|
||||
)
|
||||
model_response["object"] = "list"
|
||||
model_response["data"] = embedding_response
|
||||
model_response["model"] = model
|
||||
input_tokens = 0
|
||||
|
||||
input_str = "".join(input)
|
||||
|
||||
input_tokens += len(encoding.encode(input_str))
|
||||
|
||||
usage = Usage(
|
||||
prompt_tokens=input_tokens, completion_tokens=0, total_tokens=input_tokens
|
||||
)
|
||||
model_response.usage = usage
|
||||
|
||||
return model_response
|
||||
|
||||
|
||||
async def async_embedding(
|
||||
model: str,
|
||||
input: Union[list, str],
|
||||
logging_obj=None,
|
||||
model_response=None,
|
||||
optional_params=None,
|
||||
encoding=None,
|
||||
client=None,
|
||||
):
|
||||
"""
|
||||
Async embedding implementation
|
||||
"""
|
||||
request_str = f"""embeddings = llm_model.get_embeddings({input})"""
|
||||
## LOGGING PRE-CALL
|
||||
logging_obj.pre_call(
|
||||
input=input,
|
||||
api_key=None,
|
||||
additional_args={
|
||||
"complete_input_dict": optional_params,
|
||||
"request_str": request_str,
|
||||
},
|
||||
)
|
||||
|
||||
try:
|
||||
embeddings = await client.get_embeddings_async(input)
|
||||
except Exception as e:
|
||||
raise VertexAIError(status_code=500, message=str(e))
|
||||
|
||||
## LOGGING POST-CALL
|
||||
logging_obj.post_call(input=input, api_key=None, original_response=embeddings)
|
||||
## Populate OpenAI compliant dictionary
|
||||
embedding_response = []
|
||||
for idx, embedding in enumerate(embeddings):
|
||||
embedding_response.append(
|
||||
{
|
||||
"object": "embedding",
|
||||
"index": idx,
|
||||
"embedding": embedding.values,
|
||||
}
|
||||
)
|
||||
model_response["object"] = "list"
|
||||
model_response["data"] = embedding_response
|
||||
model_response["model"] = model
|
||||
input_tokens = 0
|
||||
|
||||
input_str = "".join(input)
|
||||
|
||||
input_tokens += len(encoding.encode(input_str))
|
||||
|
||||
usage = Usage(
|
||||
prompt_tokens=input_tokens, completion_tokens=0, total_tokens=input_tokens
|
||||
)
|
||||
model_response.usage = usage
|
||||
|
||||
return model_response
|
||||
|
|
253
litellm/main.py
|
@ -15,7 +15,7 @@ import dotenv, traceback, random, asyncio, time, contextvars
|
|||
from copy import deepcopy
|
||||
import httpx
|
||||
import litellm
|
||||
|
||||
from ._logging import verbose_logger
|
||||
from litellm import ( # type: ignore
|
||||
client,
|
||||
exception_type,
|
||||
|
@ -83,6 +83,7 @@ from litellm.utils import (
|
|||
TextCompletionResponse,
|
||||
TextChoices,
|
||||
EmbeddingResponse,
|
||||
ImageResponse,
|
||||
read_config_args,
|
||||
Choices,
|
||||
Message,
|
||||
|
@ -275,14 +276,10 @@ async def acompletion(
|
|||
else:
|
||||
# Call the synchronous function using run_in_executor
|
||||
response = await loop.run_in_executor(None, func_with_context) # type: ignore
|
||||
# if kwargs.get("stream", False): # return an async generator
|
||||
# return _async_streaming(
|
||||
# response=response,
|
||||
# model=model,
|
||||
# custom_llm_provider=custom_llm_provider,
|
||||
# args=args,
|
||||
# )
|
||||
# else:
|
||||
if isinstance(response, CustomStreamWrapper):
|
||||
response.set_logging_event_loop(
|
||||
loop=loop
|
||||
) # sets the logging event loop if the user does sync streaming (e.g. on proxy for sagemaker calls)
|
||||
return response
|
||||
except Exception as e:
|
||||
custom_llm_provider = custom_llm_provider or "openai"
|
||||
|
@ -345,6 +342,18 @@ def mock_completion(
|
|||
model_response["choices"][0]["message"]["content"] = mock_response
|
||||
model_response["created"] = int(time.time())
|
||||
model_response["model"] = model
|
||||
|
||||
model_response.usage = Usage(
|
||||
prompt_tokens=10, completion_tokens=20, total_tokens=30
|
||||
)
|
||||
|
||||
try:
|
||||
_, custom_llm_provider, _, _ = litellm.utils.get_llm_provider(model=model)
|
||||
model_response._hidden_params["custom_llm_provider"] = custom_llm_provider
|
||||
except:
|
||||
# dont let setting a hidden param block a mock_respose
|
||||
pass
|
||||
|
||||
return model_response
|
||||
|
||||
except:
|
||||
|
@ -444,9 +453,12 @@ def completion(
|
|||
num_retries = kwargs.get("num_retries", None) ## deprecated
|
||||
max_retries = kwargs.get("max_retries", None)
|
||||
context_window_fallback_dict = kwargs.get("context_window_fallback_dict", None)
|
||||
organization = kwargs.get("organization", None)
|
||||
### CUSTOM MODEL COST ###
|
||||
input_cost_per_token = kwargs.get("input_cost_per_token", None)
|
||||
output_cost_per_token = kwargs.get("output_cost_per_token", None)
|
||||
input_cost_per_second = kwargs.get("input_cost_per_second", None)
|
||||
output_cost_per_second = kwargs.get("output_cost_per_second", None)
|
||||
### CUSTOM PROMPT TEMPLATE ###
|
||||
initial_prompt_value = kwargs.get("initial_prompt_value", None)
|
||||
roles = kwargs.get("roles", None)
|
||||
|
@ -524,6 +536,8 @@ def completion(
|
|||
"tpm",
|
||||
"input_cost_per_token",
|
||||
"output_cost_per_token",
|
||||
"input_cost_per_second",
|
||||
"output_cost_per_second",
|
||||
"hf_model_name",
|
||||
"model_info",
|
||||
"proxy_server_request",
|
||||
|
@ -536,10 +550,6 @@ def completion(
|
|||
non_default_params = {
|
||||
k: v for k, v in kwargs.items() if k not in default_params
|
||||
} # model-specific params - pass them straight to the model/provider
|
||||
if mock_response:
|
||||
return mock_completion(
|
||||
model, messages, stream=stream, mock_response=mock_response
|
||||
)
|
||||
if timeout is None:
|
||||
timeout = (
|
||||
kwargs.get("request_timeout", None) or 600
|
||||
|
@ -577,15 +587,45 @@ def completion(
|
|||
api_base=api_base,
|
||||
api_key=api_key,
|
||||
)
|
||||
if model_response is not None and hasattr(model_response, "_hidden_params"):
|
||||
model_response._hidden_params["custom_llm_provider"] = custom_llm_provider
|
||||
model_response._hidden_params["region_name"] = kwargs.get(
|
||||
"aws_region_name", None
|
||||
) # support region-based pricing for bedrock
|
||||
|
||||
### REGISTER CUSTOM MODEL PRICING -- IF GIVEN ###
|
||||
if input_cost_per_token is not None and output_cost_per_token is not None:
|
||||
print_verbose(f"Registering model={model} in model cost map")
|
||||
litellm.register_model(
|
||||
{
|
||||
f"{custom_llm_provider}/{model}": {
|
||||
"input_cost_per_token": input_cost_per_token,
|
||||
"output_cost_per_token": output_cost_per_token,
|
||||
"litellm_provider": custom_llm_provider,
|
||||
},
|
||||
model: {
|
||||
"input_cost_per_token": input_cost_per_token,
|
||||
"output_cost_per_token": output_cost_per_token,
|
||||
"litellm_provider": custom_llm_provider,
|
||||
}
|
||||
},
|
||||
}
|
||||
)
|
||||
elif (
|
||||
input_cost_per_second is not None
|
||||
): # time based pricing just needs cost in place
|
||||
output_cost_per_second = output_cost_per_second or 0.0
|
||||
litellm.register_model(
|
||||
{
|
||||
f"{custom_llm_provider}/{model}": {
|
||||
"input_cost_per_second": input_cost_per_second,
|
||||
"output_cost_per_second": output_cost_per_second,
|
||||
"litellm_provider": custom_llm_provider,
|
||||
},
|
||||
model: {
|
||||
"input_cost_per_second": input_cost_per_second,
|
||||
"output_cost_per_second": output_cost_per_second,
|
||||
"litellm_provider": custom_llm_provider,
|
||||
},
|
||||
}
|
||||
)
|
||||
### BUILD CUSTOM PROMPT TEMPLATE -- IF GIVEN ###
|
||||
|
@ -674,6 +714,10 @@ def completion(
|
|||
optional_params=optional_params,
|
||||
litellm_params=litellm_params,
|
||||
)
|
||||
if mock_response:
|
||||
return mock_completion(
|
||||
model, messages, stream=stream, mock_response=mock_response
|
||||
)
|
||||
if custom_llm_provider == "azure":
|
||||
# azure configs
|
||||
api_type = get_secret("AZURE_API_TYPE") or "azure"
|
||||
|
@ -692,9 +736,9 @@ def completion(
|
|||
or get_secret("AZURE_API_KEY")
|
||||
)
|
||||
|
||||
azure_ad_token = optional_params.pop("azure_ad_token", None) or get_secret(
|
||||
"AZURE_AD_TOKEN"
|
||||
)
|
||||
azure_ad_token = optional_params.get("extra_body", {}).pop(
|
||||
"azure_ad_token", None
|
||||
) or get_secret("AZURE_AD_TOKEN")
|
||||
|
||||
headers = headers or litellm.headers
|
||||
|
||||
|
@ -758,7 +802,8 @@ def completion(
|
|||
or "https://api.openai.com/v1"
|
||||
)
|
||||
openai.organization = (
|
||||
litellm.organization
|
||||
organization
|
||||
or litellm.organization
|
||||
or get_secret("OPENAI_ORGANIZATION")
|
||||
or None # default - https://github.com/openai/openai-python/blob/284c1799070c723c6a553337134148a7ab088dd8/openai/util.py#L105
|
||||
)
|
||||
|
@ -798,6 +843,7 @@ def completion(
|
|||
timeout=timeout,
|
||||
custom_prompt_dict=custom_prompt_dict,
|
||||
client=client, # pass AsyncOpenAI, OpenAI client
|
||||
organization=organization,
|
||||
)
|
||||
except Exception as e:
|
||||
## LOGGING - log the original exception returned
|
||||
|
@ -967,6 +1013,7 @@ def completion(
|
|||
encoding=encoding, # for calculating input/output tokens
|
||||
api_key=api_key,
|
||||
logging_obj=logging,
|
||||
headers=headers,
|
||||
)
|
||||
if "stream" in optional_params and optional_params["stream"] == True:
|
||||
# don't try to access stream object,
|
||||
|
@ -1376,11 +1423,29 @@ def completion(
|
|||
acompletion=acompletion,
|
||||
custom_prompt_dict=custom_prompt_dict,
|
||||
)
|
||||
if (
|
||||
"stream" in optional_params
|
||||
and optional_params["stream"] == True
|
||||
and acompletion == False
|
||||
):
|
||||
response = CustomStreamWrapper(
|
||||
iter(model_response),
|
||||
model,
|
||||
custom_llm_provider="gemini",
|
||||
logging_obj=logging,
|
||||
)
|
||||
return response
|
||||
response = model_response
|
||||
elif custom_llm_provider == "vertex_ai":
|
||||
vertex_ai_project = litellm.vertex_project or get_secret("VERTEXAI_PROJECT")
|
||||
vertex_ai_location = litellm.vertex_location or get_secret(
|
||||
"VERTEXAI_LOCATION"
|
||||
vertex_ai_project = (
|
||||
optional_params.pop("vertex_ai_project", None)
|
||||
or litellm.vertex_project
|
||||
or get_secret("VERTEXAI_PROJECT")
|
||||
)
|
||||
vertex_ai_location = (
|
||||
optional_params.pop("vertex_ai_location", None)
|
||||
or litellm.vertex_location
|
||||
or get_secret("VERTEXAI_LOCATION")
|
||||
)
|
||||
|
||||
model_response = vertex_ai.completion(
|
||||
|
@ -1471,19 +1536,22 @@ def completion(
|
|||
if (
|
||||
"stream" in optional_params and optional_params["stream"] == True
|
||||
): ## [BETA]
|
||||
# sagemaker does not support streaming as of now so we're faking streaming:
|
||||
# https://discuss.huggingface.co/t/streaming-output-text-when-deploying-on-sagemaker/39611
|
||||
# "SageMaker is currently not supporting streaming responses."
|
||||
|
||||
# fake streaming for sagemaker
|
||||
print_verbose(f"ENTERS SAGEMAKER CUSTOMSTREAMWRAPPER")
|
||||
resp_string = model_response["choices"][0]["message"]["content"]
|
||||
from .llms.sagemaker import TokenIterator
|
||||
|
||||
tokenIterator = TokenIterator(model_response)
|
||||
response = CustomStreamWrapper(
|
||||
resp_string,
|
||||
model,
|
||||
completion_stream=tokenIterator,
|
||||
model=model,
|
||||
custom_llm_provider="sagemaker",
|
||||
logging_obj=logging,
|
||||
)
|
||||
## LOGGING
|
||||
logging.post_call(
|
||||
input=messages,
|
||||
api_key=None,
|
||||
original_response=response,
|
||||
)
|
||||
return response
|
||||
|
||||
## RESPONSE OBJECT
|
||||
|
@ -2146,6 +2214,7 @@ async def aembedding(*args, **kwargs):
|
|||
or custom_llm_provider == "deepinfra"
|
||||
or custom_llm_provider == "perplexity"
|
||||
or custom_llm_provider == "ollama"
|
||||
or custom_llm_provider == "vertex_ai"
|
||||
): # currently implemented aiohttp calls for just azure and openai, soon all.
|
||||
# Await normally
|
||||
init_response = await loop.run_in_executor(None, func_with_context)
|
||||
|
@ -2158,6 +2227,8 @@ async def aembedding(*args, **kwargs):
|
|||
else:
|
||||
# Call the synchronous function using run_in_executor
|
||||
response = await loop.run_in_executor(None, func_with_context)
|
||||
if response is not None and hasattr(response, "_hidden_params"):
|
||||
response._hidden_params["custom_llm_provider"] = custom_llm_provider
|
||||
return response
|
||||
except Exception as e:
|
||||
custom_llm_provider = custom_llm_provider or "openai"
|
||||
|
@ -2174,6 +2245,7 @@ def embedding(
|
|||
model,
|
||||
input=[],
|
||||
# Optional params
|
||||
dimensions: Optional[int] = None,
|
||||
timeout=600, # default to 10 minutes
|
||||
# set api_base, api_version, api_key
|
||||
api_base: Optional[str] = None,
|
||||
|
@ -2194,6 +2266,7 @@ def embedding(
|
|||
Parameters:
|
||||
- model: The embedding model to use.
|
||||
- input: The input for which embeddings are to be generated.
|
||||
- dimensions: The number of dimensions the resulting output embeddings should have. Only supported in text-embedding-3 and later models.
|
||||
- timeout: The timeout value for the API call, default 10 mins
|
||||
- litellm_call_id: The call ID for litellm logging.
|
||||
- litellm_logging_obj: The litellm logging object.
|
||||
|
@ -2220,8 +2293,14 @@ def embedding(
|
|||
encoding_format = kwargs.get("encoding_format", None)
|
||||
proxy_server_request = kwargs.get("proxy_server_request", None)
|
||||
aembedding = kwargs.get("aembedding", None)
|
||||
### CUSTOM MODEL COST ###
|
||||
input_cost_per_token = kwargs.get("input_cost_per_token", None)
|
||||
output_cost_per_token = kwargs.get("output_cost_per_token", None)
|
||||
input_cost_per_second = kwargs.get("input_cost_per_second", None)
|
||||
output_cost_per_second = kwargs.get("output_cost_per_second", None)
|
||||
openai_params = [
|
||||
"user",
|
||||
"dimensions",
|
||||
"request_timeout",
|
||||
"api_base",
|
||||
"api_version",
|
||||
|
@ -2268,6 +2347,8 @@ def embedding(
|
|||
"tpm",
|
||||
"input_cost_per_token",
|
||||
"output_cost_per_token",
|
||||
"input_cost_per_second",
|
||||
"output_cost_per_second",
|
||||
"hf_model_name",
|
||||
"proxy_server_request",
|
||||
"model_info",
|
||||
|
@ -2288,11 +2369,35 @@ def embedding(
|
|||
api_key=api_key,
|
||||
)
|
||||
optional_params = get_optional_params_embeddings(
|
||||
model=model,
|
||||
user=user,
|
||||
dimensions=dimensions,
|
||||
encoding_format=encoding_format,
|
||||
custom_llm_provider=custom_llm_provider,
|
||||
**non_default_params,
|
||||
)
|
||||
### REGISTER CUSTOM MODEL PRICING -- IF GIVEN ###
|
||||
if input_cost_per_token is not None and output_cost_per_token is not None:
|
||||
litellm.register_model(
|
||||
{
|
||||
model: {
|
||||
"input_cost_per_token": input_cost_per_token,
|
||||
"output_cost_per_token": output_cost_per_token,
|
||||
"litellm_provider": custom_llm_provider,
|
||||
}
|
||||
}
|
||||
)
|
||||
if input_cost_per_second is not None: # time based pricing just needs cost in place
|
||||
output_cost_per_second = output_cost_per_second or 0.0
|
||||
litellm.register_model(
|
||||
{
|
||||
model: {
|
||||
"input_cost_per_second": input_cost_per_second,
|
||||
"output_cost_per_second": output_cost_per_second,
|
||||
"litellm_provider": custom_llm_provider,
|
||||
}
|
||||
}
|
||||
)
|
||||
try:
|
||||
response = None
|
||||
logging = litellm_logging_obj
|
||||
|
@ -2385,7 +2490,7 @@ def embedding(
|
|||
client=client,
|
||||
aembedding=aembedding,
|
||||
)
|
||||
elif model in litellm.cohere_embedding_models:
|
||||
elif custom_llm_provider == "cohere":
|
||||
cohere_key = (
|
||||
api_key
|
||||
or litellm.cohere_key
|
||||
|
@ -2427,6 +2532,29 @@ def embedding(
|
|||
optional_params=optional_params,
|
||||
model_response=EmbeddingResponse(),
|
||||
)
|
||||
elif custom_llm_provider == "vertex_ai":
|
||||
vertex_ai_project = (
|
||||
optional_params.pop("vertex_ai_project", None)
|
||||
or litellm.vertex_project
|
||||
or get_secret("VERTEXAI_PROJECT")
|
||||
)
|
||||
vertex_ai_location = (
|
||||
optional_params.pop("vertex_ai_location", None)
|
||||
or litellm.vertex_location
|
||||
or get_secret("VERTEXAI_LOCATION")
|
||||
)
|
||||
|
||||
response = vertex_ai.embedding(
|
||||
model=model,
|
||||
input=input,
|
||||
encoding=encoding,
|
||||
logging_obj=logging,
|
||||
optional_params=optional_params,
|
||||
model_response=EmbeddingResponse(),
|
||||
vertex_project=vertex_ai_project,
|
||||
vertex_location=vertex_ai_location,
|
||||
aembedding=aembedding,
|
||||
)
|
||||
elif custom_llm_provider == "oobabooga":
|
||||
response = oobabooga.embedding(
|
||||
model=model,
|
||||
|
@ -2513,6 +2641,8 @@ def embedding(
|
|||
else:
|
||||
args = locals()
|
||||
raise ValueError(f"No valid embedding model args passed in - {args}")
|
||||
if response is not None and hasattr(response, "_hidden_params"):
|
||||
response._hidden_params["custom_llm_provider"] = custom_llm_provider
|
||||
return response
|
||||
except Exception as e:
|
||||
## LOGGING
|
||||
|
@ -2523,9 +2653,7 @@ def embedding(
|
|||
)
|
||||
## Map to OpenAI Exception
|
||||
raise exception_type(
|
||||
model=model,
|
||||
original_exception=e,
|
||||
custom_llm_provider="azure" if azure == True else None,
|
||||
model=model, original_exception=e, custom_llm_provider=custom_llm_provider
|
||||
)
|
||||
|
||||
|
||||
|
@ -2914,6 +3042,7 @@ def image_generation(
|
|||
else:
|
||||
model = "dall-e-2"
|
||||
custom_llm_provider = "openai" # default to dall-e-2 on openai
|
||||
model_response._hidden_params["model"] = model
|
||||
openai_params = [
|
||||
"user",
|
||||
"request_timeout",
|
||||
|
@ -2987,7 +3116,7 @@ def image_generation(
|
|||
custom_llm_provider=custom_llm_provider,
|
||||
**non_default_params,
|
||||
)
|
||||
logging = litellm_logging_obj
|
||||
logging: Logging = litellm_logging_obj
|
||||
logging.update_environment_variables(
|
||||
model=model,
|
||||
user=user,
|
||||
|
@ -3051,7 +3180,18 @@ def image_generation(
|
|||
model_response=model_response,
|
||||
aimg_generation=aimg_generation,
|
||||
)
|
||||
|
||||
elif custom_llm_provider == "bedrock":
|
||||
if model is None:
|
||||
raise Exception("Model needs to be set for bedrock")
|
||||
model_response = bedrock.image_generation(
|
||||
model=model,
|
||||
prompt=prompt,
|
||||
timeout=timeout,
|
||||
logging_obj=litellm_logging_obj,
|
||||
optional_params=optional_params,
|
||||
model_response=model_response,
|
||||
aimg_generation=aimg_generation,
|
||||
)
|
||||
return model_response
|
||||
except Exception as e:
|
||||
## Map to OpenAI Exception
|
||||
|
@ -3068,7 +3208,9 @@ def image_generation(
|
|||
|
||||
async def ahealth_check(
|
||||
model_params: dict,
|
||||
mode: Optional[Literal["completion", "embedding", "image_generation"]] = None,
|
||||
mode: Optional[
|
||||
Literal["completion", "embedding", "image_generation", "chat"]
|
||||
] = None,
|
||||
prompt: Optional[str] = None,
|
||||
input: Optional[List] = None,
|
||||
default_timeout: float = 6000,
|
||||
|
@ -3085,8 +3227,11 @@ async def ahealth_check(
|
|||
if model is None:
|
||||
raise Exception("model not set")
|
||||
|
||||
if model in litellm.model_cost and mode is None:
|
||||
mode = litellm.model_cost[model]["mode"]
|
||||
|
||||
model, custom_llm_provider, _, _ = get_llm_provider(model=model)
|
||||
mode = mode or "completion" # default to completion calls
|
||||
mode = mode or "chat" # default to chat completion calls
|
||||
|
||||
if custom_llm_provider == "azure":
|
||||
api_key = (
|
||||
|
@ -3126,8 +3271,12 @@ async def ahealth_check(
|
|||
prompt=prompt,
|
||||
input=input,
|
||||
)
|
||||
elif custom_llm_provider == "openai":
|
||||
elif (
|
||||
custom_llm_provider == "openai"
|
||||
or custom_llm_provider == "text-completion-openai"
|
||||
):
|
||||
api_key = model_params.get("api_key") or get_secret("OPENAI_API_KEY")
|
||||
organization = model_params.get("organization")
|
||||
|
||||
timeout = (
|
||||
model_params.get("timeout")
|
||||
|
@ -3145,8 +3294,12 @@ async def ahealth_check(
|
|||
mode=mode,
|
||||
prompt=prompt,
|
||||
input=input,
|
||||
organization=organization,
|
||||
)
|
||||
else:
|
||||
model_params["cache"] = {
|
||||
"no-cache": True
|
||||
} # don't used cached responses for making health check calls
|
||||
if mode == "embedding":
|
||||
model_params.pop("messages", None)
|
||||
model_params["input"] = input
|
||||
|
@ -3169,6 +3322,7 @@ async def ahealth_check(
|
|||
## Set verbose to true -> ```litellm.set_verbose = True```
|
||||
def print_verbose(print_statement):
|
||||
try:
|
||||
verbose_logger.debug(print_statement)
|
||||
if litellm.set_verbose:
|
||||
print(print_statement) # noqa
|
||||
except:
|
||||
|
@ -3256,7 +3410,23 @@ def stream_chunk_builder_text_completion(chunks: list, messages: Optional[List]
|
|||
return response
|
||||
|
||||
|
||||
def stream_chunk_builder(chunks: list, messages: Optional[list] = None):
|
||||
def stream_chunk_builder(
|
||||
chunks: list, messages: Optional[list] = None, start_time=None, end_time=None
|
||||
):
|
||||
model_response = litellm.ModelResponse()
|
||||
### SORT CHUNKS BASED ON CREATED ORDER ##
|
||||
print_verbose("Goes into checking if chunk has hiddden created at param")
|
||||
if chunks[0]._hidden_params.get("created_at", None):
|
||||
print_verbose("Chunks have a created at hidden param")
|
||||
# Sort chunks based on created_at in ascending order
|
||||
chunks = sorted(
|
||||
chunks, key=lambda x: x._hidden_params.get("created_at", float("inf"))
|
||||
)
|
||||
print_verbose("Chunks sorted")
|
||||
|
||||
# set hidden params from chunk to model_response
|
||||
if model_response is not None and hasattr(model_response, "_hidden_params"):
|
||||
model_response._hidden_params = chunks[0].get("_hidden_params", {})
|
||||
id = chunks[0]["id"]
|
||||
object = chunks[0]["object"]
|
||||
created = chunks[0]["created"]
|
||||
|
@ -3427,5 +3597,8 @@ def stream_chunk_builder(chunks: list, messages: Optional[list] = None):
|
|||
response["usage"]["prompt_tokens"] + response["usage"]["completion_tokens"]
|
||||
)
|
||||
return convert_to_model_response_object(
|
||||
response_object=response, model_response_object=litellm.ModelResponse()
|
||||
response_object=response,
|
||||
model_response_object=model_response,
|
||||
start_time=start_time,
|
||||
end_time=end_time,
|
||||
)
|
||||
|
|
1
litellm/proxy/_experimental/out/404.html
Normal file
|
@ -0,0 +1 @@
|
|||
self.__BUILD_MANIFEST={__rewrites:{afterFiles:[],beforeFiles:[],fallback:[]},"/_error":["static/chunks/pages/_error-9a890acb1e81c3fc.js"],sortedPages:["/_app","/_error"]},self.__BUILD_MANIFEST_CB&&self.__BUILD_MANIFEST_CB();
|
|
@ -0,0 +1 @@
|
|||
self.__SSG_MANIFEST=new Set([]);self.__SSG_MANIFEST_CB&&self.__SSG_MANIFEST_CB()
|
|
@ -0,0 +1 @@
|
|||
(self.webpackChunk_N_E=self.webpackChunk_N_E||[]).push([[165],{3155:function(e,t,n){(window.__NEXT_P=window.__NEXT_P||[]).push(["/_not-found",function(){return n(4032)}])},4032:function(e,t,n){"use strict";Object.defineProperty(t,"__esModule",{value:!0}),Object.defineProperty(t,"default",{enumerable:!0,get:function(){return i}}),n(6921);let o=n(3827);n(4090);let r={error:{fontFamily:'system-ui,"Segoe UI",Roboto,Helvetica,Arial,sans-serif,"Apple Color Emoji","Segoe UI Emoji"',height:"100vh",textAlign:"center",display:"flex",flexDirection:"column",alignItems:"center",justifyContent:"center"},desc:{display:"inline-block"},h1:{display:"inline-block",margin:"0 20px 0 0",padding:"0 23px 0 0",fontSize:24,fontWeight:500,verticalAlign:"top",lineHeight:"49px"},h2:{fontSize:14,fontWeight:400,lineHeight:"49px",margin:0}};function i(){return(0,o.jsxs)(o.Fragment,{children:[(0,o.jsx)("title",{children:"404: This page could not be found."}),(0,o.jsx)("div",{style:r.error,children:(0,o.jsxs)("div",{children:[(0,o.jsx)("style",{dangerouslySetInnerHTML:{__html:"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}"}}),(0,o.jsx)("h1",{className:"next-error-h1",style:r.h1,children:"404"}),(0,o.jsx)("div",{style:r.desc,children:(0,o.jsx)("h2",{style:r.h2,children:"This page could not be found."})})]})})]})}("function"==typeof t.default||"object"==typeof t.default&&null!==t.default)&&void 0===t.default.__esModule&&(Object.defineProperty(t.default,"__esModule",{value:!0}),Object.assign(t.default,t),e.exports=t.default)}},function(e){e.O(0,[971,69,744],function(){return e(e.s=3155)}),_N_E=e.O()}]);
|
|
@ -0,0 +1 @@
|
|||
(self.webpackChunk_N_E=self.webpackChunk_N_E||[]).push([[185],{7421:function(n,e,t){Promise.resolve().then(t.t.bind(t,9646,23)),Promise.resolve().then(t.t.bind(t,3385,23))},3385:function(){},9646:function(n){n.exports={style:{fontFamily:"'__Inter_c23dc8', '__Inter_Fallback_c23dc8'",fontStyle:"normal"},className:"__className_c23dc8"}}},function(n){n.O(0,[971,69,744],function(){return n(n.s=7421)}),_N_E=n.O()}]);
|
|
@ -0,0 +1 @@
|
|||
(self.webpackChunk_N_E=self.webpackChunk_N_E||[]).push([[744],{2028:function(e,n,t){Promise.resolve().then(t.t.bind(t,7690,23)),Promise.resolve().then(t.t.bind(t,8955,23)),Promise.resolve().then(t.t.bind(t,5613,23)),Promise.resolve().then(t.t.bind(t,1902,23)),Promise.resolve().then(t.t.bind(t,1778,23)),Promise.resolve().then(t.t.bind(t,7831,23))}},function(e){var n=function(n){return e(e.s=n)};e.O(0,[971,69],function(){return n(5317),n(2028)}),_N_E=e.O()}]);
|
|
@ -0,0 +1 @@
|
|||
(self.webpackChunk_N_E=self.webpackChunk_N_E||[]).push([[888],{1597:function(n,_,u){(window.__NEXT_P=window.__NEXT_P||[]).push(["/_app",function(){return u(7174)}])}},function(n){var _=function(_){return n(n.s=_)};n.O(0,[774,179],function(){return _(1597),_(4546)}),_N_E=n.O()}]);
|
|
@ -0,0 +1 @@
|
|||
(self.webpackChunk_N_E=self.webpackChunk_N_E||[]).push([[820],{1981:function(n,_,u){(window.__NEXT_P=window.__NEXT_P||[]).push(["/_error",function(){return u(5103)}])}},function(n){n.O(0,[888,774,179],function(){return n(n.s=1981)}),_N_E=n.O()}]);
|
|
@ -0,0 +1 @@
|
|||
!function(){"use strict";var e,t,n,r,o,u,i,c,f,a={},l={};function s(e){var t=l[e];if(void 0!==t)return t.exports;var n=l[e]={exports:{}},r=!0;try{a[e](n,n.exports,s),r=!1}finally{r&&delete l[e]}return n.exports}s.m=a,e=[],s.O=function(t,n,r,o){if(n){o=o||0;for(var u=e.length;u>0&&e[u-1][2]>o;u--)e[u]=e[u-1];e[u]=[n,r,o];return}for(var i=1/0,u=0;u<e.length;u++){for(var n=e[u][0],r=e[u][1],o=e[u][2],c=!0,f=0;f<n.length;f++)i>=o&&Object.keys(s.O).every(function(e){return s.O[e](n[f])})?n.splice(f--,1):(c=!1,o<i&&(i=o));if(c){e.splice(u--,1);var a=r();void 0!==a&&(t=a)}}return t},s.n=function(e){var t=e&&e.__esModule?function(){return e.default}:function(){return e};return s.d(t,{a:t}),t},n=Object.getPrototypeOf?function(e){return Object.getPrototypeOf(e)}:function(e){return e.__proto__},s.t=function(e,r){if(1&r&&(e=this(e)),8&r||"object"==typeof e&&e&&(4&r&&e.__esModule||16&r&&"function"==typeof e.then))return e;var o=Object.create(null);s.r(o);var u={};t=t||[null,n({}),n([]),n(n)];for(var i=2&r&&e;"object"==typeof i&&!~t.indexOf(i);i=n(i))Object.getOwnPropertyNames(i).forEach(function(t){u[t]=function(){return e[t]}});return u.default=function(){return e},s.d(o,u),o},s.d=function(e,t){for(var n in t)s.o(t,n)&&!s.o(e,n)&&Object.defineProperty(e,n,{enumerable:!0,get:t[n]})},s.f={},s.e=function(e){return Promise.all(Object.keys(s.f).reduce(function(t,n){return s.f[n](e,t),t},[]))},s.u=function(e){},s.miniCssF=function(e){return"static/css/7384ba6288e79f81.css"},s.g=function(){if("object"==typeof globalThis)return globalThis;try{return this||Function("return this")()}catch(e){if("object"==typeof window)return window}}(),s.o=function(e,t){return Object.prototype.hasOwnProperty.call(e,t)},r={},o="_N_E:",s.l=function(e,t,n,u){if(r[e]){r[e].push(t);return}if(void 0!==n)for(var i,c,f=document.getElementsByTagName("script"),a=0;a<f.length;a++){var l=f[a];if(l.getAttribute("src")==e||l.getAttribute("data-webpack")==o+n){i=l;break}}i||(c=!0,(i=document.createElement("script")).charset="utf-8",i.timeout=120,s.nc&&i.setAttribute("nonce",s.nc),i.setAttribute("data-webpack",o+n),i.src=s.tu(e)),r[e]=[t];var d=function(t,n){i.onerror=i.onload=null,clearTimeout(p);var o=r[e];if(delete r[e],i.parentNode&&i.parentNode.removeChild(i),o&&o.forEach(function(e){return e(n)}),t)return t(n)},p=setTimeout(d.bind(null,void 0,{type:"timeout",target:i}),12e4);i.onerror=d.bind(null,i.onerror),i.onload=d.bind(null,i.onload),c&&document.head.appendChild(i)},s.r=function(e){"undefined"!=typeof Symbol&&Symbol.toStringTag&&Object.defineProperty(e,Symbol.toStringTag,{value:"Module"}),Object.defineProperty(e,"__esModule",{value:!0})},s.tt=function(){return void 0===u&&(u={createScriptURL:function(e){return e}},"undefined"!=typeof trustedTypes&&trustedTypes.createPolicy&&(u=trustedTypes.createPolicy("nextjs#bundler",u))),u},s.tu=function(e){return s.tt().createScriptURL(e)},s.p="/ui/_next/",i={272:0},s.f.j=function(e,t){var n=s.o(i,e)?i[e]:void 0;if(0!==n){if(n)t.push(n[2]);else if(272!=e){var r=new Promise(function(t,r){n=i[e]=[t,r]});t.push(n[2]=r);var o=s.p+s.u(e),u=Error();s.l(o,function(t){if(s.o(i,e)&&(0!==(n=i[e])&&(i[e]=void 0),n)){var r=t&&("load"===t.type?"missing":t.type),o=t&&t.target&&t.target.src;u.message="Loading chunk "+e+" failed.\n("+r+": "+o+")",u.name="ChunkLoadError",u.type=r,u.request=o,n[1](u)}},"chunk-"+e,e)}else i[e]=0}},s.O.j=function(e){return 0===i[e]},c=function(e,t){var n,r,o=t[0],u=t[1],c=t[2],f=0;if(o.some(function(e){return 0!==i[e]})){for(n in u)s.o(u,n)&&(s.m[n]=u[n]);if(c)var a=c(s)}for(e&&e(t);f<o.length;f++)r=o[f],s.o(i,r)&&i[r]&&i[r][0](),i[r]=0;return s.O(a)},(f=self.webpackChunk_N_E=self.webpackChunk_N_E||[]).forEach(c.bind(null,0)),f.push=c.bind(null,f.push.bind(f))}();
|
BIN
litellm/proxy/_experimental/out/favicon.ico
Normal file
After Width: | Height: | Size: 25 KiB |
1
litellm/proxy/_experimental/out/index.html
Normal file
8
litellm/proxy/_experimental/out/index.txt
Normal file
|
@ -0,0 +1,8 @@
|
|||
2:"$Sreact.suspense"
|
||||
3:I[5250,["448","static/chunks/448-cd38799829cf7b57.js","931","static/chunks/app/page-e5227a95293777d5.js"],""]
|
||||
4:I[3239,["448","static/chunks/448-cd38799829cf7b57.js","931","static/chunks/app/page-e5227a95293777d5.js"],""]
|
||||
5:I[5613,[],""]
|
||||
6:I[1778,[],""]
|
||||
0:["B4oAVsVV35eL3Y1bPepKW",[[["",{"children":["__PAGE__",{}]},"$undefined","$undefined",true],["",{"children":["__PAGE__",{},["$L1",["$","$2",null,{"fallback":["$","div",null,{"children":"Loading..."}],"children":["$","div",null,{"className":"flex min-h-screen flex-col items-center","children":[["$","nav",null,{"className":"left-0 right-0 top-0 flex justify-between items-center h-12","children":[["$","div",null,{"className":"text-left mx-4 my-2 absolute top-0 left-0","children":["$","div",null,{"className":"flex flex-col items-center","children":["$","$L3",null,{"href":"/","children":["$","button",null,{"className":"text-gray-800 text-2xl px-4 py-1 rounded text-center","children":"🚅 LiteLLM"}]}]}]}],["$","div",null,{"className":"text-right mx-4 my-2 absolute top-0 right-0","children":[["$","a",null,{"href":"https://docs.litellm.ai/docs/","target":"_blank","rel":"noopener noreferrer","children":["$","button",null,{"className":"border border-gray-800 rounded-lg text-gray-800 text-xl px-4 py-1 rounded p-1 mr-2 text-center","children":"Docs"}]}],["$","a",null,{"href":"https://calendly.com/d/4mp-gd3-k5k/berriai-1-1-onboarding-litellm-hosted-version","target":"_blank","rel":"noopener noreferrer","children":["$","button",null,{"className":"border border-gray-800 rounded-lg text-gray-800 text-xl px-4 py-1 rounded p-1 text-center","children":"Schedule Demo"}]}]]}]]}],["$","$L4",null,{}]]}]}],null]]},[null,["$","html",null,{"lang":"en","children":["$","body",null,{"className":"__className_c23dc8","children":["$","$L5",null,{"parallelRouterKey":"children","segmentPath":["children"],"loading":"$undefined","loadingStyles":"$undefined","loadingScripts":"$undefined","hasLoading":false,"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L6",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":[["$","title",null,{"children":"404: This page could not be found."}],["$","div",null,{"style":{"fontFamily":"system-ui,\"Segoe UI\",Roboto,Helvetica,Arial,sans-serif,\"Apple Color Emoji\",\"Segoe UI Emoji\"","height":"100vh","textAlign":"center","display":"flex","flexDirection":"column","alignItems":"center","justifyContent":"center"},"children":["$","div",null,{"children":[["$","style",null,{"dangerouslySetInnerHTML":{"__html":"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}"}}],["$","h1",null,{"className":"next-error-h1","style":{"display":"inline-block","margin":"0 20px 0 0","padding":"0 23px 0 0","fontSize":24,"fontWeight":500,"verticalAlign":"top","lineHeight":"49px"},"children":"404"}],["$","div",null,{"style":{"display":"inline-block"},"children":["$","h2",null,{"style":{"fontSize":14,"fontWeight":400,"lineHeight":"49px","margin":0},"children":"This page could not be found."}]}]]}]}]],"notFoundStyles":[],"styles":null}]}]}],null]],[[["$","link","0",{"rel":"stylesheet","href":"/ui/_next/static/css/7384ba6288e79f81.css","precedence":"next","crossOrigin":""}]],"$L7"]]]]
|
||||
7:[["$","meta","0",{"name":"viewport","content":"width=device-width, initial-scale=1"}],["$","meta","1",{"charSet":"utf-8"}],["$","title","2",{"children":"Create Next App"}],["$","meta","3",{"name":"description","content":"Generated by create next app"}],["$","link","4",{"rel":"icon","href":"/ui/favicon.ico","type":"image/x-icon","sizes":"16x16"}],["$","meta","5",{"name":"next-size-adjust"}]]
|
||||
1:null
|