From 63b4e11bfdf1c4976194a795fb640509bc123f88 Mon Sep 17 00:00:00 2001 From: Ishaan Jaff Date: Fri, 16 Aug 2024 11:47:13 -0700 Subject: [PATCH 1/2] docs sagemaker - add example using with proxy --- .../docs/providers/aws_sagemaker.md | 300 ++++++++++++++++-- docs/my-website/docs/providers/bedrock.md | 2 +- 2 files changed, 277 insertions(+), 25 deletions(-) diff --git a/docs/my-website/docs/providers/aws_sagemaker.md b/docs/my-website/docs/providers/aws_sagemaker.md index 2b65709e8..c6004e83c 100644 --- a/docs/my-website/docs/providers/aws_sagemaker.md +++ b/docs/my-website/docs/providers/aws_sagemaker.md @@ -1,10 +1,18 @@ +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem' + # AWS Sagemaker LiteLLM supports All Sagemaker Huggingface Jumpstart Models +:::tip + +**We support ALL Sagemaker models, just set `model=sagemaker/` as a prefix when sending litellm requests** + +::: + + ### API KEYS ```python -!pip install boto3 - os.environ["AWS_ACCESS_KEY_ID"] = "" os.environ["AWS_SECRET_ACCESS_KEY"] = "" os.environ["AWS_REGION_NAME"] = "" @@ -27,6 +35,263 @@ response = completion( ) ``` +### Usage - Streaming +Sagemaker currently does not support streaming - LiteLLM fakes streaming by returning chunks of the response string + +```python +import os +from litellm import completion + +os.environ["AWS_ACCESS_KEY_ID"] = "" +os.environ["AWS_SECRET_ACCESS_KEY"] = "" +os.environ["AWS_REGION_NAME"] = "" + +response = completion( + model="sagemaker/jumpstart-dft-meta-textgeneration-llama-2-7b", + messages=[{ "content": "Hello, how are you?","role": "user"}], + temperature=0.2, + max_tokens=80, + stream=True, + ) +for chunk in response: + print(chunk) +``` + + +## **LiteLLM Proxy Usage** + +Here's how to call Sagemaker with the LiteLLM Proxy Server + +### 1. Setup config.yaml + +```yaml +model_list: + - model_name: jumpstart-model + litellm_params: + model: sagemaker/jumpstart-dft-hf-textgeneration1-mp-20240815-185614 + aws_access_key_id: os.environ/CUSTOM_AWS_ACCESS_KEY_ID + aws_secret_access_key: os.environ/CUSTOM_AWS_SECRET_ACCESS_KEY + aws_region_name: os.environ/CUSTOM_AWS_REGION_NAME +``` + +All possible auth params: + +``` +aws_access_key_id: Optional[str], +aws_secret_access_key: Optional[str], +aws_session_token: Optional[str], +aws_region_name: Optional[str], +aws_session_name: Optional[str], +aws_profile_name: Optional[str], +aws_role_name: Optional[str], +aws_web_identity_token: Optional[str], +``` + +### 2. Start the proxy + +```bash +litellm --config /path/to/config.yaml +``` +### 3. Test it + + + + + +```shell +curl --location 'http://0.0.0.0:4000/chat/completions' \ +--header 'Content-Type: application/json' \ +--data ' { + "model": "jumpstart-model", + "messages": [ + { + "role": "user", + "content": "what llm are you" + } + ] + } +' +``` + + + +```python +import openai +client = openai.OpenAI( + api_key="anything", + base_url="http://0.0.0.0:4000" +) + +response = client.chat.completions.create(model="jumpstart-model", messages = [ + { + "role": "user", + "content": "this is a test request, write a short poem" + } +]) + +print(response) + +``` + + + +```python +from langchain.chat_models import ChatOpenAI +from langchain.prompts.chat import ( + ChatPromptTemplate, + HumanMessagePromptTemplate, + SystemMessagePromptTemplate, +) +from langchain.schema import HumanMessage, SystemMessage + +chat = ChatOpenAI( + openai_api_base="http://0.0.0.0:4000", # set openai_api_base to the LiteLLM Proxy + model = "jumpstart-model", + temperature=0.1 +) + +messages = [ + SystemMessage( + content="You are a helpful assistant that im using to make a test request to." + ), + HumanMessage( + content="test from litellm. tell me why it's amazing in 1 sentence" + ), +] +response = chat(messages) + +print(response) +``` + + + +## Set temperature, top p, etc. + + + + +```python +import os +from litellm import completion + +os.environ["AWS_ACCESS_KEY_ID"] = "" +os.environ["AWS_SECRET_ACCESS_KEY"] = "" +os.environ["AWS_REGION_NAME"] = "" + +response = completion( + model="sagemaker/jumpstart-dft-hf-textgeneration1-mp-20240815-185614", + messages=[{ "content": "Hello, how are you?","role": "user"}], + temperature=0.7, + top_p=1 +) +``` + + + +**Set on yaml** + +```yaml +model_list: + - model_name: jumpstart-model + litellm_params: + model: sagemaker/jumpstart-dft-hf-textgeneration1-mp-20240815-185614 + temperature: + top_p: +``` + +**Set on request** + +```python + +import openai +client = openai.OpenAI( + api_key="anything", + base_url="http://0.0.0.0:4000" +) + +# request sent to model set on litellm proxy, `litellm --model` +response = client.chat.completions.create(model="jumpstart-model", messages = [ + { + "role": "user", + "content": "this is a test request, write a short poem" + } +], +temperature=0.7, +top_p=1 +) + +print(response) + +``` + + + + +## Pass provider-specific params + +If you pass a non-openai param to litellm, we'll assume it's provider-specific and send it as a kwarg in the request body. [See more](../completion/input.md#provider-specific-params) + + + + +```python +import os +from litellm import completion + +os.environ["AWS_ACCESS_KEY_ID"] = "" +os.environ["AWS_SECRET_ACCESS_KEY"] = "" +os.environ["AWS_REGION_NAME"] = "" + +response = completion( + model="sagemaker/jumpstart-dft-hf-textgeneration1-mp-20240815-185614", + messages=[{ "content": "Hello, how are you?","role": "user"}], + top_k=1 # 👈 PROVIDER-SPECIFIC PARAM +) +``` + + + +**Set on yaml** + +```yaml +model_list: + - model_name: jumpstart-model + litellm_params: + model: sagemaker/jumpstart-dft-hf-textgeneration1-mp-20240815-185614 + top_k: 1 # 👈 PROVIDER-SPECIFIC PARAM +``` + +**Set on request** + +```python + +import openai +client = openai.OpenAI( + api_key="anything", + base_url="http://0.0.0.0:4000" +) + +# request sent to model set on litellm proxy, `litellm --model` +response = client.chat.completions.create(model="jumpstart-model", messages = [ + { + "role": "user", + "content": "this is a test request, write a short poem" + } +], +temperature=0.7, +extra_body={ + top_k=1 # 👈 PROVIDER-SPECIFIC PARAM +} +) + +print(response) + +``` + + + + + ### Passing Inference Component Name If you have multiple models on an endpoint, you'll need to specify the individual model names, do this via `model_id`. @@ -85,29 +350,16 @@ response = completion( You can also pass in your own [custom prompt template](../completion/prompt_formatting.md#format-prompt-yourself) -### Usage - Streaming -Sagemaker currently does not support streaming - LiteLLM fakes streaming by returning chunks of the response string - -```python -import os -from litellm import completion - -os.environ["AWS_ACCESS_KEY_ID"] = "" -os.environ["AWS_SECRET_ACCESS_KEY"] = "" -os.environ["AWS_REGION_NAME"] = "" - -response = completion( - model="sagemaker/jumpstart-dft-meta-textgeneration-llama-2-7b", - messages=[{ "content": "Hello, how are you?","role": "user"}], - temperature=0.2, - max_tokens=80, - stream=True, - ) -for chunk in response: - print(chunk) -``` ### Completion Models + + +:::tip + +**We support ALL Sagemaker models, just set `model=sagemaker/` as a prefix when sending litellm requests** + +::: + Here's an example of using a sagemaker model with LiteLLM | Model Name | Function Call | @@ -120,7 +372,7 @@ Here's an example of using a sagemaker model with LiteLLM | Meta Llama 2 70B | `completion(model='sagemaker/jumpstart-dft-meta-textgeneration-llama-2-70b', messages=messages)` | `os.environ['AWS_ACCESS_KEY_ID']`, `os.environ['AWS_SECRET_ACCESS_KEY']`, `os.environ['AWS_REGION_NAME']` | | Meta Llama 2 70B (Chat/Fine-tuned) | `completion(model='sagemaker/jumpstart-dft-meta-textgeneration-llama-2-70b-b-f', messages=messages)` | `os.environ['AWS_ACCESS_KEY_ID']`, `os.environ['AWS_SECRET_ACCESS_KEY']`, `os.environ['AWS_REGION_NAME']` | -### Embedding Models +## Embedding Models LiteLLM supports all Sagemaker Jumpstart Huggingface Embedding models. Here's how to call it: diff --git a/docs/my-website/docs/providers/bedrock.md b/docs/my-website/docs/providers/bedrock.md index 907dfc233..a1a056d41 100644 --- a/docs/my-website/docs/providers/bedrock.md +++ b/docs/my-website/docs/providers/bedrock.md @@ -36,7 +36,7 @@ response = completion( ) ``` -## OpenAI Proxy Usage +## LiteLLM Proxy Usage Here's how to call Anthropic with the LiteLLM Proxy Server From dcd8ff44df002a7fd47386ba8c0c738478551769 Mon Sep 17 00:00:00 2001 From: Ishaan Jaff Date: Fri, 16 Aug 2024 12:04:35 -0700 Subject: [PATCH 2/2] docs add example on setting temp=0 for sagemaker --- .../docs/providers/aws_sagemaker.md | 64 +++++++++++++++++++ 1 file changed, 64 insertions(+) diff --git a/docs/my-website/docs/providers/aws_sagemaker.md b/docs/my-website/docs/providers/aws_sagemaker.md index c6004e83c..0f8a55261 100644 --- a/docs/my-website/docs/providers/aws_sagemaker.md +++ b/docs/my-website/docs/providers/aws_sagemaker.md @@ -227,6 +227,70 @@ print(response) +## **Allow setting temperature=0** for Sagemaker + +By default when `temperature=0` is sent in requests to LiteLLM, LiteLLM rounds up to `temperature=0.1` since Sagemaker fails most requests when `temperature=0` + +If you want to send `temperature=0` for your model here's how to set it up (Since Sagemaker can host any kind of model, some models allow zero temperature) + + + + +```python +import os +from litellm import completion + +os.environ["AWS_ACCESS_KEY_ID"] = "" +os.environ["AWS_SECRET_ACCESS_KEY"] = "" +os.environ["AWS_REGION_NAME"] = "" + +response = completion( + model="sagemaker/jumpstart-dft-hf-textgeneration1-mp-20240815-185614", + messages=[{ "content": "Hello, how are you?","role": "user"}], + temperature=0, + aws_sagemaker_allow_zero_temp=True, +) +``` + + + +**Set `aws_sagemaker_allow_zero_temp` on yaml** + +```yaml +model_list: + - model_name: jumpstart-model + litellm_params: + model: sagemaker/jumpstart-dft-hf-textgeneration1-mp-20240815-185614 + aws_sagemaker_allow_zero_temp: true +``` + +**Set `temperature=0` on request** + +```python + +import openai +client = openai.OpenAI( + api_key="anything", + base_url="http://0.0.0.0:4000" +) + +# request sent to model set on litellm proxy, `litellm --model` +response = client.chat.completions.create(model="jumpstart-model", messages = [ + { + "role": "user", + "content": "this is a test request, write a short poem" + } +], +temperature=0, +) + +print(response) + +``` + + + + ## Pass provider-specific params If you pass a non-openai param to litellm, we'll assume it's provider-specific and send it as a kwarg in the request body. [See more](../completion/input.md#provider-specific-params)