docs(routing.md): add weight-based shuffling to docs

2024-08-30 08:24:12 -07:00 · 2024-08-30 08:24:12 -07:00 · 7f1531006c
commit 7f1531006c
parent 94db4ec830
2 changed files with 103 additions and 4 deletions
--- a/docs/my-website/docs/routing.md
+++ b/docs/my-website/docs/routing.md
@ -315,6 +315,33 @@ router_settings:
 If `rpm` or `tpm` is not provided, it randomly picks a deployment
 You can also set a `weight` param, to specify which model should get picked when.
 <Tabs>
 <TabItem value="rpm" label="RPM-based shuffling">
 ##### **LiteLLM Proxy Config.yaml**
 ```yaml
 model_list:
 	- model_name: gpt-3.5-turbo
 	  litellm_params:
 	  	model: azure/chatgpt-v-2
 		api_key: os.environ/AZURE_API_KEY
 		api_version: os.environ/AZURE_API_VERSION
 		api_base: os.environ/AZURE_API_BASE
 		rpm: 900 
 	- model_name: gpt-3.5-turbo
 	  litellm_params:
 	  	model: azure/chatgpt-functioncalling
 		api_key: os.environ/AZURE_API_KEY
 		api_version: os.environ/AZURE_API_VERSION
 		api_base: os.environ/AZURE_API_BASE
 		rpm: 10 
 ```
 ##### **Python SDK**
 ```python
 from litellm import Router 
 import asyncio
@ -337,12 +364,68 @@ model_list = [{ # list of model deployments
 		"api_base": os.getenv("AZURE_API_BASE"),
 		"rpm": 10,
 	}
 },]
 # init router
 router = Router(model_list=model_list, routing_strategy="simple-shuffle")
 async def router_acompletion():
 	response = await router.acompletion(
 		model="gpt-3.5-turbo", 
 		messages=[{"role": "user", "content": "Hey, how's it going?"}]
 	)
 	print(response)
 	return response
 asyncio.run(router_acompletion())
 ```
 </TabItem>
 <TabItem value="weight" label="Weight-based shuffling">
 ##### **LiteLLM Proxy Config.yaml**
 ```yaml
 model_list:
 	- model_name: gpt-3.5-turbo
 	  litellm_params:
 	  	model: azure/chatgpt-v-2
 		api_key: os.environ/AZURE_API_KEY
 		api_version: os.environ/AZURE_API_VERSION
 		api_base: os.environ/AZURE_API_BASE
 		weight: 9
 	- model_name: gpt-3.5-turbo
 	  litellm_params:
 	  	model: azure/chatgpt-functioncalling
 		api_key: os.environ/AZURE_API_KEY
 		api_version: os.environ/AZURE_API_VERSION
 		api_base: os.environ/AZURE_API_BASE
 		weight: 1 
 ```
 ##### **Python SDK**
 ```python
 from litellm import Router 
 import asyncio
 model_list = [{
 	"model_name": "gpt-3.5-turbo", # model alias 
 	"litellm_params": { 
 		"model": "azure/chatgpt-v-2", # actual model name
 		"api_key": os.getenv("AZURE_API_KEY"),
 		"api_version": os.getenv("AZURE_API_VERSION"),
 		"api_base": os.getenv("AZURE_API_BASE"),
 		"weight": 9, # pick this 90% of the time
 	}
 }, {
    "model_name": "gpt-3.5-turbo", 
-	"litellm_params": { # params for litellm completion/embedding call 
+	"litellm_params": { 
-		"model": "gpt-3.5-turbo", 
+		"model": "azure/chatgpt-functioncalling", 
-		"api_key": os.getenv("OPENAI_API_KEY"),
+		"api_key": os.getenv("AZURE_API_KEY"),
-		"rpm": 10,
+		"api_version": os.getenv("AZURE_API_VERSION"),
 		"api_base": os.getenv("AZURE_API_BASE"),
 		"weight": 1,
 	}
 }]
@ -358,6 +441,10 @@ async def router_acompletion():
 asyncio.run(router_acompletion())
 ```
 </TabItem>
 </Tabs>
 </TabItem>
 <TabItem value="usage-based" label="Rate-Limit Aware">
--- a/litellm/model_prices_and_context_window_backup.json
+++ b/litellm/model_prices_and_context_window_backup.json
@ -305,6 +305,18 @@
        "supports_function_calling": true,
        "source": "OpenAI needs to add pricing for this ft model, will be updated when added by OpenAI. Defaulting to base model pricing"
    },
    "ft:gpt-4o-2024-08-06": {
        "max_tokens": 16384,
        "max_input_tokens": 128000,
        "max_output_tokens": 16384,
        "input_cost_per_token": 0.00000375,
        "output_cost_per_token": 0.000015,
        "litellm_provider": "openai",
        "mode": "chat",
        "supports_function_calling": true,
        "supports_parallel_function_calling": true,
        "supports_vision": true
    },
    "ft:gpt-4o-mini-2024-07-18": {
        "max_tokens": 16384,
        "max_input_tokens": 128000,