diff --git a/.github/workflows/ghcr_deploy.yml b/.github/workflows/ghcr_deploy.yml index f8064cf47..8c8060a92 100644 --- a/.github/workflows/ghcr_deploy.yml +++ b/.github/workflows/ghcr_deploy.yml @@ -194,6 +194,8 @@ jobs: platforms: local,linux/amd64,linux/arm64,linux/arm64/v8 build-and-push-helm-chart: + if: github.event.inputs.release_type != 'dev' + needs: [docker-hub-deploy, build-and-push-image, build-and-push-image-database] runs-on: ubuntu-latest steps: - name: Checkout repository @@ -211,9 +213,17 @@ jobs: - name: lowercase github.repository_owner run: | echo "REPO_OWNER=`echo ${{github.repository_owner}} | tr '[:upper:]' '[:lower:]'`" >>${GITHUB_ENV} + - name: Get LiteLLM Latest Tag id: current_app_tag - uses: WyriHaximus/github-action-get-previous-tag@v1.3.0 + shell: bash + run: | + LATEST_TAG=$(git describe --tags --exclude "*dev*" --abbrev=0) + if [ -z "${LATEST_TAG}" ]; then + echo "latest_tag=latest" | tee -a $GITHUB_OUTPUT + else + echo "latest_tag=${LATEST_TAG}" | tee -a $GITHUB_OUTPUT + fi - name: Get last published chart version id: current_version @@ -241,7 +251,7 @@ jobs: name: ${{ env.CHART_NAME }} repository: ${{ env.REPO_OWNER }} tag: ${{ github.event.inputs.chartVersion || steps.bump_version.outputs.next-version || '0.1.0' }} - app_version: ${{ steps.current_app_tag.outputs.tag || 'latest' }} + app_version: ${{ steps.current_app_tag.outputs.latest_tag }} path: deploy/charts/${{ env.CHART_NAME }} registry: ${{ env.REGISTRY }} registry_username: ${{ github.actor }} diff --git a/docs/my-website/docs/pass_through/langfuse.md b/docs/my-website/docs/pass_through/langfuse.md new file mode 100644 index 000000000..8987842f7 --- /dev/null +++ b/docs/my-website/docs/pass_through/langfuse.md @@ -0,0 +1,132 @@ +# Langfuse Endpoints (Pass-Through) + +Pass-through endpoints for Langfuse - call langfuse endpoints with LiteLLM Virtual Key. + +Just replace `https://us.cloud.langfuse.com` with `LITELLM_PROXY_BASE_URL/langfuse` 🚀 + +#### **Example Usage** +```python +from langfuse import Langfuse + +langfuse = Langfuse( + host="http://localhost:4000/langfuse", # your litellm proxy endpoint + public_key="anything", # no key required since this is a pass through + secret_key="LITELLM_VIRTUAL_KEY", # no key required since this is a pass through +) + +print("sending langfuse trace request") +trace = langfuse.trace(name="test-trace-litellm-proxy-passthrough") +print("flushing langfuse request") +langfuse.flush() + +print("flushed langfuse request") +``` + +Supports **ALL** Langfuse Endpoints. + +[**See All Langfuse Endpoints**](https://api.reference.langfuse.com/) + +## Quick Start + +Let's log a trace to Langfuse. + +1. Add Langfuse Public/Private keys to environment + +```bash +export LANGFUSE_PUBLIC_KEY="" +export LANGFUSE_PRIVATE_KEY="" +``` + +2. Start LiteLLM Proxy + +```bash +litellm + +# RUNNING on http://0.0.0.0:4000 +``` + +3. Test it! + +Let's log a trace to Langfuse! + +```python +from langfuse import Langfuse + +langfuse = Langfuse( + host="http://localhost:4000/langfuse", # your litellm proxy endpoint + public_key="anything", # no key required since this is a pass through + secret_key="anything", # no key required since this is a pass through +) + +print("sending langfuse trace request") +trace = langfuse.trace(name="test-trace-litellm-proxy-passthrough") +print("flushing langfuse request") +langfuse.flush() + +print("flushed langfuse request") +``` + + +## Advanced - Use with Virtual Keys + +Pre-requisites +- [Setup proxy with DB](../proxy/virtual_keys.md#setup) + +Use this, to avoid giving developers the raw Google AI Studio key, but still letting them use Google AI Studio endpoints. + +### Usage + +1. Setup environment + +```bash +export DATABASE_URL="" +export LITELLM_MASTER_KEY="" +export LANGFUSE_PUBLIC_KEY="" +export LANGFUSE_PRIVATE_KEY="" +``` + +```bash +litellm + +# RUNNING on http://0.0.0.0:4000 +``` + +2. Generate virtual key + +```bash +curl -X POST 'http://0.0.0.0:4000/key/generate' \ +-H 'Authorization: Bearer sk-1234' \ +-H 'Content-Type: application/json' \ +-d '{}' +``` + +Expected Response + +```bash +{ + ... + "key": "sk-1234ewknldferwedojwojw" +} +``` + +3. Test it! + + +```python +from langfuse import Langfuse + +langfuse = Langfuse( + host="http://localhost:4000/langfuse", # your litellm proxy endpoint + public_key="anything", # no key required since this is a pass through + secret_key="sk-1234ewknldferwedojwojw", # no key required since this is a pass through +) + +print("sending langfuse trace request") +trace = langfuse.trace(name="test-trace-litellm-proxy-passthrough") +print("flushing langfuse request") +langfuse.flush() + +print("flushed langfuse request") +``` + +## [Advanced - Log to separate langfuse projects (by key/team)](../proxy/team_logging.md) \ No newline at end of file diff --git a/docs/my-website/docs/proxy/team_logging.md b/docs/my-website/docs/proxy/team_logging.md index e36cb8f66..ef4ebe591 100644 --- a/docs/my-website/docs/proxy/team_logging.md +++ b/docs/my-website/docs/proxy/team_logging.md @@ -207,7 +207,7 @@ curl -X POST 'http://0.0.0.0:4000/key/generate' \ -H 'Content-Type: application/json' \ -d '{ "metadata": { - "logging": { + "logging": [{ "callback_name": "langfuse", # 'otel', 'langfuse', 'lunary' "callback_type": "success" # set, if required by integration - future improvement, have logging tools work for success + failure by default "callback_vars": { @@ -215,7 +215,7 @@ curl -X POST 'http://0.0.0.0:4000/key/generate' \ "langfuse_secret_key": "os.environ/LANGFUSE_SECRET_KEY", # [RECOMMENDED] reference key in proxy environment "langfuse_host": "https://cloud.langfuse.com" } - } + }] } }' diff --git a/docs/my-website/docs/tutorials/litellm_proxy_aporia.md b/docs/my-website/docs/tutorials/litellm_proxy_aporia.md index 480c411c0..1fea3037f 100644 --- a/docs/my-website/docs/tutorials/litellm_proxy_aporia.md +++ b/docs/my-website/docs/tutorials/litellm_proxy_aporia.md @@ -61,7 +61,7 @@ guardrails: - `pre_call` Run **before** LLM call, on **input** - `post_call` Run **after** LLM call, on **input & output** -- `during_call` Run **during** LLM call, on **input** +- `during_call` Run **during** LLM call, on **input** Same as `pre_call` but runs in parallel as LLM call. Response not returned until guardrail check completes ## 3. Start LiteLLM Gateway @@ -72,6 +72,8 @@ litellm --config config.yaml --detailed_debug ## 4. Test request +**[Langchain, OpenAI SDK Usage Examples](../proxy/user_keys##request-format)** + @@ -134,12 +136,10 @@ curl -i http://localhost:4000/v1/chat/completions \ -## Advanced -### Control Guardrails per Project (API Key) +## 5. Control Guardrails per Project (API Key) -Use this to control what guardrails run per project. In this tutorial we only want the following guardrails to run for 1 project -- `pre_call_guardrails`: ["aporia-pre-guard"] -- `post_call_guardrails`: ["aporia-post-guard"] +Use this to control what guardrails run per project. In this tutorial we only want the following guardrails to run for 1 project (API Key) +- `guardrails`: ["aporia-pre-guard", "aporia-post-guard"] **Step 1** Create Key with guardrail settings @@ -151,8 +151,7 @@ curl -X POST 'http://0.0.0.0:4000/key/generate' \ -H 'Authorization: Bearer sk-1234' \ -H 'Content-Type: application/json' \ -D '{ - "pre_call_guardrails": ["aporia-pre-guard"], - "post_call_guardrails": ["aporia"] + "guardrails": ["aporia-pre-guard", "aporia-post-guard"] } }' ``` @@ -166,8 +165,7 @@ curl --location 'http://0.0.0.0:4000/key/update' \ --header 'Content-Type: application/json' \ --data '{ "key": "sk-jNm1Zar7XfNdZXp49Z1kSQ", - "pre_call_guardrails": ["aporia"], - "post_call_guardrails": ["aporia"] + "guardrails": ["aporia-pre-guard", "aporia-post-guard"] } }' ``` diff --git a/docs/my-website/sidebars.js b/docs/my-website/sidebars.js index 6501ebd75..1dcaf008e 100644 --- a/docs/my-website/sidebars.js +++ b/docs/my-website/sidebars.js @@ -195,7 +195,8 @@ const sidebars = { "pass_through/vertex_ai", "pass_through/google_ai_studio", "pass_through/cohere", - "pass_through/bedrock" + "pass_through/bedrock", + "pass_through/langfuse" ], }, "scheduler", diff --git a/litellm/llms/ollama.py b/litellm/llms/ollama.py index 12b60c2c5..9f62bab20 100644 --- a/litellm/llms/ollama.py +++ b/litellm/llms/ollama.py @@ -509,16 +509,16 @@ async def ollama_acompletion( async def ollama_aembeddings( api_base: str, model: str, - prompts: list, + prompts: List[str], model_response: litellm.EmbeddingResponse, optional_params: dict, logging_obj=None, encoding=None, ): - if api_base.endswith("/api/embeddings"): + if api_base.endswith("/api/embed"): url = api_base else: - url = f"{api_base}/api/embeddings" + url = f"{api_base}/api/embed" ## Load Config config = litellm.OllamaConfig.get_config() @@ -528,64 +528,53 @@ async def ollama_aembeddings( ): # completion(top_k=3) > cohere_config(top_k=3) <- allows for dynamic variables to be passed in optional_params[k] = v - input_data: Dict[str, Any] = {"model": model} + data: Dict[str, Any] = {"model": model, "input": prompts} special_optional_params = ["truncate", "options", "keep_alive"] for k, v in optional_params.items(): if k in special_optional_params: - input_data[k] = v + data[k] = v else: # Ensure "options" is a dictionary before updating it - input_data.setdefault("options", {}) - if isinstance(input_data["options"], dict): - input_data["options"].update({k: v}) + data.setdefault("options", {}) + if isinstance(data["options"], dict): + data["options"].update({k: v}) total_input_tokens = 0 output_data = [] timeout = aiohttp.ClientTimeout(total=litellm.request_timeout) # 10 minutes async with aiohttp.ClientSession(timeout=timeout) as session: - for idx, prompt in enumerate(prompts): - data = deepcopy(input_data) - data["prompt"] = prompt - ## LOGGING - logging_obj.pre_call( - input=None, - api_key=None, - additional_args={ - "api_base": url, - "complete_input_dict": data, - "headers": {}, - }, - ) + ## LOGGING + logging_obj.pre_call( + input=None, + api_key=None, + additional_args={ + "api_base": url, + "complete_input_dict": data, + "headers": {}, + }, + ) - response = await session.post(url, json=data) - if response.status != 200: - text = await response.text() - raise OllamaError(status_code=response.status, message=text) + response = await session.post(url, json=data) - ## LOGGING - logging_obj.post_call( - input=prompt, - api_key="", - original_response=response.text, - additional_args={ - "headers": None, - "api_base": api_base, - }, - ) + if response.status != 200: + text = await response.text() + raise OllamaError(status_code=response.status, message=text) - response_json = await response.json() - embeddings: list[float] = response_json["embedding"] - output_data.append( - {"object": "embedding", "index": idx, "embedding": embeddings} - ) + response_json = await response.json() - input_tokens = len(encoding.encode(prompt)) - total_input_tokens += input_tokens + embeddings: List[List[float]] = response_json["embeddings"] + for idx, emb in enumerate(embeddings): + output_data.append({"object": "embedding", "index": idx, "embedding": emb}) + + input_tokens = response_json.get("prompt_eval_count") or len( + encoding.encode("".join(prompt for prompt in prompts)) + ) + total_input_tokens += input_tokens model_response.object = "list" model_response.data = output_data - model_response.model = model + model_response.model = "ollama/" + model setattr( model_response, "usage", diff --git a/litellm/llms/prompt_templates/factory.py b/litellm/llms/prompt_templates/factory.py index 7ec21e8bb..c9e691c00 100644 --- a/litellm/llms/prompt_templates/factory.py +++ b/litellm/llms/prompt_templates/factory.py @@ -2195,7 +2195,7 @@ def _convert_to_bedrock_tool_call_invoke( def _convert_to_bedrock_tool_call_result( message: dict, -) -> BedrockMessageBlock: +) -> BedrockContentBlock: """ OpenAI message with a tool result looks like: { @@ -2247,7 +2247,7 @@ def _convert_to_bedrock_tool_call_result( ) content_block = BedrockContentBlock(toolResult=tool_result) - return BedrockMessageBlock(role="user", content=[content_block]) + return content_block def _bedrock_converse_messages_pt( @@ -2289,6 +2289,12 @@ def _bedrock_converse_messages_pt( msg_i += 1 + ## MERGE CONSECUTIVE TOOL CALL MESSAGES ## + while msg_i < len(messages) and messages[msg_i]["role"] == "tool": + tool_call_result = _convert_to_bedrock_tool_call_result(messages[msg_i]) + + user_content.append(tool_call_result) + msg_i += 1 if user_content: contents.append(BedrockMessageBlock(role="user", content=user_content)) assistant_content: List[BedrockContentBlock] = [] @@ -2332,11 +2338,6 @@ def _bedrock_converse_messages_pt( BedrockMessageBlock(role="assistant", content=assistant_content) ) - ## APPEND TOOL CALL MESSAGES ## - if msg_i < len(messages) and messages[msg_i]["role"] == "tool": - tool_call_result = _convert_to_bedrock_tool_call_result(messages[msg_i]) - contents.append(tool_call_result) - msg_i += 1 if msg_i == init_msg_i: # prevent infinite loops raise litellm.BadRequestError( message=BAD_MESSAGE_ERROR_STR + f"passed in {messages[msg_i]}", diff --git a/litellm/llms/text_completion_codestral.py b/litellm/llms/text_completion_codestral.py index a6865b953..9dbe3bb37 100644 --- a/litellm/llms/text_completion_codestral.py +++ b/litellm/llms/text_completion_codestral.py @@ -365,6 +365,7 @@ class CodestralTextCompletion(BaseLLM): stream = optional_params.pop("stream", False) data = { + "model": model, "prompt": prompt, **optional_params, } diff --git a/litellm/main.py b/litellm/main.py index 12f8cceb5..f2c6df306 100644 --- a/litellm/main.py +++ b/litellm/main.py @@ -253,7 +253,7 @@ async def acompletion( logit_bias: Optional[dict] = None, user: Optional[str] = None, # openai v1.0+ new params - response_format: Optional[dict] = None, + response_format: Optional[Union[dict, Type[BaseModel]]] = None, seed: Optional[int] = None, tools: Optional[List] = None, tool_choice: Optional[str] = None, diff --git a/litellm/proxy/_new_secret_config.yaml b/litellm/proxy/_new_secret_config.yaml index 521a034d6..f72e37278 100644 --- a/litellm/proxy/_new_secret_config.yaml +++ b/litellm/proxy/_new_secret_config.yaml @@ -1,6 +1,4 @@ model_list: - - model_name: gpt-3.5-turbo + - model_name: ollama/mistral litellm_params: - model: azure/chatgpt-v-2 - api_key: os.environ/AZURE_API_KEY - api_base: os.environ/AZURE_API_BASE \ No newline at end of file + model: ollama/mistral diff --git a/litellm/proxy/_types.py b/litellm/proxy/_types.py index 4f9d39d77..75934ee1f 100644 --- a/litellm/proxy/_types.py +++ b/litellm/proxy/_types.py @@ -587,6 +587,7 @@ class GenerateKeyRequest(GenerateRequestBase): send_invite_email: Optional[bool] = None model_rpm_limit: Optional[dict] = None model_tpm_limit: Optional[dict] = None + guardrails: Optional[List[str]] = None class GenerateKeyResponse(GenerateKeyRequest): diff --git a/litellm/proxy/auth/user_api_key_auth.py b/litellm/proxy/auth/user_api_key_auth.py index 8d2c728f5..58a2aa164 100644 --- a/litellm/proxy/auth/user_api_key_auth.py +++ b/litellm/proxy/auth/user_api_key_auth.py @@ -1269,8 +1269,9 @@ def _get_user_role( def _get_request_ip_address( request: Request, use_x_forwarded_for: Optional[bool] = False -) -> str: +) -> Optional[str]: + client_ip = None if use_x_forwarded_for is True and "x-forwarded-for" in request.headers: client_ip = request.headers["x-forwarded-for"] elif request.client is not None: diff --git a/litellm/proxy/litellm_pre_call_utils.py b/litellm/proxy/litellm_pre_call_utils.py index ff5b6dc86..54f217194 100644 --- a/litellm/proxy/litellm_pre_call_utils.py +++ b/litellm/proxy/litellm_pre_call_utils.py @@ -331,13 +331,33 @@ async def add_litellm_data_to_request( # Guardrails move_guardrails_to_metadata( - data=data, _metadata_variable_name=_metadata_variable_name + data=data, + _metadata_variable_name=_metadata_variable_name, + user_api_key_dict=user_api_key_dict, ) return data -def move_guardrails_to_metadata(data: dict, _metadata_variable_name: str): +def move_guardrails_to_metadata( + data: dict, + _metadata_variable_name: str, + user_api_key_dict: UserAPIKeyAuth, +): + """ + Heper to add guardrails from request to metadata + + - If guardrails set on API Key metadata then sets guardrails on request metadata + - If guardrails not set on API key, then checks request metadata + + """ + if user_api_key_dict.metadata: + if "guardrails" in user_api_key_dict.metadata: + data[_metadata_variable_name]["guardrails"] = user_api_key_dict.metadata[ + "guardrails" + ] + return + if "guardrails" in data: data[_metadata_variable_name]["guardrails"] = data["guardrails"] del data["guardrails"] diff --git a/litellm/proxy/management_endpoints/key_management_endpoints.py b/litellm/proxy/management_endpoints/key_management_endpoints.py index 79e2dcc2d..1758b416d 100644 --- a/litellm/proxy/management_endpoints/key_management_endpoints.py +++ b/litellm/proxy/management_endpoints/key_management_endpoints.py @@ -66,6 +66,7 @@ async def generate_key_fn( - budget_duration: Optional[str] - Budget is reset at the end of specified duration. If not set, budget is never reset. You can set duration as seconds ("30s"), minutes ("30m"), hours ("30h"), days ("30d"). - max_parallel_requests: Optional[int] - Rate limit a user based on the number of parallel requests. Raises 429 error, if user's parallel requests > x. - metadata: Optional[dict] - Metadata for key, store information for key. Example metadata = {"team": "core-infra", "app": "app2", "email": "ishaan@berri.ai" } + - guardrails: Optional[List[str]] - List of active guardrails for the key - permissions: Optional[dict] - key-specific permissions. Currently just used for turning off pii masking (if connected). Example - {"pii": false} - model_max_budget: Optional[dict] - key-specific model budget in USD. Example - {"text-davinci-002": 0.5, "gpt-3.5-turbo": 0.5}. IF null or {} then no model specific budget. - model_rpm_limit: Optional[dict] - key-specific model rpm limit. Example - {"text-davinci-002": 1000, "gpt-3.5-turbo": 1000}. IF null or {} then no model specific rpm limit. @@ -321,11 +322,12 @@ async def update_key_fn( detail={"error": f"Team not found, passed team_id={data.team_id}"}, ) + _metadata_fields = ["model_rpm_limit", "model_tpm_limit", "guardrails"] # get non default values for key non_default_values = {} for k, v in data_json.items(): # this field gets stored in metadata - if key == "model_rpm_limit" or key == "model_tpm_limit": + if key in _metadata_fields: continue if v is not None and v not in ( [], @@ -366,6 +368,14 @@ async def update_key_fn( non_default_values["metadata"] = _metadata non_default_values.pop("model_rpm_limit", None) + if data.guardrails: + _metadata = existing_key_row.metadata or {} + _metadata["guardrails"] = data.guardrails + + # update values that will be written to the DB + non_default_values["metadata"] = _metadata + non_default_values.pop("guardrails", None) + response = await prisma_client.update_data( token=key, data={**non_default_values, "token": key} ) @@ -734,6 +744,7 @@ async def generate_key_helper_fn( model_max_budget: Optional[dict] = {}, model_rpm_limit: Optional[dict] = {}, model_tpm_limit: Optional[dict] = {}, + guardrails: Optional[list] = None, teams: Optional[list] = None, organization_id: Optional[str] = None, table_name: Optional[Literal["key", "user"]] = None, @@ -783,6 +794,9 @@ async def generate_key_helper_fn( if model_tpm_limit is not None: metadata = metadata or {} metadata["model_tpm_limit"] = model_tpm_limit + if guardrails is not None: + metadata = metadata or {} + metadata["guardrails"] = guardrails metadata_json = json.dumps(metadata) model_max_budget_json = json.dumps(model_max_budget) diff --git a/litellm/proxy/pass_through_endpoints/pass_through_endpoints.py b/litellm/proxy/pass_through_endpoints/pass_through_endpoints.py index bc16c3555..b50fbb0c5 100644 --- a/litellm/proxy/pass_through_endpoints/pass_through_endpoints.py +++ b/litellm/proxy/pass_through_endpoints/pass_through_endpoints.py @@ -360,24 +360,22 @@ async def pass_through_request( # combine url with query params for logging - # requested_query_params = query_params or request.query_params.__dict__ - # requested_query_params_str = "&".join( - # f"{k}={v}" for k, v in requested_query_params.items() - # ) + requested_query_params = query_params or request.query_params.__dict__ + requested_query_params_str = "&".join( + f"{k}={v}" for k, v in requested_query_params.items() + ) - requested_query_params = None - - # if "?" in str(url): - # logging_url = str(url) + "&" + requested_query_params_str - # else: - # logging_url = str(url) + "?" + requested_query_params_str + if "?" in str(url): + logging_url = str(url) + "&" + requested_query_params_str + else: + logging_url = str(url) + "?" + requested_query_params_str logging_obj.pre_call( input=[{"role": "user", "content": "no-message-pass-through-endpoint"}], api_key="", additional_args={ "complete_input_dict": _parsed_body, - "api_base": str(url), + "api_base": str(logging_url), "headers": headers, }, ) diff --git a/litellm/proxy/proxy_server.py b/litellm/proxy/proxy_server.py index 0fbf10a2b..12069d5e8 100644 --- a/litellm/proxy/proxy_server.py +++ b/litellm/proxy/proxy_server.py @@ -2350,7 +2350,8 @@ async def initialize( config=None, ): global user_model, user_api_base, user_debug, user_detailed_debug, user_user_max_tokens, user_request_timeout, user_temperature, user_telemetry, user_headers, experimental, llm_model_list, llm_router, general_settings, master_key, user_custom_auth, prisma_client - generate_feedback_box() + if os.getenv("LITELLM_DONT_SHOW_FEEDBACK_BOX", "").lower() != "true": + generate_feedback_box() user_model = model user_debug = debug if debug is True: # this needs to be first, so users can see Router init debugg @@ -8065,14 +8066,14 @@ async def login(request: Request): return redirect_response else: raise ProxyException( - message=f"Invalid credentials used to access UI. Passed in username: {username}, passed in password: {password}.\nNot valid credentials for {username}", + message=f"Invalid credentials used to access UI.\nNot valid credentials for {username}", type=ProxyErrorTypes.auth_error, param="invalid_credentials", code=status.HTTP_401_UNAUTHORIZED, ) else: raise ProxyException( - message=f"Invalid credentials used to access UI. Passed in username: {username}, passed in password: {password}.\nCheck 'UI_USERNAME', 'UI_PASSWORD' in .env file", + message="Invalid credentials used to access UI.\nCheck 'UI_USERNAME', 'UI_PASSWORD' in .env file", type=ProxyErrorTypes.auth_error, param="invalid_credentials", code=status.HTTP_401_UNAUTHORIZED, diff --git a/litellm/tests/test_completion.py b/litellm/tests/test_completion.py index 31bfa9332..9e2066511 100644 --- a/litellm/tests/test_completion.py +++ b/litellm/tests/test_completion.py @@ -23,7 +23,7 @@ from litellm import RateLimitError, Timeout, completion, completion_cost, embedd from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler from litellm.llms.prompt_templates.factory import anthropic_messages_pt -# litellm.num_retries =3 +# litellm.num_retries=3 litellm.cache = None litellm.success_callback = [] user_message = "Write a short poem about the sky" diff --git a/litellm/tests/test_function_calling.py b/litellm/tests/test_function_calling.py index 6e4e9d3e8..5f97dbf87 100644 --- a/litellm/tests/test_function_calling.py +++ b/litellm/tests/test_function_calling.py @@ -1,18 +1,20 @@ -import sys, os +import os +import sys import traceback + from dotenv import load_dotenv load_dotenv() -import os, io +import io +import os sys.path.insert( 0, os.path.abspath("../..") ) # Adds the parent directory to the system path import pytest + import litellm -from litellm import embedding, completion, completion_cost, Timeout -from litellm import RateLimitError -import pytest +from litellm import RateLimitError, Timeout, completion, completion_cost, embedding litellm.num_retries = 0 litellm.cache = None @@ -41,7 +43,14 @@ def get_current_weather(location, unit="fahrenheit"): # In production, this could be your backend API or an external API @pytest.mark.parametrize( - "model", ["gpt-3.5-turbo-1106", "mistral/mistral-large-latest"] + "model", + [ + "gpt-3.5-turbo-1106", + "mistral/mistral-large-latest", + "claude-3-haiku-20240307", + "gemini/gemini-1.5-pro", + "anthropic.claude-3-sonnet-20240229-v1:0", + ], ) def test_parallel_function_call(model): try: @@ -124,7 +133,12 @@ def test_parallel_function_call(model): ) # extend conversation with function response print(f"messages: {messages}") second_response = litellm.completion( - model=model, messages=messages, temperature=0.2, seed=22 + model=model, + messages=messages, + temperature=0.2, + seed=22, + tools=tools, + drop_params=True, ) # get a new response from the model where it can see the function response print("second response\n", second_response) except Exception as e: diff --git a/litellm/tests/test_key_generate_prisma.py b/litellm/tests/test_key_generate_prisma.py index 907262d48..2641edbb4 100644 --- a/litellm/tests/test_key_generate_prisma.py +++ b/litellm/tests/test_key_generate_prisma.py @@ -2770,6 +2770,60 @@ async def test_generate_key_with_model_tpm_limit(prisma_client): } +@pytest.mark.asyncio() +async def test_generate_key_with_guardrails(prisma_client): + print("prisma client=", prisma_client) + + setattr(litellm.proxy.proxy_server, "prisma_client", prisma_client) + setattr(litellm.proxy.proxy_server, "master_key", "sk-1234") + await litellm.proxy.proxy_server.prisma_client.connect() + request = GenerateKeyRequest( + guardrails=["aporia-pre-call"], + metadata={ + "team": "litellm-team3", + }, + ) + key = await generate_key_fn( + data=request, + user_api_key_dict=UserAPIKeyAuth( + user_role=LitellmUserRoles.PROXY_ADMIN, + api_key="sk-1234", + user_id="1234", + ), + ) + print("generated key=", key) + + generated_key = key.key + + # use generated key to auth in + result = await info_key_fn(key=generated_key) + print("result from info_key_fn", result) + assert result["key"] == generated_key + print("\n info for key=", result["info"]) + assert result["info"]["metadata"] == { + "team": "litellm-team3", + "guardrails": ["aporia-pre-call"], + } + + # Update model tpm_limit and rpm_limit + request = UpdateKeyRequest( + key=generated_key, + guardrails=["aporia-pre-call", "aporia-post-call"], + ) + _request = Request(scope={"type": "http"}) + _request._url = URL(url="/update/key") + + await update_key_fn(data=request, request=_request) + result = await info_key_fn(key=generated_key) + print("result from info_key_fn", result) + assert result["key"] == generated_key + print("\n info for key=", result["info"]) + assert result["info"]["metadata"] == { + "team": "litellm-team3", + "guardrails": ["aporia-pre-call", "aporia-post-call"], + } + + @pytest.mark.asyncio() async def test_team_access_groups(prisma_client): """ diff --git a/litellm/tests/test_ollama.py b/litellm/tests/test_ollama.py index 0d0b07672..de41e24b8 100644 --- a/litellm/tests/test_ollama.py +++ b/litellm/tests/test_ollama.py @@ -132,6 +132,7 @@ def test_ollama_aembeddings(mock_aembeddings): # test_ollama_aembeddings() +@pytest.mark.skip(reason="local only test") def test_ollama_chat_function_calling(): import json diff --git a/litellm/tests/test_prompt_factory.py b/litellm/tests/test_prompt_factory.py index 93e92a792..81339e831 100644 --- a/litellm/tests/test_prompt_factory.py +++ b/litellm/tests/test_prompt_factory.py @@ -313,3 +313,78 @@ def test_anthropic_cache_controls_pt(): assert msg["content"][0]["cache_control"] == {"type": "ephemeral"} print("translated_messages: ", translated_messages) + + +@pytest.mark.parametrize("provider", ["bedrock", "anthropic"]) +def test_bedrock_parallel_tool_calling_pt(provider): + """ + Make sure parallel tool call blocks are merged correctly - https://github.com/BerriAI/litellm/issues/5277 + """ + from litellm.llms.prompt_templates.factory import _bedrock_converse_messages_pt + from litellm.types.utils import ChatCompletionMessageToolCall, Function, Message + + messages = [ + { + "role": "user", + "content": "What's the weather like in San Francisco, Tokyo, and Paris? - give me 3 responses", + }, + Message( + content="Here are the current weather conditions for San Francisco, Tokyo, and Paris:", + role="assistant", + tool_calls=[ + ChatCompletionMessageToolCall( + index=1, + function=Function( + arguments='{"city": "New York"}', + name="get_current_weather", + ), + id="tooluse_XcqEBfm8R-2YVaPhDUHsPQ", + type="function", + ), + ChatCompletionMessageToolCall( + index=2, + function=Function( + arguments='{"city": "London"}', + name="get_current_weather", + ), + id="tooluse_VB9nk7UGRniVzGcaj6xrAQ", + type="function", + ), + ], + function_call=None, + ), + { + "tool_call_id": "tooluse_XcqEBfm8R-2YVaPhDUHsPQ", + "role": "tool", + "name": "get_current_weather", + "content": "25 degrees celsius.", + }, + { + "tool_call_id": "tooluse_VB9nk7UGRniVzGcaj6xrAQ", + "role": "tool", + "name": "get_current_weather", + "content": "28 degrees celsius.", + }, + ] + + if provider == "bedrock": + translated_messages = _bedrock_converse_messages_pt( + messages=messages, + model="anthropic.claude-3-sonnet-20240229-v1:0", + llm_provider="bedrock", + ) + else: + translated_messages = anthropic_messages_pt( + messages=messages, + model="claude-3-sonnet-20240229-v1:0", + llm_provider=provider, + ) + print(translated_messages) + + number_of_messages = len(translated_messages) + + # assert last 2 messages are not the same role + assert ( + translated_messages[number_of_messages - 1]["role"] + != translated_messages[number_of_messages - 2]["role"] + ) diff --git a/pyproject.toml b/pyproject.toml index 355a5ea3c..9498cc128 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "litellm" -version = "1.43.18" +version = "1.43.19" description = "Library to easily interface with LLM API providers" authors = ["BerriAI"] license = "MIT" @@ -91,7 +91,7 @@ requires = ["poetry-core", "wheel"] build-backend = "poetry.core.masonry.api" [tool.commitizen] -version = "1.43.18" +version = "1.43.19" version_files = [ "pyproject.toml:^version" ] diff --git a/tests/otel_tests/test_guardrails.py b/tests/otel_tests/test_guardrails.py index c48a5ba79..7e9ff613a 100644 --- a/tests/otel_tests/test_guardrails.py +++ b/tests/otel_tests/test_guardrails.py @@ -22,10 +22,6 @@ async def chat_completion( data = { "model": model, "messages": messages, - "guardrails": [ - "aporia-post-guard", - "aporia-pre-guard", - ], # default guardrails for all tests } if guardrails is not None: @@ -41,7 +37,7 @@ async def chat_completion( print() if status != 200: - return response_text + raise Exception(response_text) # response headers response_headers = response.headers @@ -50,6 +46,29 @@ async def chat_completion( return await response.json(), response_headers +async def generate_key(session, guardrails): + url = "http://0.0.0.0:4000/key/generate" + headers = {"Authorization": "Bearer sk-1234", "Content-Type": "application/json"} + if guardrails: + data = { + "guardrails": guardrails, + } + else: + data = {} + + async with session.post(url, headers=headers, json=data) as response: + status = response.status + response_text = await response.text() + + print(response_text) + print() + + if status != 200: + raise Exception(f"Request did not return a 200 status code: {status}") + + return await response.json() + + @pytest.mark.asyncio async def test_llm_guard_triggered_safe_request(): """ @@ -62,6 +81,10 @@ async def test_llm_guard_triggered_safe_request(): "sk-1234", model="fake-openai-endpoint", messages=[{"role": "user", "content": f"Hello what's the weather"}], + guardrails=[ + "aporia-post-guard", + "aporia-pre-guard", + ], ) await asyncio.sleep(3) @@ -90,6 +113,10 @@ async def test_llm_guard_triggered(): messages=[ {"role": "user", "content": f"Hello my name is ishaan@berri.ai"} ], + guardrails=[ + "aporia-post-guard", + "aporia-pre-guard", + ], ) pytest.fail("Should have thrown an exception") except Exception as e: @@ -116,3 +143,54 @@ async def test_no_llm_guard_triggered(): print("response=", response, "response headers", headers) assert "x-litellm-applied-guardrails" not in headers + +@pytest.mark.asyncio +async def test_guardrails_with_api_key_controls(): + """ + - Make two API Keys + - Key 1 with no guardrails + - Key 2 with guardrails + - Request to Key 1 -> should be success with no guardrails + - Request to Key 2 -> should be error since guardrails are triggered + """ + async with aiohttp.ClientSession() as session: + key_with_guardrails = await generate_key( + session=session, + guardrails=[ + "aporia-post-guard", + "aporia-pre-guard", + ], + ) + + key_with_guardrails = key_with_guardrails["key"] + + key_without_guardrails = await generate_key(session=session, guardrails=None) + + key_without_guardrails = key_without_guardrails["key"] + + # test no guardrails triggered for key without guardrails + response, headers = await chat_completion( + session, + key_without_guardrails, + model="fake-openai-endpoint", + messages=[{"role": "user", "content": f"Hello what's the weather"}], + ) + await asyncio.sleep(3) + + print("response=", response, "response headers", headers) + assert "x-litellm-applied-guardrails" not in headers + + # test guardrails triggered for key with guardrails + try: + response, headers = await chat_completion( + session, + key_with_guardrails, + model="fake-openai-endpoint", + messages=[ + {"role": "user", "content": f"Hello my name is ishaan@berri.ai"} + ], + ) + pytest.fail("Should have thrown an exception") + except Exception as e: + print(e) + assert "Aporia detected and blocked PII" in str(e)