diff --git a/docs/my-website/docs/observability/langsmith_integration.md b/docs/my-website/docs/observability/langsmith_integration.md index ca0421b15..78c7e3119 100644 --- a/docs/my-website/docs/observability/langsmith_integration.md +++ b/docs/my-website/docs/observability/langsmith_integration.md @@ -57,7 +57,7 @@ os.environ["LANGSMITH_API_KEY"] = "" os.environ['OPENAI_API_KEY']="" # set langfuse as a callback, litellm will send the data to langfuse -litellm.success_callback = ["langfuse"] +litellm.success_callback = ["langsmith"] response = litellm.completion( model="gpt-3.5-turbo", @@ -76,4 +76,4 @@ print(response) - [Schedule Demo 👋](https://calendly.com/d/4mp-gd3-k5k/berriai-1-1-onboarding-litellm-hosted-version) - [Community Discord 💭](https://discord.gg/wuPM9dRgDw) - Our numbers 📞 +1 (770) 8783-106 / +1 (412) 618-6238 -- Our emails ✉️ ishaan@berri.ai / krrish@berri.ai \ No newline at end of file +- Our emails ✉️ ishaan@berri.ai / krrish@berri.ai diff --git a/litellm/integrations/prometheus.py b/litellm/integrations/prometheus.py index 30a1188fe..b9d89fd3e 100644 --- a/litellm/integrations/prometheus.py +++ b/litellm/integrations/prometheus.py @@ -25,27 +25,27 @@ class PrometheusLogger: self.litellm_llm_api_failed_requests_metric = Counter( name="litellm_llm_api_failed_requests_metric", documentation="Total number of failed LLM API calls via litellm", - labelnames=["end_user", "hashed_api_key", "model", "team"], + labelnames=["end_user", "hashed_api_key", "model", "team", "user"], ) self.litellm_requests_metric = Counter( name="litellm_requests_metric", documentation="Total number of LLM calls to litellm", - labelnames=["end_user", "hashed_api_key", "model", "team"], + labelnames=["end_user", "hashed_api_key", "model", "team", "user"], ) # Counter for spend self.litellm_spend_metric = Counter( "litellm_spend_metric", "Total spend on LLM requests", - labelnames=["end_user", "hashed_api_key", "model", "team"], + labelnames=["end_user", "hashed_api_key", "model", "team", "user"], ) # Counter for total_output_tokens self.litellm_tokens_metric = Counter( "litellm_total_tokens", "Total number of input + output tokens from LLM requests", - labelnames=["end_user", "hashed_api_key", "model", "team"], + labelnames=["end_user", "hashed_api_key", "model", "team", "user"], ) except Exception as e: print_verbose(f"Got exception on init prometheus client {str(e)}") @@ -71,6 +71,9 @@ class PrometheusLogger: litellm_params = kwargs.get("litellm_params", {}) or {} proxy_server_request = litellm_params.get("proxy_server_request") or {} end_user_id = proxy_server_request.get("body", {}).get("user", None) + user_id = proxy_server_request.get("metadata", {}).get( + "user_api_key_user_id", None + ) user_api_key = litellm_params.get("metadata", {}).get("user_api_key", None) user_api_team = litellm_params.get("metadata", {}).get( "user_api_key_team_id", None @@ -94,19 +97,19 @@ class PrometheusLogger: user_api_key = hash_token(user_api_key) self.litellm_requests_metric.labels( - end_user_id, user_api_key, model, user_api_team + end_user_id, user_api_key, model, user_api_team, user_id ).inc() self.litellm_spend_metric.labels( - end_user_id, user_api_key, model, user_api_team + end_user_id, user_api_key, model, user_api_team, user_id ).inc(response_cost) self.litellm_tokens_metric.labels( - end_user_id, user_api_key, model, user_api_team + end_user_id, user_api_key, model, user_api_team, user_id ).inc(tokens_used) ### FAILURE INCREMENT ### if "exception" in kwargs: self.litellm_llm_api_failed_requests_metric.labels( - end_user_id, user_api_key, model, user_api_team + end_user_id, user_api_key, model, user_api_team, user_id ).inc() except Exception as e: traceback.print_exc() diff --git a/litellm/llms/vertex_ai.py b/litellm/llms/vertex_ai.py index fc2d882af..62ab971f7 100644 --- a/litellm/llms/vertex_ai.py +++ b/litellm/llms/vertex_ai.py @@ -22,6 +22,35 @@ class VertexAIError(Exception): ) # Call the base class constructor with the parameters it needs +class ExtendedGenerationConfig(dict): + """Extended parameters for the generation.""" + + def __init__( + self, + *, + temperature: Optional[float] = None, + top_p: Optional[float] = None, + top_k: Optional[int] = None, + candidate_count: Optional[int] = None, + max_output_tokens: Optional[int] = None, + stop_sequences: Optional[List[str]] = None, + response_mime_type: Optional[str] = None, + frequency_penalty: Optional[float] = None, + presence_penalty: Optional[float] = None, + ): + super().__init__( + temperature=temperature, + top_p=top_p, + top_k=top_k, + candidate_count=candidate_count, + max_output_tokens=max_output_tokens, + stop_sequences=stop_sequences, + response_mime_type=response_mime_type, + frequency_penalty=frequency_penalty, + presence_penalty=presence_penalty, + ) + + class VertexAIConfig: """ Reference: https://cloud.google.com/vertex-ai/docs/generative-ai/chat/test-chat-prompts @@ -43,6 +72,10 @@ class VertexAIConfig: - `stop_sequences` (List[str]): The set of character sequences (up to 5) that will stop output generation. If specified, the API will stop at the first appearance of a stop sequence. The stop sequence will not be included as part of the response. + - `frequency_penalty` (float): This parameter is used to penalize the model from repeating the same output. The default value is 0.0. + + - `presence_penalty` (float): This parameter is used to penalize the model from generating the same output as the input. The default value is 0.0. + Note: Please make sure to modify the default parameters as required for your use case. """ @@ -53,6 +86,8 @@ class VertexAIConfig: response_mime_type: Optional[str] = None candidate_count: Optional[int] = None stop_sequences: Optional[list] = None + frequency_penalty: Optional[float] = None + presence_penalty: Optional[float] = None def __init__( self, @@ -63,6 +98,8 @@ class VertexAIConfig: response_mime_type: Optional[str] = None, candidate_count: Optional[int] = None, stop_sequences: Optional[list] = None, + frequency_penalty: Optional[float] = None, + presence_penalty: Optional[float] = None, ) -> None: locals_ = locals() for key, value in locals_.items(): @@ -119,6 +156,10 @@ class VertexAIConfig: optional_params["max_output_tokens"] = value if param == "response_format" and value["type"] == "json_object": optional_params["response_mime_type"] = "application/json" + if param == "frequency_penalty": + optional_params["frequency_penalty"] = value + if param == "presence_penalty": + optional_params["presence_penalty"] = value if param == "tools" and isinstance(value, list): from vertexai.preview import generative_models @@ -363,42 +404,6 @@ def completion( from google.cloud.aiplatform_v1beta1.types import content as gapic_content_types # type: ignore import google.auth # type: ignore - class ExtendedGenerationConfig(GenerationConfig): - """Extended parameters for the generation.""" - - def __init__( - self, - *, - temperature: Optional[float] = None, - top_p: Optional[float] = None, - top_k: Optional[int] = None, - candidate_count: Optional[int] = None, - max_output_tokens: Optional[int] = None, - stop_sequences: Optional[List[str]] = None, - response_mime_type: Optional[str] = None, - ): - args_spec = inspect.getfullargspec(gapic_content_types.GenerationConfig) - - if "response_mime_type" in args_spec.args: - self._raw_generation_config = gapic_content_types.GenerationConfig( - temperature=temperature, - top_p=top_p, - top_k=top_k, - candidate_count=candidate_count, - max_output_tokens=max_output_tokens, - stop_sequences=stop_sequences, - response_mime_type=response_mime_type, - ) - else: - self._raw_generation_config = gapic_content_types.GenerationConfig( - temperature=temperature, - top_p=top_p, - top_k=top_k, - candidate_count=candidate_count, - max_output_tokens=max_output_tokens, - stop_sequences=stop_sequences, - ) - ## Load credentials with the correct quota project ref: https://github.com/googleapis/python-aiplatform/issues/2557#issuecomment-1709284744 print_verbose( f"VERTEX AI: vertex_project={vertex_project}; vertex_location={vertex_location}" @@ -550,12 +555,12 @@ def completion( model_response = llm_model.generate_content( contents=content, - generation_config=ExtendedGenerationConfig(**optional_params), + generation_config=optional_params, safety_settings=safety_settings, stream=True, tools=tools, ) - optional_params["stream"] = True + return model_response request_str += f"response = llm_model.generate_content({content})\n" @@ -572,7 +577,7 @@ def completion( ## LLM Call response = llm_model.generate_content( contents=content, - generation_config=ExtendedGenerationConfig(**optional_params), + generation_config=optional_params, safety_settings=safety_settings, tools=tools, ) @@ -627,7 +632,7 @@ def completion( }, ) model_response = chat.send_message_streaming(prompt, **optional_params) - optional_params["stream"] = True + return model_response request_str += f"chat.send_message({prompt}, **{optional_params}).text\n" @@ -659,7 +664,7 @@ def completion( }, ) model_response = llm_model.predict_streaming(prompt, **optional_params) - optional_params["stream"] = True + return model_response request_str += f"llm_model.predict({prompt}, **{optional_params}).text\n" @@ -811,45 +816,6 @@ async def async_completion( Add support for acompletion calls for gemini-pro """ try: - from vertexai.preview.generative_models import GenerationConfig - from google.cloud.aiplatform_v1beta1.types import content as gapic_content_types # type: ignore - - class ExtendedGenerationConfig(GenerationConfig): - """Extended parameters for the generation.""" - - def __init__( - self, - *, - temperature: Optional[float] = None, - top_p: Optional[float] = None, - top_k: Optional[int] = None, - candidate_count: Optional[int] = None, - max_output_tokens: Optional[int] = None, - stop_sequences: Optional[List[str]] = None, - response_mime_type: Optional[str] = None, - ): - args_spec = inspect.getfullargspec(gapic_content_types.GenerationConfig) - - if "response_mime_type" in args_spec.args: - self._raw_generation_config = gapic_content_types.GenerationConfig( - temperature=temperature, - top_p=top_p, - top_k=top_k, - candidate_count=candidate_count, - max_output_tokens=max_output_tokens, - stop_sequences=stop_sequences, - response_mime_type=response_mime_type, - ) - else: - self._raw_generation_config = gapic_content_types.GenerationConfig( - temperature=temperature, - top_p=top_p, - top_k=top_k, - candidate_count=candidate_count, - max_output_tokens=max_output_tokens, - stop_sequences=stop_sequences, - ) - if mode == "vision": print_verbose("\nMaking VertexAI Gemini Pro Vision Call") print_verbose(f"\nProcessing input messages = {messages}") @@ -872,7 +838,7 @@ async def async_completion( ## LLM Call response = await llm_model._generate_content_async( contents=content, - generation_config=ExtendedGenerationConfig(**optional_params), + generation_config=optional_params, tools=tools, ) @@ -1056,45 +1022,6 @@ async def async_streaming( """ Add support for async streaming calls for gemini-pro """ - from vertexai.preview.generative_models import GenerationConfig - from google.cloud.aiplatform_v1beta1.types import content as gapic_content_types # type: ignore - - class ExtendedGenerationConfig(GenerationConfig): - """Extended parameters for the generation.""" - - def __init__( - self, - *, - temperature: Optional[float] = None, - top_p: Optional[float] = None, - top_k: Optional[int] = None, - candidate_count: Optional[int] = None, - max_output_tokens: Optional[int] = None, - stop_sequences: Optional[List[str]] = None, - response_mime_type: Optional[str] = None, - ): - args_spec = inspect.getfullargspec(gapic_content_types.GenerationConfig) - - if "response_mime_type" in args_spec.args: - self._raw_generation_config = gapic_content_types.GenerationConfig( - temperature=temperature, - top_p=top_p, - top_k=top_k, - candidate_count=candidate_count, - max_output_tokens=max_output_tokens, - stop_sequences=stop_sequences, - response_mime_type=response_mime_type, - ) - else: - self._raw_generation_config = gapic_content_types.GenerationConfig( - temperature=temperature, - top_p=top_p, - top_k=top_k, - candidate_count=candidate_count, - max_output_tokens=max_output_tokens, - stop_sequences=stop_sequences, - ) - if mode == "vision": stream = optional_params.pop("stream") tools = optional_params.pop("tools", None) @@ -1115,11 +1042,10 @@ async def async_streaming( response = await llm_model._generate_content_streaming_async( contents=content, - generation_config=ExtendedGenerationConfig(**optional_params), + generation_config=optional_params, tools=tools, ) - optional_params["stream"] = True - optional_params["tools"] = tools + elif mode == "chat": chat = llm_model.start_chat() optional_params.pop( @@ -1138,7 +1064,7 @@ async def async_streaming( }, ) response = chat.send_message_streaming_async(prompt, **optional_params) - optional_params["stream"] = True + elif mode == "text": optional_params.pop( "stream", None diff --git a/litellm/llms/vertex_ai_anthropic.py b/litellm/llms/vertex_ai_anthropic.py index 34709e0c5..9bce746dd 100644 --- a/litellm/llms/vertex_ai_anthropic.py +++ b/litellm/llms/vertex_ai_anthropic.py @@ -123,7 +123,7 @@ class VertexAIAnthropicConfig: """ -- Run client init +- Run client init - Support async completion, streaming """ @@ -236,19 +236,17 @@ def completion( if client is None: if vertex_credentials is not None and isinstance(vertex_credentials, str): import google.oauth2.service_account + + json_obj = json.loads(vertex_credentials) + creds = ( google.oauth2.service_account.Credentials.from_service_account_info( - json.loads(vertex_credentials), + json_obj, scopes=["https://www.googleapis.com/auth/cloud-platform"], ) ) ### CHECK IF ACCESS access_token = refresh_auth(credentials=creds) - else: - import google.auth - creds, _ = google.auth.default() - ### CHECK IF ACCESS - access_token = refresh_auth(credentials=creds) vertex_ai_client = AnthropicVertex( project_id=vertex_project, diff --git a/litellm/main.py b/litellm/main.py index 8f357b834..e04d50973 100644 --- a/litellm/main.py +++ b/litellm/main.py @@ -12,7 +12,6 @@ from typing import Any, Literal, Union, BinaryIO from functools import partial import dotenv, traceback, random, asyncio, time, contextvars from copy import deepcopy - import httpx import litellm from ._logging import verbose_logger @@ -1685,13 +1684,14 @@ def completion( or optional_params.pop("vertex_ai_credentials", None) or get_secret("VERTEXAI_CREDENTIALS") ) + new_params = deepcopy(optional_params) if "claude-3" in model: model_response = vertex_ai_anthropic.completion( model=model, messages=messages, model_response=model_response, print_verbose=print_verbose, - optional_params=optional_params, + optional_params=new_params, litellm_params=litellm_params, logger_fn=logger_fn, encoding=encoding, @@ -1707,7 +1707,7 @@ def completion( messages=messages, model_response=model_response, print_verbose=print_verbose, - optional_params=optional_params, + optional_params=new_params, litellm_params=litellm_params, logger_fn=logger_fn, encoding=encoding, diff --git a/litellm/model_prices_and_context_window_backup.json b/litellm/model_prices_and_context_window_backup.json index 113f9413f..c2c172bfe 100644 --- a/litellm/model_prices_and_context_window_backup.json +++ b/litellm/model_prices_and_context_window_backup.json @@ -1535,6 +1535,13 @@ "litellm_provider": "openrouter", "mode": "chat" }, + "openrouter/meta-llama/llama-3-70b-instruct": { + "max_tokens": 8192, + "input_cost_per_token": 0.0000008, + "output_cost_per_token": 0.0000008, + "litellm_provider": "openrouter", + "mode": "chat" + }, "j2-ultra": { "max_tokens": 8192, "max_input_tokens": 8192, diff --git a/litellm/proxy/_experimental/out/404.html b/litellm/proxy/_experimental/out/404.html index bb4c1890d..67bfb14c9 100644 --- a/litellm/proxy/_experimental/out/404.html +++ b/litellm/proxy/_experimental/out/404.html @@ -1 +1 @@ -