mirror of
https://github.com/BerriAI/litellm.git
synced 2025-04-26 11:14:04 +00:00
move baseten to a REST endpoint call
This commit is contained in:
parent
725611aa58
commit
6e30b234ac
10 changed files with 173 additions and 33 deletions
|
@ -21,6 +21,7 @@ huggingface_key: Optional[str] = None
|
||||||
vertex_project: Optional[str] = None
|
vertex_project: Optional[str] = None
|
||||||
vertex_location: Optional[str] = None
|
vertex_location: Optional[str] = None
|
||||||
togetherai_api_key: Optional[str] = None
|
togetherai_api_key: Optional[str] = None
|
||||||
|
baseten_key: Optional[str] = None
|
||||||
caching = False
|
caching = False
|
||||||
caching_with_models = False # if you want the caching key to be model + prompt
|
caching_with_models = False # if you want the caching key to be model + prompt
|
||||||
model_alias_map: Dict[str, str] = {}
|
model_alias_map: Dict[str, str] = {}
|
||||||
|
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -29,7 +29,7 @@ class LiteDebugger:
|
||||||
)
|
)
|
||||||
|
|
||||||
def input_log_event(
|
def input_log_event(
|
||||||
self, model, messages, end_user, litellm_call_id, print_verbose
|
self, model, messages, end_user, litellm_call_id, print_verbose, litellm_params, optional_params
|
||||||
):
|
):
|
||||||
try:
|
try:
|
||||||
print_verbose(
|
print_verbose(
|
||||||
|
@ -42,6 +42,8 @@ class LiteDebugger:
|
||||||
"status": "initiated",
|
"status": "initiated",
|
||||||
"litellm_call_id": litellm_call_id,
|
"litellm_call_id": litellm_call_id,
|
||||||
"user_email": self.user_email,
|
"user_email": self.user_email,
|
||||||
|
"litellm_params": litellm_params,
|
||||||
|
"optional_params": optional_params
|
||||||
}
|
}
|
||||||
response = requests.post(
|
response = requests.post(
|
||||||
url=self.api_url,
|
url=self.api_url,
|
||||||
|
|
129
litellm/llms/baseten.py
Normal file
129
litellm/llms/baseten.py
Normal file
|
@ -0,0 +1,129 @@
|
||||||
|
import os, json
|
||||||
|
from enum import Enum
|
||||||
|
import requests
|
||||||
|
import time
|
||||||
|
from typing import Callable
|
||||||
|
from litellm.utils import ModelResponse
|
||||||
|
|
||||||
|
class BasetenError(Exception):
|
||||||
|
def __init__(self, status_code, message):
|
||||||
|
self.status_code = status_code
|
||||||
|
self.message = message
|
||||||
|
super().__init__(
|
||||||
|
self.message
|
||||||
|
) # Call the base class constructor with the parameters it needs
|
||||||
|
|
||||||
|
|
||||||
|
class BasetenLLM:
|
||||||
|
def __init__(
|
||||||
|
self, encoding, logging_obj, api_key=None
|
||||||
|
):
|
||||||
|
self.encoding = encoding
|
||||||
|
self.completion_url_fragment_1 = "https://app.baseten.co/models/"
|
||||||
|
self.completion_url_fragment_2 = "/predict"
|
||||||
|
self.api_key = api_key
|
||||||
|
self.logging_obj = logging_obj
|
||||||
|
self.validate_environment(api_key=api_key)
|
||||||
|
|
||||||
|
def validate_environment(
|
||||||
|
self, api_key
|
||||||
|
): # set up the environment required to run the model
|
||||||
|
# set the api key
|
||||||
|
if self.api_key == None:
|
||||||
|
raise ValueError(
|
||||||
|
"Missing Baseten API Key - A call is being made to baseten but no key is set either in the environment variables or via params"
|
||||||
|
)
|
||||||
|
self.api_key = api_key
|
||||||
|
self.headers = {
|
||||||
|
"accept": "application/json",
|
||||||
|
"content-type": "application/json",
|
||||||
|
"Authorization": "Api-Key " + self.api_key,
|
||||||
|
}
|
||||||
|
|
||||||
|
def completion(
|
||||||
|
self,
|
||||||
|
model: str,
|
||||||
|
messages: list,
|
||||||
|
model_response: ModelResponse,
|
||||||
|
print_verbose: Callable,
|
||||||
|
optional_params=None,
|
||||||
|
litellm_params=None,
|
||||||
|
logger_fn=None,
|
||||||
|
): # logic for parsing in - calling - parsing out model completion calls
|
||||||
|
model = model
|
||||||
|
prompt = ""
|
||||||
|
for message in messages:
|
||||||
|
if "role" in message:
|
||||||
|
if message["role"] == "user":
|
||||||
|
prompt += (
|
||||||
|
f"{message['content']}"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
prompt += (
|
||||||
|
f"{message['content']}"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
prompt += f"{message['content']}"
|
||||||
|
data = {
|
||||||
|
"prompt": prompt,
|
||||||
|
# "instruction": prompt, # some baseten models require the prompt to be passed in via the 'instruction' kwarg
|
||||||
|
# **optional_params,
|
||||||
|
}
|
||||||
|
|
||||||
|
## LOGGING
|
||||||
|
self.logging_obj.pre_call(
|
||||||
|
input=prompt,
|
||||||
|
api_key=self.api_key,
|
||||||
|
additional_args={"complete_input_dict": data},
|
||||||
|
)
|
||||||
|
## COMPLETION CALL
|
||||||
|
response = requests.post(
|
||||||
|
self.completion_url_fragment_1 + model + self.completion_url_fragment_2, headers=self.headers, data=json.dumps(data)
|
||||||
|
)
|
||||||
|
if "stream" in optional_params and optional_params["stream"] == True:
|
||||||
|
return response.iter_lines()
|
||||||
|
else:
|
||||||
|
## LOGGING
|
||||||
|
self.logging_obj.post_call(
|
||||||
|
input=prompt,
|
||||||
|
api_key=self.api_key,
|
||||||
|
original_response=response.text,
|
||||||
|
additional_args={"complete_input_dict": data},
|
||||||
|
)
|
||||||
|
print_verbose(f"raw model_response: {response.text}")
|
||||||
|
## RESPONSE OBJECT
|
||||||
|
completion_response = response.json()
|
||||||
|
if "error" in completion_response:
|
||||||
|
raise BasetenError(
|
||||||
|
message=completion_response["error"],
|
||||||
|
status_code=response.status_code,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
if "model_output" in completion_response:
|
||||||
|
if isinstance(completion_response["model_output"], str):
|
||||||
|
model_response["choices"][0]["message"]["content"] = completion_response["model_output"]
|
||||||
|
elif isinstance(completion_response["model_output"], dict) and "data" in completion_response["model_output"] and isinstance(completion_response["model_output"]["data"], list):
|
||||||
|
model_response["choices"][0]["message"]["content"] = completion_response["model_output"]["data"][0]
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Unable to parse response. Original response: {response.text}")
|
||||||
|
## CALCULATING USAGE - baseten charges on time, not tokens - have some mapping of cost here.
|
||||||
|
prompt_tokens = len(
|
||||||
|
self.encoding.encode(prompt)
|
||||||
|
)
|
||||||
|
completion_tokens = len(
|
||||||
|
self.encoding.encode(model_response["choices"][0]["message"]["content"])
|
||||||
|
)
|
||||||
|
|
||||||
|
model_response["created"] = time.time()
|
||||||
|
model_response["model"] = model
|
||||||
|
model_response["usage"] = {
|
||||||
|
"prompt_tokens": prompt_tokens,
|
||||||
|
"completion_tokens": completion_tokens,
|
||||||
|
"total_tokens": prompt_tokens + completion_tokens,
|
||||||
|
}
|
||||||
|
return model_response
|
||||||
|
|
||||||
|
def embedding(
|
||||||
|
self,
|
||||||
|
): # logic for parsing in - calling - parsing out model embedding calls
|
||||||
|
pass
|
|
@ -21,6 +21,7 @@ from litellm.utils import (
|
||||||
)
|
)
|
||||||
from .llms.anthropic import AnthropicLLM
|
from .llms.anthropic import AnthropicLLM
|
||||||
from .llms.huggingface_restapi import HuggingfaceRestAPILLM
|
from .llms.huggingface_restapi import HuggingfaceRestAPILLM
|
||||||
|
from .llms.baseten import BasetenLLM
|
||||||
import tiktoken
|
import tiktoken
|
||||||
from concurrent.futures import ThreadPoolExecutor
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
|
|
||||||
|
@ -73,6 +74,7 @@ def completion(
|
||||||
max_tokens=float("inf"),
|
max_tokens=float("inf"),
|
||||||
presence_penalty=0,
|
presence_penalty=0,
|
||||||
frequency_penalty=0,
|
frequency_penalty=0,
|
||||||
|
num_beams=1,
|
||||||
logit_bias={},
|
logit_bias={},
|
||||||
user="",
|
user="",
|
||||||
deployment_id=None,
|
deployment_id=None,
|
||||||
|
@ -681,36 +683,31 @@ def completion(
|
||||||
custom_llm_provider == "baseten"
|
custom_llm_provider == "baseten"
|
||||||
or litellm.api_base == "https://app.baseten.co"
|
or litellm.api_base == "https://app.baseten.co"
|
||||||
):
|
):
|
||||||
import baseten
|
custom_llm_provider = "baseten"
|
||||||
|
baseten_key = (
|
||||||
base_ten_key = get_secret("BASETEN_API_KEY")
|
api_key
|
||||||
baseten.login(base_ten_key)
|
or litellm.baseten_key
|
||||||
|
or os.environ.get("BASETEN_API_KEY")
|
||||||
prompt = " ".join([message["content"] for message in messages])
|
|
||||||
## LOGGING
|
|
||||||
logging.pre_call(input=prompt, api_key=base_ten_key, model=model)
|
|
||||||
|
|
||||||
base_ten__model = baseten.deployed_model_version_id(model)
|
|
||||||
|
|
||||||
completion_response = base_ten__model.predict({"prompt": prompt})
|
|
||||||
if type(completion_response) == dict:
|
|
||||||
completion_response = completion_response["data"]
|
|
||||||
if type(completion_response) == dict:
|
|
||||||
completion_response = completion_response["generated_text"]
|
|
||||||
|
|
||||||
## LOGGING
|
|
||||||
logging.post_call(
|
|
||||||
input=prompt,
|
|
||||||
api_key=base_ten_key,
|
|
||||||
original_response=completion_response,
|
|
||||||
)
|
)
|
||||||
|
baseten_client = BasetenLLM(
|
||||||
## RESPONSE OBJECT
|
encoding=encoding, api_key=baseten_key, logging_obj=logging
|
||||||
model_response["choices"][0]["message"]["content"] = completion_response
|
)
|
||||||
model_response["created"] = time.time()
|
model_response = baseten_client.completion(
|
||||||
model_response["model"] = model
|
model=model,
|
||||||
|
messages=messages,
|
||||||
|
model_response=model_response,
|
||||||
|
print_verbose=print_verbose,
|
||||||
|
optional_params=optional_params,
|
||||||
|
litellm_params=litellm_params,
|
||||||
|
logger_fn=logger_fn,
|
||||||
|
)
|
||||||
|
if "stream" in optional_params and optional_params["stream"] == True:
|
||||||
|
# don't try to access stream object,
|
||||||
|
response = CustomStreamWrapper(
|
||||||
|
model_response, model, custom_llm_provider="huggingface"
|
||||||
|
)
|
||||||
|
return response
|
||||||
response = model_response
|
response = model_response
|
||||||
|
|
||||||
elif custom_llm_provider == "petals" or (
|
elif custom_llm_provider == "petals" or (
|
||||||
litellm.api_base and "chat.petals.dev" in litellm.api_base
|
litellm.api_base and "chat.petals.dev" in litellm.api_base
|
||||||
):
|
):
|
||||||
|
|
|
@ -13,4 +13,5 @@ from litellm import embedding, completion
|
||||||
litellm.set_verbose = True
|
litellm.set_verbose = True
|
||||||
|
|
||||||
# Test: Check if the alias created via LiteDebugger is mapped correctly
|
# Test: Check if the alias created via LiteDebugger is mapped correctly
|
||||||
print(completion("wizard-lm", messages=[{"role": "user", "content": "Hey, how's it going?"}]))
|
{"top_p": 0.75, "prompt": "What's the meaning of life?", "num_beams": 4, "temperature": 0.1}
|
||||||
|
print(completion("llama-7b", messages=[{"role": "user", "content": "Hey, how's it going?"}], top_p=0.1, temperature=0, num_beams=4, max_tokens=60))
|
|
@ -154,6 +154,7 @@ class Logging:
|
||||||
self.optional_params = optional_params
|
self.optional_params = optional_params
|
||||||
self.litellm_params = litellm_params
|
self.litellm_params = litellm_params
|
||||||
self.logger_fn = litellm_params["logger_fn"]
|
self.logger_fn = litellm_params["logger_fn"]
|
||||||
|
print_verbose(f"self.optional_params: {self.optional_params}")
|
||||||
self.model_call_details = {
|
self.model_call_details = {
|
||||||
"model": model,
|
"model": model,
|
||||||
"messages": messages,
|
"messages": messages,
|
||||||
|
@ -214,6 +215,8 @@ class Logging:
|
||||||
end_user=litellm._thread_context.user,
|
end_user=litellm._thread_context.user,
|
||||||
litellm_call_id=self.
|
litellm_call_id=self.
|
||||||
litellm_params["litellm_call_id"],
|
litellm_params["litellm_call_id"],
|
||||||
|
litellm_params=self.model_call_details["litellm_params"],
|
||||||
|
optional_params=self.model_call_details["optional_params"],
|
||||||
print_verbose=print_verbose,
|
print_verbose=print_verbose,
|
||||||
)
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
@ -539,7 +542,7 @@ def get_litellm_params(
|
||||||
return litellm_params
|
return litellm_params
|
||||||
|
|
||||||
|
|
||||||
def get_optional_params(
|
def get_optional_params( # use the openai defaults
|
||||||
# 12 optional params
|
# 12 optional params
|
||||||
functions=[],
|
functions=[],
|
||||||
function_call="",
|
function_call="",
|
||||||
|
@ -552,6 +555,7 @@ def get_optional_params(
|
||||||
presence_penalty=0,
|
presence_penalty=0,
|
||||||
frequency_penalty=0,
|
frequency_penalty=0,
|
||||||
logit_bias={},
|
logit_bias={},
|
||||||
|
num_beams=1,
|
||||||
user="",
|
user="",
|
||||||
deployment_id=None,
|
deployment_id=None,
|
||||||
model=None,
|
model=None,
|
||||||
|
@ -613,7 +617,13 @@ def get_optional_params(
|
||||||
optional_params["temperature"] = temperature
|
optional_params["temperature"] = temperature
|
||||||
optional_params["top_p"] = top_p
|
optional_params["top_p"] = top_p
|
||||||
optional_params["top_k"] = top_k
|
optional_params["top_k"] = top_k
|
||||||
|
elif custom_llm_provider == "baseten":
|
||||||
|
optional_params["temperature"] = temperature
|
||||||
|
optional_params["top_p"] = top_p
|
||||||
|
optional_params["top_k"] = top_k
|
||||||
|
optional_params["num_beams"] = num_beams
|
||||||
|
if max_tokens != float("inf"):
|
||||||
|
optional_params["max_new_tokens"] = max_tokens
|
||||||
else: # assume passing in params for openai/azure openai
|
else: # assume passing in params for openai/azure openai
|
||||||
if functions != []:
|
if functions != []:
|
||||||
optional_params["functions"] = functions
|
optional_params["functions"] = functions
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
[tool.poetry]
|
[tool.poetry]
|
||||||
name = "litellm"
|
name = "litellm"
|
||||||
version = "0.1.478"
|
version = "0.1.479"
|
||||||
description = "Library to easily interface with LLM API providers"
|
description = "Library to easily interface with LLM API providers"
|
||||||
authors = ["BerriAI"]
|
authors = ["BerriAI"]
|
||||||
license = "MIT License"
|
license = "MIT License"
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue