diff --git a/litellm/proxy/api_log.json b/litellm/proxy/api_log.json deleted file mode 100644 index 62020b025..000000000 --- a/litellm/proxy/api_log.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "20231014160921359878": { - "pre_api_call": { - "model": "codellama/CodeLlama-7b-Instruct-hf", - "messages": [ - { - "role": "user", - "content": "hey" - } - ], - "optional_params": { - "temperature": 0.5, - "stream": true, - "max_new_tokens": 1024, - "details": true, - "return_full_text": false - }, - "litellm_params": { - "return_async": false, - "api_key": null, - "force_timeout": 600, - "logger_fn": null, - "verbose": false, - "custom_llm_provider": "huggingface", - "api_base": "https://app.baseten.co/models/pP8JeaB/predict", - "litellm_call_id": "d75891a0-d567-470a-a6cd-137e698da092", - "model_alias_map": {}, - "completion_call_id": null, - "metadata": null, - "stream_response": {} - }, - "input": "[INST] hey [/INST]\n", - "api_key": "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", - "additional_args": { - "complete_input_dict": { - "inputs": "[INST] hey [/INST]\n", - "parameters": { - "temperature": 0.5, - "stream": true, - "max_new_tokens": 1024, - "details": true, - "return_full_text": false - }, - "stream": true - }, - "task": "text-generation-inference", - "headers": { - "Authorization": "Api-Key SQqH1uZg.SSN79Bq997k4TRdzW9HBCghx9KyL0EJA" - } - }, - "log_event_type": "pre_api_call" - }, - "post_api_call": { - "model": "codellama/CodeLlama-7b-Instruct-hf", - "messages": [ - { - "role": "user", - "content": "hey" - } - ], - "optional_params": { - "temperature": 0.5, - "stream": true, - "max_new_tokens": 1024, - "details": true, - "return_full_text": false - }, - "litellm_params": { - "return_async": false, - "api_key": null, - "force_timeout": 600, - "logger_fn": null, - "verbose": false, - "custom_llm_provider": "huggingface", - "api_base": "https://app.baseten.co/models/pP8JeaB/predict", - "litellm_call_id": "d75891a0-d567-470a-a6cd-137e698da092", - "model_alias_map": {}, - "completion_call_id": null, - "metadata": null, - "stream_response": {} - }, - "input": null, - "api_key": null, - "additional_args": {}, - "log_event_type": "post_api_call", - "original_response": "", - "end_time": \ No newline at end of file diff --git a/litellm/proxy/cost.log b/litellm/proxy/cost.log deleted file mode 100644 index e69de29bb..000000000 diff --git a/litellm/proxy/costs.json b/litellm/proxy/costs.json deleted file mode 100644 index 8211cec22..000000000 --- a/litellm/proxy/costs.json +++ /dev/null @@ -1,8 +0,0 @@ -{ - "Oct-12-2023": { - "claude-2": { - "cost": 0.02365918, - "num_requests": 1 - } - } -} \ No newline at end of file diff --git a/litellm/proxy/proxy_server.py b/litellm/proxy/proxy_server.py index b4965dc64..a6cf52a6c 100644 --- a/litellm/proxy/proxy_server.py +++ b/litellm/proxy/proxy_server.py @@ -120,6 +120,7 @@ config_dir = appdirs.user_config_dir("litellm") user_config_path = os.getenv( "LITELLM_CONFIG_PATH", os.path.join(config_dir, config_filename) ) +experimental = False #### GLOBAL VARIABLES #### llm_router: Optional[litellm.Router] = None llm_model_list: Optional[list] = None @@ -354,7 +355,7 @@ def initialize( save, config ): - global user_model, user_api_base, user_debug, user_max_tokens, user_request_timeout, user_temperature, user_telemetry, user_headers, llm_model_list, llm_router, server_settings + global user_model, user_api_base, user_debug, user_max_tokens, user_request_timeout, user_temperature, user_telemetry, user_headers, experimental, llm_model_list, llm_router, server_settings generate_feedback_box() user_model = model user_debug = debug @@ -393,6 +394,8 @@ def initialize( dynamic_config["general"]["max_budget"] = max_budget if debug: # litellm-specific param litellm.set_verbose = True + if experimental: + pass if save: save_params_to_config(dynamic_config) with open(user_config_path) as f: @@ -537,6 +540,22 @@ async def chat_completion(request: Request, model: Optional[str] = None): return {"error": error_msg} +@router.post("/router/chat/completions") +async def router_completion(request: Request): + try: + body = await request.body() + body_str = body.decode() + try: + data = ast.literal_eval(body_str) + except: + data = json.loads(body_str) + return {"data": data} + except Exception as e: + print(f"\033[1;31mAn error occurred: {e}\n\n Debug this by setting `--debug`, e.g. `litellm --model gpt-3.5-turbo --debug`") + error_traceback = traceback.format_exc() + error_msg = f"{str(e)}\n\n{error_traceback}" + return {"error": error_msg} + @router.get("/ollama_logs") async def retrieve_server_log(request: Request): filepath = os.path.expanduser("~/.ollama/logs/server.log") diff --git a/litellm/proxy/start.sh b/litellm/proxy/start.sh index 586366108..44df50aaa 100755 --- a/litellm/proxy/start.sh +++ b/litellm/proxy/start.sh @@ -1,3 +1,2 @@ #!/bin/bash -python3 proxy_cli.py --config -f ../../secrets_template.toml python3 proxy_cli.py \ No newline at end of file diff --git a/litellm/proxy/utils.py b/litellm/proxy/utils.py new file mode 100644 index 000000000..348dd19f0 --- /dev/null +++ b/litellm/proxy/utils.py @@ -0,0 +1,98 @@ +# import threading, time, litellm +# import concurrent.futures +# """ +# v1: + +# 1. `--experimental_async` starts 2 background threads: +# - 1. to check the redis queue: +# - if job available +# - it dequeues as many jobs as healthy endpoints +# - calls llm api -> saves response in redis cache +# - 2. to check the llm apis: +# - check if endpoints are healthy (unhealthy = 4xx / 5xx call or >1min. queue) +# - which one is least busy +# 2. /router/chat/completions: receives request -> adds to redis queue -> returns {run_id, started_at, request_obj} +# 3. /router/chat/completions/runs/{run_id}: returns {status: _, [optional] response_obj: _} +# """ + +# def _start_health_check_thread(): +# """ +# Starts a separate thread to perform health checks periodically. +# """ +# health_check_thread = threading.Thread(target=_perform_health_checks, daemon=True) +# health_check_thread.start() +# llm_call_thread = threading.Thread(target=_llm_call_thread, daemon=True) +# llm_call_thread.start() + + +# def _llm_call_thread(): +# """ +# Periodically performs job checks on the redis queue. +# If available, make llm api calls. +# Write result to redis cache (1 min ttl) +# """ +# with concurrent.futures.ThreadPoolExecutor() as executor: +# while True: +# job_checks = _job_check() +# future_to_job = {executor.submit(_llm_api_call, job): job for job in job_checks} +# for future in concurrent.futures.as_completed(future_to_job): +# job = future_to_job[future] +# try: +# result = future.result() +# except Exception as exc: +# print(f'{job} generated an exception: {exc}') +# else: +# _write_to_cache(job, result, ttl=1*60) +# time.sleep(1) # sleep 1 second to avoid overloading the server + + + +# def _perform_health_checks(): +# """ +# Periodically performs health checks on the servers. +# Updates the list of healthy servers accordingly. +# """ +# while True: +# healthy_deployments = _health_check() +# # Adjust the time interval based on your needs +# time.sleep(15) + +# def _job_check(): +# """ +# Periodically performs job checks on the redis queue. +# Returns the list of available jobs - len(available_jobs) == len(healthy_endpoints), +# e.g. don't dequeue a gpt-3.5-turbo job if there's no healthy deployments left +# """ +# pass + +# def _llm_api_call(**data): +# """ +# Makes the litellm.completion() call with 3 retries +# """ +# return litellm.completion(num_retries=3, **data) + +# def _write_to_cache(): +# """ +# Writes the result to a redis cache in the form (key:job_id, value: ) +# """ +# pass + +# def _health_check(): +# """ +# Performs a health check on the deployments +# Returns the list of healthy deployments +# """ +# healthy_deployments = [] +# for deployment in model_list: +# litellm_args = deployment["litellm_params"] +# try: +# start_time = time.time() +# litellm.completion(messages=[{"role": "user", "content": ""}], max_tokens=1, **litellm_args) # hit the server with a blank message to see how long it takes to respond +# end_time = time.time() +# response_time = end_time - start_time +# logging.debug(f"response_time: {response_time}") +# healthy_deployments.append((deployment, response_time)) +# healthy_deployments.sort(key=lambda x: x[1]) +# except Exception as e: +# pass +# return healthy_deployments