moving proxy server to the top of repo

2025-04-25 10:44:24 +00:00 · 2023-08-18 09:53:49 -07:00 · 2023-08-18 09:53:49 -07:00 · 8ef47524bf
commit 8ef47524bf
parent 8543d89418
10 changed files with 0 additions and 0 deletions
--- a/proxy-server/utils.py
+++ b/proxy-server/utils.py
@ -0,0 +1,107 @@
+
+from litellm import completion 
+import os, dotenv
+import json
+dotenv.load_dotenv()
+############### Advanced ##########################
+
+########### streaming ############################
+def generate_responses(response):
+    for chunk in response:
+        yield json.dumps({"response": chunk}) + "\n"
+
+################ ERROR HANDLING #####################
+# implement model fallbacks, cooldowns, and retries
+# if a model fails assume it was rate limited and let it cooldown for 60s
+def handle_error(data):
+    import time
+    # retry completion() request with fallback models
+    response = None
+    start_time = time.time()
+    rate_limited_models = set()
+    model_expiration_times = {}
+    fallback_strategy=['gpt-3.5-turbo', 'command-nightly', 'claude-2']
+    while response == None and time.time() - start_time < 45: # retry for 45s
+      for model in fallback_strategy:
+        try:
+            if model in rate_limited_models: # check if model is currently cooling down
+              if model_expiration_times.get(model) and time.time() >= model_expiration_times[model]:
+                  rate_limited_models.remove(model) # check if it's been 60s of cool down and remove model
+              else:
+                  continue # skip model
+            print(f"calling model {model}")
+            response = completion(**data)
+            if response != None:
+              return response
+        except Exception as e:
+          rate_limited_models.add(model)
+          model_expiration_times[model] = time.time() + 60 # cool down this selected model
+          pass
+    return response
+
+
+########### Pricing is tracked in Supabase ############
+
+
+
+import uuid
+cache_collection = None
+# Add a response to the cache
+def add_cache(messages, model_response):
+    global cache_collection
+    if cache_collection is None:
+        make_collection()
+
+    user_question = message_to_user_question(messages)
+
+    # Add the user question and model response to the cache
+    cache_collection.add(
+        documents=[user_question],
+        metadatas=[{"model_response": str(model_response)}],
+        ids=[str(uuid.uuid4())]
+    )
+    return
+
+# Retrieve a response from the cache if similarity is above the threshold
+def get_cache(messages, similarity_threshold):
+    try:
+        global cache_collection
+        if cache_collection is None:
+            make_collection()
+
+        user_question = message_to_user_question(messages)
+
+        # Query the cache for the user question
+        results = cache_collection.query(
+            query_texts=[user_question],
+            n_results=1
+        )
+
+        if len(results['distances'][0]) == 0:
+            return None  # Cache is empty
+
+        distance = results['distances'][0][0]
+        sim = (1 - distance)
+
+        if sim >= similarity_threshold:
+            return results['metadatas'][0][0]["model_response"]  # Return cached response
+        else:
+            return None  # No cache hit
+    except Exception as e:
+        print("Error in get cache", e)
+        raise e
+
+# Initialize the cache collection
+def make_collection():
+    import chromadb
+    global cache_collection
+    client = chromadb.Client()
+    cache_collection = client.create_collection("llm_responses")
+
+# HELPER: Extract user's question from messages
+def message_to_user_question(messages):
+    user_question = ""
+    for message in messages:
+        if message['role'] == 'user':
+            user_question += message["content"]
+    return user_question