refactor(openai.py): moving openai text completion calls to http

2025-04-27 11:43:54 +00:00 · 2023-11-08 18:39:56 -08:00 · 2023-11-08 18:39:56 -08:00 · e66373bd47
commit e66373bd47
parent 901b0e690e
6 changed files with 211 additions and 66 deletions
--- a/litellm/llms/openai.py
+++ b/litellm/llms/openai.py
@ -269,3 +269,132 @@ class OpenAIChatCompletion(BaseLLM):
            else: 
                import traceback
                raise OpenAIError(status_code=500, message=traceback.format_exc())
 class OpenAITextCompletion(BaseLLM):
    _client_session: requests.Session
    def __init__(self) -> None:
        super().__init__()
        self._client_session = self.create_client_session()
    def validate_environment(self, api_key):
        headers = {
            "content-type": "application/json",
        }
        if api_key:
            headers["Authorization"] = f"Bearer {api_key}"
        return headers
    def convert_to_model_response_object(self, response_object: Optional[dict]=None, model_response_object: Optional[ModelResponse]=None):
        try: 
            ## RESPONSE OBJECT
            if response_object is None or model_response_object is None:
                raise OpenAIError(status_code=500, message="Error in response object format")
            choice_list=[]
            for idx, choice in enumerate(response_object["choices"]): 
                message = Message(content=choice["text"], role="assistant")
                choice = Choices(finish_reason=choice["finish_reason"], index=idx, message=message)
                choice_list.append(choice)
            model_response_object.choices = choice_list
            if "usage" in response_object: 
                model_response_object.usage = response_object["usage"]
            if "id" in response_object: 
                model_response_object.id = response_object["id"]
            if "model" in response_object: 
                model_response_object.model = response_object["model"]
            model_response_object._hidden_params["original_response"] = response_object # track original response, if users make a litellm.text_completion() request, we can return the original response
            return model_response_object
        except: 
            OpenAIError(status_code=500, message="Invalid response object.")
    def completion(self, 
               model: Optional[str]=None,
               messages: Optional[list]=None,
               model_response: Optional[ModelResponse]=None,
               print_verbose: Optional[Callable]=None,
               api_key: Optional[str]=None,
               api_base: Optional[str]=None,
               logging_obj=None,
               optional_params=None,
               litellm_params=None,
               logger_fn=None,
               headers: Optional[dict]=None):
        super().completion()
        exception_mapping_worked = False
        try: 
            if headers is None:
                headers = self.validate_environment(api_key=api_key)
            if model is None or messages is None:
                raise OpenAIError(status_code=422, message=f"Missing model or messages")
            api_base = f"{api_base}/completions"
            if len(messages)>0 and "content" in messages[0] and type(messages[0]["content"]) == list: 
                # Note: internal logic - for enabling litellm.text_completion()
                # text-davinci-003 can accept a string or array, if it's an array, assume the array is set in messages[0]['content']
                # https://platform.openai.com/docs/api-reference/completions/create
                prompt = messages[0]["content"]
            else:
                prompt = " ".join([message["content"] for message in messages]) # type: ignore
            data = {
                "model": model,
                "prompt": prompt, 
                **optional_params
            }
            ## LOGGING
            logging_obj.pre_call(
                input=messages,
                api_key=api_key,
                additional_args={"headers": headers, "api_base": api_base, "data": data},
            )
            if "stream" in optional_params and optional_params["stream"] == True:
                response = self._client_session.post(
                    url=f"{api_base}",
                    json=data,
                    headers=headers,
                    stream=optional_params["stream"]
                )
                if response.status_code != 200:
                    raise OpenAIError(status_code=response.status_code, message=response.text)
                ## RESPONSE OBJECT
                return response.iter_lines()
            else:
                response = self._client_session.post(
                    url=f"{api_base}",
                    json=data,
                    headers=headers,
                )
                if response.status_code != 200:
                    raise OpenAIError(status_code=response.status_code, message=response.text)
                ## LOGGING
                logging_obj.post_call(
                    input=prompt,
                    api_key=api_key,
                    original_response=response,
                    additional_args={
                        "headers": headers,
                        "api_base": api_base,
                    },
                )
                ## RESPONSE OBJECT
                return self.convert_to_model_response_object(response_object=response.json(), model_response_object=model_response)
        except OpenAIError as e: 
            exception_mapping_worked = True
            raise e
        except Exception as e: 
            if exception_mapping_worked: 
                raise e
            else: 
                import traceback
                raise OpenAIError(status_code=500, message=traceback.format_exc())
--- a/litellm/main.py
+++ b/litellm/main.py
@ -49,7 +49,7 @@ from .llms import (
    palm,
    vertex_ai,
    maritalk)
-from .llms.openai import OpenAIChatCompletion
+from .llms.openai import OpenAIChatCompletion, OpenAITextCompletion
 from .llms.azure import AzureChatCompletion
 from .llms.prompt_templates.factory import prompt_factory, custom_prompt, function_call_prompt
 import tiktoken
@ -73,6 +73,7 @@ from litellm.utils import (
 ####### ENVIRONMENT VARIABLES ###################
 dotenv.load_dotenv()  # Loading env variables using dotenv
 openai_chat_completions = OpenAIChatCompletion()
 openai_text_completions = OpenAITextCompletion()
 azure_chat_completions = AzureChatCompletion()
 ####### COMPLETION ENDPOINTS ################
@ -498,14 +499,8 @@ def completion(
            )
        elif (
            custom_llm_provider == "text-completion-openai"
            or model in litellm.open_ai_text_completion_models
            or "ft:babbage-002" in model
            or "ft:davinci-002" in model  # support for finetuned completion models
            # NOTE: Do NOT add custom_llm_provider == "openai". 
            # this will break hosted vllm/proxy calls. 
            # see: https://docs.litellm.ai/docs/providers/vllm#calling-hosted-vllm-server. 
            # VLLM expects requests to call openai.ChatCompletion we need those requests to always 
            # call openai.ChatCompletion
        ):
            # print("calling custom openai provider")
            openai.api_type = "openai"
@ -558,43 +553,22 @@ def completion(
                },
            )
            ## COMPLETION CALL
-            response = openai.Completion.create(
+            model_response = openai_text_completions.completion(
-                model=model, 
+                model=model,
-                prompt=prompt,
+                messages=messages,
-                headers=headers,
+                model_response=model_response,
-                api_key = api_key,
+                print_verbose=print_verbose,
                api_base=api_base,
                **optional_params
            )
            if "stream" in optional_params and optional_params["stream"] == True:
                response = CustomStreamWrapper(response, model, custom_llm_provider="text-completion-openai", logging_obj=logging)
                return response
            ## LOGGING
            logging.post_call(
                input=prompt,
                api_key=api_key,
-                original_response=response,
+                api_base=api_base,
-                additional_args={
+                logging_obj=logging,
-                    "openai_organization": litellm.organization,
+                optional_params=optional_params,
-                    "headers": headers,
+                litellm_params=litellm_params,
-                    "api_base": openai.api_base,
+                logger_fn=logger_fn
                    "api_type": openai.api_type,
                },
            )
-            ## RESPONSE OBJECT
+            
-            model_response._hidden_params["original_response"] = response # track original response, if users make a litellm.text_completion() request, we can return the original response
+            if "stream" in optional_params and optional_params["stream"] == True:
-            choices_list = []
+                response = CustomStreamWrapper(model_response, model, custom_llm_provider="text-completion-openai", logging_obj=logging)
-            for idx, item in enumerate(response["choices"]):
+                return response
                if len(item["text"]) > 0: 
                    message_obj = Message(content=item["text"])
                else: 
                    message_obj = Message(content=None)
                choice_obj = Choices(finish_reason=item["finish_reason"], index=idx+1, message=message_obj)
                choices_list.append(choice_obj)
            model_response["choices"] = choices_list
            model_response["created"] = response.get("created", time.time())
            model_response["model"] = model
            model_response["usage"] = response.get("usage", 0)
            response = model_response
        elif (
            "replicate" in model or 
--- a/litellm/tests/test_completion.py
+++ b/litellm/tests/test_completion.py
@ -391,11 +391,12 @@ def test_completion_openai():
 def test_completion_text_openai():
    try:
        litellm.set_verbose = True
        response = completion(model="gpt-3.5-turbo-instruct", messages=messages)
        print(response)
    except Exception as e:
        pytest.fail(f"Error occurred: {e}")
-# test_completion_text_openai()
+test_completion_text_openai()
 def test_completion_openai_with_optional_params():
    try:
--- a/litellm/tests/test_exceptions.py
+++ b/litellm/tests/test_exceptions.py
@ -62,7 +62,7 @@ def test_context_window_with_fallbacks(model):
 # for model in litellm.models_by_provider["bedrock"]:
 #     test_context_window(model=model)
-# test_context_window(model="gpt-3.5-turbo")
+# test_context_window(model="gpt-3.5-turbo-instruct")
 # test_context_window_with_fallbacks(model="command-nightly")
 # Test 2: InvalidAuth Errors
@pytest.mark.parametrize("model", models)
@ -70,7 +70,7 @@ def invalid_auth(model):  # set the model key to an invalid key, depending on th
    messages = [{"content": "Hello, how are you?", "role": "user"}]
    temporary_key = None
    try:
-        if model == "gpt-3.5-turbo":
+        if model == "gpt-3.5-turbo" or model == "gpt-3.5-turbo-instruct":
            temporary_key = os.environ["OPENAI_API_KEY"]
            os.environ["OPENAI_API_KEY"] = "bad-key"
        elif model == "bedrock/anthropic.claude-v2":
@ -158,7 +158,7 @@ def invalid_auth(model):  # set the model key to an invalid key, depending on th
 # for model in litellm.models_by_provider["bedrock"]:
 #     invalid_auth(model=model)
-# invalid_auth(model="gpt-3.5-turbo")
+# invalid_auth(model="gpt-3.5-turbo-instruct")
 # Test 3: Invalid Request Error 
@pytest.mark.parametrize("model", models)
--- a/litellm/tests/test_streaming.py
+++ b/litellm/tests/test_streaming.py
@ -916,7 +916,31 @@ def test_openai_chat_completion_call():
        print(f"error occurred: {traceback.format_exc()}")
        pass
-test_openai_chat_completion_call()
+# test_openai_chat_completion_call()
 def test_openai_text_completion_call():
    try:
        litellm.set_verbose = True
        response = completion(
            model="gpt-3.5-turbo-instruct", messages=messages, stream=True
        )
        complete_response = ""
        start_time = time.time()
        for idx, chunk in enumerate(response):
            chunk, finished = streaming_format_tests(idx, chunk)
            complete_response += chunk
            if finished:
                break
            # print(f'complete_chunk: {complete_response}')
        if complete_response.strip() == "": 
            raise Exception("Empty response received")
        print(f"complete response: {complete_response}")
    except:
        print(f"error occurred: {traceback.format_exc()}")
        pass
 test_openai_text_completion_call()
 # # test on together ai completion call - starcoder
 def test_together_ai_completion_call_starcoder():
--- a/litellm/utils.py
+++ b/litellm/utils.py
@ -2890,19 +2890,19 @@ def exception_type(
                exception_type = type(original_exception).__name__
            else:
                exception_type = ""
-            if custom_llm_provider == "openai":
+            if custom_llm_provider == "openai" or custom_llm_provider == "text-completion-openai":
                if "This model's maximum context length is" in error_str:
                    exception_mapping_worked = True
                    raise ContextWindowExceededError(
-                        message=f"AzureException - {original_exception.message}",
+                        message=f"OpenAIException - {original_exception.message}",
-                        llm_provider="azure",
+                        llm_provider="openai",
                        model=model
                    )
                elif "invalid_request_error" in error_str:
                    exception_mapping_worked = True
                    raise InvalidRequestError(
-                        message=f"AzureException - {original_exception.message}",
+                        message=f"OpenAIException - {original_exception.message}",
-                        llm_provider="azure",
+                        llm_provider="openai",
                        model=model
                    )
                elif hasattr(original_exception, "status_code"):
@ -4013,16 +4013,33 @@ class CustomStreamWrapper:
            else:
                return {"text": text, "is_finished": is_finished, "finish_reason": finish_reason}
-        except:
+        except Exception as e:
            traceback.print_exc()
-            pass
+            raise e
    def handle_openai_text_completion_chunk(self, chunk):
-        try:
+        try: 
-            return chunk["choices"][0]["text"]
+            str_line = chunk.decode("utf-8")  # Convert bytes to string
-        except:
+            text = "" 
-            raise ValueError(f"Unable to parse response. Original response: {chunk}")
+            is_finished = False
            finish_reason = None
            if str_line.startswith("data:"):
                data_json = json.loads(str_line[5:])
                print_verbose(f"delta content: {data_json['choices'][0]['text']}")
                text = data_json["choices"][0].get("text", "") 
                if data_json["choices"][0].get("finish_reason", None): 
                    is_finished = True
                    finish_reason = data_json["choices"][0]["finish_reason"]
                return {"text": text, "is_finished": is_finished, "finish_reason": finish_reason}
            elif "error" in str_line:
                raise ValueError(f"Unable to parse response. Original response: {str_line}")
            else:
                return {"text": text, "is_finished": is_finished, "finish_reason": finish_reason}
        except Exception as e:
            traceback.print_exc()
            raise e
    def handle_baseten_chunk(self, chunk):
        try:
@ -4146,9 +4163,6 @@ class CustomStreamWrapper:
                    completion_obj["content"] = response_obj["text"]
                    if response_obj["is_finished"]: 
                        model_response.choices[0].finish_reason = response_obj["finish_reason"]
                elif self.custom_llm_provider and self.custom_llm_provider == "text-completion-openai":
                    chunk = next(self.completion_stream)
                    completion_obj["content"] = self.handle_openai_text_completion_chunk(chunk)
                elif self.model in litellm.nlp_cloud_models or self.custom_llm_provider == "nlp_cloud":
                    try: 
                        chunk = next(self.completion_stream)
@ -4235,12 +4249,15 @@ class CustomStreamWrapper:
                    print_verbose(f"completion obj content: {completion_obj['content']}")
                    if response_obj["is_finished"]: 
                        model_response.choices[0].finish_reason = response_obj["finish_reason"]
-                else: # openai chat/azure models
+                elif self.custom_llm_provider == "text-completion-openai":
                    chunk = next(self.completion_stream)
-                    model_response = chunk
+                    response_obj = self.handle_openai_text_completion_chunk(chunk)
-                    # LOGGING
+                    completion_obj["content"] = response_obj["text"]
-                    threading.Thread(target=self.logging_obj.success_handler, args=(model_response,)).start()
+                    print_verbose(f"completion obj content: {completion_obj['content']}")
-                    return model_response
+                    if response_obj["is_finished"]: 
                        model_response.choices[0].finish_reason = response_obj["finish_reason"]
                else: # openai chat/azure models
                    raise Exception("Unmapped Model Error")
                model_response.model = self.model
                if len(completion_obj["content"]) > 0: # cannot set content of an OpenAI Object to be an empty string