Add integration with MLflow Tracing (#6147)

* Add MLflow logger Signed-off-by: B-Step62 <yuki.watanabe@databricks.com> * Streaming handling Signed-off-by: B-Step62 <yuki.watanabe@databricks.com> * lint Signed-off-by: B-Step62 <yuki.watanabe@databricks.com> * address comments and fix issues Signed-off-by: B-Step62 <yuki.watanabe@databricks.com> * address comments and fix issues Signed-off-by: B-Step62 <yuki.watanabe@databricks.com> * Move logger construction code Signed-off-by: B-Step62 <yuki.watanabe@databricks.com> * Add docs Signed-off-by: B-Step62 <yuki.watanabe@databricks.com> * async handlers Signed-off-by: B-Step62 <yuki.watanabe@databricks.com> * new picture Signed-off-by: B-Step62 <yuki.watanabe@databricks.com> --------- Signed-off-by: B-Step62 <yuki.watanabe@databricks.com>
2024-11-14 00:00:41 +09:00 · 2024-11-14 00:00:41 +09:00 · 82f405adcb
commit 82f405adcb
parent 1e097bbfbe
7 changed files with 402 additions and 1 deletions
--- a/README.md
+++ b/README.md
@ -113,7 +113,7 @@ for part in response:

 ## Logging Observability ([Docs](https://docs.litellm.ai/docs/observability/callbacks))

-LiteLLM exposes pre defined callbacks to send data to Lunary, Langfuse, DynamoDB, s3 Buckets, Helicone, Promptlayer, Traceloop, Athina, Slack
+LiteLLM exposes pre defined callbacks to send data to Lunary, Langfuse, DynamoDB, s3 Buckets, Helicone, Promptlayer, Traceloop, Athina, Slack, MLflow

 ```python
 from litellm import completion
--- a/docs/my-website/docs/observability/mlflow.md
+++ b/docs/my-website/docs/observability/mlflow.md
@ -0,0 +1,108 @@
+# MLflow
+
+## What is MLflow?
+
+**MLflow** is an end-to-end open source MLOps platform for [experiment tracking](https://www.mlflow.org/docs/latest/tracking.html), [model management](https://www.mlflow.org/docs/latest/models.html), [evaluation](https://www.mlflow.org/docs/latest/llms/llm-evaluate/index.html), [observability (tracing)](https://www.mlflow.org/docs/latest/llms/tracing/index.html), and [deployment](https://www.mlflow.org/docs/latest/deployment/index.html). MLflow empowers teams to collaboratively develop and refine LLM applications efficiently.
+
+MLflow’s integration with LiteLLM supports advanced observability compatible with OpenTelemetry.
+
+
+<Image img={require('../../img/mlflow_tracing.png')} />
+
+
+## Getting Started
+
+Install MLflow:
+
+```shell
+pip install mlflow
+```
+
+To enable LiteLLM tracing:
+
+```python
+import mlflow
+
+mlflow.litellm.autolog()
+
+# Alternative, you can set the callback manually in LiteLLM
+# litellm.callbacks = ["mlflow"]
+```
+
+Since MLflow is open-source, no sign-up or API key is needed to log traces!
+
+```
+import litellm
+import os
+
+# Set your LLM provider's API key
+os.environ["OPENAI_API_KEY"] = ""
+
+# Call LiteLLM as usual
+response = litellm.completion(
+    model="gpt-4o-mini",
+    messages=[
+      {"role": "user", "content": "Hi 👋 - i'm openai"}
+    ]
+)
+```
+
+Open the MLflow UI and go to the `Traces` tab to view logged traces:
+
+```bash
+mlflow ui
+```
+
+## Exporting Traces to OpenTelemetry collectors
+
+MLflow traces are compatible with OpenTelemetry. You can export traces to any OpenTelemetry collector (e.g., Jaeger, Zipkin, Datadog, New Relic) by setting the endpoint URL in the environment variables.
+
+```
+# Set the endpoint of the OpenTelemetry Collector
+os.environ["OTEL_EXPORTER_OTLP_TRACES_ENDPOINT"] = "http://localhost:4317/v1/traces"
+# Optionally, set the service name to group traces
+os.environ["OTEL_SERVICE_NAME"] = "<your-service-name>"
+```
+
+See [MLflow documentation](https://mlflow.org/docs/latest/llms/tracing/index.html#using-opentelemetry-collector-for-exporting-traces) for more details.
+
+## Combine LiteLLM Trace with Your Application Trace
+
+LiteLLM is often part of larger LLM applications, such as agentic models. MLflow Tracing allows you to instrument custom Python code, which can then be combined with LiteLLM traces.
+
+```python
+import litellm
+import mlflow
+from mlflow.entities import SpanType
+
+# Enable LiteLLM tracing
+mlflow.litellm.autolog()
+
+
+class CustomAgent:
+    # Use @mlflow.trace to instrument Python functions.
+    @mlflow.trace(span_type=SpanType.AGENT)
+    def run(self, query: str):
+        # do something
+
+        while i < self.max_turns:
+            response = litellm.completion(
+                model="gpt-4o-mini",
+                messages=messages,
+            )
+
+            action = self.get_action(response)
+            ...
+
+    @mlflow.trace
+    def get_action(llm_response):
+        ...
+```
+
+This approach generates a unified trace, combining your custom Python code with LiteLLM calls.
+
+
+## Support
+
+* For advanced usage and integrations of tracing, visit the [MLflow Tracing documentation](https://mlflow.org/docs/latest/llms/tracing/index.html).
+* For any question or issue with this integration, please [submit an issue](https://github.com/mlflow/mlflow/issues/new/choose) on our [Github](https://github.com/mlflow/mlflow) repository!
--- a/docs/my-website/img/mlflow_tracing.png
+++ b/docs/my-website/img/mlflow_tracing.png
--- a/litellm/init.py
+++ b/litellm/init.py
@ -57,6 +57,7 @@ _custom_logger_compatible_callbacks_literal = Literal[
    "gcs_bucket",
    "opik",
    "argilla",
+    "mlflow",
 ]
 logged_real_time_event_types: Optional[Union[List[str], Literal["*"]]] = None
 _known_custom_logger_compatible_callbacks: List = list(
--- a/litellm/integrations/mlflow.py
+++ b/litellm/integrations/mlflow.py
@ -0,0 +1,246 @@
+import json
+import threading
+from typing import Optional
+
+from litellm.integrations.custom_logger import CustomLogger
+from litellm._logging import verbose_logger
+
+class MlflowLogger(CustomLogger):
+    def __init__(self):
+        from mlflow.tracking import MlflowClient
+
+        self._client = MlflowClient()
+
+        self._stream_id_to_span = {}
+        self._lock = threading.Lock()  # lock for _stream_id_to_span
+
+    def log_success_event(self, kwargs, response_obj, start_time, end_time):
+        self._handle_success(kwargs, response_obj, start_time, end_time)
+
+    async def async_log_success_event(self, kwargs, response_obj, start_time, end_time):
+        self._handle_success(kwargs, response_obj, start_time, end_time)
+
+    def _handle_success(self, kwargs, response_obj, start_time, end_time):
+        """
+        Log the success event as an MLflow span.
+        Note that this method is called asynchronously in the background thread.
+        """
+        from mlflow.entities import SpanStatusCode
+
+        try:
+            verbose_logger.debug(f"MLflow logging start for success event")
+
+            if kwargs.get("stream"):
+                self._handle_stream_event(kwargs, response_obj, start_time, end_time)
+            else:
+                span = self._start_span_or_trace(kwargs, start_time)
+                end_time_ns = int(end_time.timestamp() * 1e9)
+                self._end_span_or_trace(
+                    span=span,
+                    outputs=response_obj,
+                    status=SpanStatusCode.OK,
+                    end_time_ns=end_time_ns,
+                )
+        except Exception:
+            verbose_logger.debug(f"MLflow Logging Error", stack_info=True)
+
+    def log_failure_event(self, kwargs, response_obj, start_time, end_time):
+        self._handle_failure(kwargs, response_obj, start_time, end_time)
+
+    async def async_log_failure_event(self, kwargs, response_obj, start_time, end_time):
+        self._handle_failure(kwargs, response_obj, start_time, end_time)
+
+    def _handle_failure(self, kwargs, response_obj, start_time, end_time):
+        """
+        Log the failure event as an MLflow span.
+        Note that this method is called *synchronously* unlike the success handler.
+        """
+        from mlflow.entities import SpanEvent, SpanStatusCode
+
+        try:
+            span = self._start_span_or_trace(kwargs, start_time)
+
+            end_time_ns = int(end_time.timestamp() * 1e9)
+
+            # Record exception info as event
+            if exception := kwargs.get("exception"):
+                span.add_event(SpanEvent.from_exception(exception))
+
+            self._end_span_or_trace(
+                span=span,
+                outputs=response_obj,
+                status=SpanStatusCode.ERROR,
+                end_time_ns=end_time_ns,
+            )
+
+        except Exception as e:
+            verbose_logger.debug(f"MLflow Logging Error", stack_info=True)
+
+    def _handle_stream_event(self, kwargs, response_obj, start_time, end_time):
+        """
+        Handle the success event for a streaming response. For streaming calls,
+        log_success_event handle is triggered for every chunk of the stream.
+        We create a single span for the entire stream request as follows:
+
+        1. For the first chunk, start a new span and store it in the map.
+        2. For subsequent chunks, add the chunk as an event to the span.
+        3. For the final chunk, end the span and remove the span from the map.
+        """
+        from mlflow.entities import SpanStatusCode
+
+        litellm_call_id = kwargs.get("litellm_call_id")
+
+        if litellm_call_id not in self._stream_id_to_span:
+            with self._lock:
+                # Check again after acquiring lock
+                if litellm_call_id not in self._stream_id_to_span:
+                    # Start a new span for the first chunk of the stream
+                    span = self._start_span_or_trace(kwargs, start_time)
+                    self._stream_id_to_span[litellm_call_id] = span
+
+        # Add chunk as event to the span
+        span = self._stream_id_to_span[litellm_call_id]
+        self._add_chunk_events(span, response_obj)
+
+        # If this is the final chunk, end the span. The final chunk
+        # has complete_streaming_response that gathers the full response.
+        if final_response := kwargs.get("complete_streaming_response"):
+            end_time_ns = int(end_time.timestamp() * 1e9)
+            self._end_span_or_trace(
+                span=span,
+                outputs=final_response,
+                status=SpanStatusCode.OK,
+                end_time_ns=end_time_ns,
+            )
+
+            # Remove the stream_id from the map
+            with self._lock:
+                self._stream_id_to_span.pop(litellm_call_id)
+
+    def _add_chunk_events(self, span, response_obj):
+        from mlflow.entities import SpanEvent
+
+        try:
+            for choice in response_obj.choices:
+                span.add_event(
+                    SpanEvent(
+                        name="streaming_chunk",
+                        attributes={"delta": json.dumps(choice.delta.model_dump())},
+                    )
+                )
+        except Exception:
+            verbose_logger.debug("Error adding chunk events to span", stack_info=True)
+
+    def _construct_input(self, kwargs):
+        """Construct span inputs with optional parameters"""
+        inputs = {"messages": kwargs.get("messages")}
+        for key in ["functions", "tools", "stream", "tool_choice", "user"]:
+            if value := kwargs.get("optional_params", {}).pop(key, None):
+                inputs[key] = value
+        return inputs
+
+    def _extract_attributes(self, kwargs):
+        """
+        Extract span attributes from kwargs.
+
+        With the latest version of litellm, the standard_logging_object contains
+        canonical information for logging. If it is not present, we extract
+        subset of attributes from other kwargs.
+        """
+        attributes = {
+            "litellm_call_id": kwargs.get("litellm_call_id"),
+            "call_type": kwargs.get("call_type"),
+            "model": kwargs.get("model"),
+        }
+        standard_obj = kwargs.get("standard_logging_object")
+        if standard_obj:
+            attributes.update(
+                {
+                    "api_base": standard_obj.get("api_base"),
+                    "cache_hit": standard_obj.get("cache_hit"),
+                    "usage": {
+                        "completion_tokens": standard_obj.get("completion_tokens"),
+                        "prompt_tokens": standard_obj.get("prompt_tokens"),
+                        "total_tokens": standard_obj.get("total_tokens"),
+                    },
+                    "raw_llm_response": standard_obj.get("response"),
+                    "response_cost": standard_obj.get("response_cost"),
+                    "saved_cache_cost": standard_obj.get("saved_cache_cost"),
+                }
+            )
+        else:
+            litellm_params = kwargs.get("litellm_params", {})
+            attributes.update(
+                {
+                    "model": kwargs.get("model"),
+                    "cache_hit": kwargs.get("cache_hit"),
+                    "custom_llm_provider": kwargs.get("custom_llm_provider"),
+                    "api_base": litellm_params.get("api_base"),
+                    "response_cost": kwargs.get("response_cost"),
+                }
+            )
+        return attributes
+
+    def _get_span_type(self, call_type: Optional[str]) -> str:
+        from mlflow.entities import SpanType
+
+        if call_type in ["completion", "acompletion"]:
+            return SpanType.LLM
+        elif call_type == "embeddings":
+            return SpanType.EMBEDDING
+        else:
+            return SpanType.LLM
+
+    def _start_span_or_trace(self, kwargs, start_time):
+        """
+        Start an MLflow span or a trace.
+
+        If there is an active span, we start a new span as a child of
+        that span. Otherwise, we start a new trace.
+        """
+        import mlflow
+
+        call_type = kwargs.get("call_type", "completion")
+        span_name = f"litellm-{call_type}"
+        span_type = self._get_span_type(call_type)
+        start_time_ns = int(start_time.timestamp() * 1e9)
+
+        inputs = self._construct_input(kwargs)
+        attributes = self._extract_attributes(kwargs)
+
+        if active_span := mlflow.get_current_active_span():
+            return self._client.start_span(
+                name=span_name,
+                request_id=active_span.request_id,
+                parent_id=active_span.span_id,
+                span_type=span_type,
+                inputs=inputs,
+                attributes=attributes,
+                start_time_ns=start_time_ns,
+            )
+        else:
+            return self._client.start_trace(
+                name=span_name,
+                span_type=span_type,
+                inputs=inputs,
+                attributes=attributes,
+                start_time_ns=start_time_ns,
+            )
+
+    def _end_span_or_trace(self, span, outputs, end_time_ns, status):
+        """End an MLflow span or a trace."""
+        if span.parent_id is None:
+            self._client.end_trace(
+                request_id=span.request_id,
+                outputs=outputs,
+                status=status,
+                end_time_ns=end_time_ns,
+            )
+        else:
+            self._client.end_span(
+                request_id=span.request_id,
+                span_id=span.span_id,
+                outputs=outputs,
+                status=status,
+                end_time_ns=end_time_ns,
+            )
--- a/litellm/litellm_core_utils/litellm_logging.py
+++ b/litellm/litellm_core_utils/litellm_logging.py
@ -28,6 +28,7 @@ from litellm.caching.caching_handler import LLMCachingHandler
 from litellm.cost_calculator import _select_model_name_for_cost_calc
 from litellm.integrations.custom_guardrail import CustomGuardrail
 from litellm.integrations.custom_logger import CustomLogger
+from litellm.integrations.mlflow import MlflowLogger
 from litellm.litellm_core_utils.redact_messages import (
    redact_message_input_output_from_custom_logger,
    redact_message_input_output_from_logging,
@ -554,6 +555,7 @@ class Logging:
                            message=f"Model Call Details pre-call: {details_to_log}",
                            level="info",
                        )
+
                    elif isinstance(callback, CustomLogger):  # custom logger class
                        callback.log_pre_api_call(
                            model=self.model,
@ -1249,6 +1251,7 @@ class Logging:
                                end_time=end_time,
                                print_verbose=print_verbose,
                            )
+
                    if (
                        callback == "openmeter"
                        and self.model_call_details.get("litellm_params", {}).get(
@ -2338,6 +2341,14 @@ def _init_custom_logger_compatible_class(  # noqa: PLR0915
        _in_memory_loggers.append(_otel_logger)
        return _otel_logger  # type: ignore

+    elif logging_integration == "mlflow":
+        for callback in _in_memory_loggers:
+            if isinstance(callback, MlflowLogger):
+                return callback  # type: ignore
+
+        _mlflow_logger = MlflowLogger()
+        _in_memory_loggers.append(_mlflow_logger)
+        return _mlflow_logger  # type: ignore

 def get_custom_logger_compatible_class(
    logging_integration: litellm._custom_logger_compatible_callbacks_literal,
@ -2439,6 +2450,12 @@ def get_custom_logger_compatible_class(
                and callback.callback_name == "langtrace"
            ):
                return callback
+
+    elif logging_integration == "mlflow":
+        for callback in _in_memory_loggers:
+            if isinstance(callback, MlflowLogger):
+                return callback
+
    return None


--- a/litellm/tests/test_mlflow.py
+++ b/litellm/tests/test_mlflow.py
@ -0,0 +1,29 @@
+import pytest
+
+import litellm
+
+
+def test_mlflow_logging():
+    litellm.success_callback = ["mlflow"]
+    litellm.failure_callback = ["mlflow"]
+
+    litellm.completion(
+        model="gpt-4o-mini",
+        messages=[{"role": "user", "content": "what llm are u"}],
+        max_tokens=10,
+        temperature=0.2,
+        user="test-user",
+    )
+
+@pytest.mark.asyncio()
+async def test_async_mlflow_logging():
+    litellm.success_callback = ["mlflow"]
+    litellm.failure_callback = ["mlflow"]
+
+    await litellm.acompletion(
+        model="gpt-4o-mini",
+        messages=[{"role": "user", "content": "hi test from local arize"}],
+        mock_response="hello",
+        temperature=0.1,
+        user="OTEL_USER",
+    )