litellm-mirror/tests/code_coverage_tests/check_data_replace_usage.py

import os
import re
import ast
from pathlib import Path


class DataReplaceVisitor(ast.NodeVisitor):
    """AST visitor that finds calls to .replace("data:", ...) in the code."""

    def __init__(self):
        self.issues = []
        self.current_file = None

    def set_file(self, filename):
        self.current_file = filename

    def visit_Call(self, node):
        # Check for method calls like x.replace(...)
        if isinstance(node.func, ast.Attribute) and node.func.attr == "replace":
            # Check if first argument is "data:"
            if (
                len(node.args) >= 2
                and isinstance(node.args[0], ast.Constant)
                and isinstance(node.args[0].value, str)
                and "data:" in node.args[0].value
            ):

                self.issues.append(
                    {
                        "file": self.current_file,
                        "line": node.lineno,
                        "col": node.col_offset,
                        "text": f'Found .replace("data:", ...) at line {node.lineno}',
                    }
                )

        # Continue visiting child nodes
        self.generic_visit(node)


def check_file_with_ast(file_path):
    """Check a Python file for .replace("data:", ...) using AST parsing."""
    with open(file_path, "r", encoding="utf-8") as f:
        try:
            tree = ast.parse(f.read(), filename=file_path)
            visitor = DataReplaceVisitor()
            visitor.set_file(file_path)
            visitor.visit(tree)
            return visitor.issues
        except SyntaxError:
            return [
                {
                    "file": file_path,
                    "line": 0,
                    "col": 0,
                    "text": f"Syntax error in file, could not parse",
                }
            ]


def check_file_with_regex(file_path):
    """Check any file for .replace("data:", ...) using regex."""
    issues = []
    with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
        for i, line in enumerate(f, 1):
            matches = re.finditer(r'\.replace\(\s*[\'"]data:[\'"]', line)
            for match in matches:
                issues.append(
                    {
                        "file": file_path,
                        "line": i,
                        "col": match.start(),
                        "text": f'Found .replace("data:", ...) at line {i}',
                    }
                )
    return issues


def scan_directory(base_dir):
    """Scan a directory recursively for files containing .replace("data:", ...)."""
    all_issues = []

    for root, _, files in os.walk(base_dir):
        for file in files:
            print("checking file: ", file)
            file_path = os.path.join(root, file)

            # Skip directories we don't want to check
            if any(
                d in file_path for d in [".git", "__pycache__", ".venv", "node_modules"]
            ):
                continue

            # For Python files, use AST for more accurate parsing
            if file.endswith(".py"):
                issues = check_file_with_ast(file_path)
            # For other files that might contain code, use regex
            elif file.endswith((".js", ".ts", ".jsx", ".tsx", ".md", ".ipynb")):
                issues = check_file_with_regex(file_path)
            else:
                continue

            all_issues.extend(issues)

    return all_issues


def main():
    # Start from the project root directory

    base_dir = "./litellm"

    # Local testing
    # base_dir = "../../litellm"

    print(f"Scanning for .replace('data:', ...) usage in {base_dir}")
    issues = scan_directory(base_dir)

    if issues:
        print(f"\n⚠️ Found {len(issues)} instances of .replace('data:', ...):")
        for issue in issues:
            print(f"{issue['file']}:{issue['line']} - {issue['text']}")

        # Fail the test if issues are found
        raise Exception(
            f"Found {len(issues)} instances of .replace('data:', ...) which may be unsafe. Use litellm.CustomStreamWrapper._strip_sse_data_from_chunk instead."
        )
    else:
        print("✅ No instances of .replace('data:', ...) found.")


if __name__ == "__main__":
    main()