diff --git a/docs/notebooks/Llama_Stack_Agent_Workflows.ipynb b/docs/notebooks/Llama_Stack_Agent_Workflows.ipynb index 5b2f5794d..83915104c 100644 --- a/docs/notebooks/Llama_Stack_Agent_Workflows.ipynb +++ b/docs/notebooks/Llama_Stack_Agent_Workflows.ipynb @@ -576,6 +576,371 @@ " print(\"\\n\")" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 2. Evaluator-Optimizer Workflow\n", + "\n", + "In the evaluator-optimizer workflow, one LLM call generates a response while another provider evaluation and feedback in a loop. " + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": {}, + "outputs": [], + "source": [ + "class GeneratorOutputSchema(BaseModel):\n", + " thoughts: str\n", + " response: str\n", + "\n", + "generator_agent_config = AgentConfig({\n", + " **base_agent_config,\n", + " \"instructions\": \"\"\"Your goal is to complete the task based on . If there are feedback \n", + " from your previous generations, you should reflect on them to improve your solution\n", + "\n", + " Output your answer concisely in the following JSON format:\n", + " {{\n", + " \"thoughts\": \"\",\n", + " \"response\": \"\"\n", + " }}\n", + " \"\"\",\n", + " \"response_format\": {\n", + " \"type\": \"json_schema\",\n", + " \"json_schema\": GeneratorOutputSchema.model_json_schema()\n", + " }\n", + "})\n", + "\n", + "class EvaluatorOutputSchema(BaseModel):\n", + " evaluation: str\n", + " feedback: str\n", + "\n", + "evaluator_agent_config = AgentConfig({\n", + " **base_agent_config,\n", + " \"instructions\": \"\"\"Evaluate this following code implementation for:\n", + " 1. code correctness\n", + " 2. time complexity\n", + " 3. style and best practices\n", + "\n", + " You should be evaluating only and not attemping to solve the task.\n", + " Only output \"PASS\" if all criteria are met and you have no further suggestions for improvements.\n", + " Output your evaluation concisely in the following JSON format.\n", + " {{\n", + " \"evaluation\": \"\",\n", + " \"feedback\": \"What needs improvement and why.\"\n", + " }}\n", + "\n", + " The evaluation enum output should be one of the following:\n", + " - PASS\n", + " - NEEDS_IMPROVEMENT\n", + " - FAIL\n", + " \"\"\",\n", + " \"response_format\": {\n", + " \"type\": \"json_schema\",\n", + " \"json_schema\": EvaluatorOutputSchema.model_json_schema()\n", + " }\n", + "})\n", + "\n", + "generator_agent = Agent(client, generator_agent_config)\n", + "evaluator_agent = Agent(client, evaluator_agent_config)\n", + "generator_session_id = generator_agent.create_session(session_name=f\"generator_agent_{uuid.uuid4()}\")\n", + "evaluator_session_id = evaluator_agent.create_session(session_name=f\"evaluator_agent_{uuid.uuid4()}\")\n", + "\n", + "def generate_and_evaluate_loop(user_input):\n", + " # Step 1: Generate a response\n", + " generator_response = generator_agent.create_turn(\n", + " messages=[\n", + " {\"role\": \"user\", \"content\": user_input}\n", + " ],\n", + " session_id=generator_session_id,\n", + " stream=False,\n", + " )\n", + " generator_result = json.loads(generator_response.output_message.content)\n", + "\n", + " # Step 2: While evaluator does not return PASS, re-generate and re-evaluate\n", + " while True:\n", + " # Step 2.1: Evaluate the response\n", + " evaluator_response = evaluator_agent.create_turn(\n", + " messages=[\n", + " {\"role\": \"user\", \"content\": generator_result[\"response\"]}\n", + " ],\n", + " session_id=evaluator_session_id,\n", + " stream=False,\n", + " )\n", + "\n", + " evaluator_result = json.loads(evaluator_response.output_message.content)\n", + "\n", + " # Step 2.2: If evaluator returns PASS, return the response\n", + " if evaluator_result[\"evaluation\"] == \"PASS\":\n", + " return generator_result\n", + "\n", + " # Step 2.3: If evaluator returns NEEDS_IMPROVEMENT | FAIL, attach the feedback and re-generate\n", + " generator_response = generator_agent.create_turn(\n", + " messages=[\n", + " {\"role\": \"user\", \"content\": f\"{evaluator_result['feedback']}\"}\n", + " ],\n", + " session_id=generator_session_id,\n", + " stream=False,\n", + " )\n", + " generator_result = json.loads(generator_response.output_message.content)" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "```python\n", + "class MinStack:\n", + " def __init__(self):\n", + " self.main_stack = []\n", + " self.min_stack = []\n", + "\n", + " def push(self, x: int) -> None:\n", + " self.main_stack.append(x)\n", + " if not self.min_stack or x <= self.min_stack[-1]:\n", + " self.min_stack.append(x)\n", + "\n", + " def pop(self) -> None:\n", + " if self.main_stack:\n", + " if self.main_stack[-1] == self.min_stack[-1]:\n", + " self.min_stack.pop()\n", + " self.main_stack.pop()\n", + "\n", + " def getMin(self) -> int:\n", + " return self.min_stack[-1]\n", + "```\n" + ] + } + ], + "source": [ + "coding_task = \"\"\"\n", + "Implement a Stack with:\n", + "1. push(x)\n", + "2. pop()\n", + "3. getMin()\n", + "All operations should be O(1).\n", + "\"\"\"\n", + "\n", + "print(generate_and_evaluate_loop(coding_task)[\"response\"])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 2.1. Monitor into the loop's internal" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
{\n",
+       "'session_id': 'a2a3b149-0bf3-40a2-86d4-facf3f162014',\n",
+       "'session_name': 'generator_agent_e334542d-5c66-4136-94ce-f751c64eb9a5',\n",
+       "'started_at': datetime.datetime(2025, 3, 3, 11, 35, 49, 860141),\n",
+       "'turns': [\n",
+       "│   │   {\n",
+       "│   │   │   'input_messages': [\n",
+       "│   │   │   │   {\n",
+       "│   │   │   │   │   'content': '\\nImplement a Stack with:\\n1. push(x)\\n2. pop()\\n3. getMin()\\nAll operations should be O(1).\\n',\n",
+       "│   │   │   │   │   'role': 'user',\n",
+       "│   │   │   │   │   'context': None\n",
+       "│   │   │   │   }\n",
+       "│   │   │   ],\n",
+       "│   │   │   'output_message': {\n",
+       "│   │   │   │   'content': '{\\n\"thoughts\": \"To implement a Stack with push, pop, and getMin operations all in O(1) time complexity, we need to use two stacks. One stack will be used to store the actual elements (main stack), and the other stack will be used to keep track of the minimum elements seen so far (min stack). When an element is pushed onto the main stack, we check if the min stack is empty or if the top element of the min stack is greater than or equal to the element being pushed. If either condition is true, we push the element onto the min stack as well. When popping an element from the main stack, we check if the top element of the main stack is equal to the top element of the min stack. If they are equal, we pop the element from the min stack as well. The getMin operation simply returns the top element of the min stack.\",\\n\"response\": \"```python\\\\nclass MinStack:\\\\n    def __init__(self):\\\\n        self.main_stack = []\\\\n        self.min_stack = []\\\\n\\\\n    def push(self, x: int) -> None:\\\\n        self.main_stack.append(x)\\\\n        if not self.min_stack or x <= self.min_stack[-1]:\\\\n            self.min_stack.append(x)\\\\n\\\\n    def pop(self) -> None:\\\\n        if self.main_stack:\\\\n            if self.main_stack[-1] == self.min_stack[-1]:\\\\n                self.min_stack.pop()\\\\n            self.main_stack.pop()\\\\n\\\\n    def getMin(self) -> int:\\\\n        return self.min_stack[-1]\\\\n```\"\\n}',\n",
+       "│   │   │   │   'role': 'assistant',\n",
+       "│   │   │   │   'stop_reason': 'end_of_turn',\n",
+       "│   │   │   │   'tool_calls': []\n",
+       "│   │   │   },\n",
+       "│   │   │   'session_id': 'a2a3b149-0bf3-40a2-86d4-facf3f162014',\n",
+       "│   │   │   'started_at': datetime.datetime(2025, 3, 3, 11, 35, 51, 801415, tzinfo=datetime.timezone(datetime.timedelta(days=-1, seconds=57600))),\n",
+       "│   │   │   'steps': [\n",
+       "│   │   │   │   {\n",
+       "│   │   │   │   │   'model_response': {\n",
+       "│   │   │   │   │   │   'content': '{\\n\"thoughts\": \"To implement a Stack with push, pop, and getMin operations all in O(1) time complexity, we need to use two stacks. One stack will be used to store the actual elements (main stack), and the other stack will be used to keep track of the minimum elements seen so far (min stack). When an element is pushed onto the main stack, we check if the min stack is empty or if the top element of the min stack is greater than or equal to the element being pushed. If either condition is true, we push the element onto the min stack as well. When popping an element from the main stack, we check if the top element of the main stack is equal to the top element of the min stack. If they are equal, we pop the element from the min stack as well. The getMin operation simply returns the top element of the min stack.\",\\n\"response\": \"```python\\\\nclass MinStack:\\\\n    def __init__(self):\\\\n        self.main_stack = []\\\\n        self.min_stack = []\\\\n\\\\n    def push(self, x: int) -> None:\\\\n        self.main_stack.append(x)\\\\n        if not self.min_stack or x <= self.min_stack[-1]:\\\\n            self.min_stack.append(x)\\\\n\\\\n    def pop(self) -> None:\\\\n        if self.main_stack:\\\\n            if self.main_stack[-1] == self.min_stack[-1]:\\\\n                self.min_stack.pop()\\\\n            self.main_stack.pop()\\\\n\\\\n    def getMin(self) -> int:\\\\n        return self.min_stack[-1]\\\\n```\"\\n}',\n",
+       "│   │   │   │   │   │   'role': 'assistant',\n",
+       "│   │   │   │   │   │   'stop_reason': 'end_of_turn',\n",
+       "│   │   │   │   │   │   'tool_calls': []\n",
+       "│   │   │   │   │   },\n",
+       "│   │   │   │   │   'step_id': '4c4e54a6-c3e3-4d30-8da7-10003c59bfc7',\n",
+       "│   │   │   │   │   'step_type': 'inference',\n",
+       "│   │   │   │   │   'turn_id': '73ece739-af65-4c0b-97c9-d2fbb0b84234',\n",
+       "│   │   │   │   │   'completed_at': datetime.datetime(2025, 3, 3, 11, 35, 55, 346289, tzinfo=TzInfo(-08:00)),\n",
+       "│   │   │   │   │   'started_at': datetime.datetime(2025, 3, 3, 11, 35, 51, 812800, tzinfo=TzInfo(-08:00))\n",
+       "│   │   │   │   }\n",
+       "│   │   │   ],\n",
+       "│   │   │   'turn_id': '73ece739-af65-4c0b-97c9-d2fbb0b84234',\n",
+       "│   │   │   'completed_at': datetime.datetime(2025, 3, 3, 11, 35, 55, 364553, tzinfo=TzInfo(-08:00)),\n",
+       "│   │   │   'output_attachments': []\n",
+       "│   │   }\n",
+       "]\n",
+       "}\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[1m{\u001b[0m\n", + "\u001b[2;32m│ \u001b[0m\u001b[32m'session_id'\u001b[0m: \u001b[32m'a2a3b149-0bf3-40a2-86d4-facf3f162014'\u001b[0m,\n", + "\u001b[2;32m│ \u001b[0m\u001b[32m'session_name'\u001b[0m: \u001b[32m'generator_agent_e334542d-5c66-4136-94ce-f751c64eb9a5'\u001b[0m,\n", + "\u001b[2;32m│ \u001b[0m\u001b[32m'started_at'\u001b[0m: \u001b[1;35mdatetime.datetime\u001b[0m\u001b[1m(\u001b[0m\u001b[1;36m2025\u001b[0m, \u001b[1;36m3\u001b[0m, \u001b[1;36m3\u001b[0m, \u001b[1;36m11\u001b[0m, \u001b[1;36m35\u001b[0m, \u001b[1;36m49\u001b[0m, \u001b[1;36m860141\u001b[0m\u001b[1m)\u001b[0m,\n", + "\u001b[2;32m│ \u001b[0m\u001b[32m'turns'\u001b[0m: \u001b[1m[\u001b[0m\n", + "\u001b[2;32m│ │ \u001b[0m\u001b[1m{\u001b[0m\n", + "\u001b[2;32m│ │ │ \u001b[0m\u001b[32m'input_messages'\u001b[0m: \u001b[1m[\u001b[0m\n", + "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[1m{\u001b[0m\n", + "\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[32m'content'\u001b[0m: \u001b[32m'\\nImplement a Stack with:\\n1. push\u001b[0m\u001b[32m(\u001b[0m\u001b[32mx\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n2. pop\u001b[0m\u001b[32m(\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n3. getMin\u001b[0m\u001b[32m(\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\nAll operations should be O\u001b[0m\u001b[32m(\u001b[0m\u001b[32m1\u001b[0m\u001b[32m)\u001b[0m\u001b[32m.\\n'\u001b[0m,\n", + "\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[32m'role'\u001b[0m: \u001b[32m'user'\u001b[0m,\n", + "\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[32m'context'\u001b[0m: \u001b[3;35mNone\u001b[0m\n", + "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[1m}\u001b[0m\n", + "\u001b[2;32m│ │ │ \u001b[0m\u001b[1m]\u001b[0m,\n", + "\u001b[2;32m│ │ │ \u001b[0m\u001b[32m'output_message'\u001b[0m: \u001b[1m{\u001b[0m\n", + "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[32m'content'\u001b[0m: \u001b[32m'\u001b[0m\u001b[32m{\u001b[0m\u001b[32m\\n\"thoughts\": \"To implement a Stack with push, pop, and getMin operations all in O\u001b[0m\u001b[32m(\u001b[0m\u001b[32m1\u001b[0m\u001b[32m)\u001b[0m\u001b[32m time complexity, we need to use two stacks. One stack will be used to store the actual elements \u001b[0m\u001b[32m(\u001b[0m\u001b[32mmain stack\u001b[0m\u001b[32m)\u001b[0m\u001b[32m, and the other stack will be used to keep track of the minimum elements seen so far \u001b[0m\u001b[32m(\u001b[0m\u001b[32mmin stack\u001b[0m\u001b[32m)\u001b[0m\u001b[32m. When an element is pushed onto the main stack, we check if the min stack is empty or if the top element of the min stack is greater than or equal to the element being pushed. If either condition is true, we push the element onto the min stack as well. When popping an element from the main stack, we check if the top element of the main stack is equal to the top element of the min stack. If they are equal, we pop the element from the min stack as well. The getMin operation simply returns the top element of the min stack.\",\\n\"response\": \"```python\\\\nclass MinStack:\\\\n def __init__\u001b[0m\u001b[32m(\u001b[0m\u001b[32mself\u001b[0m\u001b[32m)\u001b[0m\u001b[32m:\\\\n self.main_stack = \u001b[0m\u001b[32m[\u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\\\n self.min_stack = \u001b[0m\u001b[32m[\u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\\\n\\\\n def push\u001b[0m\u001b[32m(\u001b[0m\u001b[32mself, x: int\u001b[0m\u001b[32m)\u001b[0m\u001b[32m -> None:\\\\n self.main_stack.append\u001b[0m\u001b[32m(\u001b[0m\u001b[32mx\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\\\n if not self.min_stack or x \u001b[0m\u001b[32m<\u001b[0m\u001b[32m= self.min_stack\u001b[0m\u001b[32m[\u001b[0m\u001b[32m-1\u001b[0m\u001b[32m]\u001b[0m\u001b[32m:\\\\n self.min_stack.append\u001b[0m\u001b[32m(\u001b[0m\u001b[32mx\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\\\n\\\\n def pop\u001b[0m\u001b[32m(\u001b[0m\u001b[32mself\u001b[0m\u001b[32m)\u001b[0m\u001b[32m -> None:\\\\n if self.main_stack:\\\\n if self.main_stack\u001b[0m\u001b[32m[\u001b[0m\u001b[32m-1\u001b[0m\u001b[32m]\u001b[0m\u001b[32m == self.min_stack\u001b[0m\u001b[32m[\u001b[0m\u001b[32m-1\u001b[0m\u001b[32m]\u001b[0m\u001b[32m:\\\\n self.min_stack.pop\u001b[0m\u001b[32m(\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\\\n self.main_stack.pop\u001b[0m\u001b[32m(\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\\\n\\\\n def getMin\u001b[0m\u001b[32m(\u001b[0m\u001b[32mself\u001b[0m\u001b[32m)\u001b[0m\u001b[32m -> int:\\\\n return self.min_stack\u001b[0m\u001b[32m[\u001b[0m\u001b[32m-1\u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\\\n```\"\\n\u001b[0m\u001b[32m}\u001b[0m\u001b[32m'\u001b[0m\u001b[39m,\u001b[0m\n", + "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[32m'role'\u001b[0m\u001b[39m: \u001b[0m\u001b[32m'assistant'\u001b[0m\u001b[39m,\u001b[0m\n", + "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[32m'stop_reason'\u001b[0m\u001b[39m: \u001b[0m\u001b[32m'end_of_turn'\u001b[0m\u001b[39m,\u001b[0m\n", + "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[32m'tool_calls'\u001b[0m\u001b[39m: \u001b[0m\u001b[1;39m[\u001b[0m\u001b[1;39m]\u001b[0m\n", + "\u001b[2;32m│ │ │ \u001b[0m\u001b[1;39m}\u001b[0m\u001b[39m,\u001b[0m\n", + "\u001b[2;32m│ │ │ \u001b[0m\u001b[32m'session_id'\u001b[0m\u001b[39m: \u001b[0m\u001b[32m'a2a3b149-0bf3-40a2-86d4-facf3f162014'\u001b[0m\u001b[39m,\u001b[0m\n", + "\u001b[2;32m│ │ │ \u001b[0m\u001b[32m'started_at'\u001b[0m\u001b[39m: \u001b[0m\u001b[1;35mdatetime.datetime\u001b[0m\u001b[1;39m(\u001b[0m\u001b[1;36m2025\u001b[0m\u001b[39m, \u001b[0m\u001b[1;36m3\u001b[0m\u001b[39m, \u001b[0m\u001b[1;36m3\u001b[0m\u001b[39m, \u001b[0m\u001b[1;36m11\u001b[0m\u001b[39m, \u001b[0m\u001b[1;36m35\u001b[0m\u001b[39m, \u001b[0m\u001b[1;36m51\u001b[0m\u001b[39m, \u001b[0m\u001b[1;36m801415\u001b[0m\u001b[39m, \u001b[0m\u001b[33mtzinfo\u001b[0m\u001b[39m=\u001b[0m\u001b[1;35mdatetime\u001b[0m\u001b[1;35m.timezone\u001b[0m\u001b[1;39m(\u001b[0m\u001b[1;35mdatetime.timedelta\u001b[0m\u001b[1;39m(\u001b[0m\u001b[33mdays\u001b[0m\u001b[39m=\u001b[0m\u001b[1;36m-1\u001b[0m\u001b[39m, \u001b[0m\u001b[33mseconds\u001b[0m\u001b[39m=\u001b[0m\u001b[1;36m57600\u001b[0m\u001b[1;39m)\u001b[0m\u001b[1;39m)\u001b[0m\u001b[1;39m)\u001b[0m\u001b[39m,\u001b[0m\n", + "\u001b[2;32m│ │ │ \u001b[0m\u001b[32m'steps'\u001b[0m\u001b[39m: \u001b[0m\u001b[1;39m[\u001b[0m\n", + "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[1;39m{\u001b[0m\n", + "\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[32m'model_response'\u001b[0m\u001b[39m: \u001b[0m\u001b[1;39m{\u001b[0m\n", + "\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[32m'content'\u001b[0m\u001b[39m: \u001b[0m\u001b[32m'\u001b[0m\u001b[32m{\u001b[0m\u001b[32m\\n\"thoughts\": \"To implement a Stack with push, pop, and getMin operations all in O\u001b[0m\u001b[32m(\u001b[0m\u001b[32m1\u001b[0m\u001b[32m)\u001b[0m\u001b[32m time complexity, we need to use two stacks. One stack will be used to store the actual elements \u001b[0m\u001b[32m(\u001b[0m\u001b[32mmain stack\u001b[0m\u001b[32m)\u001b[0m\u001b[32m, and the other stack will be used to keep track of the minimum elements seen so far \u001b[0m\u001b[32m(\u001b[0m\u001b[32mmin stack\u001b[0m\u001b[32m)\u001b[0m\u001b[32m. When an element is pushed onto the main stack, we check if the min stack is empty or if the top element of the min stack is greater than or equal to the element being pushed. If either condition is true, we push the element onto the min stack as well. When popping an element from the main stack, we check if the top element of the main stack is equal to the top element of the min stack. If they are equal, we pop the element from the min stack as well. The getMin operation simply returns the top element of the min stack.\",\\n\"response\": \"```python\\\\nclass MinStack:\\\\n def __init__\u001b[0m\u001b[32m(\u001b[0m\u001b[32mself\u001b[0m\u001b[32m)\u001b[0m\u001b[32m:\\\\n self.main_stack = \u001b[0m\u001b[32m[\u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\\\n self.min_stack = \u001b[0m\u001b[32m[\u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\\\n\\\\n def push\u001b[0m\u001b[32m(\u001b[0m\u001b[32mself, x: int\u001b[0m\u001b[32m)\u001b[0m\u001b[32m -> None:\\\\n self.main_stack.append\u001b[0m\u001b[32m(\u001b[0m\u001b[32mx\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\\\n if not self.min_stack or x <= self.min_stack\u001b[0m\u001b[32m[\u001b[0m\u001b[32m-1\u001b[0m\u001b[32m]\u001b[0m\u001b[32m:\\\\n self.min_stack.append\u001b[0m\u001b[32m(\u001b[0m\u001b[32mx\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\\\n\\\\n def pop\u001b[0m\u001b[32m(\u001b[0m\u001b[32mself\u001b[0m\u001b[32m)\u001b[0m\u001b[32m -> None:\\\\n if self.main_stack:\\\\n if self.main_stack\u001b[0m\u001b[32m[\u001b[0m\u001b[32m-1\u001b[0m\u001b[32m]\u001b[0m\u001b[32m == self.min_stack\u001b[0m\u001b[32m[\u001b[0m\u001b[32m-1\u001b[0m\u001b[32m]\u001b[0m\u001b[32m:\\\\n self.min_stack.pop\u001b[0m\u001b[32m(\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\\\n self.main_stack.pop\u001b[0m\u001b[32m(\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\\\n\\\\n def getMin\u001b[0m\u001b[32m(\u001b[0m\u001b[32mself\u001b[0m\u001b[32m)\u001b[0m\u001b[32m -\u001b[0m\u001b[32m>\u001b[0m\u001b[32m int:\\\\n return self.min_stack\u001b[0m\u001b[32m[\u001b[0m\u001b[32m-1\u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\\\n```\"\\n\u001b[0m\u001b[32m}\u001b[0m\u001b[32m'\u001b[0m,\n", + "\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[32m'role'\u001b[0m: \u001b[32m'assistant'\u001b[0m,\n", + "\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[32m'stop_reason'\u001b[0m: \u001b[32m'end_of_turn'\u001b[0m,\n", + "\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[32m'tool_calls'\u001b[0m: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n", + "\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[1m}\u001b[0m,\n", + "\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[32m'step_id'\u001b[0m: \u001b[32m'4c4e54a6-c3e3-4d30-8da7-10003c59bfc7'\u001b[0m,\n", + "\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[32m'step_type'\u001b[0m: \u001b[32m'inference'\u001b[0m,\n", + "\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[32m'turn_id'\u001b[0m: \u001b[32m'73ece739-af65-4c0b-97c9-d2fbb0b84234'\u001b[0m,\n", + "\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[32m'completed_at'\u001b[0m: \u001b[1;35mdatetime.datetime\u001b[0m\u001b[1m(\u001b[0m\u001b[1;36m2025\u001b[0m, \u001b[1;36m3\u001b[0m, \u001b[1;36m3\u001b[0m, \u001b[1;36m11\u001b[0m, \u001b[1;36m35\u001b[0m, \u001b[1;36m55\u001b[0m, \u001b[1;36m346289\u001b[0m, \u001b[33mtzinfo\u001b[0m=\u001b[1;35mTzInfo\u001b[0m\u001b[1m(\u001b[0m\u001b[1;36m-08\u001b[0m:\u001b[1;36m00\u001b[0m\u001b[1m)\u001b[0m\u001b[1m)\u001b[0m,\n", + "\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[32m'started_at'\u001b[0m: \u001b[1;35mdatetime.datetime\u001b[0m\u001b[1m(\u001b[0m\u001b[1;36m2025\u001b[0m, \u001b[1;36m3\u001b[0m, \u001b[1;36m3\u001b[0m, \u001b[1;36m11\u001b[0m, \u001b[1;36m35\u001b[0m, \u001b[1;36m51\u001b[0m, \u001b[1;36m812800\u001b[0m, \u001b[33mtzinfo\u001b[0m=\u001b[1;35mTzInfo\u001b[0m\u001b[1m(\u001b[0m\u001b[1;36m-08\u001b[0m:\u001b[1;36m00\u001b[0m\u001b[1m)\u001b[0m\u001b[1m)\u001b[0m\n", + "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[1m}\u001b[0m\n", + "\u001b[2;32m│ │ │ \u001b[0m\u001b[1m]\u001b[0m,\n", + "\u001b[2;32m│ │ │ \u001b[0m\u001b[32m'turn_id'\u001b[0m: \u001b[32m'73ece739-af65-4c0b-97c9-d2fbb0b84234'\u001b[0m,\n", + "\u001b[2;32m│ │ │ \u001b[0m\u001b[32m'completed_at'\u001b[0m: \u001b[1;35mdatetime.datetime\u001b[0m\u001b[1m(\u001b[0m\u001b[1;36m2025\u001b[0m, \u001b[1;36m3\u001b[0m, \u001b[1;36m3\u001b[0m, \u001b[1;36m11\u001b[0m, \u001b[1;36m35\u001b[0m, \u001b[1;36m55\u001b[0m, \u001b[1;36m364553\u001b[0m, \u001b[33mtzinfo\u001b[0m=\u001b[1;35mTzInfo\u001b[0m\u001b[1m(\u001b[0m\u001b[1;36m-08\u001b[0m:\u001b[1;36m00\u001b[0m\u001b[1m)\u001b[0m\u001b[1m)\u001b[0m,\n", + "\u001b[2;32m│ │ │ \u001b[0m\u001b[32m'output_attachments'\u001b[0m: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n", + "\u001b[2;32m│ │ \u001b[0m\u001b[1m}\u001b[0m\n", + "\u001b[2;32m│ \u001b[0m\u001b[1m]\u001b[0m\n", + "\u001b[1m}\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
{\n",
+       "'session_id': '2beb59a8-c81d-4655-ab8e-cd0b6c6d83d0',\n",
+       "'session_name': 'evaluator_agent_0deb09c5-1204-49c6-8e91-51f73d883195',\n",
+       "'started_at': datetime.datetime(2025, 3, 3, 11, 35, 49, 863796),\n",
+       "'turns': [\n",
+       "│   │   {\n",
+       "│   │   │   'input_messages': [\n",
+       "│   │   │   │   {\n",
+       "│   │   │   │   │   'content': '```python\\nclass MinStack:\\n    def __init__(self):\\n        self.main_stack = []\\n        self.min_stack = []\\n\\n    def push(self, x: int) -> None:\\n        self.main_stack.append(x)\\n        if not self.min_stack or x <= self.min_stack[-1]:\\n            self.min_stack.append(x)\\n\\n    def pop(self) -> None:\\n        if self.main_stack:\\n            if self.main_stack[-1] == self.min_stack[-1]:\\n                self.min_stack.pop()\\n            self.main_stack.pop()\\n\\n    def getMin(self) -> int:\\n        return self.min_stack[-1]\\n```',\n",
+       "│   │   │   │   │   'role': 'user',\n",
+       "│   │   │   │   │   'context': None\n",
+       "│   │   │   │   }\n",
+       "│   │   │   ],\n",
+       "│   │   │   'output_message': {\n",
+       "│   │   │   │   'content': '{\"evaluation\": \"PASS\", \"feedback\": \"The provided code is correct, efficient, and well-structured. It correctly implements a MinStack with O(1) time complexity for push, pop, and getMin operations. The use of two stacks to keep track of the minimum element is a good approach. The code also follows best practices, with clear and concise method names, and proper handling of edge cases.\"}',\n",
+       "│   │   │   │   'role': 'assistant',\n",
+       "│   │   │   │   'stop_reason': 'end_of_turn',\n",
+       "│   │   │   │   'tool_calls': []\n",
+       "│   │   │   },\n",
+       "│   │   │   'session_id': '2beb59a8-c81d-4655-ab8e-cd0b6c6d83d0',\n",
+       "│   │   │   'started_at': datetime.datetime(2025, 3, 3, 11, 35, 55, 387165, tzinfo=datetime.timezone(datetime.timedelta(days=-1, seconds=57600))),\n",
+       "│   │   │   'steps': [\n",
+       "│   │   │   │   {\n",
+       "│   │   │   │   │   'model_response': {\n",
+       "│   │   │   │   │   │   'content': '{\"evaluation\": \"PASS\", \"feedback\": \"The provided code is correct, efficient, and well-structured. It correctly implements a MinStack with O(1) time complexity for push, pop, and getMin operations. The use of two stacks to keep track of the minimum element is a good approach. The code also follows best practices, with clear and concise method names, and proper handling of edge cases.\"}',\n",
+       "│   │   │   │   │   │   'role': 'assistant',\n",
+       "│   │   │   │   │   │   'stop_reason': 'end_of_turn',\n",
+       "│   │   │   │   │   │   'tool_calls': []\n",
+       "│   │   │   │   │   },\n",
+       "│   │   │   │   │   'step_id': '01fccf0e-bc87-450e-9673-7a222d8b2044',\n",
+       "│   │   │   │   │   'step_type': 'inference',\n",
+       "│   │   │   │   │   'turn_id': 'cb4310bf-e31f-476f-9ca2-18f5dcfd16c9',\n",
+       "│   │   │   │   │   'completed_at': datetime.datetime(2025, 3, 3, 11, 35, 57, 294525, tzinfo=TzInfo(-08:00)),\n",
+       "│   │   │   │   │   'started_at': datetime.datetime(2025, 3, 3, 11, 35, 55, 398588, tzinfo=TzInfo(-08:00))\n",
+       "│   │   │   │   }\n",
+       "│   │   │   ],\n",
+       "│   │   │   'turn_id': 'cb4310bf-e31f-476f-9ca2-18f5dcfd16c9',\n",
+       "│   │   │   'completed_at': datetime.datetime(2025, 3, 3, 11, 35, 57, 306549, tzinfo=TzInfo(-08:00)),\n",
+       "│   │   │   'output_attachments': []\n",
+       "│   │   }\n",
+       "]\n",
+       "}\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[1m{\u001b[0m\n", + "\u001b[2;32m│ \u001b[0m\u001b[32m'session_id'\u001b[0m: \u001b[32m'2beb59a8-c81d-4655-ab8e-cd0b6c6d83d0'\u001b[0m,\n", + "\u001b[2;32m│ \u001b[0m\u001b[32m'session_name'\u001b[0m: \u001b[32m'evaluator_agent_0deb09c5-1204-49c6-8e91-51f73d883195'\u001b[0m,\n", + "\u001b[2;32m│ \u001b[0m\u001b[32m'started_at'\u001b[0m: \u001b[1;35mdatetime.datetime\u001b[0m\u001b[1m(\u001b[0m\u001b[1;36m2025\u001b[0m, \u001b[1;36m3\u001b[0m, \u001b[1;36m3\u001b[0m, \u001b[1;36m11\u001b[0m, \u001b[1;36m35\u001b[0m, \u001b[1;36m49\u001b[0m, \u001b[1;36m863796\u001b[0m\u001b[1m)\u001b[0m,\n", + "\u001b[2;32m│ \u001b[0m\u001b[32m'turns'\u001b[0m: \u001b[1m[\u001b[0m\n", + "\u001b[2;32m│ │ \u001b[0m\u001b[1m{\u001b[0m\n", + "\u001b[2;32m│ │ │ \u001b[0m\u001b[32m'input_messages'\u001b[0m: \u001b[1m[\u001b[0m\n", + "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[1m{\u001b[0m\n", + "\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[32m'content'\u001b[0m: \u001b[32m'```python\\nclass MinStack:\\n def __init__\u001b[0m\u001b[32m(\u001b[0m\u001b[32mself\u001b[0m\u001b[32m)\u001b[0m\u001b[32m:\\n self.main_stack = \u001b[0m\u001b[32m[\u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\n self.min_stack = \u001b[0m\u001b[32m[\u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\n\\n def push\u001b[0m\u001b[32m(\u001b[0m\u001b[32mself, x: int\u001b[0m\u001b[32m)\u001b[0m\u001b[32m -> None:\\n self.main_stack.append\u001b[0m\u001b[32m(\u001b[0m\u001b[32mx\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n if not self.min_stack or x \u001b[0m\u001b[32m<\u001b[0m\u001b[32m= self.min_stack\u001b[0m\u001b[32m[\u001b[0m\u001b[32m-1\u001b[0m\u001b[32m]\u001b[0m\u001b[32m:\\n self.min_stack.append\u001b[0m\u001b[32m(\u001b[0m\u001b[32mx\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n\\n def pop\u001b[0m\u001b[32m(\u001b[0m\u001b[32mself\u001b[0m\u001b[32m)\u001b[0m\u001b[32m -> None:\\n if self.main_stack:\\n if self.main_stack\u001b[0m\u001b[32m[\u001b[0m\u001b[32m-1\u001b[0m\u001b[32m]\u001b[0m\u001b[32m == self.min_stack\u001b[0m\u001b[32m[\u001b[0m\u001b[32m-1\u001b[0m\u001b[32m]\u001b[0m\u001b[32m:\\n self.min_stack.pop\u001b[0m\u001b[32m(\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n self.main_stack.pop\u001b[0m\u001b[32m(\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n\\n def getMin\u001b[0m\u001b[32m(\u001b[0m\u001b[32mself\u001b[0m\u001b[32m)\u001b[0m\u001b[32m -\u001b[0m\u001b[32m>\u001b[0m\u001b[32m int:\\n return self.min_stack\u001b[0m\u001b[32m[\u001b[0m\u001b[32m-1\u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\n```'\u001b[0m,\n", + "\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[32m'role'\u001b[0m: \u001b[32m'user'\u001b[0m,\n", + "\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[32m'context'\u001b[0m: \u001b[3;35mNone\u001b[0m\n", + "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[1m}\u001b[0m\n", + "\u001b[2;32m│ │ │ \u001b[0m\u001b[1m]\u001b[0m,\n", + "\u001b[2;32m│ │ │ \u001b[0m\u001b[32m'output_message'\u001b[0m: \u001b[1m{\u001b[0m\n", + "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[32m'content'\u001b[0m: \u001b[32m'\u001b[0m\u001b[32m{\u001b[0m\u001b[32m\"evaluation\": \"PASS\", \"feedback\": \"The provided code is correct, efficient, and well-structured. It correctly implements a MinStack with O\u001b[0m\u001b[32m(\u001b[0m\u001b[32m1\u001b[0m\u001b[32m)\u001b[0m\u001b[32m time complexity for push, pop, and getMin operations. The use of two stacks to keep track of the minimum element is a good approach. The code also follows best practices, with clear and concise method names, and proper handling of edge cases.\"\u001b[0m\u001b[32m}\u001b[0m\u001b[32m'\u001b[0m,\n", + "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[32m'role'\u001b[0m: \u001b[32m'assistant'\u001b[0m,\n", + "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[32m'stop_reason'\u001b[0m: \u001b[32m'end_of_turn'\u001b[0m,\n", + "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[32m'tool_calls'\u001b[0m: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n", + "\u001b[2;32m│ │ │ \u001b[0m\u001b[1m}\u001b[0m,\n", + "\u001b[2;32m│ │ │ \u001b[0m\u001b[32m'session_id'\u001b[0m: \u001b[32m'2beb59a8-c81d-4655-ab8e-cd0b6c6d83d0'\u001b[0m,\n", + "\u001b[2;32m│ │ │ \u001b[0m\u001b[32m'started_at'\u001b[0m: \u001b[1;35mdatetime.datetime\u001b[0m\u001b[1m(\u001b[0m\u001b[1;36m2025\u001b[0m, \u001b[1;36m3\u001b[0m, \u001b[1;36m3\u001b[0m, \u001b[1;36m11\u001b[0m, \u001b[1;36m35\u001b[0m, \u001b[1;36m55\u001b[0m, \u001b[1;36m387165\u001b[0m, \u001b[33mtzinfo\u001b[0m=\u001b[1;35mdatetime\u001b[0m\u001b[1;35m.timezone\u001b[0m\u001b[1m(\u001b[0m\u001b[1;35mdatetime.timedelta\u001b[0m\u001b[1m(\u001b[0m\u001b[33mdays\u001b[0m=\u001b[1;36m-1\u001b[0m, \u001b[33mseconds\u001b[0m=\u001b[1;36m57600\u001b[0m\u001b[1m)\u001b[0m\u001b[1m)\u001b[0m\u001b[1m)\u001b[0m,\n", + "\u001b[2;32m│ │ │ \u001b[0m\u001b[32m'steps'\u001b[0m: \u001b[1m[\u001b[0m\n", + "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[1m{\u001b[0m\n", + "\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[32m'model_response'\u001b[0m: \u001b[1m{\u001b[0m\n", + "\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[32m'content'\u001b[0m: \u001b[32m'\u001b[0m\u001b[32m{\u001b[0m\u001b[32m\"evaluation\": \"PASS\", \"feedback\": \"The provided code is correct, efficient, and well-structured. It correctly implements a MinStack with O\u001b[0m\u001b[32m(\u001b[0m\u001b[32m1\u001b[0m\u001b[32m)\u001b[0m\u001b[32m time complexity for push, pop, and getMin operations. The use of two stacks to keep track of the minimum element is a good approach. The code also follows best practices, with clear and concise method names, and proper handling of edge cases.\"\u001b[0m\u001b[32m}\u001b[0m\u001b[32m'\u001b[0m,\n", + "\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[32m'role'\u001b[0m: \u001b[32m'assistant'\u001b[0m,\n", + "\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[32m'stop_reason'\u001b[0m: \u001b[32m'end_of_turn'\u001b[0m,\n", + "\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[32m'tool_calls'\u001b[0m: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n", + "\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[1m}\u001b[0m,\n", + "\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[32m'step_id'\u001b[0m: \u001b[32m'01fccf0e-bc87-450e-9673-7a222d8b2044'\u001b[0m,\n", + "\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[32m'step_type'\u001b[0m: \u001b[32m'inference'\u001b[0m,\n", + "\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[32m'turn_id'\u001b[0m: \u001b[32m'cb4310bf-e31f-476f-9ca2-18f5dcfd16c9'\u001b[0m,\n", + "\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[32m'completed_at'\u001b[0m: \u001b[1;35mdatetime.datetime\u001b[0m\u001b[1m(\u001b[0m\u001b[1;36m2025\u001b[0m, \u001b[1;36m3\u001b[0m, \u001b[1;36m3\u001b[0m, \u001b[1;36m11\u001b[0m, \u001b[1;36m35\u001b[0m, \u001b[1;36m57\u001b[0m, \u001b[1;36m294525\u001b[0m, \u001b[33mtzinfo\u001b[0m=\u001b[1;35mTzInfo\u001b[0m\u001b[1m(\u001b[0m\u001b[1;36m-08\u001b[0m:\u001b[1;36m00\u001b[0m\u001b[1m)\u001b[0m\u001b[1m)\u001b[0m,\n", + "\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[32m'started_at'\u001b[0m: \u001b[1;35mdatetime.datetime\u001b[0m\u001b[1m(\u001b[0m\u001b[1;36m2025\u001b[0m, \u001b[1;36m3\u001b[0m, \u001b[1;36m3\u001b[0m, \u001b[1;36m11\u001b[0m, \u001b[1;36m35\u001b[0m, \u001b[1;36m55\u001b[0m, \u001b[1;36m398588\u001b[0m, \u001b[33mtzinfo\u001b[0m=\u001b[1;35mTzInfo\u001b[0m\u001b[1m(\u001b[0m\u001b[1;36m-08\u001b[0m:\u001b[1;36m00\u001b[0m\u001b[1m)\u001b[0m\u001b[1m)\u001b[0m\n", + "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[1m}\u001b[0m\n", + "\u001b[2;32m│ │ │ \u001b[0m\u001b[1m]\u001b[0m,\n", + "\u001b[2;32m│ │ │ \u001b[0m\u001b[32m'turn_id'\u001b[0m: \u001b[32m'cb4310bf-e31f-476f-9ca2-18f5dcfd16c9'\u001b[0m,\n", + "\u001b[2;32m│ │ │ \u001b[0m\u001b[32m'completed_at'\u001b[0m: \u001b[1;35mdatetime.datetime\u001b[0m\u001b[1m(\u001b[0m\u001b[1;36m2025\u001b[0m, \u001b[1;36m3\u001b[0m, \u001b[1;36m3\u001b[0m, \u001b[1;36m11\u001b[0m, \u001b[1;36m35\u001b[0m, \u001b[1;36m57\u001b[0m, \u001b[1;36m306549\u001b[0m, \u001b[33mtzinfo\u001b[0m=\u001b[1;35mTzInfo\u001b[0m\u001b[1m(\u001b[0m\u001b[1;36m-08\u001b[0m:\u001b[1;36m00\u001b[0m\u001b[1m)\u001b[0m\u001b[1m)\u001b[0m,\n", + "\u001b[2;32m│ │ │ \u001b[0m\u001b[32m'output_attachments'\u001b[0m: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n", + "\u001b[2;32m│ │ \u001b[0m\u001b[1m}\u001b[0m\n", + "\u001b[2;32m│ \u001b[0m\u001b[1m]\u001b[0m\n", + "\u001b[1m}\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "generator_agent_session = client.agents.session.retrieve(session_id=generator_session_id, agent_id=generator_agent.agent_id)\n", + "pprint(generator_agent_session.to_dict())\n", + "\n", + "evaluator_agent_session = client.agents.session.retrieve(session_id=evaluator_session_id, agent_id=evaluator_agent.agent_id)\n", + "pprint(evaluator_agent_session.to_dict())" + ] + }, { "cell_type": "code", "execution_count": null,