From cd79fe99be0e14346d3042bf80938553856a71f7 Mon Sep 17 00:00:00 2001 From: ThomasTaroni Date: Fri, 25 Apr 2025 18:20:51 +0200 Subject: [PATCH 01/55] Refactor `ReportGenerator` and integrate custom logs handler Introduced `CustomLogsHandler` to manage log handling and WebSocket integration in `ReportGenerator`. Simplified and restructured report generation logic for improved maintainability. Removed obsolete methods and enhanced overall readability with cleaner code structure. --- .../gptresearch/deepresearch.py | 114 +++++------------- 1 file changed, 27 insertions(+), 87 deletions(-) diff --git a/src/phoenix_technologies/gptresearch/deepresearch.py b/src/phoenix_technologies/gptresearch/deepresearch.py index b128433..e693f45 100644 --- a/src/phoenix_technologies/gptresearch/deepresearch.py +++ b/src/phoenix_technologies/gptresearch/deepresearch.py @@ -1,107 +1,47 @@ from gpt_researcher import GPTResearcher +from typing import Dict, Any, AsyncGenerator +class CustomLogsHandler: + """A custom Logs handler class to handle JSON data.""" + def __init__(self): + self.logs = [] # Initialize logs to store data + + async def send_json(self, data: Dict[str, Any]) -> AsyncGenerator[str, Any]: + """Send JSON data and log it.""" + yield f"My custom Log: {data}" + class ReportGenerator: - """ - A class to handle the generation of research-based reports. - - This class integrates with GPTResearcher to conduct research, retrieve results, - and format them into consumable chunks for asynchronous streaming. - """ - def __init__(self, query: str, report_type: str): """ - Initializes the ReportGenerator instance with a query and report type. - - :param query: The main topic or question for research. - :param report_type: The type of report to generate. + Initializes the ReportGenerator with a query and report type. """ self.query = query self.report_type = report_type - self.researcher = GPTResearcher(query, report_type) - self._chunks = None # Placeholder for report chunks - self._index = 0 # Iterator index for streaming + # Initialize researcher with a custom WebSocket + self.custom_logs_handler = CustomLogsHandler() - def __aiter__(self): + self.researcher = GPTResearcher(query, report_type, websocket=self.custom_logs_handler) + + async def generate_report(self): """ - Makes the ReportGenerator instance asynchronously iterable. - - :return: Self instance for iteration. + Conducts research and generates the report along with additional information. """ - return self + # Conduct research + research_result = await self.researcher.conduct_research() + report = await self.researcher.write_report() - async def __anext__(self): - """ - Defines the logic for asynchronous iteration over report chunks. + # Retrieve additional information + research_context = self.researcher.get_research_context() + research_costs = self.researcher.get_costs() + research_images = self.researcher.get_research_images() + research_sources = self.researcher.get_research_sources() - :return: The next chunk of the report. - :raises StopAsyncIteration: Raised when all chunks are yielded. - """ - if self._chunks is None: - # Generate report chunks on first access - self._chunks = await self._generate_report_chunks() - - if self._index >= len(self._chunks): - # End iteration once all chunks are exhausted - raise StopAsyncIteration - - chunk = self._chunks[self._index] - self._index += 1 - return chunk - - async def _generate_report_chunks(self): - """ - Conducts research using the researcher and generates the report in chunks. - - :return: A list of report chunks. - """ - try: - # Asynchronous research and report generation - research_result = await self.researcher.conduct_research() - report = await self.researcher.write_report() - - # Retrieve additional research details - research_context = self.researcher.get_research_context() or {} - research_costs = self.researcher.get_costs() or {} - research_images = self.researcher.get_research_images() or [] - research_sources = self.researcher.get_research_sources() or [] - - # Construct the complete research response - full_report = { - "report": report, - "context": research_context, - "costs": research_costs, - "images": research_images, - "sources": research_sources, - } - - # Generate chunks for streaming - return self._split_into_chunks(full_report) - - except Exception as e: - # Handle potential errors during research and report generation - raise RuntimeError(f"Error generating report chunks: {e}") - - def _split_into_chunks(self, report): - """ - Splits the report dictionary into smaller chunks for easier streaming. - - :param report: A dictionary containing the full report data. - :return: A list of formatted text chunks. - """ - if not report: - raise ValueError("Cannot split an empty or None report into chunks.") - - chunks = [] - for key, value in report.items(): - chunks.append(f"{key}: {value}") - return chunks + return self.custom_logs_handler def get_query_details(self): """ - Retrieves the details of the query and report type. - - :return: A dictionary containing the query and report type. + Returns details of the query and report type. """ return { "query": self.query, From 14429fc6f7edf7d1ade2dc704a8f7872d2ae13ac Mon Sep 17 00:00:00 2001 From: ThomasTaroni Date: Fri, 25 Apr 2025 18:35:30 +0200 Subject: [PATCH 02/55] Refactor logging and improve async iteration in report generation Refactored `CustomLogsHandler` to accept an optional logs list and modified its method to append data instead of yielding. Enhanced `ReportGenerator` with proper asynchronous iterator implementation, enabling smooth log handling and retrieval. Simplified variable naming for better readability. --- src/main.py | 6 ++-- .../gptresearch/deepresearch.py | 31 +++++++++++++++---- 2 files changed, 28 insertions(+), 9 deletions(-) diff --git a/src/main.py b/src/main.py index 95db1a8..3bbc6cf 100644 --- a/src/main.py +++ b/src/main.py @@ -36,9 +36,9 @@ async def get_report_endpoint(request: ReportRequest): async def generate_report(): try: # Call the asynchronous get_report function - report_generator = ReportGenerator(request.query, request.report_type) - async for chunk in report_generator: - yield chunk + generator = ReportGenerator(request.query, request.report_type) + async for log in generator: + yield log except Exception as e: yield f"Error: {str(e)}" diff --git a/src/phoenix_technologies/gptresearch/deepresearch.py b/src/phoenix_technologies/gptresearch/deepresearch.py index e693f45..add36e3 100644 --- a/src/phoenix_technologies/gptresearch/deepresearch.py +++ b/src/phoenix_technologies/gptresearch/deepresearch.py @@ -1,15 +1,18 @@ from gpt_researcher import GPTResearcher -from typing import Dict, Any, AsyncGenerator +from typing import Dict, Any, AsyncGenerator, Coroutine class CustomLogsHandler: """A custom Logs handler class to handle JSON data.""" - def __init__(self): - self.logs = [] # Initialize logs to store data + def __init__(self, logs=None): + if logs is None: + logs = [] + self.logs = logs # Initialize logs to store data - async def send_json(self, data: Dict[str, Any]) -> AsyncGenerator[str, Any]: + async def send_json(self, data: Dict[str, Any]) -> None: """Send JSON data and log it.""" - yield f"My custom Log: {data}" + self.logs.append(data) # Append data to logs + print(f"My custom Log: {data}") # For demonstration, print the log class ReportGenerator: def __init__(self, query: str, report_type: str): @@ -19,7 +22,8 @@ class ReportGenerator: self.query = query self.report_type = report_type # Initialize researcher with a custom WebSocket - self.custom_logs_handler = CustomLogsHandler() + self.logs = [] + self.custom_logs_handler = CustomLogsHandler(self.logs) self.researcher = GPTResearcher(query, report_type, websocket=self.custom_logs_handler) @@ -47,3 +51,18 @@ class ReportGenerator: "query": self.query, "report_type": self.report_type } + + # Implementing the asynchronous iterator protocol + def __aiter__(self): + """Initialize and return the asynchronous iterator.""" + self._log_index = 0 # Iterator index + return self + + async def __anext__(self): + """Return the next log asynchronously.""" + if self._log_index < len(self.logs): + log = self.logs[self._log_index] + self._log_index += 1 + return log # Return the next log + else: + raise StopAsyncIteration # Stop when logs are exhausted From ade4511a8718ee090981f36c28d9e3d4da2b900c Mon Sep 17 00:00:00 2001 From: ThomasTaroni Date: Fri, 25 Apr 2025 18:39:33 +0200 Subject: [PATCH 03/55] Add report generation step to the main async process The `generate_report` method is now explicitly called within the main process, ensuring the researcher's report generation is initiated properly. Updated its return type annotation for clarity in `deepresearch.py`. --- src/main.py | 1 + src/phoenix_technologies/gptresearch/deepresearch.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/src/main.py b/src/main.py index 3bbc6cf..b223552 100644 --- a/src/main.py +++ b/src/main.py @@ -37,6 +37,7 @@ async def get_report_endpoint(request: ReportRequest): try: # Call the asynchronous get_report function generator = ReportGenerator(request.query, request.report_type) + await generator.generate_report() async for log in generator: yield log except Exception as e: diff --git a/src/phoenix_technologies/gptresearch/deepresearch.py b/src/phoenix_technologies/gptresearch/deepresearch.py index add36e3..d8e0adc 100644 --- a/src/phoenix_technologies/gptresearch/deepresearch.py +++ b/src/phoenix_technologies/gptresearch/deepresearch.py @@ -27,7 +27,7 @@ class ReportGenerator: self.researcher = GPTResearcher(query, report_type, websocket=self.custom_logs_handler) - async def generate_report(self): + async def generate_report(self) -> CustomLogsHandler: """ Conducts research and generates the report along with additional information. """ From 327758f00f4f5c5bd8d0dcd66a1d3f49262140fd Mon Sep 17 00:00:00 2001 From: ThomasTaroni Date: Fri, 25 Apr 2025 18:42:40 +0200 Subject: [PATCH 04/55] Refactor async logging logic in report generation. Removed unused log yielding in `main.py` to simplify the flow. Updated `send_json` in `deepresearch.py` to use an async generator for more flexible streaming of log data. --- src/main.py | 2 -- src/phoenix_technologies/gptresearch/deepresearch.py | 5 +++-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/src/main.py b/src/main.py index b223552..f147b19 100644 --- a/src/main.py +++ b/src/main.py @@ -38,8 +38,6 @@ async def get_report_endpoint(request: ReportRequest): # Call the asynchronous get_report function generator = ReportGenerator(request.query, request.report_type) await generator.generate_report() - async for log in generator: - yield log except Exception as e: yield f"Error: {str(e)}" diff --git a/src/phoenix_technologies/gptresearch/deepresearch.py b/src/phoenix_technologies/gptresearch/deepresearch.py index d8e0adc..3724c99 100644 --- a/src/phoenix_technologies/gptresearch/deepresearch.py +++ b/src/phoenix_technologies/gptresearch/deepresearch.py @@ -9,10 +9,11 @@ class CustomLogsHandler: logs = [] self.logs = logs # Initialize logs to store data - async def send_json(self, data: Dict[str, Any]) -> None: + async def send_json(self, data: Dict[str, Any]) -> AsyncGenerator[str, Any]: """Send JSON data and log it.""" self.logs.append(data) # Append data to logs - print(f"My custom Log: {data}") # For demonstration, print the log + print(f"My custom Log: {data}") + yield f"My custom Log: {data}" class ReportGenerator: def __init__(self, query: str, report_type: str): From 1dbf774d55dcfc27c70e2f9e209c7ab61b80d824 Mon Sep 17 00:00:00 2001 From: ThomasTaroni Date: Fri, 25 Apr 2025 18:46:48 +0200 Subject: [PATCH 05/55] Refactor async methods to adjust return types. Changed `send_json` to return `None` instead of yielding and updated log retrieval to use `yield` in place of `return`. These modifications align function behavior with intended asynchronous use cases and improve consistency. --- src/phoenix_technologies/gptresearch/deepresearch.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/phoenix_technologies/gptresearch/deepresearch.py b/src/phoenix_technologies/gptresearch/deepresearch.py index 3724c99..f10497e 100644 --- a/src/phoenix_technologies/gptresearch/deepresearch.py +++ b/src/phoenix_technologies/gptresearch/deepresearch.py @@ -9,11 +9,10 @@ class CustomLogsHandler: logs = [] self.logs = logs # Initialize logs to store data - async def send_json(self, data: Dict[str, Any]) -> AsyncGenerator[str, Any]: + async def send_json(self, data: Dict[str, Any]) -> None: """Send JSON data and log it.""" self.logs.append(data) # Append data to logs print(f"My custom Log: {data}") - yield f"My custom Log: {data}" class ReportGenerator: def __init__(self, query: str, report_type: str): @@ -64,6 +63,6 @@ class ReportGenerator: if self._log_index < len(self.logs): log = self.logs[self._log_index] self._log_index += 1 - return log # Return the next log + yield log # Return the next log else: raise StopAsyncIteration # Stop when logs are exhausted From 936eb6f39454e3119b00dca7d4acabb006b00d86 Mon Sep 17 00:00:00 2001 From: ThomasTaroni Date: Fri, 25 Apr 2025 18:56:41 +0200 Subject: [PATCH 06/55] Refactor report generation and logging handlers. Introduced `CustomLogsHandler` to streamline log management and updated `ReportGenerator` to simplify report generation output. Removed unused asynchronous iterator methods, improving code clarity and reducing complexity. --- src/main.py | 3 ++- .../gptresearch/deepresearch.py | 23 +++---------------- 2 files changed, 5 insertions(+), 21 deletions(-) diff --git a/src/main.py b/src/main.py index f147b19..41ac4aa 100644 --- a/src/main.py +++ b/src/main.py @@ -37,7 +37,8 @@ async def get_report_endpoint(request: ReportRequest): try: # Call the asynchronous get_report function generator = ReportGenerator(request.query, request.report_type) - await generator.generate_report() + custom_logs_handler = await generator.generate_report() + yield "Report generation completed successfully!\n" except Exception as e: yield f"Error: {str(e)}" diff --git a/src/phoenix_technologies/gptresearch/deepresearch.py b/src/phoenix_technologies/gptresearch/deepresearch.py index f10497e..4a21a1e 100644 --- a/src/phoenix_technologies/gptresearch/deepresearch.py +++ b/src/phoenix_technologies/gptresearch/deepresearch.py @@ -4,15 +4,13 @@ from typing import Dict, Any, AsyncGenerator, Coroutine class CustomLogsHandler: """A custom Logs handler class to handle JSON data.""" - def __init__(self, logs=None): - if logs is None: - logs = [] - self.logs = logs # Initialize logs to store data + def __init__(self): + self.logs = [] # Initialize logs to store data async def send_json(self, data: Dict[str, Any]) -> None: """Send JSON data and log it.""" self.logs.append(data) # Append data to logs - print(f"My custom Log: {data}") + print(f"My custom Log: {data}") # For demonstration, print the log class ReportGenerator: def __init__(self, query: str, report_type: str): @@ -51,18 +49,3 @@ class ReportGenerator: "query": self.query, "report_type": self.report_type } - - # Implementing the asynchronous iterator protocol - def __aiter__(self): - """Initialize and return the asynchronous iterator.""" - self._log_index = 0 # Iterator index - return self - - async def __anext__(self): - """Return the next log asynchronously.""" - if self._log_index < len(self.logs): - log = self.logs[self._log_index] - self._log_index += 1 - yield log # Return the next log - else: - raise StopAsyncIteration # Stop when logs are exhausted From ffeb2527b35b56397bddb1070e9dd1ac11d077aa Mon Sep 17 00:00:00 2001 From: ThomasTaroni Date: Fri, 25 Apr 2025 18:58:40 +0200 Subject: [PATCH 07/55] Refactor log handling in deepresearch module Removed the unnecessary `logs` initialization and streamlined the `CustomLogsHandler` setup. This simplifies the code and avoids redundant log storage. --- src/phoenix_technologies/gptresearch/deepresearch.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/phoenix_technologies/gptresearch/deepresearch.py b/src/phoenix_technologies/gptresearch/deepresearch.py index 4a21a1e..919811c 100644 --- a/src/phoenix_technologies/gptresearch/deepresearch.py +++ b/src/phoenix_technologies/gptresearch/deepresearch.py @@ -20,8 +20,7 @@ class ReportGenerator: self.query = query self.report_type = report_type # Initialize researcher with a custom WebSocket - self.logs = [] - self.custom_logs_handler = CustomLogsHandler(self.logs) + self.custom_logs_handler = CustomLogsHandler() self.researcher = GPTResearcher(query, report_type, websocket=self.custom_logs_handler) From ae4e81906e239d6fbca4052b8942dd324555e00d Mon Sep 17 00:00:00 2001 From: ThomasTaroni Date: Fri, 25 Apr 2025 19:26:18 +0200 Subject: [PATCH 08/55] Add progress messages to report generation process This change introduces status updates to indicate the start and successful completion of the report generation. These progress messages improve user feedback during the asynchronous operation. --- src/main.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/main.py b/src/main.py index 41ac4aa..d32018f 100644 --- a/src/main.py +++ b/src/main.py @@ -36,6 +36,7 @@ async def get_report_endpoint(request: ReportRequest): async def generate_report(): try: # Call the asynchronous get_report function + yield "Report generation started...\n" generator = ReportGenerator(request.query, request.report_type) custom_logs_handler = await generator.generate_report() yield "Report generation completed successfully!\n" From cb5fe35d24ded2d727f2d75a2d237a2a6a7ad2c9 Mon Sep 17 00:00:00 2001 From: ThomasTaroni Date: Fri, 25 Apr 2025 19:47:34 +0200 Subject: [PATCH 09/55] Refactor report generation flow to include progress streaming Modified the report generation process to stream logs incrementally during execution. Introduced new methods `init` and `is_complete` to enhance modularity and track completion status. Adjusted generator logic to better manage and yield logs in real-time. --- src/main.py | 17 ++++++++++++++++- .../gptresearch/deepresearch.py | 12 +++++++++--- 2 files changed, 25 insertions(+), 4 deletions(-) diff --git a/src/main.py b/src/main.py index d32018f..1905ff4 100644 --- a/src/main.py +++ b/src/main.py @@ -38,8 +38,23 @@ async def get_report_endpoint(request: ReportRequest): # Call the asynchronous get_report function yield "Report generation started...\n" generator = ReportGenerator(request.query, request.report_type) - custom_logs_handler = await generator.generate_report() + custom_logs_handler = generator.init() + generator.generate_report() yield "Report generation completed successfully!\n" + index = 0 + while not generator.is_complete(): + # If there are more logs to send, yield them + if index < len(custom_logs_handler.logs): + log_entry = custom_logs_handler.logs[index] + index += 1 + yield f"{log_entry}\n" # Convert logs to string for streaming + else: + # Wait briefly to avoid aggressive looping + await asyncio.sleep(0.1) + # Stop if processing is complete and no more logs remain + if generator.researcher.is_complete: + break + except Exception as e: yield f"Error: {str(e)}" diff --git a/src/phoenix_technologies/gptresearch/deepresearch.py b/src/phoenix_technologies/gptresearch/deepresearch.py index 919811c..77ad753 100644 --- a/src/phoenix_technologies/gptresearch/deepresearch.py +++ b/src/phoenix_technologies/gptresearch/deepresearch.py @@ -21,10 +21,17 @@ class ReportGenerator: self.report_type = report_type # Initialize researcher with a custom WebSocket self.custom_logs_handler = CustomLogsHandler() + self.complete = False self.researcher = GPTResearcher(query, report_type, websocket=self.custom_logs_handler) - async def generate_report(self) -> CustomLogsHandler: + def init(self) -> CustomLogsHandler: + return self.custom_logs_handler + + def is_complete(self): + return self.complete + + async def generate_report(self) -> None: """ Conducts research and generates the report along with additional information. """ @@ -37,8 +44,7 @@ class ReportGenerator: research_costs = self.researcher.get_costs() research_images = self.researcher.get_research_images() research_sources = self.researcher.get_research_sources() - - return self.custom_logs_handler + self.complete = True def get_query_details(self): """ From b16305e369f70ff5a18391b28a7544387e020c74 Mon Sep 17 00:00:00 2001 From: ThomasTaroni Date: Fri, 25 Apr 2025 19:51:38 +0200 Subject: [PATCH 10/55] Fix missing parentheses in is_complete method call The is_complete method was incorrectly referenced without parentheses, causing a potential runtime error. This fix ensures proper method invocation and prevents the loop from behaving incorrectly. --- src/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main.py b/src/main.py index 1905ff4..02ca60f 100644 --- a/src/main.py +++ b/src/main.py @@ -52,7 +52,7 @@ async def get_report_endpoint(request: ReportRequest): # Wait briefly to avoid aggressive looping await asyncio.sleep(0.1) # Stop if processing is complete and no more logs remain - if generator.researcher.is_complete: + if generator.researcher.is_complete(): break except Exception as e: From 79be94afd2c863c0a42a09e9e8baf6c247dbc469 Mon Sep 17 00:00:00 2001 From: ThomasTaroni Date: Fri, 25 Apr 2025 19:53:59 +0200 Subject: [PATCH 11/55] Refactor generator completion checks for consistency. Replaced `.is_complete()` method calls with direct `.complete` attribute access to streamline the code. Removed the redundant `is_complete()` method from `deepresearch.py` to reduce unnecessary indirection. This simplifies the logic and improves readability. --- src/main.py | 4 ++-- src/phoenix_technologies/gptresearch/deepresearch.py | 3 --- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/src/main.py b/src/main.py index 02ca60f..0b432c8 100644 --- a/src/main.py +++ b/src/main.py @@ -42,7 +42,7 @@ async def get_report_endpoint(request: ReportRequest): generator.generate_report() yield "Report generation completed successfully!\n" index = 0 - while not generator.is_complete(): + while not generator.complete: # If there are more logs to send, yield them if index < len(custom_logs_handler.logs): log_entry = custom_logs_handler.logs[index] @@ -52,7 +52,7 @@ async def get_report_endpoint(request: ReportRequest): # Wait briefly to avoid aggressive looping await asyncio.sleep(0.1) # Stop if processing is complete and no more logs remain - if generator.researcher.is_complete(): + if generator.researcher.complete: break except Exception as e: diff --git a/src/phoenix_technologies/gptresearch/deepresearch.py b/src/phoenix_technologies/gptresearch/deepresearch.py index 77ad753..61f1aec 100644 --- a/src/phoenix_technologies/gptresearch/deepresearch.py +++ b/src/phoenix_technologies/gptresearch/deepresearch.py @@ -28,9 +28,6 @@ class ReportGenerator: def init(self) -> CustomLogsHandler: return self.custom_logs_handler - def is_complete(self): - return self.complete - async def generate_report(self) -> None: """ Conducts research and generates the report along with additional information. From 6ee47f02c06e37bfe4d76485fbd61b0b42f1ce08 Mon Sep 17 00:00:00 2001 From: ThomasTaroni Date: Fri, 25 Apr 2025 19:57:01 +0200 Subject: [PATCH 12/55] Fix incorrect attribute access for generator completion check Replaced `generator.researcher.complete` with the correct `generator.complete`. This resolves a potential logic error and ensures the completion condition is properly evaluated. --- src/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main.py b/src/main.py index 0b432c8..aa7b992 100644 --- a/src/main.py +++ b/src/main.py @@ -52,7 +52,7 @@ async def get_report_endpoint(request: ReportRequest): # Wait briefly to avoid aggressive looping await asyncio.sleep(0.1) # Stop if processing is complete and no more logs remain - if generator.researcher.complete: + if generator.complete: break except Exception as e: From 1472e277bc7afef8278faf2af2837b58a9c48fba Mon Sep 17 00:00:00 2001 From: ThomasTaroni Date: Fri, 25 Apr 2025 22:07:25 +0200 Subject: [PATCH 13/55] Refactor report generation with async streaming and logging Enhanced report generation by integrating `CustomLogsHandler` and separating tasks for generating reports and streaming logs. Replaced the previous implementation with a unified async generator to handle concurrent execution and improve error handling. Updated module exports to include the new `CustomLogsHandler` component. --- src/main.py | 48 ++++++++++++++++++++-------- src/phoenix_technologies/__init__.py | 4 +-- 2 files changed, 36 insertions(+), 16 deletions(-) diff --git a/src/main.py b/src/main.py index aa7b992..fa46b69 100644 --- a/src/main.py +++ b/src/main.py @@ -1,6 +1,6 @@ from fastapi import FastAPI, HTTPException, Request, Depends from pydantic import BaseModel -from phoenix_technologies import ReportGenerator +from phoenix_technologies import ReportGenerator, CustomLogsHandler from fastapi.responses import StreamingResponse import os import asyncio @@ -33,14 +33,20 @@ async def get_report_endpoint(request: ReportRequest): Expose the `get_report` function as a POST API endpoint, with a streaming response. """ - async def generate_report(): + # Initialize the ReportGenerator and CustomLogsHandler + generator = ReportGenerator(request.query, request.report_type) + custom_logs_handler = generator.init() + + # Define a coroutine to run `generate_report` in a separate thread + async def generate_report_thread(generator: ReportGenerator): + try: + await asyncio.to_thread(generator.generate_report) + except Exception as e: + print(f"Error during report generation: {str(e)}") + + # Define a coroutine for streaming logs + async def get_logs_thread(generator: ReportGenerator, custom_logs_handler: CustomLogsHandler): try: - # Call the asynchronous get_report function - yield "Report generation started...\n" - generator = ReportGenerator(request.query, request.report_type) - custom_logs_handler = generator.init() - generator.generate_report() - yield "Report generation completed successfully!\n" index = 0 while not generator.complete: # If there are more logs to send, yield them @@ -51,13 +57,27 @@ async def get_report_endpoint(request: ReportRequest): else: # Wait briefly to avoid aggressive looping await asyncio.sleep(0.1) - # Stop if processing is complete and no more logs remain - if generator.complete: - break + except Exception as e: + print(f"Error while fetching logs: {str(e)}") + yield f"Error: {str(e)}" + # Define an asynchronous generator to stream output + async def combined_stream(): + try: + # Run both tasks concurrently + task1 = asyncio.create_task(generate_report_thread(generator)) + task2 = asyncio.create_task(get_logs_thread(generator, custom_logs_handler)) + + # Wait for logs and stream output + async for log_entry in get_logs_thread(generator, custom_logs_handler): + yield log_entry + + # Wait for both tasks to finish + await asyncio.gather(task1, task2) + + yield "\nReport generation completed successfully!\n" except Exception as e: yield f"Error: {str(e)}" - # Return streaming response - return StreamingResponse(generate_report(), media_type="text/plain") - + # Return the combined async generator as a streaming response + return StreamingResponse(combined_stream(), media_type="text/plain") diff --git a/src/phoenix_technologies/__init__.py b/src/phoenix_technologies/__init__.py index 142e2fc..d7e2062 100644 --- a/src/phoenix_technologies/__init__.py +++ b/src/phoenix_technologies/__init__.py @@ -1,4 +1,4 @@ # phoenix-technologies/__init__.py -from .gptresearch.deepresearch import ReportGenerator +from .gptresearch.deepresearch import ReportGenerator, CustomLogsHandler -__all__ = ["ReportGenerator"] \ No newline at end of file +__all__ = ["ReportGenerator", "CustomLogsHandler"] \ No newline at end of file From e2b4ba5a7db4c54934972742acc6fe4eb587ba11 Mon Sep 17 00:00:00 2001 From: ThomasTaroni Date: Fri, 25 Apr 2025 22:11:19 +0200 Subject: [PATCH 14/55] Refactor log streaming and report generation logic Simplified the logic for log streaming by consolidating into a single async generator (`log_stream`). Removed redundant tasks and streamlined report generation to improve code readability and maintainability. --- src/main.py | 40 +++++++++++++++++----------------------- 1 file changed, 17 insertions(+), 23 deletions(-) diff --git a/src/main.py b/src/main.py index fa46b69..0f897d4 100644 --- a/src/main.py +++ b/src/main.py @@ -1,4 +1,4 @@ -from fastapi import FastAPI, HTTPException, Request, Depends +from fastapi import FastAPI, HTTPException, Request, Depends from pydantic import BaseModel from phoenix_technologies import ReportGenerator, CustomLogsHandler from fastapi.responses import StreamingResponse @@ -8,17 +8,21 @@ import asyncio # FastAPI app instance app = FastAPI() + # Define a request body structure using Pydantic class ReportRequest(BaseModel): query: str report_type: str + # Define a dependency to validate the API Key def verify_api_key(request: Request): # Define the API key from the environment variables expected_api_key = os.getenv("API_KEY", None) if not expected_api_key: - raise HTTPException(status_code=500, detail="API key is not configured on the server.") + raise HTTPException( + status_code=500, detail="API key is not configured on the server." + ) # Get the API key from the request headers provided_api_key = request.headers.get("X-API-KEY", None) @@ -27,6 +31,7 @@ def verify_api_key(request: Request): if not provided_api_key or provided_api_key != expected_api_key: raise HTTPException(status_code=403, detail="Invalid or missing API key.") + @app.post("/get_report", dependencies=[Depends(verify_api_key)]) async def get_report_endpoint(request: ReportRequest): """ @@ -40,12 +45,13 @@ async def get_report_endpoint(request: ReportRequest): # Define a coroutine to run `generate_report` in a separate thread async def generate_report_thread(generator: ReportGenerator): try: + # Run blocking code in a thread pool await asyncio.to_thread(generator.generate_report) except Exception as e: print(f"Error during report generation: {str(e)}") - # Define a coroutine for streaming logs - async def get_logs_thread(generator: ReportGenerator, custom_logs_handler: CustomLogsHandler): + # Define an asynchronous generator for streaming logs + async def log_stream(): try: index = 0 while not generator.complete: @@ -57,27 +63,15 @@ async def get_report_endpoint(request: ReportRequest): else: # Wait briefly to avoid aggressive looping await asyncio.sleep(0.1) + + # After completion, include a success message + yield "\nReport generation completed successfully!\n" except Exception as e: print(f"Error while fetching logs: {str(e)}") yield f"Error: {str(e)}" - # Define an asynchronous generator to stream output - async def combined_stream(): - try: - # Run both tasks concurrently - task1 = asyncio.create_task(generate_report_thread(generator)) - task2 = asyncio.create_task(get_logs_thread(generator, custom_logs_handler)) + # Run the report generation task concurrently with the log streaming + asyncio.create_task(generate_report_thread(generator)) - # Wait for logs and stream output - async for log_entry in get_logs_thread(generator, custom_logs_handler): - yield log_entry - - # Wait for both tasks to finish - await asyncio.gather(task1, task2) - - yield "\nReport generation completed successfully!\n" - except Exception as e: - yield f"Error: {str(e)}" - - # Return the combined async generator as a streaming response - return StreamingResponse(combined_stream(), media_type="text/plain") + # Return the `log_stream` async generator as a streaming response + return StreamingResponse(log_stream(), media_type="text/plain") From 73e929ca00ee5ef8546188372b5cf396fb0f3886 Mon Sep 17 00:00:00 2001 From: ThomasTaroni Date: Sat, 26 Apr 2025 08:38:02 +0200 Subject: [PATCH 15/55] Refactor report generation with simplified log streaming Replaced custom log handler and async report generation logic with a simplified fake data streamer for the StreamingResponse. Added uvicorn server startup code for direct script execution. --- src/main.py | 48 +++++++++++++----------------------------------- 1 file changed, 13 insertions(+), 35 deletions(-) diff --git a/src/main.py b/src/main.py index 0f897d4..be5b167 100644 --- a/src/main.py +++ b/src/main.py @@ -1,19 +1,23 @@ +import uvicorn from fastapi import FastAPI, HTTPException, Request, Depends from pydantic import BaseModel from phoenix_technologies import ReportGenerator, CustomLogsHandler from fastapi.responses import StreamingResponse +from typing import Dict, Any, AsyncGenerator, Coroutine, Generator import os import asyncio +import time # FastAPI app instance app = FastAPI() - # Define a request body structure using Pydantic class ReportRequest(BaseModel): query: str report_type: str +# Shared log array using asyncio.Queue +log_queue = asyncio.Queue() # Define a dependency to validate the API Key def verify_api_key(request: Request): @@ -38,40 +42,14 @@ async def get_report_endpoint(request: ReportRequest): Expose the `get_report` function as a POST API endpoint, with a streaming response. """ - # Initialize the ReportGenerator and CustomLogsHandler - generator = ReportGenerator(request.query, request.report_type) - custom_logs_handler = generator.init() + def fake_data_streamer(): + for i in range(5): + yield f"My custom Log: {i}" + time.sleep(5) - # Define a coroutine to run `generate_report` in a separate thread - async def generate_report_thread(generator: ReportGenerator): - try: - # Run blocking code in a thread pool - await asyncio.to_thread(generator.generate_report) - except Exception as e: - print(f"Error during report generation: {str(e)}") + # Return streaming response + return StreamingResponse(fake_data_streamer(), media_type="text/plain") - # Define an asynchronous generator for streaming logs - async def log_stream(): - try: - index = 0 - while not generator.complete: - # If there are more logs to send, yield them - if index < len(custom_logs_handler.logs): - log_entry = custom_logs_handler.logs[index] - index += 1 - yield f"{log_entry}\n" # Convert logs to string for streaming - else: - # Wait briefly to avoid aggressive looping - await asyncio.sleep(0.1) - # After completion, include a success message - yield "\nReport generation completed successfully!\n" - except Exception as e: - print(f"Error while fetching logs: {str(e)}") - yield f"Error: {str(e)}" - - # Run the report generation task concurrently with the log streaming - asyncio.create_task(generate_report_thread(generator)) - - # Return the `log_stream` async generator as a streaming response - return StreamingResponse(log_stream(), media_type="text/plain") +if __name__ == "__main__": + uvicorn.run(app='main:app', host="127.0.0.1", port=8000) \ No newline at end of file From 44b91b937545f0f6c8cabe44c988f514a889259d Mon Sep 17 00:00:00 2001 From: ThomasTaroni Date: Sat, 26 Apr 2025 17:54:43 +0200 Subject: [PATCH 16/55] Refactor codebase to implement MCP server for GPT Researcher Replaced FastAPI app with an MCP server implementation, enhancing flexibility and modularity for research operations. Deprecated `phoenix_technologies` package, updated server logic, added utility functions, and revised dependencies in `requirements.txt`. Updated Dockerfile and README to align with the new architecture. --- Dockerfile | 2 +- README.md | 223 ++++----------- requirements.txt | 17 +- src/__init__.py | 8 + src/main.py | 55 ---- src/phoenix_technologies/__init__.py | 4 - .../gptresearch/__init__.py | 0 .../gptresearch/deepresearch.py | 53 ---- src/server.py | 261 ++++++++++++++++++ src/utils.py | 139 ++++++++++ 10 files changed, 481 insertions(+), 281 deletions(-) delete mode 100644 src/main.py delete mode 100644 src/phoenix_technologies/__init__.py delete mode 100644 src/phoenix_technologies/gptresearch/__init__.py delete mode 100644 src/phoenix_technologies/gptresearch/deepresearch.py create mode 100644 src/server.py create mode 100644 src/utils.py diff --git a/Dockerfile b/Dockerfile index 51f00ad..b6228aa 100644 --- a/Dockerfile +++ b/Dockerfile @@ -22,4 +22,4 @@ COPY src/ /app/ EXPOSE 8000 # Set the default command to run the app with `uvicorn` -CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"] \ No newline at end of file +CMD ["python", "server.py"] \ No newline at end of file diff --git a/README.md b/README.md index 9f0a65c..7ec22f7 100644 --- a/README.md +++ b/README.md @@ -1,175 +1,72 @@ -# README for FastAPI-Based Report GPT Generation Service - -## Overview - -This repository contains the implementation of a **FastAPI**-based service designed to generate research reports. The service processes user-provided queries and report types, performing advanced research powered by `GPTResearcher` and responding with comprehensive results, including details, cost, context, images, and other associated metadata. - +# Project Overview +## Description +This project is a server-side application built with Python that facilitates research-related operations. It provides functionalities to manage researchers, handle resources, process queries, and generate in-depth research reports. The application features reusable utility functions to streamline responses, handle exceptions gracefully, and format data for client consumption. A `Dockerfile` is provided for easy containerization and deployment. ## Features +### Server Functionality +The main server functionalities are defined in `server.py`, which includes: +- **research_resource**: Management of research resources. +- **deep_research**: Conducts detailed research operations. +- **write_report**: Creates comprehensive reports based on researched data. +- **get_research_sources**: Retrieves information sources for research. +- **get_research_context**: Provides contextual information tied to research. +- **research_query**: Handles incoming research-related queries. +- **run_server**: Initializes and runs the server. -- **RESTful API** to handle user queries and generate reports. -- **Streaming responses** to deliver research output in chunks. -- **Secure API access** with API Key authentication. -- Completely containerized setup with Docker. -- Built with modular design for easier scalability and maintenance. +### Utility Functions +The `utils.py` file provides additional support, including: +- **Response Handling**: + - `create_error_response` + - `create_success_response` ---- +- **Error & Exception Management**: + - `handle_exception` -## System Architecture +- **Data Operations**: + - `get_researcher_by_id` + - `format_sources_for_response` + - `format_context_with_sources` + - `store_research_results` + - `create_research_prompt` -### Core Components +### Docker Support +The included `Dockerfile` allows for simple containerized deployment: +- Uses a lightweight Python 3.13 image. +- Installs required dependencies from `requirements.txt`. +- Configures the application to run via `server.py` on port `8000` using `CMD ["python", "server.py"]`. -1. **FastAPI App (`main.py`)**: - - Hosts the API endpoints. - - Handles API Key authentication for secure use. - - Accepts user inputs (query and report type) and generates a chunked streaming response. +## Setup and Usage +### Prerequisites +- Python 3.13 or later. +- `pip` for dependency management. +- Docker (optional, for containerized deployment). -2. **Research Logic (`deepresearch.py`)**: - - Encapsulates research and report generation. - - Utilizes `GPTResearcher` to conduct research, generate reports, and retrieve extended data like images, contexts, or costs. - -3. **Docker Integration**: - - The application is containerized with a well-defined `Dockerfile`. - - Includes dependency installation, environment setup, and FastAPI server configuration for rapid deployment. - ---- - -## Prerequisites - -Before running the application, ensure the following are installed on your system: - -- **Docker**: Version 24.0+ -- **Python**: Version 3.13+ -- **pip**: Pre-installed Python package manager. - ---- - -## Running the Application Locally - -### Cloning the Repository - -Clone the repository to a directory of your choice: - -```shell script -git clone https://git.kvant.cloud/phoenix/gpt-researcher.git -cd gpt-researcher +### Installation +1. Clone this repository. +2. Install dependencies: +``` bash + pip install -r requirements.txt ``` - -### Environment Variable Configuration - -Create a `.env` file in the root of the project and define: - +1. Run the application: +``` bash + python server.py ``` -API_KEY=your_api_key # Replace "your_api_key" with your desired key -OPENAI_BASE_URL= -OPENAI_API_KEY= -EMBEDDING= -FAST_LLM= -SMART_LLM= -STRATEGIC_LLM= -OPENAI_API_VERSION= -SERPER_API_KEY= -RETRIEVER=serper +### Using Docker +Build and run the application as a Docker container: +1. Build the Docker image: +``` bash + docker build -t research-app . ``` - -### Installing Dependencies - -Install the required Python modules based on the generated `requirements.txt`. - -```shell script -pip install --no-cache-dir -r requirements.txt +1. Run the Docker container: +``` bash + docker run -p 8000:8000 research-app ``` - -### Running the App - -Run the FastAPI app locally: - -```shell script -uvicorn main:app --host 0.0.0.0 --port 8000 +The application will be accessible at `http://localhost:8000`. +## Folder Structure +``` +|-- src/ + |-- server.py # Main server logic + |-- utils.py # Reusable utility functions +|-- Dockerfile # Containerization setup +|-- requirements.txt # Dependencies file +|-- README.md # Documentation (this file) ``` - -After running, your app will be available at `http://127.0.0.1:8000`. - ---- - -## Using Docker for Deployment - -### Building the Docker Image - -Build the Docker image using the **Dockerfile** provided: - -```shell script -docker build -t fastapi-report-service . -``` - -### Running the Docker Container - -Spin up a container and map FastAPI's default port, `8000`: - -```shell script -docker run --env-file .env -p 8000:8000 fastapi-report-service -``` - ---- - -## API Usage - -### 1. **`/get_report`** - -- **Method**: `POST` -- **Description**: Generates a report based on user input. -- **Headers**: - - `X-API-KEY`: API Key for authentication. -- **Request Body** (`JSON`): - -```json -{ - "query": "Research on AI in healthcare", - "report_type": "research_report|resource_report|outline_report|custom_report|detailed_report|subtopic_report|deep" -} -``` - -- **Streaming Response**: Research and report are provided in chunks. - ---- - -## Code Structure - -``` -├── Dockerfile # Configuration for Dockerizing the application -├── requirements.txt # Python dependencies list -├── main.py # FastAPI server entry point -├── deepresearch.py # Research-related logic and GPTResearcher integration -└── src/ # Other project files and assets -``` - ---- - -## Features Under the Hood - -1. **Authentication**: - - An API key mechanism ensures that only authorized users can access endpoints. - -2. **Streaming Response**: - - Large research reports are sent incrementally using `StreamingResponse` for better experience and efficiency. - -3. **Modular Research Logic**: - - Research and generation tasks are handled by a dedicated class (`ReportGenerator`), making the application extensible. - ---- - -## Future Enhancements - -- **Asynchronous Enhancements**: - - Improve async handling for long-running queries. - -- **Database Integration**: - - Save request history for auditing and reference purposes. - -- **Web Interface**: - - A user-friendly web application for interacting with the API. - ---- - -## Contributing - -Contributions are welcome! Feel free to fork the repository, make updates, and submit a pull request. \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 06e6b07..1f2c348 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,12 @@ -fastapi -uvicorn -pydantic -gpt-researcher -asyncio +# GPT Researcher dependencies +gpt-researcher>=0.12.16 +python-dotenv + +# MCP dependencies +mcp>=1.6.0 +fastapi>=0.103.1 +uvicorn>=0.23.2 +pydantic>=2.3.0 + +# Utility dependencies +loguru>=0.7.0 \ No newline at end of file diff --git a/src/__init__.py b/src/__init__.py index e69de29..f372bcb 100644 --- a/src/__init__.py +++ b/src/__init__.py @@ -0,0 +1,8 @@ +""" +GPT Researcher MCP Server + +This module provides an MCP server implementation for GPT Researcher, +allowing AI assistants to perform web research and generate reports via the MCP protocol. +""" + +__version__ = "0.1.0" \ No newline at end of file diff --git a/src/main.py b/src/main.py deleted file mode 100644 index be5b167..0000000 --- a/src/main.py +++ /dev/null @@ -1,55 +0,0 @@ -import uvicorn -from fastapi import FastAPI, HTTPException, Request, Depends -from pydantic import BaseModel -from phoenix_technologies import ReportGenerator, CustomLogsHandler -from fastapi.responses import StreamingResponse -from typing import Dict, Any, AsyncGenerator, Coroutine, Generator -import os -import asyncio -import time - -# FastAPI app instance -app = FastAPI() - -# Define a request body structure using Pydantic -class ReportRequest(BaseModel): - query: str - report_type: str - -# Shared log array using asyncio.Queue -log_queue = asyncio.Queue() - -# Define a dependency to validate the API Key -def verify_api_key(request: Request): - # Define the API key from the environment variables - expected_api_key = os.getenv("API_KEY", None) - if not expected_api_key: - raise HTTPException( - status_code=500, detail="API key is not configured on the server." - ) - - # Get the API key from the request headers - provided_api_key = request.headers.get("X-API-KEY", None) - - # Check if the API key is correct - if not provided_api_key or provided_api_key != expected_api_key: - raise HTTPException(status_code=403, detail="Invalid or missing API key.") - - -@app.post("/get_report", dependencies=[Depends(verify_api_key)]) -async def get_report_endpoint(request: ReportRequest): - """ - Expose the `get_report` function as a POST API endpoint, with a streaming response. - """ - - def fake_data_streamer(): - for i in range(5): - yield f"My custom Log: {i}" - time.sleep(5) - - # Return streaming response - return StreamingResponse(fake_data_streamer(), media_type="text/plain") - - -if __name__ == "__main__": - uvicorn.run(app='main:app', host="127.0.0.1", port=8000) \ No newline at end of file diff --git a/src/phoenix_technologies/__init__.py b/src/phoenix_technologies/__init__.py deleted file mode 100644 index d7e2062..0000000 --- a/src/phoenix_technologies/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# phoenix-technologies/__init__.py -from .gptresearch.deepresearch import ReportGenerator, CustomLogsHandler - -__all__ = ["ReportGenerator", "CustomLogsHandler"] \ No newline at end of file diff --git a/src/phoenix_technologies/gptresearch/__init__.py b/src/phoenix_technologies/gptresearch/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/src/phoenix_technologies/gptresearch/deepresearch.py b/src/phoenix_technologies/gptresearch/deepresearch.py deleted file mode 100644 index 61f1aec..0000000 --- a/src/phoenix_technologies/gptresearch/deepresearch.py +++ /dev/null @@ -1,53 +0,0 @@ -from gpt_researcher import GPTResearcher -from typing import Dict, Any, AsyncGenerator, Coroutine - - -class CustomLogsHandler: - """A custom Logs handler class to handle JSON data.""" - def __init__(self): - self.logs = [] # Initialize logs to store data - - async def send_json(self, data: Dict[str, Any]) -> None: - """Send JSON data and log it.""" - self.logs.append(data) # Append data to logs - print(f"My custom Log: {data}") # For demonstration, print the log - -class ReportGenerator: - def __init__(self, query: str, report_type: str): - """ - Initializes the ReportGenerator with a query and report type. - """ - self.query = query - self.report_type = report_type - # Initialize researcher with a custom WebSocket - self.custom_logs_handler = CustomLogsHandler() - self.complete = False - - self.researcher = GPTResearcher(query, report_type, websocket=self.custom_logs_handler) - - def init(self) -> CustomLogsHandler: - return self.custom_logs_handler - - async def generate_report(self) -> None: - """ - Conducts research and generates the report along with additional information. - """ - # Conduct research - research_result = await self.researcher.conduct_research() - report = await self.researcher.write_report() - - # Retrieve additional information - research_context = self.researcher.get_research_context() - research_costs = self.researcher.get_costs() - research_images = self.researcher.get_research_images() - research_sources = self.researcher.get_research_sources() - self.complete = True - - def get_query_details(self): - """ - Returns details of the query and report type. - """ - return { - "query": self.query, - "report_type": self.report_type - } diff --git a/src/server.py b/src/server.py new file mode 100644 index 0000000..47914b2 --- /dev/null +++ b/src/server.py @@ -0,0 +1,261 @@ +""" +GPT Researcher MCP Server + +This script implements an MCP server for GPT Researcher, allowing AI assistants +to conduct web research and generate reports via the MCP protocol. +""" + +import os +import sys +import uuid +import logging +from typing import Dict, Any, Optional +from dotenv import load_dotenv +from mcp.server.fastmcp import FastMCP +from gpt_researcher import GPTResearcher + +# Load environment variables +load_dotenv() + +from utils import ( + research_store, + create_success_response, + handle_exception, + get_researcher_by_id, + format_sources_for_response, + format_context_with_sources, + store_research_results, + create_research_prompt +) + +logging.basicConfig( + level=logging.INFO, + format='[%(asctime)s][%(levelname)s] - %(message)s', +) + +logger = logging.getLogger(__name__) + +# Initialize FastMCP server +mcp = FastMCP("GPT Researcher") + +# Initialize researchers dictionary +if not hasattr(mcp, "researchers"): + mcp.researchers = {} + + +@mcp.resource("research://{topic}") +async def research_resource(topic: str) -> str: + """ + Provide research context for a given topic directly as a resource. + + This allows LLMs to access web-sourced information without explicit function calls. + + Args: + topic: The research topic or query + + Returns: + String containing the research context with source information + """ + # Check if we've already researched this topic + if topic in research_store: + logger.info(f"Returning cached research for topic: {topic}") + return research_store[topic]["context"] + + # If not, conduct the research + logger.info(f"Conducting new research for resource on topic: {topic}") + + # Initialize GPT Researcher + researcher = GPTResearcher(topic) + + try: + # Conduct the research + await researcher.conduct_research() + + # Get the context and sources + context = researcher.get_research_context() + sources = researcher.get_research_sources() + source_urls = researcher.get_source_urls() + + # Format with sources included + formatted_context = format_context_with_sources(topic, context, sources) + + # Store for future use + store_research_results(topic, context, sources, source_urls, formatted_context) + + return formatted_context + except Exception as e: + return f"Error conducting research on '{topic}': {str(e)}" + + +@mcp.tool() +async def deep_research(query: str) -> Dict[str, Any]: + """ + Conduct a deep web research on a given query using GPT Researcher. + Use this tool when you need time-sensitive, real-time information like stock prices, news, people, specific knowledge, etc. + You must include citations that back your responses when using this tool. + + Args: + query: The research query or topic + + Returns: + Dict containing research status, ID, and the actual research context and sources + that can be used directly by LLMs for context enrichment + """ + logger.info(f"Conducting research on query: {query}...") + + # Generate a unique ID for this research session + research_id = str(uuid.uuid4()) + + # Initialize GPT Researcher + researcher = GPTResearcher(query) + + # Start research + try: + await researcher.conduct_research() + mcp.researchers[research_id] = researcher + logger.info(f"Research completed for ID: {research_id}") + + # Get the research context and sources + context = researcher.get_research_context() + sources = researcher.get_research_sources() + source_urls = researcher.get_source_urls() + + # Store in the research store for the resource API + store_research_results(query, context, sources, source_urls) + + return create_success_response({ + "research_id": research_id, + "query": query, + "source_count": len(sources), + "context": context, + "sources": format_sources_for_response(sources), + "source_urls": source_urls + }) + except Exception as e: + return handle_exception(e, "Research") + + +@mcp.tool() +async def write_report(research_id: str, custom_prompt: Optional[str] = None) -> Dict[str, Any]: + """ + Generate a report based on previously conducted research. + + Args: + research_id: The ID of the research session from conduct_research + custom_prompt: Optional custom prompt for report generation + + Returns: + Dict containing the report content and metadata + """ + success, researcher, error = get_researcher_by_id(mcp.researchers, research_id) + if not success: + return error + + logger.info(f"Generating report for research ID: {research_id}") + + try: + # Generate report + report = await researcher.write_report(custom_prompt=custom_prompt) + + # Get additional information + sources = researcher.get_research_sources() + costs = researcher.get_costs() + + return create_success_response({ + "report": report, + "source_count": len(sources), + "costs": costs + }) + except Exception as e: + return handle_exception(e, "Report generation") + + +@mcp.tool() +async def get_research_sources(research_id: str) -> Dict[str, Any]: + """ + Get the sources used in the research. + + Args: + research_id: The ID of the research session + + Returns: + Dict containing the research sources + """ + success, researcher, error = get_researcher_by_id(mcp.researchers, research_id) + if not success: + return error + + sources = researcher.get_research_sources() + source_urls = researcher.get_source_urls() + + return create_success_response({ + "sources": format_sources_for_response(sources), + "source_urls": source_urls + }) + + +@mcp.tool() +async def get_research_context(research_id: str) -> Dict[str, Any]: + """ + Get the full context of the research. + + Args: + research_id: The ID of the research session + + Returns: + Dict containing the research context + """ + success, researcher, error = get_researcher_by_id(mcp.researchers, research_id) + if not success: + return error + + context = researcher.get_research_context() + + return create_success_response({ + "context": context + }) + + +@mcp.prompt() +def research_query(topic: str, goal: str, report_format: str = "research_report") -> str: + """ + Create a research query prompt for GPT Researcher. + + Args: + topic: The topic to research + goal: The goal or specific question to answer + report_format: The format of the report to generate + + Returns: + A formatted prompt for research + """ + return create_research_prompt(topic, goal, report_format) + + +def run_server(): + """Run the MCP server using FastMCP's built-in event loop handling.""" + # Check if API keys are set + if not os.getenv("OPENAI_API_KEY"): + logger.error("OPENAI_API_KEY not found. Please set it in your .env file.") + return + + # Add startup message + logger.info("Starting GPT Researcher MCP Server...") + print("🚀 GPT Researcher MCP Server starting... Check researcher_mcp_server.log for details") + + # Let FastMCP handle the event loop + try: + mcp.run("sse") + # Note: If we reach here, the server has stopped + logger.info("MCP Server has stopped") + except Exception as e: + logger.error(f"Error running MCP server: {str(e)}") + print(f"❌ MCP Server error: {str(e)}") + return + + print("✅ MCP Server stopped") + + +if __name__ == "__main__": + # Use the non-async approach to avoid asyncio nesting issues + run_server() \ No newline at end of file diff --git a/src/utils.py b/src/utils.py new file mode 100644 index 0000000..8ad7d12 --- /dev/null +++ b/src/utils.py @@ -0,0 +1,139 @@ +""" +GPT Researcher MCP Server Utilities + +This module provides utility functions and helpers for the GPT Researcher MCP Server. +""" + +import sys +from typing import Dict, List, Optional, Tuple, Any +from loguru import logger + +# Configure logging for console only (no file logging) +logger.configure(handlers=[{"sink": sys.stderr, "level": "INFO"}]) + +# Research store to track ongoing research topics and contexts +research_store = {} + +# API Response Utilities +def create_error_response(message: str) -> Dict[str, Any]: + """Create a standardized error response""" + return {"status": "error", "message": message} + + +def create_success_response(data: Dict[str, Any]) -> Dict[str, Any]: + """Create a standardized success response""" + return {"status": "success", **data} + + +def handle_exception(e: Exception, operation: str) -> Dict[str, Any]: + """Handle exceptions in a consistent way""" + error_message = str(e) + logger.error(f"{operation} failed: {error_message}") + return create_error_response(error_message) + + +def get_researcher_by_id(researchers_dict: Dict, research_id: str) -> Tuple[bool, Any, Dict[str, Any]]: + """ + Helper function to retrieve a researcher by ID. + + Args: + researchers_dict: Dictionary of research objects + research_id: The ID of the research session + + Returns: + Tuple containing (success, researcher_object, error_response) + """ + if not researchers_dict or research_id not in researchers_dict: + return False, None, create_error_response("Research ID not found. Please conduct research first.") + return True, researchers_dict[research_id], {} + + +def format_sources_for_response(sources: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + """ + Format source information for API responses. + + Args: + sources: List of source dictionaries + + Returns: + Formatted source list for API responses + """ + return [ + { + "title": source.get("title", "Unknown"), + "url": source.get("url", ""), + "content_length": len(source.get("content", "")) + } + for source in sources + ] + + +def format_context_with_sources(topic: str, context: str, sources: List[Dict[str, Any]]) -> str: + """ + Format research context with sources for display. + + Args: + topic: Research topic + context: Research context + sources: List of sources + + Returns: + Formatted context string with sources + """ + formatted_context = f"## Research: {topic}\n\n{context}\n\n" + formatted_context += "## Sources:\n" + for i, source in enumerate(sources): + formatted_context += f"{i+1}. {source.get('title', 'Unknown')}: {source.get('url', '')}\n" + return formatted_context + + +def store_research_results(topic: str, context: str, sources: List[Dict[str, Any]], + source_urls: List[str], formatted_context: Optional[str] = None): + """ + Store research results in the research store. + + Args: + topic: Research topic + context: Research context + sources: List of sources + source_urls: List of source URLs + formatted_context: Optional pre-formatted context + """ + research_store[topic] = { + "context": formatted_context or context, + "sources": sources, + "source_urls": source_urls + } + + +def create_research_prompt(topic: str, goal: str, report_format: str = "research_report") -> str: + """ + Create a research query prompt for GPT Researcher. + + Args: + topic: The topic to research + goal: The goal or specific question to answer + report_format: The format of the report to generate + + Returns: + A formatted prompt for research + """ + return f""" + Please research the following topic: {topic} + + Goal: {goal} + + You have two methods to access web-sourced information: + + 1. Use the "research://{topic}" resource to directly access context about this topic if it exists + or if you want to get straight to the information without tracking a research ID. + + 2. Use the conduct_research tool to perform new research and get a research_id for later use. + This tool also returns the context directly in its response, which you can use immediately. + + After getting context, you can: + - Use it directly in your response + - Use the write_report tool with a custom prompt to generate a structured {report_format} + + You can also use get_research_sources to view additional details about the information sources. + """ \ No newline at end of file From 626345b65eff04de7afd2a01ba8bfda98326141c Mon Sep 17 00:00:00 2001 From: ThomasTaroni Date: Sat, 26 Apr 2025 18:11:21 +0200 Subject: [PATCH 17/55] Add report_type parameter to GPTResearcher initialization This update enables specifying the report type during researcher initialization. It introduces "deep" as the default report type, enhancing flexibility for future use cases. --- src/server.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/server.py b/src/server.py index 47914b2..7914e43 100644 --- a/src/server.py +++ b/src/server.py @@ -107,7 +107,7 @@ async def deep_research(query: str) -> Dict[str, Any]: research_id = str(uuid.uuid4()) # Initialize GPT Researcher - researcher = GPTResearcher(query) + researcher = GPTResearcher(query, report_type="deep") # Start research try: From 7e5a6db0f6a941537ecf69368488c6a57d2b769e Mon Sep 17 00:00:00 2001 From: ThomasTaroni Date: Sat, 26 Apr 2025 18:14:41 +0200 Subject: [PATCH 18/55] Set research type dynamically via environment variable Added `RESEARCH_TYPE` environment variable to configure the default research type for GPTResearcher. Updated the initialization of GPTResearcher to use this dynamic configuration, improving flexibility and adaptability. --- src/server.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/server.py b/src/server.py index 7914e43..fa8d222 100644 --- a/src/server.py +++ b/src/server.py @@ -37,6 +37,7 @@ logger = logging.getLogger(__name__) # Initialize FastMCP server mcp = FastMCP("GPT Researcher") +research_type = os.getenv("RESEARCH_TYPE", "deep") # Initialize researchers dictionary if not hasattr(mcp, "researchers"): @@ -65,7 +66,7 @@ async def research_resource(topic: str) -> str: logger.info(f"Conducting new research for resource on topic: {topic}") # Initialize GPT Researcher - researcher = GPTResearcher(topic) + researcher = GPTResearcher(topic, report_type=research_type) try: # Conduct the research @@ -107,7 +108,7 @@ async def deep_research(query: str) -> Dict[str, Any]: research_id = str(uuid.uuid4()) # Initialize GPT Researcher - researcher = GPTResearcher(query, report_type="deep") + researcher = GPTResearcher(query, report_type=research_type) # Start research try: From 7f7a6083f8aa5812707532a440de82b826890039 Mon Sep 17 00:00:00 2001 From: ThomasTaroni Date: Sat, 26 Apr 2025 18:46:11 +0200 Subject: [PATCH 19/55] Add quick_search function and remove citation note in deep_research Introduced the quick_search function for faster, snippet-based searches and adjusted mcp.run to remove the "sse" argument. Additionally, modified the deep_research docstring to remove the citation requirement note for simpler usage documentation. --- src/server.py | 44 ++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 40 insertions(+), 4 deletions(-) diff --git a/src/server.py b/src/server.py index fa8d222..d2b75c0 100644 --- a/src/server.py +++ b/src/server.py @@ -9,7 +9,7 @@ import os import sys import uuid import logging -from typing import Dict, Any, Optional +from typing import Dict, Any, Optional, List from dotenv import load_dotenv from mcp.server.fastmcp import FastMCP from gpt_researcher import GPTResearcher @@ -91,9 +91,8 @@ async def research_resource(topic: str) -> str: @mcp.tool() async def deep_research(query: str) -> Dict[str, Any]: """ - Conduct a deep web research on a given query using GPT Researcher. + Conduct a web deep research on a given query using GPT Researcher. Use this tool when you need time-sensitive, real-time information like stock prices, news, people, specific knowledge, etc. - You must include citations that back your responses when using this tool. Args: query: The research query or topic @@ -136,6 +135,43 @@ async def deep_research(query: str) -> Dict[str, Any]: return handle_exception(e, "Research") +@mcp.tool() +async def quick_search(query: str) -> Dict[str, Any]: + """ + Perform a quick web search on a given query and return search results with snippets. + This optimizes for speed over quality and is useful when an LLM doesn't need in-depth + information on a topic. + + Args: + query: The search query + + Returns: + Dict containing search results and snippets + """ + logger.info(f"Performing quick search on query: {query}...") + + # Generate a unique ID for this search session + search_id = str(uuid.uuid4()) + + # Initialize GPT Researcher + researcher = GPTResearcher(query, report_type=research_type) + + try: + # Perform quick search + search_results = await researcher.quick_search(query=query) + mcp.researchers[search_id] = researcher + logger.info(f"Quick search completed for ID: {search_id}") + + return create_success_response({ + "search_id": search_id, + "query": query, + "result_count": len(search_results) if search_results else 0, + "search_results": search_results + }) + except Exception as e: + return handle_exception(e, "Quick search") + + @mcp.tool() async def write_report(research_id: str, custom_prompt: Optional[str] = None) -> Dict[str, Any]: """ @@ -246,7 +282,7 @@ def run_server(): # Let FastMCP handle the event loop try: - mcp.run("sse") + mcp.run() # Note: If we reach here, the server has stopped logger.info("MCP Server has stopped") except Exception as e: From 0892ecdc8e33420648e70c68c3dffb0276df1b3d Mon Sep 17 00:00:00 2001 From: ThomasTaroni Date: Sat, 26 Apr 2025 18:48:51 +0200 Subject: [PATCH 20/55] Use "sse" mode in MCP run method Updated the `mcp.run` call to include the "sse" mode, ensuring proper handling of server events. This change improves the server's event loop management and prepares it for SSE-specific operations. --- src/server.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/server.py b/src/server.py index d2b75c0..3de1191 100644 --- a/src/server.py +++ b/src/server.py @@ -282,7 +282,7 @@ def run_server(): # Let FastMCP handle the event loop try: - mcp.run() + mcp.run("sse") # Note: If we reach here, the server has stopped logger.info("MCP Server has stopped") except Exception as e: From 51eecd2830208dd4ae89da9830f17db331e263bf Mon Sep 17 00:00:00 2001 From: ThomasTaroni Date: Sat, 26 Apr 2025 19:28:43 +0200 Subject: [PATCH 21/55] Add CustomLogsHandler for JSON logging in GPTResearcher Introduced a CustomLogsHandler class to handle and log JSON data during research or search operations. Updated GPTResearcher initialization to include the CustomLogsHandler for improved logging and debugging. --- src/server.py | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/src/server.py b/src/server.py index 3de1191..0e7e169 100644 --- a/src/server.py +++ b/src/server.py @@ -44,6 +44,16 @@ if not hasattr(mcp, "researchers"): mcp.researchers = {} +class CustomLogsHandler: + """A custom Logs handler class to handle JSON data.""" + def __init__(self): + self.logs = [] # Initialize logs to store data + + async def send_json(self, data: Dict[str, Any]) -> None: + """Send JSON data and log it.""" + self.logs.append(data) # Append data to logs + print(f"MCP Log: {data}") # For demonstration, print the log + @mcp.resource("research://{topic}") async def research_resource(topic: str) -> str: """ @@ -64,9 +74,10 @@ async def research_resource(topic: str) -> str: # If not, conduct the research logger.info(f"Conducting new research for resource on topic: {topic}") + custom_logs_handler = CustomLogsHandler() # Initialize GPT Researcher - researcher = GPTResearcher(topic, report_type=research_type) + researcher = GPTResearcher(query=topic, report_type=research_type, websocket=custom_logs_handler) try: # Conduct the research @@ -105,9 +116,10 @@ async def deep_research(query: str) -> Dict[str, Any]: # Generate a unique ID for this research session research_id = str(uuid.uuid4()) + custom_logs_handler = CustomLogsHandler() # Initialize GPT Researcher - researcher = GPTResearcher(query, report_type=research_type) + researcher = GPTResearcher(query=query, report_type=research_type, websocket=custom_logs_handler) # Start research try: @@ -152,9 +164,9 @@ async def quick_search(query: str) -> Dict[str, Any]: # Generate a unique ID for this search session search_id = str(uuid.uuid4()) - + custom_logs_handler = CustomLogsHandler() # Initialize GPT Researcher - researcher = GPTResearcher(query, report_type=research_type) + researcher = GPTResearcher(query=query, report_type=research_type, websocket=custom_logs_handler) try: # Perform quick search From 6bab336883131638766fa9545ae20e0d9fc3a1f5 Mon Sep 17 00:00:00 2001 From: ThomasTaroni Date: Sat, 26 Apr 2025 19:48:06 +0200 Subject: [PATCH 22/55] Update function docstrings for clarity and focus Simplified the descriptions in `deep_research` and `quick_search` docstrings to improve readability and eliminate redundant details. This ensures concise and focused explanations of each function's purpose. --- src/server.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/server.py b/src/server.py index 0e7e169..ad14203 100644 --- a/src/server.py +++ b/src/server.py @@ -103,7 +103,7 @@ async def research_resource(topic: str) -> str: async def deep_research(query: str) -> Dict[str, Any]: """ Conduct a web deep research on a given query using GPT Researcher. - Use this tool when you need time-sensitive, real-time information like stock prices, news, people, specific knowledge, etc. + Use this tool when you need a deep research on a topic. Args: query: The research query or topic @@ -151,8 +151,7 @@ async def deep_research(query: str) -> Dict[str, Any]: async def quick_search(query: str) -> Dict[str, Any]: """ Perform a quick web search on a given query and return search results with snippets. - This optimizes for speed over quality and is useful when an LLM doesn't need in-depth - information on a topic. + Use this tool when you need a quick research on a topic. Args: query: The search query From eec1b34517d9fa98b49ef5613260839ec4ec1e4c Mon Sep 17 00:00:00 2001 From: ThomasTaroni Date: Sat, 3 May 2025 11:03:15 +0200 Subject: [PATCH 23/55] Refactor module structure and improve Docker configuration Reorganized module directories under `phoenix_technologies` for better namespace clarity and maintainability. Updated the Dockerfile to use an environment variable for the application entry point, enhancing flexibility in deployment. Additionally, revamped the README to reflect the new structure and provide clearer project documentation. --- Dockerfile | 5 +- README.md | 163 +++++++++++------- src/__init__.py | 8 - src/phoenix_technologies/__init__.py | 0 .../gpt_researcher/__init__.py | 8 + .../gpt_researcher}/server.py | 0 .../gpt_researcher}/utils.py | 0 7 files changed, 117 insertions(+), 67 deletions(-) create mode 100644 src/phoenix_technologies/__init__.py create mode 100644 src/phoenix_technologies/gpt_researcher/__init__.py rename src/{ => phoenix_technologies/gpt_researcher}/server.py (100%) rename src/{ => phoenix_technologies/gpt_researcher}/utils.py (100%) diff --git a/Dockerfile b/Dockerfile index b6228aa..6b0b52c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -4,6 +4,9 @@ FROM python:3.13-slim # Set environment variable for Python ENV PYTHONDONTWRITEBYTECODE 1 ENV PYTHONUNBUFFERED 1 +# Use an environment variable to define the script path +ENV APP_ENTRYPOINT "/app/mcp/gpt_researcher/server.py" + # Set working directory within the container WORKDIR /app @@ -22,4 +25,4 @@ COPY src/ /app/ EXPOSE 8000 # Set the default command to run the app with `uvicorn` -CMD ["python", "server.py"] \ No newline at end of file +CMD ["python", "$APP_ENTRYPOINT"] \ No newline at end of file diff --git a/README.md b/README.md index 7ec22f7..6b40cd8 100644 --- a/README.md +++ b/README.md @@ -1,72 +1,119 @@ -# Project Overview -## Description -This project is a server-side application built with Python that facilitates research-related operations. It provides functionalities to manage researchers, handle resources, process queries, and generate in-depth research reports. The application features reusable utility functions to streamline responses, handle exceptions gracefully, and format data for client consumption. A `Dockerfile` is provided for easy containerization and deployment. +# GPT Researcher +## Project Description +**GPT Researcher** is a Python-based tool designed to assist researchers and knowledge seekers in generating detailed research insights efficiently and accurately. This project combines powerful machine learning-backed methodologies with intuitive features for processing and analyzing research data. +The project revolves around managing research queries, obtaining resourceful insights, and enabling the creation of detailed research outputs. It provides flexibility for both quick general searches and deep explorations of specific topics. Additionally, it comes with a robust system for managing logs, exceptions, and user-defined custom responses. +The application is containerized with Docker, making it easier to deploy in any environment. It is built to utilize modular components for research management, formatting responses, and enhancing the overall research experience. ## Features -### Server Functionality -The main server functionalities are defined in `server.py`, which includes: -- **research_resource**: Management of research resources. -- **deep_research**: Conducts detailed research operations. -- **write_report**: Creates comprehensive reports based on researched data. -- **get_research_sources**: Retrieves information sources for research. -- **get_research_context**: Provides contextual information tied to research. -- **research_query**: Handles incoming research-related queries. -- **run_server**: Initializes and runs the server. +### Key Highlights: +1. **Research Operations** + - Perform deep-dives into specific topics through a systematic research mechanism. + - Quickly search for general information in a fast and lightweight fashion (`quick_search`). + - Retrieve necessary sources for research with flexibility. -### Utility Functions -The `utils.py` file provides additional support, including: -- **Response Handling**: - - `create_error_response` - - `create_success_response` +2. **Resource Management** + - Organize and store research results for later analysis. + - Format research sources and context to create user-friendly reports. + - Generate customizable research prompts for optimized research requests. -- **Error & Exception Management**: - - `handle_exception` +3. **Logging and Error Handling** + - Employ a custom logging handler (`CustomLogsHandler`) to manage logs effectively. + - Provide user-friendly error responses with flexible exception handling. -- **Data Operations**: - - `get_researcher_by_id` - - `format_sources_for_response` - - `format_context_with_sources` - - `store_research_results` - - `create_research_prompt` +4. **Report Generation** + - Automate the creation of research reports in reusable templates. + - Aggregate research resources and contexts while ensuring professional quality formatting. -### Docker Support -The included `Dockerfile` allows for simple containerized deployment: -- Uses a lightweight Python 3.13 image. -- Installs required dependencies from `requirements.txt`. -- Configures the application to run via `server.py` on port `8000` using `CMD ["python", "server.py"]`. +5. **Web Server Support** + - Run a backend server with the `run_server` function for handling client-side operations and queries. -## Setup and Usage +6. **Containerized Deployment** + - Fully Dockerized to allow for fast setup and consistent deployment across environments. + +## Installation +To get the project up and running locally, follow these steps: ### Prerequisites -- Python 3.13 or later. -- `pip` for dependency management. -- Docker (optional, for containerized deployment). +- Python 3.13+ +- Docker Engine (for containerized deployments) +- `pip` for managing necessary Python packages -### Installation -1. Clone this repository. -2. Install dependencies: -``` bash +### Steps: +1. **Clone the Repository**: +``` sh + git clone + cd gpt_researcher +``` +1. **Install Dependencies**: If you are working outside of Docker: +``` sh pip install -r requirements.txt ``` -1. Run the application: -``` bash - python server.py +1. **Run the Application**: +``` sh + python src/mcp/gpt_researcher/server.py ``` -### Using Docker -Build and run the application as a Docker container: -1. Build the Docker image: -``` bash - docker build -t research-app . +1. **Using Docker**: Build and run the Docker container: +``` sh + docker build -t gpt-researcher . + docker run -p 8000:8000 gpt-researcher ``` -1. Run the Docker container: -``` bash - docker run -p 8000:8000 research-app +The app will be available at `http://localhost:8000`. +## File Structure +### Overview: +``` plaintext +src/ +├── mcp/ +│ ├── gpt_researcher/ +│ │ ├── server.py # Main entrypoint for research workflow and server +│ │ └── utils.py # Utility functions for research formatting and storage +Dockerfile # Defines the container runtime environment +requirements.txt # Python dependencies for the project ``` -The application will be accessible at `http://localhost:8000`. -## Folder Structure -``` -|-- src/ - |-- server.py # Main server logic - |-- utils.py # Reusable utility functions -|-- Dockerfile # Containerization setup -|-- requirements.txt # Dependencies file -|-- README.md # Documentation (this file) +### Key Components: +1. **`server.py` **: + - Implements the main research operations: + - `deep_research`, `quick_search`, and `write_report`. + + - Contains functions to retrieve research sources, manage contexts, and execute research prompts. + +2. **`utils.py` **: + - Provides utilities for: + - Managing research storage. + - Formatting sources and context for readable outputs. + - Generating responses to different types of user interactions. + +3. **`Dockerfile` **: + - A configuration file for containerizing the application using the official Python 3.13-slim image. + +## Usage Instructions +### Available Endpoints +Once the server is running successfully, you can interact with the application by sending API requests to the available endpoints. Below are the core functionalities: +1. **Quick Research**: + - Quickly get general insights based on a query. + +2. **Deep Research**: + - Run comprehensive research on a specific topic. + +3. **Custom Research Reports**: + - Combine sources and analysis to create detailed user reports. + +4. **Access Research Context**: + - Retrieve the context and sources behind a result for deeper understanding. + +## Contributing +We follow a **code of conduct** that expects all contributors to maintain a respectful and inclusive environment. +### Contribution Guidelines: +1. Fork the repository and make your development locally. +2. Create a branch for your changes. +``` sh + git checkout -b feature-name ``` +1. Make your changes and test them thoroughly. +2. Submit a pull request with a clear description of the changes. + +## Code of Conduct +As contributors and maintainers of this project, we pledge to respect all members of the community by fostering a positive and inclusive environment. Instances of abuse, harassment, or unacceptable behavior will not be tolerated. +For details, please refer to the [Code of Conduct](CODE_OF_CONDUCT.md). +## License +This project is licensed under the **MIT License**. For more information, refer to the LICENSE file in the repository. +## Feedback and Support +For any queries, suggestions, or feedback, feel free to open an issue in this repository. You can also contact the maintainers directly via email or GitHub. +Please feel free to expand or modify this README according to your project's future needs! diff --git a/src/__init__.py b/src/__init__.py index f372bcb..e69de29 100644 --- a/src/__init__.py +++ b/src/__init__.py @@ -1,8 +0,0 @@ -""" -GPT Researcher MCP Server - -This module provides an MCP server implementation for GPT Researcher, -allowing AI assistants to perform web research and generate reports via the MCP protocol. -""" - -__version__ = "0.1.0" \ No newline at end of file diff --git a/src/phoenix_technologies/__init__.py b/src/phoenix_technologies/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/phoenix_technologies/gpt_researcher/__init__.py b/src/phoenix_technologies/gpt_researcher/__init__.py new file mode 100644 index 0000000..f372bcb --- /dev/null +++ b/src/phoenix_technologies/gpt_researcher/__init__.py @@ -0,0 +1,8 @@ +""" +GPT Researcher MCP Server + +This module provides an MCP server implementation for GPT Researcher, +allowing AI assistants to perform web research and generate reports via the MCP protocol. +""" + +__version__ = "0.1.0" \ No newline at end of file diff --git a/src/server.py b/src/phoenix_technologies/gpt_researcher/server.py similarity index 100% rename from src/server.py rename to src/phoenix_technologies/gpt_researcher/server.py diff --git a/src/utils.py b/src/phoenix_technologies/gpt_researcher/utils.py similarity index 100% rename from src/utils.py rename to src/phoenix_technologies/gpt_researcher/utils.py From 47c036a973df49b1f5da74fc7dafcdb1b47def5d Mon Sep 17 00:00:00 2001 From: ThomasTaroni Date: Sat, 3 May 2025 11:10:56 +0200 Subject: [PATCH 24/55] Remove unused research resource and related utilities Eliminated the `research://{topic}` resource API, associated utilities, and the `research_store`. These components were redundant due to existing alternatives using the `conduct_research` tool. This cleanup reduces complexity and improves maintainability. --- .../gpt_researcher/server.py | 50 ------------------- .../gpt_researcher/utils.py | 45 +---------------- 2 files changed, 1 insertion(+), 94 deletions(-) diff --git a/src/phoenix_technologies/gpt_researcher/server.py b/src/phoenix_technologies/gpt_researcher/server.py index ad14203..fbfa63f 100644 --- a/src/phoenix_technologies/gpt_researcher/server.py +++ b/src/phoenix_technologies/gpt_researcher/server.py @@ -18,13 +18,10 @@ from gpt_researcher import GPTResearcher load_dotenv() from utils import ( - research_store, create_success_response, handle_exception, get_researcher_by_id, format_sources_for_response, - format_context_with_sources, - store_research_results, create_research_prompt ) @@ -54,50 +51,6 @@ class CustomLogsHandler: self.logs.append(data) # Append data to logs print(f"MCP Log: {data}") # For demonstration, print the log -@mcp.resource("research://{topic}") -async def research_resource(topic: str) -> str: - """ - Provide research context for a given topic directly as a resource. - - This allows LLMs to access web-sourced information without explicit function calls. - - Args: - topic: The research topic or query - - Returns: - String containing the research context with source information - """ - # Check if we've already researched this topic - if topic in research_store: - logger.info(f"Returning cached research for topic: {topic}") - return research_store[topic]["context"] - - # If not, conduct the research - logger.info(f"Conducting new research for resource on topic: {topic}") - custom_logs_handler = CustomLogsHandler() - - # Initialize GPT Researcher - researcher = GPTResearcher(query=topic, report_type=research_type, websocket=custom_logs_handler) - - try: - # Conduct the research - await researcher.conduct_research() - - # Get the context and sources - context = researcher.get_research_context() - sources = researcher.get_research_sources() - source_urls = researcher.get_source_urls() - - # Format with sources included - formatted_context = format_context_with_sources(topic, context, sources) - - # Store for future use - store_research_results(topic, context, sources, source_urls, formatted_context) - - return formatted_context - except Exception as e: - return f"Error conducting research on '{topic}': {str(e)}" - @mcp.tool() async def deep_research(query: str) -> Dict[str, Any]: @@ -132,9 +85,6 @@ async def deep_research(query: str) -> Dict[str, Any]: sources = researcher.get_research_sources() source_urls = researcher.get_source_urls() - # Store in the research store for the resource API - store_research_results(query, context, sources, source_urls) - return create_success_response({ "research_id": research_id, "query": query, diff --git a/src/phoenix_technologies/gpt_researcher/utils.py b/src/phoenix_technologies/gpt_researcher/utils.py index 8ad7d12..283d22d 100644 --- a/src/phoenix_technologies/gpt_researcher/utils.py +++ b/src/phoenix_technologies/gpt_researcher/utils.py @@ -11,8 +11,6 @@ from loguru import logger # Configure logging for console only (no file logging) logger.configure(handlers=[{"sink": sys.stderr, "level": "INFO"}]) -# Research store to track ongoing research topics and contexts -research_store = {} # API Response Utilities def create_error_response(message: str) -> Dict[str, Any]: @@ -68,44 +66,6 @@ def format_sources_for_response(sources: List[Dict[str, Any]]) -> List[Dict[str, ] -def format_context_with_sources(topic: str, context: str, sources: List[Dict[str, Any]]) -> str: - """ - Format research context with sources for display. - - Args: - topic: Research topic - context: Research context - sources: List of sources - - Returns: - Formatted context string with sources - """ - formatted_context = f"## Research: {topic}\n\n{context}\n\n" - formatted_context += "## Sources:\n" - for i, source in enumerate(sources): - formatted_context += f"{i+1}. {source.get('title', 'Unknown')}: {source.get('url', '')}\n" - return formatted_context - - -def store_research_results(topic: str, context: str, sources: List[Dict[str, Any]], - source_urls: List[str], formatted_context: Optional[str] = None): - """ - Store research results in the research store. - - Args: - topic: Research topic - context: Research context - sources: List of sources - source_urls: List of source URLs - formatted_context: Optional pre-formatted context - """ - research_store[topic] = { - "context": formatted_context or context, - "sources": sources, - "source_urls": source_urls - } - - def create_research_prompt(topic: str, goal: str, report_format: str = "research_report") -> str: """ Create a research query prompt for GPT Researcher. @@ -125,10 +85,7 @@ def create_research_prompt(topic: str, goal: str, report_format: str = "research You have two methods to access web-sourced information: - 1. Use the "research://{topic}" resource to directly access context about this topic if it exists - or if you want to get straight to the information without tracking a research ID. - - 2. Use the conduct_research tool to perform new research and get a research_id for later use. + Use the conduct_research tool to perform new research and get a research_id for later use. This tool also returns the context directly in its response, which you can use immediately. After getting context, you can: From 3b75d04f32b3a01fbc46dd98761a5191d8fb65a0 Mon Sep 17 00:00:00 2001 From: ThomasTaroni Date: Sat, 31 May 2025 17:08:46 +0200 Subject: [PATCH 25/55] Add SMD MCP Server implementation and utilities Implemented the SMD MCP Server with functionality for AI-driven research and article retrieval via the MCP protocol. Added utilities for prompt creation and enhanced dependency configuration. This serves as a foundation for handling SMD-related queries and data processing. --- requirements.txt | 5 +- src/phoenix_technologies/smd/__init__.py | 8 ++ src/phoenix_technologies/smd/server.py | 121 +++++++++++++++++++++++ src/phoenix_technologies/smd/utils.py | 49 +++++++++ 4 files changed, 181 insertions(+), 2 deletions(-) create mode 100644 src/phoenix_technologies/smd/__init__.py create mode 100644 src/phoenix_technologies/smd/server.py create mode 100644 src/phoenix_technologies/smd/utils.py diff --git a/requirements.txt b/requirements.txt index 1f2c348..65bcfd1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ # GPT Researcher dependencies gpt-researcher>=0.12.16 -python-dotenv +python-dotenv~=1.1.0 # MCP dependencies mcp>=1.6.0 @@ -9,4 +9,5 @@ uvicorn>=0.23.2 pydantic>=2.3.0 # Utility dependencies -loguru>=0.7.0 \ No newline at end of file +loguru>=0.7.0 +requests~=2.32.3 \ No newline at end of file diff --git a/src/phoenix_technologies/smd/__init__.py b/src/phoenix_technologies/smd/__init__.py new file mode 100644 index 0000000..33295d9 --- /dev/null +++ b/src/phoenix_technologies/smd/__init__.py @@ -0,0 +1,8 @@ +""" +SMD MCP Server + +This module provides an MCP server implementation for SMD, +allowing AI assistants to perform searches and generate reports via the MCP protocol. +""" + +__version__ = "0.1.0" \ No newline at end of file diff --git a/src/phoenix_technologies/smd/server.py b/src/phoenix_technologies/smd/server.py new file mode 100644 index 0000000..566c6b1 --- /dev/null +++ b/src/phoenix_technologies/smd/server.py @@ -0,0 +1,121 @@ +""" +SMD Researcher MCP Server + +This script implements an MCP server for SMD Researcher, allowing AI assistants +to conduct research and generate reports via the MCP protocol. +""" + +import os +import logging +import requests +from dotenv import load_dotenv +from mcp.server.fastmcp import FastMCP + +# Load environment variables +load_dotenv() + +from utils import ( + create_research_prompt +) + +logging.basicConfig( + level=logging.INFO, + format='[%(asctime)s][%(levelname)s] - %(message)s', +) + +logger = logging.getLogger(__name__) + +# Initialize FastMCP server +mcp = FastMCP("SMD Researcher") + +@mcp.tool() +async def smd_detail_article(article_id: str) -> dict: + """ + Get the Details of an article found by the SMD Research. + Use this tool when you need the full content of an article. + + Args: + article_id: The ID of the article to get the details for + + Returns: + String containing the details of the article + """ + url = f"https://api.smd.ch/api/documents/{article_id}" + headers = { + "authorization": f"Bearer {os.getenv('SWISSDOX_BEARER_TOKEN', '')}", + "content-type": "application/json", + } + payload = {"filters": [], "pagination": {"pageSize": 1, "currentPage": 1}} + response = requests.post(url, headers=headers, json=payload) + if response.status_code == 200: + return response.json() + else: + return { + "message": response.text + } + +@mcp.tool() +async def smd_research(query: dict) -> dict: + """ + Execute a deep search on a given query using SMD Researcher. + Use this tool when you need research on a topic. + + Args: + query: The research query or topic + + Returns: + String containing research status, ID, and the actual research context + """ + url = "https://api.swissdox.ch/api/documents/search" + headers = { + "authorization": f"Bearer {os.getenv('SWISSDOX_BEARER_TOKEN', '')}", + "content-type": "application/json", + } + response = requests.post(url, headers=headers, json=query) + if response.status_code == 200: + return response.json() + else: + return { + "message": response.text + } + + +@mcp.prompt() +def research_query(search_query: str = "Bundesrat", date_from: str = "2024-05-30T22:00:00.000Z", date_to: str = "2025-05-31T21:59:59.999Z") -> dict: + """ + Create a research query prompt for SMD Researcher. + + Args: + search_query: The SMD search query, there are Logical Operators available (AND, OR, NOT) and for a excact match use "+" before the word. For excluding use "-" before the word. For queries with multiple words use quotes. + date_from: The date to start research from, in the format YYYY-MM-DDTHH:MM:SS.SSSZ + date_to: The date to end research at, in the format YYYY-MM-DDTHH:MM:SS.SSSZ + + Returns: + A formatted prompt for research + """ + return create_research_prompt(search_query,date_from,date_to) + + +def run_server(): + """Run the MCP server using FastMCP's built-in event loop handling.""" + + # Add startup message + logger.info("Starting GPT Researcher MCP Server...") + print("🚀 GPT Researcher MCP Server starting... Check researcher_mcp_server.log for details") + + # Let FastMCP handle the event loop + try: + mcp.run("sse") + # Note: If we reach here, the server has stopped + logger.info("MCP Server has stopped") + except Exception as e: + logger.error(f"Error running MCP server: {str(e)}") + print(f"❌ MCP Server error: {str(e)}") + return + + print("✅ MCP Server stopped") + + +if __name__ == "__main__": + # Use the non-async approach to avoid asyncio nesting issues + run_server() \ No newline at end of file diff --git a/src/phoenix_technologies/smd/utils.py b/src/phoenix_technologies/smd/utils.py new file mode 100644 index 0000000..7c84f68 --- /dev/null +++ b/src/phoenix_technologies/smd/utils.py @@ -0,0 +1,49 @@ +""" +SMD Researcher MCP Server Utilities + +This module provides utility functions and helpers for the SMD Researcher MCP Server. +""" + +import sys +from loguru import logger + +# Configure logging for console only (no file logging) +logger.configure(handlers=[{"sink": sys.stderr, "level": "INFO"}]) + +def create_research_prompt(search_query: str = "Bundesrat", date_from: str = "2024-05-30T22:00:00.000Z", date_to: str = "2025-05-31T21:59:59.999Z") -> dict: + """ + Create a research query prompt for SMD Researcher. + + Args: + search_query: The SMD search query, there are Logical Operators available (AND, OR, NOT) and for a excact match use "+" before the word. For excluding use "-" before the word. For queries with multiple words use quotes. + date_from: The date to start research from, in the format YYYY-MM-DDTHH:MM:SS.SSSZ + date_to: The date to end research at, in the format YYYY-MM-DDTHH:MM:SS.SSSZ + + Returns: + A formatted prompt for research as a Python dictionary. + """ + return { + "sort": { + "field": "score", + "direction": "desc" + }, + "filters": [ + { + "field": "datetime", + "value": [ + date_from, + date_to + ] + }, + { + "field": "query_text", + "value": [search_query] + } + ], + "exact": False, + "pagination": { + "pageSize": 10, + "currentPage": 1 + }, + "onlyResults": True + } From dc93de9b180a3a0d16e7b593303ea0d0df500892 Mon Sep 17 00:00:00 2001 From: ThomasTaroni Date: Sat, 31 May 2025 20:06:23 +0200 Subject: [PATCH 26/55] Use shell form for CMD in Dockerfile Switched CMD to use shell form instead of exec form for running the application. This simplifies the command syntax and ensures compatibility in environments where exec form parsing may cause issues. --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 6b0b52c..d45dabe 100644 --- a/Dockerfile +++ b/Dockerfile @@ -25,4 +25,4 @@ COPY src/ /app/ EXPOSE 8000 # Set the default command to run the app with `uvicorn` -CMD ["python", "$APP_ENTRYPOINT"] \ No newline at end of file +CMD python "$APP_ENTRYPOINT" \ No newline at end of file From a593cdcae8f7f4f747c188c70ee00b8a4acfea55 Mon Sep 17 00:00:00 2001 From: ThomasTaroni Date: Sat, 31 May 2025 20:28:10 +0200 Subject: [PATCH 27/55] Update APP_ENTRYPOINT in Dockerfile to new path Changed the APP_ENTRYPOINT environment variable to reflect the updated directory structure. This ensures the application points to the correct script path during execution. --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index d45dabe..fd7fdfa 100644 --- a/Dockerfile +++ b/Dockerfile @@ -5,7 +5,7 @@ FROM python:3.13-slim ENV PYTHONDONTWRITEBYTECODE 1 ENV PYTHONUNBUFFERED 1 # Use an environment variable to define the script path -ENV APP_ENTRYPOINT "/app/mcp/gpt_researcher/server.py" +ENV APP_ENTRYPOINT "/app/phoenix_technologies/gpt_researcher/server.py" # Set working directory within the container From 12503a4453bfec13d936533d5dfc100b027cc338 Mon Sep 17 00:00:00 2001 From: ThomasTaroni Date: Sat, 31 May 2025 21:16:40 +0200 Subject: [PATCH 28/55] Set FastMCP server to listen on all interfaces with port 8000 The `host` and `port` parameters were added to FastMCP initialization for both GPT Researcher and SMD servers. This ensures the servers are accessible on all network interfaces at port 8000. --- src/phoenix_technologies/gpt_researcher/server.py | 2 +- src/phoenix_technologies/smd/server.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/phoenix_technologies/gpt_researcher/server.py b/src/phoenix_technologies/gpt_researcher/server.py index fbfa63f..95bca05 100644 --- a/src/phoenix_technologies/gpt_researcher/server.py +++ b/src/phoenix_technologies/gpt_researcher/server.py @@ -33,7 +33,7 @@ logging.basicConfig( logger = logging.getLogger(__name__) # Initialize FastMCP server -mcp = FastMCP("GPT Researcher") +mcp = FastMCP("GPT Researcher", host="0.0.0.0", port=8000) research_type = os.getenv("RESEARCH_TYPE", "deep") # Initialize researchers dictionary diff --git a/src/phoenix_technologies/smd/server.py b/src/phoenix_technologies/smd/server.py index 566c6b1..8da3638 100644 --- a/src/phoenix_technologies/smd/server.py +++ b/src/phoenix_technologies/smd/server.py @@ -26,7 +26,7 @@ logging.basicConfig( logger = logging.getLogger(__name__) # Initialize FastMCP server -mcp = FastMCP("SMD Researcher") +mcp = FastMCP("SMD Researcher", host="0.0.0.0", port=8000) @mcp.tool() async def smd_detail_article(article_id: str) -> dict: From ba48f443217a409e11dcd454a10e8900d0c673ad Mon Sep 17 00:00:00 2001 From: ThomasTaroni Date: Sat, 31 May 2025 22:29:01 +0200 Subject: [PATCH 29/55] Set keep-alive timeout for FastMCP servers to 720 seconds Added the `timeout_keep_alive` parameter to FastMCP initialization in both GPT Researcher and SMD Researcher servers. This ensures connections remain active for longer, improving server reliability for long-lived client interactions. --- src/phoenix_technologies/gpt_researcher/server.py | 2 +- src/phoenix_technologies/smd/server.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/phoenix_technologies/gpt_researcher/server.py b/src/phoenix_technologies/gpt_researcher/server.py index 95bca05..a96ee6e 100644 --- a/src/phoenix_technologies/gpt_researcher/server.py +++ b/src/phoenix_technologies/gpt_researcher/server.py @@ -33,7 +33,7 @@ logging.basicConfig( logger = logging.getLogger(__name__) # Initialize FastMCP server -mcp = FastMCP("GPT Researcher", host="0.0.0.0", port=8000) +mcp = FastMCP("GPT Researcher", host="0.0.0.0", port=8000, timeout_keep_alive=720) research_type = os.getenv("RESEARCH_TYPE", "deep") # Initialize researchers dictionary diff --git a/src/phoenix_technologies/smd/server.py b/src/phoenix_technologies/smd/server.py index 8da3638..24de685 100644 --- a/src/phoenix_technologies/smd/server.py +++ b/src/phoenix_technologies/smd/server.py @@ -26,7 +26,7 @@ logging.basicConfig( logger = logging.getLogger(__name__) # Initialize FastMCP server -mcp = FastMCP("SMD Researcher", host="0.0.0.0", port=8000) +mcp = FastMCP("SMD Researcher", host="0.0.0.0", port=8000, timeout_keep_alive=720) @mcp.tool() async def smd_detail_article(article_id: str) -> dict: From b1ad64cd75b8b90ac0ab58ce47f3d4b807208568 Mon Sep 17 00:00:00 2001 From: ThomasTaroni Date: Sat, 31 May 2025 23:54:51 +0200 Subject: [PATCH 30/55] Introduce research resource API and improve research caching Add a `research://{topic}` resource endpoint for direct access to research context, reducing redundant API calls. Introduced `research_store` for caching research results and modularized helper methods like `store_research_results` and `format_context_with_sources` for better reusability and clarity. Refactored existing researcher initialization for simplicity and improved comments to clarify intended usage. --- .../gpt_researcher/server.py | 70 ++++++++++++++----- .../gpt_researcher/utils.py | 45 +++++++++++- 2 files changed, 98 insertions(+), 17 deletions(-) diff --git a/src/phoenix_technologies/gpt_researcher/server.py b/src/phoenix_technologies/gpt_researcher/server.py index a96ee6e..2fdcd25 100644 --- a/src/phoenix_technologies/gpt_researcher/server.py +++ b/src/phoenix_technologies/gpt_researcher/server.py @@ -18,10 +18,13 @@ from gpt_researcher import GPTResearcher load_dotenv() from utils import ( + research_store, create_success_response, handle_exception, get_researcher_by_id, format_sources_for_response, + format_context_with_sources, + store_research_results, create_research_prompt ) @@ -33,30 +36,62 @@ logging.basicConfig( logger = logging.getLogger(__name__) # Initialize FastMCP server -mcp = FastMCP("GPT Researcher", host="0.0.0.0", port=8000, timeout_keep_alive=720) -research_type = os.getenv("RESEARCH_TYPE", "deep") +mcp = FastMCP("GPT Researcher") # Initialize researchers dictionary if not hasattr(mcp, "researchers"): mcp.researchers = {} -class CustomLogsHandler: - """A custom Logs handler class to handle JSON data.""" - def __init__(self): - self.logs = [] # Initialize logs to store data +@mcp.resource("research://{topic}") +async def research_resource(topic: str) -> str: + """ + Provide research context for a given topic directly as a resource. - async def send_json(self, data: Dict[str, Any]) -> None: - """Send JSON data and log it.""" - self.logs.append(data) # Append data to logs - print(f"MCP Log: {data}") # For demonstration, print the log + This allows LLMs to access web-sourced information without explicit function calls. + + Args: + topic: The research topic or query + + Returns: + String containing the research context with source information + """ + # Check if we've already researched this topic + if topic in research_store: + logger.info(f"Returning cached research for topic: {topic}") + return research_store[topic]["context"] + + # If not, conduct the research + logger.info(f"Conducting new research for resource on topic: {topic}") + + # Initialize GPT Researcher + researcher = GPTResearcher(topic) + + try: + # Conduct the research + await researcher.conduct_research() + + # Get the context and sources + context = researcher.get_research_context() + sources = researcher.get_research_sources() + source_urls = researcher.get_source_urls() + + # Format with sources included + formatted_context = format_context_with_sources(topic, context, sources) + + # Store for future use + store_research_results(topic, context, sources, source_urls, formatted_context) + + return formatted_context + except Exception as e: + return f"Error conducting research on '{topic}': {str(e)}" @mcp.tool() async def deep_research(query: str) -> Dict[str, Any]: """ Conduct a web deep research on a given query using GPT Researcher. - Use this tool when you need a deep research on a topic. + Use this tool when you need time-sensitive, real-time information like stock prices, news, people, specific knowledge, etc. Args: query: The research query or topic @@ -69,10 +104,9 @@ async def deep_research(query: str) -> Dict[str, Any]: # Generate a unique ID for this research session research_id = str(uuid.uuid4()) - custom_logs_handler = CustomLogsHandler() # Initialize GPT Researcher - researcher = GPTResearcher(query=query, report_type=research_type, websocket=custom_logs_handler) + researcher = GPTResearcher(query) # Start research try: @@ -85,6 +119,9 @@ async def deep_research(query: str) -> Dict[str, Any]: sources = researcher.get_research_sources() source_urls = researcher.get_source_urls() + # Store in the research store for the resource API + store_research_results(query, context, sources, source_urls) + return create_success_response({ "research_id": research_id, "query": query, @@ -101,7 +138,8 @@ async def deep_research(query: str) -> Dict[str, Any]: async def quick_search(query: str) -> Dict[str, Any]: """ Perform a quick web search on a given query and return search results with snippets. - Use this tool when you need a quick research on a topic. + This optimizes for speed over quality and is useful when an LLM doesn't need in-depth + information on a topic. Args: query: The search query @@ -113,9 +151,9 @@ async def quick_search(query: str) -> Dict[str, Any]: # Generate a unique ID for this search session search_id = str(uuid.uuid4()) - custom_logs_handler = CustomLogsHandler() + # Initialize GPT Researcher - researcher = GPTResearcher(query=query, report_type=research_type, websocket=custom_logs_handler) + researcher = GPTResearcher(query) try: # Perform quick search diff --git a/src/phoenix_technologies/gpt_researcher/utils.py b/src/phoenix_technologies/gpt_researcher/utils.py index 283d22d..8ad7d12 100644 --- a/src/phoenix_technologies/gpt_researcher/utils.py +++ b/src/phoenix_technologies/gpt_researcher/utils.py @@ -11,6 +11,8 @@ from loguru import logger # Configure logging for console only (no file logging) logger.configure(handlers=[{"sink": sys.stderr, "level": "INFO"}]) +# Research store to track ongoing research topics and contexts +research_store = {} # API Response Utilities def create_error_response(message: str) -> Dict[str, Any]: @@ -66,6 +68,44 @@ def format_sources_for_response(sources: List[Dict[str, Any]]) -> List[Dict[str, ] +def format_context_with_sources(topic: str, context: str, sources: List[Dict[str, Any]]) -> str: + """ + Format research context with sources for display. + + Args: + topic: Research topic + context: Research context + sources: List of sources + + Returns: + Formatted context string with sources + """ + formatted_context = f"## Research: {topic}\n\n{context}\n\n" + formatted_context += "## Sources:\n" + for i, source in enumerate(sources): + formatted_context += f"{i+1}. {source.get('title', 'Unknown')}: {source.get('url', '')}\n" + return formatted_context + + +def store_research_results(topic: str, context: str, sources: List[Dict[str, Any]], + source_urls: List[str], formatted_context: Optional[str] = None): + """ + Store research results in the research store. + + Args: + topic: Research topic + context: Research context + sources: List of sources + source_urls: List of source URLs + formatted_context: Optional pre-formatted context + """ + research_store[topic] = { + "context": formatted_context or context, + "sources": sources, + "source_urls": source_urls + } + + def create_research_prompt(topic: str, goal: str, report_format: str = "research_report") -> str: """ Create a research query prompt for GPT Researcher. @@ -85,7 +125,10 @@ def create_research_prompt(topic: str, goal: str, report_format: str = "research You have two methods to access web-sourced information: - Use the conduct_research tool to perform new research and get a research_id for later use. + 1. Use the "research://{topic}" resource to directly access context about this topic if it exists + or if you want to get straight to the information without tracking a research ID. + + 2. Use the conduct_research tool to perform new research and get a research_id for later use. This tool also returns the context directly in its response, which you can use immediately. After getting context, you can: From 11bfff7ff7bcaeee05e891a46952bb9f2cae9b2b Mon Sep 17 00:00:00 2001 From: ThomasTaroni Date: Sat, 31 May 2025 23:59:42 +0200 Subject: [PATCH 31/55] Set FastMCP server to listen on all interfaces with a config. Updated the FastMCP initialization to explicitly bind to 0.0.0.0, set the port to 8000, and specify a keep-alive timeout of 720 seconds. This ensures the server is properly configured for broader accessibility and stability. --- src/phoenix_technologies/gpt_researcher/server.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/phoenix_technologies/gpt_researcher/server.py b/src/phoenix_technologies/gpt_researcher/server.py index 2fdcd25..0627c87 100644 --- a/src/phoenix_technologies/gpt_researcher/server.py +++ b/src/phoenix_technologies/gpt_researcher/server.py @@ -36,7 +36,7 @@ logging.basicConfig( logger = logging.getLogger(__name__) # Initialize FastMCP server -mcp = FastMCP("GPT Researcher") +mcp = FastMCP("GPT Researcher", host="0.0.0.0", port=8000, timeout_keep_alive=720) # Initialize researchers dictionary if not hasattr(mcp, "researchers"): From cda44df1a096426003582a89451ccf1bfe13319a Mon Sep 17 00:00:00 2001 From: ThomasTaroni Date: Sun, 1 Jun 2025 11:46:11 +0200 Subject: [PATCH 32/55] Refactor deep_research to support SSE-based streaming. Updated the `deep_research` function to use asynchronous generators and send Server-Sent Events (SSE) for real-time streaming updates. Added a new utility for formatting SSE events and improved research lifecycle visibility with intermediate progress steps and error handling. --- .../gpt_researcher/server.py | 94 ++++++++++++++----- 1 file changed, 72 insertions(+), 22 deletions(-) diff --git a/src/phoenix_technologies/gpt_researcher/server.py b/src/phoenix_technologies/gpt_researcher/server.py index 0627c87..59d08ab 100644 --- a/src/phoenix_technologies/gpt_researcher/server.py +++ b/src/phoenix_technologies/gpt_researcher/server.py @@ -6,10 +6,11 @@ to conduct web research and generate reports via the MCP protocol. """ import os -import sys +import json import uuid import logging -from typing import Dict, Any, Optional, List +from typing import Dict, Any, Optional, AsyncGenerator +import asyncio from dotenv import load_dotenv from mcp.server.fastmcp import FastMCP from gpt_researcher import GPTResearcher @@ -35,6 +36,11 @@ logging.basicConfig( logger = logging.getLogger(__name__) +def format_sse_event(event_name: str, data: Dict[str, Any]) -> str: + """Formatiert Daten als SSE Event String.""" + json_data = json.dumps(data) + return f"event: {event_name}\ndata: {json_data}\n\n" + # Initialize FastMCP server mcp = FastMCP("GPT Researcher", host="0.0.0.0", port=8000, timeout_keep_alive=720) @@ -88,7 +94,7 @@ async def research_resource(topic: str) -> str: @mcp.tool() -async def deep_research(query: str) -> Dict[str, Any]: +async def deep_research(query: str) -> AsyncGenerator[str, None]: """ Conduct a web deep research on a given query using GPT Researcher. Use this tool when you need time-sensitive, real-time information like stock prices, news, people, specific knowledge, etc. @@ -100,39 +106,83 @@ async def deep_research(query: str) -> Dict[str, Any]: Dict containing research status, ID, and the actual research context and sources that can be used directly by LLMs for context enrichment """ - logger.info(f"Conducting research on query: {query}...") - - # Generate a unique ID for this research session + logger.info(f"Starting streaming deep research on query: {query}...") research_id = str(uuid.uuid4()) - # Initialize GPT Researcher - researcher = GPTResearcher(query) - - # Start research try: - await researcher.conduct_research() - mcp.researchers[research_id] = researcher - logger.info(f"Research completed for ID: {research_id}") + yield format_sse_event( + "tool_update", + {"research_id": research_id, "query": query, "status": "Research initiated"} + ) + + # Initialize GPT Researcher + # In neueren GPTResearcher Versionen wird der query beim Initialisieren übergeben + researcher = GPTResearcher(query=query, report_type="research_report", config_path=None) + # Alternativ, falls deine Version es anders handhabt: researcher = GPTResearcher(query) + + mcp.researchers[research_id] = researcher # Speichere früh, falls benötigt + + yield format_sse_event( + "tool_update", + {"research_id": research_id, "status": "GPT Researcher initialized. Starting information gathering."} + ) + await asyncio.sleep(0.1) # Kurze Pause, damit das Event sicher gesendet wird + + # Simuliertes Update: Beginn der Recherche + yield format_sse_event( + "tool_update", + {"research_id": research_id, "status": "Starting web crawling and source analysis...", "progress": 10} + ) + + # Der eigentliche langlaufende Prozess + await researcher.conduct_research() # Dieser Aufruf blockiert für die Dauer der Recherche + + # Simuliertes Update: Recherche abgeschlossen, beginne mit der Verarbeitung + yield format_sse_event( + "tool_update", + {"research_id": research_id, "status": "Web crawling finished. Processing and consolidating information...", "progress": 70} + ) + await asyncio.sleep(0.1) + + logger.info(f"Core research completed for ID: {research_id}. Fetching results...") # Get the research context and sources context = researcher.get_research_context() - sources = researcher.get_research_sources() - source_urls = researcher.get_source_urls() + sources = researcher.get_research_sources() # Dies sind strukturierte Source-Objekte + source_urls = researcher.get_source_urls() # Dies sind nur die URLs - # Store in the research store for the resource API - store_research_results(query, context, sources, source_urls) + # Store in the research store for the resource API (optional, je nach Logik) + # Überlege, ob formatted_context hier gebraucht wird, wenn der Client die Rohdaten bekommt + store_research_results(query, context, sources, source_urls, context) # Vereinfacht für den Store - return create_success_response({ + # Finale Daten vorbereiten + final_data_payload = { "research_id": research_id, "query": query, + "status": "Research completed successfully", "source_count": len(sources), "context": context, - "sources": format_sources_for_response(sources), + "sources": format_sources_for_response(sources), # Nutze deine Formatierungsfunktion "source_urls": source_urls - }) - except Exception as e: - return handle_exception(e, "Research") + } + # Sende das finale Ergebnis als 'tool_result' Event + yield format_sse_event("tool_result", final_data_payload) + logger.info(f"Sent final research result for ID: {research_id}") + + except Exception as e: + logger.error(f"Error during deep_research for query '{query}': {e}", exc_info=True) + # Sende ein Fehler-Event an den Client + error_payload = { + "research_id": research_id, # Kann None sein, wenn Fehler sehr früh auftritt + "query": query, + "status": "Error occurred", + "error_message": str(e), + "error_details": "Check server logs for more information." + } + yield format_sse_event("tool_error", error_payload) # Eigener Event-Typ für Fehler + # Du könntest hier auch handle_exception verwenden, wenn es ein SSE-Event zurückgibt + # oder die Exception weiter werfen, wenn FastMCP das besser handhabt. @mcp.tool() async def quick_search(query: str) -> Dict[str, Any]: From d66bb1cd7a768a4da5da8ba62041515c04a2615e Mon Sep 17 00:00:00 2001 From: ThomasTaroni Date: Sun, 1 Jun 2025 12:19:21 +0200 Subject: [PATCH 33/55] Standardize SSE event types to "message" Replaced various custom SSE event types like "tool_update", "tool_result", and "tool_error" with the unified "message" event type. This simplifies event handling logic and ensures consistency across all server-sent event flows. --- src/phoenix_technologies/gpt_researcher/server.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/phoenix_technologies/gpt_researcher/server.py b/src/phoenix_technologies/gpt_researcher/server.py index 59d08ab..9c87921 100644 --- a/src/phoenix_technologies/gpt_researcher/server.py +++ b/src/phoenix_technologies/gpt_researcher/server.py @@ -111,7 +111,7 @@ async def deep_research(query: str) -> AsyncGenerator[str, None]: try: yield format_sse_event( - "tool_update", + "message", {"research_id": research_id, "query": query, "status": "Research initiated"} ) @@ -123,14 +123,14 @@ async def deep_research(query: str) -> AsyncGenerator[str, None]: mcp.researchers[research_id] = researcher # Speichere früh, falls benötigt yield format_sse_event( - "tool_update", + "message", {"research_id": research_id, "status": "GPT Researcher initialized. Starting information gathering."} ) await asyncio.sleep(0.1) # Kurze Pause, damit das Event sicher gesendet wird # Simuliertes Update: Beginn der Recherche yield format_sse_event( - "tool_update", + "message", {"research_id": research_id, "status": "Starting web crawling and source analysis...", "progress": 10} ) @@ -139,7 +139,7 @@ async def deep_research(query: str) -> AsyncGenerator[str, None]: # Simuliertes Update: Recherche abgeschlossen, beginne mit der Verarbeitung yield format_sse_event( - "tool_update", + "message", {"research_id": research_id, "status": "Web crawling finished. Processing and consolidating information...", "progress": 70} ) await asyncio.sleep(0.1) @@ -167,7 +167,7 @@ async def deep_research(query: str) -> AsyncGenerator[str, None]: } # Sende das finale Ergebnis als 'tool_result' Event - yield format_sse_event("tool_result", final_data_payload) + yield format_sse_event("message", final_data_payload) logger.info(f"Sent final research result for ID: {research_id}") except Exception as e: @@ -180,7 +180,7 @@ async def deep_research(query: str) -> AsyncGenerator[str, None]: "error_message": str(e), "error_details": "Check server logs for more information." } - yield format_sse_event("tool_error", error_payload) # Eigener Event-Typ für Fehler + yield format_sse_event("message", error_payload) # Eigener Event-Typ für Fehler # Du könntest hier auch handle_exception verwenden, wenn es ein SSE-Event zurückgibt # oder die Exception weiter werfen, wenn FastMCP das besser handhabt. From 1b60eb0ae6f45fdcb000a59db860574d3a0dbdba Mon Sep 17 00:00:00 2001 From: ThomasTaroni Date: Sun, 1 Jun 2025 12:56:34 +0200 Subject: [PATCH 34/55] Refactor `quick_search` to support SSE response streaming. Updated `quick_search` to return an `AsyncGenerator` for streaming server-sent events (SSE) during quick research execution. Enhanced error handling to provide detailed feedback and upgraded `gpt-researcher` dependency to version 0.13.3 for compatibility. --- requirements.txt | 2 +- .../gpt_researcher/server.py | 28 +++++++++++++++---- 2 files changed, 24 insertions(+), 6 deletions(-) diff --git a/requirements.txt b/requirements.txt index 65bcfd1..bf0dfdc 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ # GPT Researcher dependencies -gpt-researcher>=0.12.16 +gpt-researcher>=0.13.3 python-dotenv~=1.1.0 # MCP dependencies diff --git a/src/phoenix_technologies/gpt_researcher/server.py b/src/phoenix_technologies/gpt_researcher/server.py index 9c87921..2589571 100644 --- a/src/phoenix_technologies/gpt_researcher/server.py +++ b/src/phoenix_technologies/gpt_researcher/server.py @@ -162,7 +162,7 @@ async def deep_research(query: str) -> AsyncGenerator[str, None]: "status": "Research completed successfully", "source_count": len(sources), "context": context, - "sources": format_sources_for_response(sources), # Nutze deine Formatierungsfunktion + "sources": format_sources_for_response(sources), "source_urls": source_urls } @@ -185,7 +185,7 @@ async def deep_research(query: str) -> AsyncGenerator[str, None]: # oder die Exception weiter werfen, wenn FastMCP das besser handhabt. @mcp.tool() -async def quick_search(query: str) -> Dict[str, Any]: +async def quick_search(query: str) -> AsyncGenerator[str, None]: """ Perform a quick web search on a given query and return search results with snippets. This optimizes for speed over quality and is useful when an LLM doesn't need in-depth @@ -206,19 +206,37 @@ async def quick_search(query: str) -> Dict[str, Any]: researcher = GPTResearcher(query) try: + yield format_sse_event( + "message", + {"research_id": search_id, "query": query, "status": "Research initiated"} + ) + # Perform quick search search_results = await researcher.quick_search(query=query) mcp.researchers[search_id] = researcher logger.info(f"Quick search completed for ID: {search_id}") - return create_success_response({ + final_data_payload = { "search_id": search_id, "query": query, "result_count": len(search_results) if search_results else 0, "search_results": search_results - }) + } + + # Sende das finale Ergebnis als 'tool_result' Event + yield format_sse_event("message", final_data_payload) + logger.info(f"Sent final research result for ID: {search_id}") except Exception as e: - return handle_exception(e, "Quick search") + logger.error(f"Error during deep_research for query '{query}': {e}", exc_info=True) + # Sende ein Fehler-Event an den Client + error_payload = { + "research_id": search_id, # Kann None sein, wenn Fehler sehr früh auftritt + "query": query, + "status": "Error occurred", + "error_message": str(e), + "error_details": "Check server logs for more information." + } + yield format_sse_event("message", error_payload) @mcp.tool() From 895671189e6a2b2a481faeba8539b2db6c655124 Mon Sep 17 00:00:00 2001 From: ThomasTaroni Date: Sun, 1 Jun 2025 13:00:41 +0200 Subject: [PATCH 35/55] Refactor API methods to use AsyncGenerator for SSE. Updated `write_report` and `get_research_sources` to stream results via server-sent events (SSE) using `AsyncGenerator`. This allows incremental data delivery and better error reporting, improving client-server communication. --- .../gpt_researcher/server.py | 34 ++++++++++++++----- 1 file changed, 25 insertions(+), 9 deletions(-) diff --git a/src/phoenix_technologies/gpt_researcher/server.py b/src/phoenix_technologies/gpt_researcher/server.py index 2589571..dba67b9 100644 --- a/src/phoenix_technologies/gpt_researcher/server.py +++ b/src/phoenix_technologies/gpt_researcher/server.py @@ -240,7 +240,7 @@ async def quick_search(query: str) -> AsyncGenerator[str, None]: @mcp.tool() -async def write_report(research_id: str, custom_prompt: Optional[str] = None) -> Dict[str, Any]: +async def write_report(research_id: str, custom_prompt: Optional[str] = None) -> AsyncGenerator[str, None]: """ Generate a report based on previously conducted research. @@ -253,7 +253,7 @@ async def write_report(research_id: str, custom_prompt: Optional[str] = None) -> """ success, researcher, error = get_researcher_by_id(mcp.researchers, research_id) if not success: - return error + yield format_sse_event("message", error) logger.info(f"Generating report for research ID: {research_id}") @@ -265,17 +265,29 @@ async def write_report(research_id: str, custom_prompt: Optional[str] = None) -> sources = researcher.get_research_sources() costs = researcher.get_costs() - return create_success_response({ + final_data_payload = { "report": report, "source_count": len(sources), "costs": costs - }) + } + + # Sende das finale Ergebnis als 'tool_result' Event + yield format_sse_event("message", final_data_payload) + logger.info(f"Sent final research result for ID: {research_id}") except Exception as e: - return handle_exception(e, "Report generation") + logger.error(f"Error during deep_research for query: {e}", exc_info=True) + # Sende ein Fehler-Event an den Client + error_payload = { + "research_id": research_id, + "status": "Error occurred", + "error_message": str(e), + "error_details": "Check server logs for more information." + } + yield format_sse_event("message", error_payload) @mcp.tool() -async def get_research_sources(research_id: str) -> Dict[str, Any]: +async def get_research_sources(research_id: str) -> AsyncGenerator[str, None]: """ Get the sources used in the research. @@ -287,15 +299,19 @@ async def get_research_sources(research_id: str) -> Dict[str, Any]: """ success, researcher, error = get_researcher_by_id(mcp.researchers, research_id) if not success: - return error + yield format_sse_event("message", error) sources = researcher.get_research_sources() source_urls = researcher.get_source_urls() - return create_success_response({ + final_data_payload = { "sources": format_sources_for_response(sources), "source_urls": source_urls - }) + } + + # Sende das finale Ergebnis als 'tool_result' Event + yield format_sse_event("message", final_data_payload) + logger.info(f"Sent final research result for ID: {research_id}") @mcp.tool() From 60c441c817baa56775a3f35aeaef940dbe3fa7c5 Mon Sep 17 00:00:00 2001 From: ThomasTaroni Date: Sun, 1 Jun 2025 13:31:42 +0200 Subject: [PATCH 36/55] Add GPT Researcher MCP server implementation Introduce an MCP server for GPT Researcher using FastMCP, enabling AI assistants to perform web research and generate reports via SSE. Includes async processing steps with real-time updates and a runnable server script. --- src/phoenix_technologies/test/__init__.py | 8 +++ src/phoenix_technologies/test/server.py | 83 +++++++++++++++++++++++ 2 files changed, 91 insertions(+) create mode 100644 src/phoenix_technologies/test/__init__.py create mode 100644 src/phoenix_technologies/test/server.py diff --git a/src/phoenix_technologies/test/__init__.py b/src/phoenix_technologies/test/__init__.py new file mode 100644 index 0000000..f372bcb --- /dev/null +++ b/src/phoenix_technologies/test/__init__.py @@ -0,0 +1,8 @@ +""" +GPT Researcher MCP Server + +This module provides an MCP server implementation for GPT Researcher, +allowing AI assistants to perform web research and generate reports via the MCP protocol. +""" + +__version__ = "0.1.0" \ No newline at end of file diff --git a/src/phoenix_technologies/test/server.py b/src/phoenix_technologies/test/server.py new file mode 100644 index 0000000..438d42e --- /dev/null +++ b/src/phoenix_technologies/test/server.py @@ -0,0 +1,83 @@ +""" +GPT Researcher MCP Server + +This script implements an MCP server for GPT Researcher, allowing AI assistants +to conduct web research and generate reports via the MCP protocol. +""" + +import os +import json +from typing import Dict, Any, AsyncGenerator +import asyncio +from mcp.server.fastmcp import FastMCP + +# Initialize FastMCP server +mcp = FastMCP("GPT Researcher", host="0.0.0.0", port=8000, timeout_keep_alive=720) + +@mcp.tool() +async def updates(tool_input: Dict[str, Any]) -> AsyncGenerator[str, None]: + """ + Ein MCP-Tool, das Zwischenupdates über SSE streamt. + tool_input ist ein Dictionary, das die Eingabeparameter des Tools enthält. + """ + print(f"Tool gestartet mit Input: {tool_input}") + + async def _process_step_1(): + await asyncio.sleep(1) + return {"status": "Step 1 complete", "details": "Initial processing done."} + + async def _process_step_2(): + await asyncio.sleep(2) + return {"status": "Step 2 in progress", "progress": 50} + + async def _process_step_3(): + await asyncio.sleep(1) + return {"status": "Step 2 complete", "progress": 100, "intermediate_result": "Partial data available"} + + async def _generate_final_output(): + await asyncio.sleep(1) + return {"final_data": "All steps finished successfully.", "summary": tool_input.get("summary_needed", False)} + + # Schritt 1 + update1 = await _process_step_1() + yield f"event: tool_update\ndata: {json.dumps(update1)}\n\n" + print("Update 1 gesendet") + + # Schritt 2 + update2 = await _process_step_2() + yield f"event: tool_update\ndata: {json.dumps(update2)}\n\n" + print("Update 2 gesendet") + + # Schritt 3 + update3 = await _process_step_3() + yield f"event: tool_update\ndata: {json.dumps(update3)}\n\n" + print("Update 3 gesendet") + + # Finale Ausgabe (kann auch als spezielles Event gesendet werden) + final_output = await _generate_final_output() + yield f"event: tool_result\ndata: {json.dumps(final_output)}\n\n" + print("Finales Ergebnis gesendet") + + # Optional: Signal für Stream-Ende, falls vom Client benötigt oder von FastMCP erwartet + # yield "event: stream_end\ndata: {}\n\n" + print("Tool-Ausführung beendet.") + +def run_server(): + """Run the MCP server using FastMCP's built-in event loop handling.""" + + # Add startup message + print("🚀 Test MCP Server starting... Check researcher_mcp_server.log for details") + + # Let FastMCP handle the event loop + try: + mcp.run("sse") + except Exception as e: + print(f"❌ MCP Server error: {str(e)}") + return + + print("✅ MCP Server stopped") + + +if __name__ == "__main__": + # Use the non-async approach to avoid asyncio nesting issues + run_server() \ No newline at end of file From bf695bef092e16a72d8ef0f55d2287b39905115a Mon Sep 17 00:00:00 2001 From: ThomasTaroni Date: Sat, 21 Jun 2025 19:14:11 +0200 Subject: [PATCH 37/55] Refactor SMD utilities and streamline research query usage Remove unnecessary SMD utilities and shift query creation logic directly into `smd_research` for simplicity. This eliminates duplicate functionality and reduces reliance on external utility files, consolidating responsibilities within server implementation. --- src/phoenix_technologies/smd/server.py | 54 +++++++++++++++----------- src/phoenix_technologies/smd/utils.py | 49 ----------------------- 2 files changed, 31 insertions(+), 72 deletions(-) delete mode 100644 src/phoenix_technologies/smd/utils.py diff --git a/src/phoenix_technologies/smd/server.py b/src/phoenix_technologies/smd/server.py index 24de685..f6639f9 100644 --- a/src/phoenix_technologies/smd/server.py +++ b/src/phoenix_technologies/smd/server.py @@ -14,10 +14,6 @@ from mcp.server.fastmcp import FastMCP # Load environment variables load_dotenv() -from utils import ( - create_research_prompt -) - logging.basicConfig( level=logging.INFO, format='[%(asctime)s][%(levelname)s] - %(message)s', @@ -55,17 +51,46 @@ async def smd_detail_article(article_id: str) -> dict: } @mcp.tool() -async def smd_research(query: dict) -> dict: +async def smd_research(search_query: str = "Bundesrat", date_from: str = "2024-05-30T22:00:00.000Z", date_to: str = "2025-05-31T21:59:59.999Z") -> dict: """ Execute a deep search on a given query using SMD Researcher. Use this tool when you need research on a topic. Args: - query: The research query or topic + search_query: The SMD search query, there are Logical Operators available (AND, OR, NOT) and for a excact match use "+" before the word. For excluding use "-" before the word. For queries with multiple words use quotes. + date_from: The date to start research from, in the format YYYY-MM-DDTHH:MM:SS.SSSZ + date_to: The date to end research at, in the format YYYY-MM-DDTHH:MM:SS.SSSZ Returns: String containing research status, ID, and the actual research context """ + + query = { + "sort": { + "field": "score", + "direction": "desc" + }, + "filters": [ + { + "field": "datetime", + "value": [ + date_from, + date_to + ] + }, + { + "field": "query_text", + "value": [search_query] + } + ], + "exact": False, + "pagination": { + "pageSize": 10, + "currentPage": 1 + }, + "onlyResults": True + } + url = "https://api.swissdox.ch/api/documents/search" headers = { "authorization": f"Bearer {os.getenv('SWISSDOX_BEARER_TOKEN', '')}", @@ -79,23 +104,6 @@ async def smd_research(query: dict) -> dict: "message": response.text } - -@mcp.prompt() -def research_query(search_query: str = "Bundesrat", date_from: str = "2024-05-30T22:00:00.000Z", date_to: str = "2025-05-31T21:59:59.999Z") -> dict: - """ - Create a research query prompt for SMD Researcher. - - Args: - search_query: The SMD search query, there are Logical Operators available (AND, OR, NOT) and for a excact match use "+" before the word. For excluding use "-" before the word. For queries with multiple words use quotes. - date_from: The date to start research from, in the format YYYY-MM-DDTHH:MM:SS.SSSZ - date_to: The date to end research at, in the format YYYY-MM-DDTHH:MM:SS.SSSZ - - Returns: - A formatted prompt for research - """ - return create_research_prompt(search_query,date_from,date_to) - - def run_server(): """Run the MCP server using FastMCP's built-in event loop handling.""" diff --git a/src/phoenix_technologies/smd/utils.py b/src/phoenix_technologies/smd/utils.py deleted file mode 100644 index 7c84f68..0000000 --- a/src/phoenix_technologies/smd/utils.py +++ /dev/null @@ -1,49 +0,0 @@ -""" -SMD Researcher MCP Server Utilities - -This module provides utility functions and helpers for the SMD Researcher MCP Server. -""" - -import sys -from loguru import logger - -# Configure logging for console only (no file logging) -logger.configure(handlers=[{"sink": sys.stderr, "level": "INFO"}]) - -def create_research_prompt(search_query: str = "Bundesrat", date_from: str = "2024-05-30T22:00:00.000Z", date_to: str = "2025-05-31T21:59:59.999Z") -> dict: - """ - Create a research query prompt for SMD Researcher. - - Args: - search_query: The SMD search query, there are Logical Operators available (AND, OR, NOT) and for a excact match use "+" before the word. For excluding use "-" before the word. For queries with multiple words use quotes. - date_from: The date to start research from, in the format YYYY-MM-DDTHH:MM:SS.SSSZ - date_to: The date to end research at, in the format YYYY-MM-DDTHH:MM:SS.SSSZ - - Returns: - A formatted prompt for research as a Python dictionary. - """ - return { - "sort": { - "field": "score", - "direction": "desc" - }, - "filters": [ - { - "field": "datetime", - "value": [ - date_from, - date_to - ] - }, - { - "field": "query_text", - "value": [search_query] - } - ], - "exact": False, - "pagination": { - "pageSize": 10, - "currentPage": 1 - }, - "onlyResults": True - } From a36990856c35eb61e32b42acfb48376b0c03705e Mon Sep 17 00:00:00 2001 From: ThomasTaroni Date: Sat, 21 Jun 2025 19:31:05 +0200 Subject: [PATCH 38/55] Update default `onlyResults` parameter to False The `onlyResults` parameter was changed from `True` to `False` in the API request configuration. This ensures more comprehensive data retrieval by including additional metadata in the API response. --- src/phoenix_technologies/smd/server.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/phoenix_technologies/smd/server.py b/src/phoenix_technologies/smd/server.py index f6639f9..dc71dcb 100644 --- a/src/phoenix_technologies/smd/server.py +++ b/src/phoenix_technologies/smd/server.py @@ -88,7 +88,7 @@ async def smd_research(search_query: str = "Bundesrat", date_from: str = "2024-0 "pageSize": 10, "currentPage": 1 }, - "onlyResults": True + "onlyResults": False } url = "https://api.swissdox.ch/api/documents/search" From 4793aca10ed0bb31c7c10ee88197b4f517db8469 Mon Sep 17 00:00:00 2001 From: ThomasTaroni Date: Sat, 21 Jun 2025 19:35:33 +0200 Subject: [PATCH 39/55] Update docstring for smd_detail_article function Clarify the usage instructions and context for the smd_detail_article function. This ensures better understanding of its relationship with the smd_research tool. --- src/phoenix_technologies/smd/server.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/phoenix_technologies/smd/server.py b/src/phoenix_technologies/smd/server.py index dc71dcb..22a0faa 100644 --- a/src/phoenix_technologies/smd/server.py +++ b/src/phoenix_technologies/smd/server.py @@ -27,8 +27,8 @@ mcp = FastMCP("SMD Researcher", host="0.0.0.0", port=8000, timeout_keep_alive=72 @mcp.tool() async def smd_detail_article(article_id: str) -> dict: """ - Get the Details of an article found by the SMD Research. - Use this tool when you need the full content of an article. + Get the Details of an article found by the tool smd_research. + Use this tool after smd_research is executed to get all the details of the articles in the result. Args: article_id: The ID of the article to get the details for From d779b5bd981aa6815b4cd50192e6e9c6133d9cdf Mon Sep 17 00:00:00 2001 From: ThomasTaroni Date: Sat, 21 Jun 2025 19:50:27 +0200 Subject: [PATCH 40/55] Update SMD server to fix API URL and clarify docstring Revised the API endpoint URL from smd.ch to swissdox.ch for accuracy and updated the function docstring to clarify the purpose of the article_id parameter. These changes improve clarity and ensure API requests are routed correctly. --- src/phoenix_technologies/smd/server.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/phoenix_technologies/smd/server.py b/src/phoenix_technologies/smd/server.py index 22a0faa..f1db6c4 100644 --- a/src/phoenix_technologies/smd/server.py +++ b/src/phoenix_technologies/smd/server.py @@ -27,7 +27,7 @@ mcp = FastMCP("SMD Researcher", host="0.0.0.0", port=8000, timeout_keep_alive=72 @mcp.tool() async def smd_detail_article(article_id: str) -> dict: """ - Get the Details of an article found by the tool smd_research. + Get the Details of an article found by the tool smd_research based on the article_id. Use this tool after smd_research is executed to get all the details of the articles in the result. Args: @@ -36,7 +36,7 @@ async def smd_detail_article(article_id: str) -> dict: Returns: String containing the details of the article """ - url = f"https://api.smd.ch/api/documents/{article_id}" + url = f"https://api.swissdox.ch/api/documents/{article_id}" headers = { "authorization": f"Bearer {os.getenv('SWISSDOX_BEARER_TOKEN', '')}", "content-type": "application/json", From 1554d5e4abf54800cb28b51cdb95bfbeb2dd1788 Mon Sep 17 00:00:00 2001 From: ThomasTaroni Date: Sat, 21 Jun 2025 20:06:17 +0200 Subject: [PATCH 41/55] Update article query logic and improve response structure Removed the unused `smd_detail_article` decorator and expanded the response structure to include detailed article data in nested JSON. Adjusted the pagination default `pageSize` from 10 to 25 to handle larger datasets effectively. --- src/phoenix_technologies/smd/server.py | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/src/phoenix_technologies/smd/server.py b/src/phoenix_technologies/smd/server.py index f1db6c4..fa89fc0 100644 --- a/src/phoenix_technologies/smd/server.py +++ b/src/phoenix_technologies/smd/server.py @@ -24,7 +24,6 @@ logger = logging.getLogger(__name__) # Initialize FastMCP server mcp = FastMCP("SMD Researcher", host="0.0.0.0", port=8000, timeout_keep_alive=720) -@mcp.tool() async def smd_detail_article(article_id: str) -> dict: """ Get the Details of an article found by the tool smd_research based on the article_id. @@ -85,7 +84,7 @@ async def smd_research(search_query: str = "Bundesrat", date_from: str = "2024-0 ], "exact": False, "pagination": { - "pageSize": 10, + "pageSize": 25, "currentPage": 1 }, "onlyResults": False @@ -98,12 +97,30 @@ async def smd_research(search_query: str = "Bundesrat", date_from: str = "2024-0 } response = requests.post(url, headers=headers, json=query) if response.status_code == 200: - return response.json() + result = response.json() + + # Artikel-Liste aus der `data`-Struktur der Antwort + articles = result.get("data", []) + + # Verschachtelte JSON-Resultate von `smd_detail_article` sammeln + detailed_articles = [] + for article in articles: + article_id = article.get("id") + if article_id: + detailed_result = smd_detail_article(article_id) + detailed_articles.append(detailed_result) + + # Gesamtantwort mit kombinierter JSON-Struktur + return { + "search_result": result, + "detailed_articles": detailed_articles + } else: return { "message": response.text } + def run_server(): """Run the MCP server using FastMCP's built-in event loop handling.""" From 8eda9031859db97e31197a724f33f6ac3a4817dd Mon Sep 17 00:00:00 2001 From: ThomasTaroni Date: Sat, 21 Jun 2025 20:17:59 +0200 Subject: [PATCH 42/55] Switch to aiohttp for async HTTP requests in server logic Replaced synchronous requests with aiohttp to enable non-blocking, async HTTP requests. Updated `smd_detail_article` and `smd_research` to utilize asyncio for parallel execution, significantly improving efficiency for handling multiple API calls. Added aiohttp and asyncio dependencies to requirements.txt. --- requirements.txt | 4 ++- src/phoenix_technologies/smd/server.py | 44 +++++++++++--------------- 2 files changed, 21 insertions(+), 27 deletions(-) diff --git a/requirements.txt b/requirements.txt index bf0dfdc..486c004 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,4 +10,6 @@ pydantic>=2.3.0 # Utility dependencies loguru>=0.7.0 -requests~=2.32.3 \ No newline at end of file +requests~=2.32.3 +aiohttp~=3.11.18 +asyncio~=3.4.3 \ No newline at end of file diff --git a/src/phoenix_technologies/smd/server.py b/src/phoenix_technologies/smd/server.py index fa89fc0..d04f544 100644 --- a/src/phoenix_technologies/smd/server.py +++ b/src/phoenix_technologies/smd/server.py @@ -7,6 +7,8 @@ to conduct research and generate reports via the MCP protocol. import os import logging +import aiohttp +import asyncio import requests from dotenv import load_dotenv from mcp.server.fastmcp import FastMCP @@ -24,30 +26,24 @@ logger = logging.getLogger(__name__) # Initialize FastMCP server mcp = FastMCP("SMD Researcher", host="0.0.0.0", port=8000, timeout_keep_alive=720) -async def smd_detail_article(article_id: str) -> dict: - """ - Get the Details of an article found by the tool smd_research based on the article_id. - Use this tool after smd_research is executed to get all the details of the articles in the result. - - Args: - article_id: The ID of the article to get the details for - - Returns: - String containing the details of the article - """ +async def smd_detail_article(article_id): url = f"https://api.swissdox.ch/api/documents/{article_id}" headers = { "authorization": f"Bearer {os.getenv('SWISSDOX_BEARER_TOKEN', '')}", "content-type": "application/json", } payload = {"filters": [], "pagination": {"pageSize": 1, "currentPage": 1}} - response = requests.post(url, headers=headers, json=payload) - if response.status_code == 200: - return response.json() - else: - return { - "message": response.text - } + + async with aiohttp.ClientSession() as session: + async with session.get(url, headers=headers, payload=payload) as response: + if response.status == 200: + return await response.json() # JSON asynchron lesen + else: + return { + "message": await response.text(), + "article_id": article_id + } + @mcp.tool() async def smd_research(search_query: str = "Bundesrat", date_from: str = "2024-05-30T22:00:00.000Z", date_to: str = "2025-05-31T21:59:59.999Z") -> dict: @@ -99,20 +95,16 @@ async def smd_research(search_query: str = "Bundesrat", date_from: str = "2024-0 if response.status_code == 200: result = response.json() - # Artikel-Liste aus der `data`-Struktur der Antwort articles = result.get("data", []) - # Verschachtelte JSON-Resultate von `smd_detail_article` sammeln - detailed_articles = [] + tasks = [] for article in articles: article_id = article.get("id") if article_id: - detailed_result = smd_detail_article(article_id) - detailed_articles.append(detailed_result) - - # Gesamtantwort mit kombinierter JSON-Struktur + tasks.append(smd_detail_article(article_id)) + detailed_articles = await asyncio.gather(*tasks) return { - "search_result": result, + "original_result": result, "detailed_articles": detailed_articles } else: From 87d4884eea0e158f372b56eeae0d86fc2b9e1ba8 Mon Sep 17 00:00:00 2001 From: ThomasTaroni Date: Sat, 21 Jun 2025 20:25:52 +0200 Subject: [PATCH 43/55] Switch HTTP method from GET to POST in server request. Updated the SMD server client to use POST instead of GET for sending requests. This change ensures the payload is sent as JSON in compliance with the server's expected input format. --- src/phoenix_technologies/smd/server.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/phoenix_technologies/smd/server.py b/src/phoenix_technologies/smd/server.py index d04f544..d1cac9c 100644 --- a/src/phoenix_technologies/smd/server.py +++ b/src/phoenix_technologies/smd/server.py @@ -35,7 +35,7 @@ async def smd_detail_article(article_id): payload = {"filters": [], "pagination": {"pageSize": 1, "currentPage": 1}} async with aiohttp.ClientSession() as session: - async with session.get(url, headers=headers, payload=payload) as response: + async with session.post(url, headers=headers, json=payload) as response: if response.status == 200: return await response.json() # JSON asynchron lesen else: From 25000459cb7eae65e568e9b9eb93c158e07915aa Mon Sep 17 00:00:00 2001 From: ThomasTaroni Date: Sat, 21 Jun 2025 20:39:22 +0200 Subject: [PATCH 44/55] Set default page size from environment variable Replaced the hardcoded page size with a value fetched from the `SWISSDOX_PAGESIZE` environment variable, defaulting to 10 if not set. This allows greater flexibility and configurability for pagination settings. --- src/phoenix_technologies/smd/server.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/phoenix_technologies/smd/server.py b/src/phoenix_technologies/smd/server.py index d1cac9c..9058965 100644 --- a/src/phoenix_technologies/smd/server.py +++ b/src/phoenix_technologies/smd/server.py @@ -80,7 +80,7 @@ async def smd_research(search_query: str = "Bundesrat", date_from: str = "2024-0 ], "exact": False, "pagination": { - "pageSize": 25, + "pageSize": os.getenv('SWISSDOX_PAGESIZE', '10'), "currentPage": 1 }, "onlyResults": False From 3c1f8a2c254aa84dc3a0b2680741c3f8434922a6 Mon Sep 17 00:00:00 2001 From: ThomasTaroni Date: Sat, 21 Jun 2025 20:52:00 +0200 Subject: [PATCH 45/55] Optimize query formulation instructions in docstring Clarified that the SMD search query should be written in German and enriched with relevant contextual keywords. This ensures better alignment with language requirements and improves search accuracy. --- src/phoenix_technologies/smd/server.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/phoenix_technologies/smd/server.py b/src/phoenix_technologies/smd/server.py index 9058965..eecde2e 100644 --- a/src/phoenix_technologies/smd/server.py +++ b/src/phoenix_technologies/smd/server.py @@ -52,7 +52,7 @@ async def smd_research(search_query: str = "Bundesrat", date_from: str = "2024-0 Use this tool when you need research on a topic. Args: - search_query: The SMD search query, there are Logical Operators available (AND, OR, NOT) and for a excact match use "+" before the word. For excluding use "-" before the word. For queries with multiple words use quotes. + search_query: The SMD search query, there are Logical Operators available (AND, OR, NOT) and for a excact match use "+" before the word. For excluding use "-" before the word. For queries with multiple words use quotes. Formulate the Query in German. Enrich the query with the relevant context keywords of the topic. date_from: The date to start research from, in the format YYYY-MM-DDTHH:MM:SS.SSSZ date_to: The date to end research at, in the format YYYY-MM-DDTHH:MM:SS.SSSZ From 1d5aeb1644f8bd8e4b644264c6489135c9e33043 Mon Sep 17 00:00:00 2001 From: ThomasTaroni Date: Sat, 21 Jun 2025 21:18:32 +0200 Subject: [PATCH 46/55] Add article summarization via external API integration Introduced a new `summarize_to_words` function to summarize articles using an external API. Integrated it into `smd_detail_article` to return summarized article content instead of the full text. Updated header key capitalization for consistency. --- src/phoenix_technologies/smd/server.py | 35 +++++++++++++++++++++++--- 1 file changed, 32 insertions(+), 3 deletions(-) diff --git a/src/phoenix_technologies/smd/server.py b/src/phoenix_technologies/smd/server.py index eecde2e..eeae925 100644 --- a/src/phoenix_technologies/smd/server.py +++ b/src/phoenix_technologies/smd/server.py @@ -26,18 +26,47 @@ logger = logging.getLogger(__name__) # Initialize FastMCP server mcp = FastMCP("SMD Researcher", host="0.0.0.0", port=8000, timeout_keep_alive=720) +async def summarize_to_words(article: dict, target_word_count: int = 1000) -> str: + url = f"https://maas.ai-2.kvant.cloud/engines/{os.getenv('SWISSDOX_SUMMARIZING_MODEL', '')}/chat/completions" + headers = { + "x-litellm-api-key": f"Bearer {os.getenv('SWISSDOX_SUMMARIZING_MODEL_APIKEY', '')}", + "Content-type": "application/json", + } + payload = { + "model": {os.getenv('SWISSDOX_SUMMARIZING_MODEL', '')}, + "messages": [ + { + "role": "text summarizer", + "content": f"You are summarizing the user input to a maximum of {target_word_count}" + }, + { + "role": "user", + "content": f"{str(article)}" + } + ] + } + + async with aiohttp.ClientSession() as session: + async with session.post(url, headers=headers, json=payload) as response: + if response.status == 200: + return await response.json() + else: + return await response.text() + async def smd_detail_article(article_id): url = f"https://api.swissdox.ch/api/documents/{article_id}" headers = { - "authorization": f"Bearer {os.getenv('SWISSDOX_BEARER_TOKEN', '')}", - "content-type": "application/json", + "Authorization": f"Bearer {os.getenv('SWISSDOX_BEARER_TOKEN', '')}", + "Content-type": "application/json", } payload = {"filters": [], "pagination": {"pageSize": 1, "currentPage": 1}} async with aiohttp.ClientSession() as session: async with session.post(url, headers=headers, json=payload) as response: if response.status == 200: - return await response.json() # JSON asynchron lesen + data = await response.json() + summarized_content = await summarize_to_words(data, target_word_count=10000) + return summarized_content else: return { "message": await response.text(), From 6f9f74dae0f7c289334e3225bb890fb274ecd1d2 Mon Sep 17 00:00:00 2001 From: ThomasTaroni Date: Sat, 21 Jun 2025 21:35:54 +0200 Subject: [PATCH 47/55] Refactor `summarize_to_words` to accept `text` and `title`. Updated the `summarize_to_words` function to take `text` and `title` as separate parameters instead of a single `article` dictionary. Adjusted payload and function calls accordingly for better clarity and flexibility. --- src/phoenix_technologies/smd/server.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/phoenix_technologies/smd/server.py b/src/phoenix_technologies/smd/server.py index eeae925..b35ddfb 100644 --- a/src/phoenix_technologies/smd/server.py +++ b/src/phoenix_technologies/smd/server.py @@ -26,7 +26,7 @@ logger = logging.getLogger(__name__) # Initialize FastMCP server mcp = FastMCP("SMD Researcher", host="0.0.0.0", port=8000, timeout_keep_alive=720) -async def summarize_to_words(article: dict, target_word_count: int = 1000) -> str: +async def summarize_to_words(text: str, title: str, target_word_count: int = 1000) -> str: url = f"https://maas.ai-2.kvant.cloud/engines/{os.getenv('SWISSDOX_SUMMARIZING_MODEL', '')}/chat/completions" headers = { "x-litellm-api-key": f"Bearer {os.getenv('SWISSDOX_SUMMARIZING_MODEL_APIKEY', '')}", @@ -41,7 +41,7 @@ async def summarize_to_words(article: dict, target_word_count: int = 1000) -> st }, { "role": "user", - "content": f"{str(article)}" + "content": f"{title} - {text}" } ] } @@ -65,7 +65,7 @@ async def smd_detail_article(article_id): async with session.post(url, headers=headers, json=payload) as response: if response.status == 200: data = await response.json() - summarized_content = await summarize_to_words(data, target_word_count=10000) + summarized_content = await summarize_to_words(title=data.get("title"), text=data.get("text"), target_word_count=10000) return summarized_content else: return { From 0d532d147b6abe5f5f51e16254a23a2711447123 Mon Sep 17 00:00:00 2001 From: ThomasTaroni Date: Sat, 21 Jun 2025 21:43:51 +0200 Subject: [PATCH 48/55] Simplify server response handling by returning raw data. Replaced the summarized content processing with direct data return. This improves efficiency by skipping the summarization step and ensures the full response data is available to the caller for further use. --- src/phoenix_technologies/smd/server.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/phoenix_technologies/smd/server.py b/src/phoenix_technologies/smd/server.py index b35ddfb..5c5ca8a 100644 --- a/src/phoenix_technologies/smd/server.py +++ b/src/phoenix_technologies/smd/server.py @@ -65,8 +65,8 @@ async def smd_detail_article(article_id): async with session.post(url, headers=headers, json=payload) as response: if response.status == 200: data = await response.json() - summarized_content = await summarize_to_words(title=data.get("title"), text=data.get("text"), target_word_count=10000) - return summarized_content +# summarized_content = await summarize_to_words(title=data.get("title"), text=data.get("text"), target_word_count=10000) + return data else: return { "message": await response.text(), From 5799319aded8f16490462bd70ca1d72980224ebd Mon Sep 17 00:00:00 2001 From: ThomasTaroni Date: Sat, 21 Jun 2025 21:47:13 +0200 Subject: [PATCH 49/55] Update response handling and add summarized content return Modified the response handling to return raw text instead of JSON in one case and updated another to include summarized content with article ID. These changes ensure consistency and enhance the response structure for better client integration. --- src/phoenix_technologies/smd/server.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/phoenix_technologies/smd/server.py b/src/phoenix_technologies/smd/server.py index 5c5ca8a..7d2896d 100644 --- a/src/phoenix_technologies/smd/server.py +++ b/src/phoenix_technologies/smd/server.py @@ -49,7 +49,7 @@ async def summarize_to_words(text: str, title: str, target_word_count: int = 100 async with aiohttp.ClientSession() as session: async with session.post(url, headers=headers, json=payload) as response: if response.status == 200: - return await response.json() + return await response.text() else: return await response.text() @@ -65,8 +65,11 @@ async def smd_detail_article(article_id): async with session.post(url, headers=headers, json=payload) as response: if response.status == 200: data = await response.json() -# summarized_content = await summarize_to_words(title=data.get("title"), text=data.get("text"), target_word_count=10000) - return data + summarized_content = await summarize_to_words(title=data.get("title"), text=data.get("text"), target_word_count=10000) + return { + "message": summarized_content, + "article_id": article_id + } else: return { "message": await response.text(), From cfc124915fb3b4884a3174347f0264da729490bb Mon Sep 17 00:00:00 2001 From: ThomasTaroni Date: Sat, 21 Jun 2025 21:54:29 +0200 Subject: [PATCH 50/55] Fix incorrect string formatting for model payload key Replaced curly brackets with direct reference to the environment variable in the payload model value. This ensures the correct model is passed dynamically from environment configurations. --- src/phoenix_technologies/smd/server.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/phoenix_technologies/smd/server.py b/src/phoenix_technologies/smd/server.py index 7d2896d..6cbb8a6 100644 --- a/src/phoenix_technologies/smd/server.py +++ b/src/phoenix_technologies/smd/server.py @@ -33,7 +33,7 @@ async def summarize_to_words(text: str, title: str, target_word_count: int = 100 "Content-type": "application/json", } payload = { - "model": {os.getenv('SWISSDOX_SUMMARIZING_MODEL', '')}, + "model": os.getenv('SWISSDOX_SUMMARIZING_MODEL', ''), "messages": [ { "role": "text summarizer", From 4c554b14729f4e2ec22eae601a2bfce6a606637a Mon Sep 17 00:00:00 2001 From: ThomasTaroni Date: Sat, 21 Jun 2025 21:57:17 +0200 Subject: [PATCH 51/55] Fix incorrect API key header in summarization function The "Bearer" prefix was mistakenly included in the API key header. This change removes it to align with the expected format required by the external service. --- src/phoenix_technologies/smd/server.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/phoenix_technologies/smd/server.py b/src/phoenix_technologies/smd/server.py index 6cbb8a6..d86704e 100644 --- a/src/phoenix_technologies/smd/server.py +++ b/src/phoenix_technologies/smd/server.py @@ -29,7 +29,7 @@ mcp = FastMCP("SMD Researcher", host="0.0.0.0", port=8000, timeout_keep_alive=72 async def summarize_to_words(text: str, title: str, target_word_count: int = 1000) -> str: url = f"https://maas.ai-2.kvant.cloud/engines/{os.getenv('SWISSDOX_SUMMARIZING_MODEL', '')}/chat/completions" headers = { - "x-litellm-api-key": f"Bearer {os.getenv('SWISSDOX_SUMMARIZING_MODEL_APIKEY', '')}", + "x-litellm-api-key": f"{os.getenv('SWISSDOX_SUMMARIZING_MODEL_APIKEY', '')}", "Content-type": "application/json", } payload = { From bbec6df3fdc84c77a98cd04bc8d62cf9c078e7cc Mon Sep 17 00:00:00 2001 From: ThomasTaroni Date: Sat, 21 Jun 2025 22:19:50 +0200 Subject: [PATCH 52/55] Refactor response handling to parse JSON content. Replace raw response text retrieval with JSON parsing to extract specific content. This ensures more precise data handling and aligns with expected response structure. --- src/phoenix_technologies/smd/server.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/phoenix_technologies/smd/server.py b/src/phoenix_technologies/smd/server.py index d86704e..45b89a8 100644 --- a/src/phoenix_technologies/smd/server.py +++ b/src/phoenix_technologies/smd/server.py @@ -49,7 +49,8 @@ async def summarize_to_words(text: str, title: str, target_word_count: int = 100 async with aiohttp.ClientSession() as session: async with session.post(url, headers=headers, json=payload) as response: if response.status == 200: - return await response.text() + data = await response.json() + return data.get("choices")[0].get("message").get("content") else: return await response.text() From e5a6c59f7ad17de6f117c081ed5d768c3926a18f Mon Sep 17 00:00:00 2001 From: ThomasTaroni Date: Sat, 21 Jun 2025 22:22:21 +0200 Subject: [PATCH 53/55] Add execution time logging to smd_detail_article function MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduced performance tracking in the smd_detail_article function by adding execution time logging. This helps in monitoring and optimizing the function’s runtime during API requests and processing. --- src/phoenix_technologies/smd/server.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/phoenix_technologies/smd/server.py b/src/phoenix_technologies/smd/server.py index 45b89a8..c5bc19d 100644 --- a/src/phoenix_technologies/smd/server.py +++ b/src/phoenix_technologies/smd/server.py @@ -10,6 +10,7 @@ import logging import aiohttp import asyncio import requests +import time from dotenv import load_dotenv from mcp.server.fastmcp import FastMCP @@ -55,6 +56,8 @@ async def summarize_to_words(text: str, title: str, target_word_count: int = 100 return await response.text() async def smd_detail_article(article_id): + logger.info("Starting smd_detail_article function.") + start_time = time.perf_counter() url = f"https://api.swissdox.ch/api/documents/{article_id}" headers = { "Authorization": f"Bearer {os.getenv('SWISSDOX_BEARER_TOKEN', '')}", @@ -67,6 +70,8 @@ async def smd_detail_article(article_id): if response.status == 200: data = await response.json() summarized_content = await summarize_to_words(title=data.get("title"), text=data.get("text"), target_word_count=10000) + execution_time = time.perf_counter() - start_time + logger.info(f"smd_detail_article executed in {execution_time:.2f} seconds.") return { "message": summarized_content, "article_id": article_id From cf8895957671082c7dce515a4ecdcbdfde26afb7 Mon Sep 17 00:00:00 2001 From: ThomasTaroni Date: Thu, 3 Jul 2025 23:33:12 +0200 Subject: [PATCH 54/55] Update query parameters and response handling in server.py Replaced hardcoded date values with environment variables for flexibility and added a new filter for newspapers. Enhanced response parsing by including facets for related persons and organizations. --- src/phoenix_technologies/smd/server.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/src/phoenix_technologies/smd/server.py b/src/phoenix_technologies/smd/server.py index c5bc19d..025a144 100644 --- a/src/phoenix_technologies/smd/server.py +++ b/src/phoenix_technologies/smd/server.py @@ -107,8 +107,14 @@ async def smd_research(search_query: str = "Bundesrat", date_from: str = "2024-0 { "field": "datetime", "value": [ - date_from, - date_to + os.getenv('SWISSDOX_DATEFROM', '2020-12-31T23:00:00.000Z'), + os.getenv('SWISSDOX_DATETO', '2023-12-31T22:59:00.000Z') + ] + }, + { + "field": "newspaper", + "value": [ + os.getenv('SWISSDOX_NEWSPAPER', 'NZZ') ] }, { @@ -134,6 +140,7 @@ async def smd_research(search_query: str = "Bundesrat", date_from: str = "2024-0 result = response.json() articles = result.get("data", []) + facets = result.get("facets", []) tasks = [] for article in articles: @@ -142,7 +149,8 @@ async def smd_research(search_query: str = "Bundesrat", date_from: str = "2024-0 tasks.append(smd_detail_article(article_id)) detailed_articles = await asyncio.gather(*tasks) return { - "original_result": result, + "related_persons": facets.get("persons", []), + "related_organizations": facets.get("persons", []), "detailed_articles": detailed_articles } else: From db19c4f4f8ed5c592c880ea6778842841ec2b2df Mon Sep 17 00:00:00 2001 From: ThomasTaroni Date: Thu, 24 Jul 2025 10:05:48 +0200 Subject: [PATCH 55/55] Add Logs --- src/phoenix_technologies/smd/server.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/phoenix_technologies/smd/server.py b/src/phoenix_technologies/smd/server.py index 025a144..b42881c 100644 --- a/src/phoenix_technologies/smd/server.py +++ b/src/phoenix_technologies/smd/server.py @@ -72,6 +72,7 @@ async def smd_detail_article(article_id): summarized_content = await summarize_to_words(title=data.get("title"), text=data.get("text"), target_word_count=10000) execution_time = time.perf_counter() - start_time logger.info(f"smd_detail_article executed in {execution_time:.2f} seconds.") + logger.info(f"smd_article_summarization {summarized_content}") return { "message": summarized_content, "article_id": article_id @@ -148,6 +149,7 @@ async def smd_research(search_query: str = "Bundesrat", date_from: str = "2024-0 if article_id: tasks.append(smd_detail_article(article_id)) detailed_articles = await asyncio.gather(*tasks) + logger.info(f"detailed_articles {detailed_articles}") return { "related_persons": facets.get("persons", []), "related_organizations": facets.get("persons", []),