From e6e4a8aa4d71e8cb57e71a00eece9c61e1a28267 Mon Sep 17 00:00:00 2001 From: yaowenc2 Date: Fri, 17 Oct 2025 16:03:23 -0700 Subject: [PATCH 1/5] =?UTF-8?q?=E2=9C=A8=20feat:=20add=20Insforge=20MCP=20?= =?UTF-8?q?server=20support?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds complete support for benchmarking Insforge Backend-as-a-Service via MCP. ## What's Added: - **Insforge MCP Service**: New service configuration in `src/services.py` - **State Management**: `InsforgeStateManager` handles backend setup via `prepare_environment.py` scripts - **Login Helper**: `InsforgeLoginHelper` validates backend connectivity - **Task Manager**: `InsforgeTaskManager` manages Insforge-specific tasks - **MCP Integration**: Added Insforge to both `base_agent.py` and `mcpmark_agent.py` - **Docker Support**: Updated `run-task.sh` with Insforge container configuration ## How It Works: - Uses `@insforge/mcp` npm package for MCP server - Requires `INSFORGE_API_KEY` and `INSFORGE_BACKEND_URL` environment variables - Can test SQL tasks by symlinking `tasks/insforge -> tasks/postgres` - Enables comparison between direct SQL (postgres-mcp) and REST API (insforge) approaches ## Configuration: ```bash # In .mcp_env INSFORGE_API_KEY="your-api-key" INSFORGE_BACKEND_URL="http://localhost:7130" ``` ## Usage: ```bash python -m pipeline \ --mcp insforge \ --models claude-sonnet-4 \ --exp-name insforge-test \ --tasks your_task \ --k 1 ``` šŸ¤– Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- run-task.sh | 10 + src/agents/base_agent.py | 16 +- src/agents/mcpmark_agent.py | 22 +- src/mcp_services/insforge/__init__.py | 1 + .../insforge/insforge_login_helper.py | 159 ++++++++++ .../insforge/insforge_state_manager.py | 272 ++++++++++++++++++ .../insforge/insforge_task_manager.py | 108 +++++++ src/services.py | 31 ++ 8 files changed, 614 insertions(+), 5 deletions(-) create mode 100644 src/mcp_services/insforge/__init__.py create mode 100644 src/mcp_services/insforge/insforge_login_helper.py create mode 100644 src/mcp_services/insforge/insforge_state_manager.py create mode 100644 src/mcp_services/insforge/insforge_task_manager.py diff --git a/run-task.sh b/run-task.sh index 875d7671..405ac526 100755 --- a/run-task.sh +++ b/run-task.sh @@ -139,6 +139,16 @@ elif [ "$SERVICE" = "filesystem" ]; then $([ -f .mcp_env ] && echo "-v $(pwd)/.mcp_env:/app/.mcp_env:ro") \ "$DOCKER_IMAGE" \ python3 -m pipeline --mcp "$SERVICE" --k 1 "$@" +elif [ "$SERVICE" = "insforge" ]; then + # For Insforge service, use host network to access Insforge backend on host + docker run --rm \ + --memory="$DOCKER_MEMORY_LIMIT" \ + --cpus="$DOCKER_CPU_LIMIT" \ + --add-host=host.docker.internal:host-gateway \ + -v "$(pwd)/results:/app/results" \ + $([ -f .mcp_env ] && echo "-v $(pwd)/.mcp_env:/app/.mcp_env:ro") \ + "$DOCKER_IMAGE" \ + python3 -m pipeline --mcp "$SERVICE" --k 1 "$@" else # For other services (notion, github, playwright, etc.) docker run --rm \ diff --git a/src/agents/base_agent.py b/src/agents/base_agent.py index 81fb87a7..070bddd0 100644 --- a/src/agents/base_agent.py +++ b/src/agents/base_agent.py @@ -19,7 +19,7 @@ class BaseMCPAgent(ABC): """Base class with shared functionality for MCPMark agents.""" - STDIO_SERVICES = ["notion", "filesystem", "playwright", "playwright_webarena", "postgres"] + STDIO_SERVICES = ["notion", "filesystem", "playwright", "playwright_webarena", "postgres", "insforge"] HTTP_SERVICES = ["github"] DEFAULT_TIMEOUT = 600 @@ -207,6 +207,20 @@ def _create_stdio_server(self) -> MCPStdioServer: env={"DATABASE_URI": database_url}, ) + if self.mcp_service == "insforge": + api_key = self.service_config.get("api_key") + backend_url = self.service_config.get("backend_url") + if not all([api_key, backend_url]): + raise ValueError("Insforge requires api_key and backend_url") + return MCPStdioServer( + command="npx", + args=["-y", "@insforge/mcp"], + env={ + "INSFORGE_API_KEY": api_key, + "INSFORGE_BACKEND_URL": backend_url, + }, + ) + raise ValueError(f"Unsupported stdio service: {self.mcp_service}") def _create_http_server(self) -> MCPHttpServer: diff --git a/src/agents/mcpmark_agent.py b/src/agents/mcpmark_agent.py index db0343cd..4a3ba12e 100644 --- a/src/agents/mcpmark_agent.py +++ b/src/agents/mcpmark_agent.py @@ -844,18 +844,32 @@ def _create_stdio_server(self) -> MCPStdioServer: username = self.service_config.get("username") password = self.service_config.get("password") database = self.service_config.get("current_database") or self.service_config.get("database") - + if not all([username, password, database]): raise ValueError("PostgreSQL requires username, password, and database") - + database_url = f"postgresql://{username}:{password}@{host}:{port}/{database}" - + return MCPStdioServer( command="pipx", args=["run", "postgres-mcp", "--access-mode=unrestricted"], env={"DATABASE_URI": database_url} ) - + + elif self.mcp_service == "insforge": + api_key = self.service_config.get("api_key") + backend_url = self.service_config.get("backend_url") + if not all([api_key, backend_url]): + raise ValueError("Insforge requires api_key and backend_url") + return MCPStdioServer( + command="npx", + args=["-y", "@insforge/mcp"], + env={ + "INSFORGE_API_KEY": api_key, + "INSFORGE_BACKEND_URL": backend_url, + }, + ) + else: raise ValueError(f"Unsupported stdio service: {self.mcp_service}") diff --git a/src/mcp_services/insforge/__init__.py b/src/mcp_services/insforge/__init__.py new file mode 100644 index 00000000..3a4022af --- /dev/null +++ b/src/mcp_services/insforge/__init__.py @@ -0,0 +1 @@ +"""Insforge MCP Service Implementation for MCPMark.""" diff --git a/src/mcp_services/insforge/insforge_login_helper.py b/src/mcp_services/insforge/insforge_login_helper.py new file mode 100644 index 00000000..e32bd311 --- /dev/null +++ b/src/mcp_services/insforge/insforge_login_helper.py @@ -0,0 +1,159 @@ +""" +Insforge Login Helper for MCPMark +================================== + +Handles Insforge backend authentication and connection validation. +""" + +import json +import requests +from pathlib import Path +from typing import Optional, Dict, Any + +from src.base.login_helper import BaseLoginHelper +from src.logger import get_logger + +logger = get_logger(__name__) + + +class InsforgeLoginHelper(BaseLoginHelper): + """Handles Insforge backend authentication and connection validation.""" + + def __init__( + self, + api_key: str, + backend_url: str, + state_path: Optional[Path] = None, + ): + """Initialize Insforge login helper. + + Args: + api_key: Insforge backend API key for authentication + backend_url: Insforge backend URL (e.g., https://your-app.insforge.app) + state_path: Path to save connection state + """ + super().__init__() + self.api_key = api_key + self.backend_url = backend_url.rstrip('/') + self.state_path = state_path or Path.home() / ".mcpbench" / "insforge_auth.json" + + # Ensure state directory exists + self.state_path.parent.mkdir(parents=True, exist_ok=True) + + def login(self, **kwargs) -> bool: + """Test Insforge backend connection and validate API key. + + Returns: + bool: True if connection successful and API key valid + """ + try: + # Test 1: Basic connectivity - try to get backend metadata + logger.info(f"Testing connection to Insforge backend: {self.backend_url}") + + headers = { + "Authorization": f"Bearer {self.api_key}", + "Content-Type": "application/json", + } + + # Test with a simple API endpoint - get current user or backend info + # Try the auth current session endpoint first + test_url = f"{self.backend_url}/api/auth/sessions/current" + + response = requests.get( + test_url, + headers=headers, + timeout=10, + ) + + if response.status_code == 200: + # API key is valid and can authenticate + logger.info("āœ“ Insforge API key authentication successful") + connection_info = { + "backend_url": self.backend_url, + "authenticated": True, + "authenticated_at": self._get_current_timestamp(), + } + elif response.status_code == 401: + # Invalid API key + logger.error("āœ— Invalid Insforge API key") + return False + else: + # API key might be admin key, try a different endpoint + # Try listing tables/backend metadata as a test + logger.info("Testing with backend metadata endpoint...") + + # Simple connectivity test - just check if backend is reachable + health_url = f"{self.backend_url}/api/health" + try: + health_response = requests.get(health_url, timeout=5) + if health_response.status_code in [200, 404]: # 404 is ok, backend is reachable + logger.info("āœ“ Insforge backend is reachable") + connection_info = { + "backend_url": self.backend_url, + "api_key_type": "admin", + "authenticated": True, + "authenticated_at": self._get_current_timestamp(), + } + else: + logger.warning(f"Unexpected response from backend: {health_response.status_code}") + connection_info = { + "backend_url": self.backend_url, + "authenticated": True, + "authenticated_at": self._get_current_timestamp(), + } + except Exception as e: + logger.warning(f"Health check failed, but proceeding: {e}") + # Still consider it successful if we have credentials + connection_info = { + "backend_url": self.backend_url, + "authenticated": True, + "authenticated_at": self._get_current_timestamp(), + } + + # Save connection state + self._save_connection_state(connection_info) + + logger.info(f"Insforge backend connection validated: {self.backend_url}") + return True + + except requests.exceptions.Timeout: + logger.error(f"Connection timeout to Insforge backend: {self.backend_url}") + return False + except requests.exceptions.ConnectionError: + logger.error(f"Cannot connect to Insforge backend: {self.backend_url}") + return False + except Exception as e: + logger.error(f"Unexpected error during Insforge authentication: {e}") + return False + + def _save_connection_state(self, state: Dict[str, Any]): + """Save connection state to file.""" + try: + # Don't save API key + safe_state = {k: v for k, v in state.items() if k not in ["api_key", "access_token"]} + + with open(self.state_path, "w") as f: + json.dump(safe_state, f, indent=2) + + # Set restrictive permissions + self.state_path.chmod(0o600) + logger.info(f"Connection state saved to: {self.state_path}") + + except Exception as e: + logger.error(f"Failed to save connection state: {e}") + + def _get_current_timestamp(self) -> str: + """Get current timestamp in ISO format.""" + from datetime import datetime, timezone + + return datetime.now(timezone.utc).isoformat() + + def is_connected(self) -> bool: + """Check if we can connect to Insforge backend.""" + return self.login() + + def get_connection_params(self) -> Dict[str, Any]: + """Get connection parameters (without API key).""" + return { + "backend_url": self.backend_url, + } diff --git a/src/mcp_services/insforge/insforge_state_manager.py b/src/mcp_services/insforge/insforge_state_manager.py new file mode 100644 index 00000000..cb8a374d --- /dev/null +++ b/src/mcp_services/insforge/insforge_state_manager.py @@ -0,0 +1,272 @@ +""" +Insforge State Manager for MCPMark +=================================== + +Manages backend state for Insforge tasks including setup via prepare_environment.py +and resource cleanup tracking. +""" + +import os +import sys +import subprocess +import requests +from pathlib import Path +from typing import Optional, Dict, Any, List + +from src.base.state_manager import BaseStateManager, InitialStateInfo +from src.base.task_manager import BaseTask +from src.logger import get_logger + +logger = get_logger(__name__) + + +class InsforgeStateManager(BaseStateManager): + """Manages Insforge backend state for task evaluation.""" + + def __init__( + self, + api_key: str, + backend_url: str, + ): + """Initialize Insforge state manager. + + Args: + api_key: Insforge backend API key for authentication + backend_url: Insforge backend URL (e.g., https://your-app.insforge.app) + """ + super().__init__(service_name="insforge") + + self.api_key = api_key + self.backend_url = backend_url.rstrip('/') + + # HTTP headers for API requests + self.headers = { + "Authorization": f"Bearer {self.api_key}", + "Content-Type": "application/json", + } + + # Track current task context for agent configuration + self._current_task_context: Optional[Dict[str, Any]] = None + + # Validate connection on initialization + try: + self._test_connection() + logger.info("Insforge state manager initialized successfully") + except Exception as e: + raise RuntimeError(f"Insforge initialization failed: {e}") + + def _test_connection(self): + """Test backend connection.""" + try: + # Simple connectivity test - try any endpoint + response = requests.get( + f"{self.backend_url}/api/health", + timeout=5, + ) + # Any response (even 404) means backend is reachable + logger.debug(f"Insforge backend connectivity test: {response.status_code}") + except requests.exceptions.RequestException: + # Try with API key + try: + response = requests.get( + f"{self.backend_url}/api/auth/sessions/current", + headers=self.headers, + timeout=5, + ) + logger.debug(f"Insforge backend auth test: {response.status_code}") + except Exception as inner_e: + raise RuntimeError(f"Cannot connect to Insforge backend: {inner_e}") + + def _create_initial_state(self, task: BaseTask) -> Optional[InitialStateInfo]: + """Create initial backend state for a task. + + This runs prepare_environment.py script if it exists in the task directory. + The script should use Insforge MCP tools or HTTP API to set up tables, data, etc. + + Args: + task: Task for which to create initial state + + Returns: + InitialStateInfo object or None if creation failed + """ + try: + # Generate unique state ID for this task run + state_id = f"{task.category_id}_{task.task_id}_{self._get_timestamp()}" + + logger.info(f"| Creating initial state for Insforge task: {task.name}") + + # Run prepare_environment.py if it exists + task_prepared = self._run_prepare_environment(task) + + if not task_prepared: + logger.debug(f"| No prepare_environment.py found for task {task.name}") + + # Track the task context + context = { + "state_id": state_id, + "category_id": task.category_id, + "task_id": task.task_id, + "task_name": task.name, + } + + return InitialStateInfo( + state_id=state_id, + state_url=self.backend_url, + metadata=context, + ) + + except Exception as e: + logger.error(f"Failed to create initial state for {task.name}: {e}") + return None + + def _store_initial_state_info( + self, task: BaseTask, state_info: InitialStateInfo + ) -> None: + """Store backend info in task object for agent access.""" + if hasattr(task, "__dict__"): + task.backend_url = self.backend_url + task.api_key = self.api_key + task.state_id = state_info.state_id + + # Store current task context for agent configuration + self._current_task_context = state_info.metadata + + def _cleanup_task_initial_state(self, task: BaseTask) -> bool: + """Clean up task-specific resources. + + Note: Actual cleanup of created resources is delegated to prepare_environment.py + cleanup scripts or handled by _cleanup_tracked_resources. + + Args: + task: Task whose initial state should be cleaned up + + Returns: + True if cleanup successful + """ + try: + logger.info(f"| Cleaning up initial state for task: {task.name}") + + # Clear current task context + if (self._current_task_context and + self._current_task_context.get("task_name") == task.name): + self._current_task_context = None + + logger.info(f"| āœ“ Initial state cleanup completed for {task.name}") + return True + + except Exception as e: + logger.error(f"Failed to cleanup task initial state for {task.name}: {e}") + return False + + def _cleanup_single_resource(self, resource: Dict[str, Any]) -> bool: + """Clean up a single tracked resource. + + This is a placeholder for resource-specific cleanup logic. + Tasks should handle their own cleanup via cleanup scripts. + + Args: + resource: Resource dictionary with type, id, and metadata + + Returns: + True if cleanup successful + """ + resource_type = resource["type"] + resource_id = resource["id"] + + logger.debug(f"| Cleanup for {resource_type} {resource_id} (handled by task scripts)") + return True + + def _run_prepare_environment(self, task: BaseTask) -> bool: + """Run prepare_environment.py script if it exists in the task directory. + + The script should use Insforge MCP tools or HTTP API to set up required state. + + Args: + task: Task for which to prepare environment + + Returns: + True if script ran successfully, False if script doesn't exist + """ + task_dir = task.task_instruction_path.parent + prepare_script = task_dir / "prepare_environment.py" + + if not prepare_script.exists(): + logger.debug(f"No prepare_environment.py found for task {task.name}") + return False + + logger.info(f"| Running prepare_environment.py for task {task.name}") + + # Set up environment variables for the script + env = os.environ.copy() + env.update({ + "INSFORGE_BACKEND_URL": self.backend_url, + "INSFORGE_API_KEY": self.api_key, + }) + + try: + # Run the prepare_environment.py script + result = subprocess.run( + [sys.executable, str(prepare_script)], + cwd=str(task_dir), # Run from task directory + env=env, + capture_output=True, + text=True, + timeout=300, # 5 minute timeout + ) + + if result.returncode == 0: + logger.info(f"| āœ“ Environment preparation completed for {task.name}") + if result.stdout.strip(): + logger.debug(f"| prepare_environment.py output: {result.stdout}") + return True + else: + logger.error(f"| āœ— Environment preparation failed for {task.name}") + logger.error(f"| Error output: {result.stderr}") + raise RuntimeError(f"prepare_environment.py failed with exit code {result.returncode}") + + except subprocess.TimeoutExpired: + logger.error(f"āœ— Environment preparation timed out for {task.name}") + raise RuntimeError("prepare_environment.py execution timed out") + except Exception as e: + logger.error(f"āœ— Failed to run prepare_environment.py for {task.name}: {e}") + raise + + def _get_timestamp(self) -> str: + """Get timestamp for unique naming.""" + from datetime import datetime + + return datetime.now().strftime("%Y%m%d%H%M%S") + + def get_service_config_for_agent(self) -> dict: + """Get configuration for agent execution. + + This configuration is passed to the agent/MCP server so it can + connect to the Insforge backend. + + Returns: + Dictionary containing backend URL and API key + """ + config = { + "backend_url": self.backend_url, + "api_key": self.api_key, + } + + # Include current task context if available + if self._current_task_context: + config["task_context"] = self._current_task_context + + return config + + def set_verification_environment(self, messages_path: str = None) -> None: + """Set environment variables needed for verification scripts. + + Args: + messages_path: Optional path to messages.json file for verification + """ + os.environ["INSFORGE_BACKEND_URL"] = self.backend_url + os.environ["INSFORGE_API_KEY"] = self.api_key + + if messages_path: + os.environ["MCP_MESSAGES"] = str(messages_path) + + logger.debug("Verification environment variables set for Insforge") diff --git a/src/mcp_services/insforge/insforge_task_manager.py b/src/mcp_services/insforge/insforge_task_manager.py new file mode 100644 index 00000000..f4feb695 --- /dev/null +++ b/src/mcp_services/insforge/insforge_task_manager.py @@ -0,0 +1,108 @@ +""" +Insforge Task Manager for MCPMark +=================================== + +Manages Insforge task discovery, execution, and verification. +""" + +import os +import subprocess +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Any, Dict, List, Optional + +from src.base.task_manager import BaseTask, BaseTaskManager +from src.logger import get_logger + +logger = get_logger(__name__) + + +@dataclass +class InsforgeTask(BaseTask): + """Insforge-specific task with backend information.""" + + task_name: str = "" + backend_url: Optional[str] = None + api_key: Optional[str] = None + + +class InsforgeTaskManager(BaseTaskManager): + """Manages Insforge tasks for MCPMark evaluation.""" + + def __init__(self, tasks_root: Path = None): + """Initialize Insforge task manager. + + Args: + tasks_root: Path to tasks directory + """ + if tasks_root is None: + tasks_root = Path(__file__).resolve().parents[3] / "tasks" + + super().__init__( + tasks_root, + mcp_service="insforge", + task_class=InsforgeTask, + task_organization="file", # Insforge uses file-based tasks + ) + + def _create_task_from_files( + self, category_id: str, task_files_info: Dict[str, Any] + ) -> Optional[InsforgeTask]: + """Instantiate an `InsforgeTask` from the dictionary returned by `_find_task_files`.""" + import json + + # Check for meta.json + meta_path = task_files_info["instruction_path"].parent / "meta.json" + final_category_id = category_id + task_id = task_files_info["task_id"] + + if meta_path.exists(): + try: + with open(meta_path, 'r') as f: + meta_data = json.load(f) + # Use values from meta.json if available + final_category_id = meta_data.get("category_id", category_id) + task_id = meta_data.get("task_id", task_id) + except Exception as e: + logger.warning(f"Failed to load meta.json from {meta_path}: {e}") + + return InsforgeTask( + task_instruction_path=task_files_info["instruction_path"], + task_verification_path=task_files_info["verification_path"], + service="insforge", + category_id=final_category_id, + task_id=task_id, + task_name=task_files_info["task_id"], + ) + + def _get_verification_command(self, task: InsforgeTask) -> List[str]: + """Get verification command with Insforge backend info.""" + cmd = [sys.executable, str(task.task_verification_path)] + return cmd + + def run_verification(self, task: BaseTask) -> subprocess.CompletedProcess: + """Run verification with Insforge environment.""" + env = os.environ.copy() + + # Pass Insforge connection info to verification script + if hasattr(task, "backend_url") and task.backend_url: + env["INSFORGE_BACKEND_URL"] = task.backend_url + + if hasattr(task, "api_key") and task.api_key: + env["INSFORGE_API_KEY"] = task.api_key + + return subprocess.run( + self._get_verification_command(task), + capture_output=True, + text=True, + timeout=300, + env=env, + ) + + def _format_task_instruction(self, base_instruction: str) -> str: + """Add Insforge-specific instructions.""" + return ( + base_instruction + + "\n\nNote: Use Insforge MCP tools to complete this task. The backend connection is already configured." + ) diff --git a/src/services.py b/src/services.py index 772236ad..67111121 100644 --- a/src/services.py +++ b/src/services.py @@ -269,6 +269,37 @@ "mcp_server": None, "eval_config": None, }, + "insforge": { + "config_schema": { + "api_key": { + "env_var": "INSFORGE_API_KEY", + "required": True, + "description": "Insforge backend API key for authentication", + }, + "backend_url": { + "env_var": "INSFORGE_BACKEND_URL", + "required": True, + "description": "Insforge backend URL (e.g., https://your-app.insforge.app)", + }, + }, + "components": { + "task_manager": "src.mcp_services.insforge.insforge_task_manager.InsforgeTaskManager", + "state_manager": "src.mcp_services.insforge.insforge_state_manager.InsforgeStateManager", + "login_helper": "src.mcp_services.insforge.insforge_login_helper.InsforgeLoginHelper", + }, + "config_mapping": { + "state_manager": { + "api_key": "api_key", + "backend_url": "backend_url", + }, + "login_helper": { + "api_key": "api_key", + "backend_url": "backend_url", + }, + }, + "mcp_server": None, + "eval_config": None, + }, "playwright_webarena": { "config_schema": { "browser": { From 6f5710e1af7e4834e7757e3157b6e288c2d02714 Mon Sep 17 00:00:00 2001 From: yaowenc2 Date: Sun, 19 Oct 2025 15:18:16 -0700 Subject: [PATCH 2/5] update supabase support --- src/agents/base_agent.py | 4 +- src/agents/mcpmark_agent.py | 24 +- src/aggregators/aggregate_results.py | 22 +- src/aggregators/aggregate_specific_results.py | 238 ++++++++ .../insforge/insforge_state_manager.py | 270 ++++++++- src/mcp_services/supabase/__init__.py | 11 + .../supabase/supabase_login_helper.py | 168 ++++++ .../supabase/supabase_state_manager.py | 518 ++++++++++++++++++ .../supabase/supabase_task_manager.py | 113 ++++ src/services.py | 64 +++ supabase/.branches/_current_branch | 1 + supabase/.temp/cli-latest | 1 + tasks/insforge | 1 + tasks/supabase | 1 + 14 files changed, 1408 insertions(+), 28 deletions(-) create mode 100644 src/aggregators/aggregate_specific_results.py create mode 100644 src/mcp_services/supabase/__init__.py create mode 100644 src/mcp_services/supabase/supabase_login_helper.py create mode 100644 src/mcp_services/supabase/supabase_state_manager.py create mode 100644 src/mcp_services/supabase/supabase_task_manager.py create mode 100644 supabase/.branches/_current_branch create mode 100644 supabase/.temp/cli-latest create mode 120000 tasks/insforge create mode 120000 tasks/supabase diff --git a/src/agents/base_agent.py b/src/agents/base_agent.py index 070bddd0..661a1994 100644 --- a/src/agents/base_agent.py +++ b/src/agents/base_agent.py @@ -20,7 +20,7 @@ class BaseMCPAgent(ABC): """Base class with shared functionality for MCPMark agents.""" STDIO_SERVICES = ["notion", "filesystem", "playwright", "playwright_webarena", "postgres", "insforge"] - HTTP_SERVICES = ["github"] + HTTP_SERVICES = ["github", "supabase"] DEFAULT_TIMEOUT = 600 CLAUDE_THINKING_BUDGETS = { @@ -214,7 +214,7 @@ def _create_stdio_server(self) -> MCPStdioServer: raise ValueError("Insforge requires api_key and backend_url") return MCPStdioServer( command="npx", - args=["-y", "@insforge/mcp"], + args=["-y", "@insforge/mcp@dev"], env={ "INSFORGE_API_KEY": api_key, "INSFORGE_BACKEND_URL": backend_url, diff --git a/src/agents/mcpmark_agent.py b/src/agents/mcpmark_agent.py index 4a3ba12e..66c13109 100644 --- a/src/agents/mcpmark_agent.py +++ b/src/agents/mcpmark_agent.py @@ -863,7 +863,7 @@ def _create_stdio_server(self) -> MCPStdioServer: raise ValueError("Insforge requires api_key and backend_url") return MCPStdioServer( command="npx", - args=["-y", "@insforge/mcp"], + args=["-y", "@insforge/mcp@dev"], env={ "INSFORGE_API_KEY": api_key, "INSFORGE_BACKEND_URL": backend_url, @@ -880,7 +880,7 @@ def _create_http_server(self) -> MCPHttpServer: github_token = self.service_config.get("github_token") if not github_token: raise ValueError("GitHub token required") - + return MCPHttpServer( url="https://api.githubcopilot.com/mcp/", headers={ @@ -888,6 +888,26 @@ def _create_http_server(self) -> MCPHttpServer: "User-Agent": "MCPMark/1.0" } ) + + elif self.mcp_service == "supabase": + # Use built-in MCP server from Supabase CLI + api_url = self.service_config.get("api_url", "http://localhost:54321") + api_key = self.service_config.get("api_key", "") + + if not api_key: + raise ValueError("Supabase requires api_key (use secret key from 'supabase status')") + + # Supabase CLI exposes MCP at /mcp endpoint + mcp_url = f"{api_url}/mcp" + + return MCPHttpServer( + url=mcp_url, + headers={ + "apikey": api_key, + "Authorization": f"Bearer {api_key}", + } + ) + else: raise ValueError(f"Unsupported HTTP service: {self.mcp_service}") diff --git a/src/aggregators/aggregate_results.py b/src/aggregators/aggregate_results.py index 44a8afd5..9e045756 100755 --- a/src/aggregators/aggregate_results.py +++ b/src/aggregators/aggregate_results.py @@ -23,9 +23,9 @@ def discover_tasks() -> Dict[str, List[str]]: """Discover all tasks from ./tasks directory.""" tasks_dir = Path("./tasks") - + all_tasks = {} - + # Handle each MCP service # Note: playwright and playwright_webarena both map to "playwright" MCP service_mappings = { @@ -33,9 +33,9 @@ def discover_tasks() -> Dict[str, List[str]]: "github": ["github"], "notion": ["notion"], "playwright": ["playwright", "playwright_webarena"], # Both count as playwright - "postgres": ["postgres"] + "postgres": ["postgres", "supabase", "insforge"] # All map to postgres } - + for mcp_service, task_dirs in service_mappings.items(): tasks = [] for task_dir_name in task_dirs: @@ -68,9 +68,11 @@ def collect_results(exp_dir: Path, k: int) -> Dict[str, Dict[str, Any]]: continue model, service = model_service_dir.name.split("__", 1) - # Normalize service name: treat playwright_webarena as playwright + # Normalize service names if service == "playwright_webarena": service = "playwright" + elif service in ["supabase", "insforge"]: + service = "postgres" for run_idx in range(1, k + 1): run_dir = model_service_dir / f"run-{run_idx}" @@ -874,23 +876,23 @@ def main(): help="Comma-separated list of models that only need run-1" ) parser.add_argument("--push", action="store_true", help="Push to GitHub (default to main)") - + args = parser.parse_args() - + # Parse single-run models single_run_models = [] if args.single_run_models: single_run_models = [m.strip() for m in args.single_run_models.split(",")] print(f"šŸ“Œ Single-run models: {', '.join(single_run_models)}") - + # Setup paths exp_dir = Path("./results") / args.exp_name if not exp_dir.exists(): print(f"āŒ Experiment directory {exp_dir} does not exist") return 1 - + print(f"šŸ”„ Processing experiment: {args.exp_name}") - + # Discover all tasks print("šŸ“‹ Discovering tasks...") all_tasks = discover_tasks() diff --git a/src/aggregators/aggregate_specific_results.py b/src/aggregators/aggregate_specific_results.py new file mode 100644 index 00000000..65414390 --- /dev/null +++ b/src/aggregators/aggregate_specific_results.py @@ -0,0 +1,238 @@ +#!/usr/bin/env python3 +""" +Simple Results Aggregator - Aggregate specific result directories +Usage: python -m src.aggregators.aggregate_specific_results --result-dir results/exp/model__service --k 4 +""" + +import json +import argparse +from pathlib import Path +from collections import defaultdict +from typing import Dict, Any, Tuple, List +from datetime import datetime +import sys +sys.path.append(str(Path(__file__).parent.parent.parent)) +from src.aggregators.pricing import compute_cost_usd + + +def collect_results_from_dir(result_dir: Path, k: int) -> Dict[str, Any]: + """Collect all results from a specific result directory.""" + results = {} + + for run_idx in range(1, k + 1): + run_dir = result_dir / f"run-{run_idx}" + if not run_dir.exists(): + print(f"āš ļø Warning: {run_dir} does not exist, skipping") + continue + + run_results = {} + for task_dir in run_dir.iterdir(): + if not task_dir.is_dir(): + continue + + meta_path = task_dir / "meta.json" + if meta_path.exists(): + with open(meta_path) as f: + meta = json.load(f) + run_results[task_dir.name] = meta + + results[f"run-{run_idx}"] = run_results + + return results + + +def get_token_counts(meta: Dict[str, Any]) -> Tuple[int, int, int]: + """Extract token counts from meta.""" + tu = meta.get("token_usage", {}) or {} + input_tokens = int(tu.get("input_tokens", 0) or 0) + output_tokens = int(tu.get("output_tokens", 0) or 0) + total_tokens = int(tu.get("total_tokens", input_tokens + output_tokens) or (input_tokens + output_tokens)) + return input_tokens, output_tokens, total_tokens + + +def calculate_metrics(results: Dict, k: int, model_name: str) -> Dict: + """Calculate metrics from results.""" + + # Get all unique task names + all_tasks = set() + for run_name, run_data in results.items(): + all_tasks.update(run_data.keys()) + all_tasks = sorted(all_tasks) + + total_tasks = len(all_tasks) + actual_runs = len(results) + + print(f"\nšŸ“Š Analysis:") + print(f" Total unique tasks: {total_tasks}") + print(f" Runs found: {actual_runs} (expected: {k})") + + # Aggregates + total_agent_execution_time = 0.0 + total_input_tokens = 0 + total_output_tokens = 0 + total_tokens = 0 + total_turns = 0 + + actual_model_name = None + + # Per-run pass@1 + pass1_rates_per_run = [] + + # For pass@k + pass_k_task_success_any = 0 + pass_power_k_task_success_all = 0 + + for run_idx in range(1, actual_runs + 1): + run_name = f"run-{run_idx}" + successes_this_run = 0 + + for task in all_tasks: + meta = results.get(run_name, {}).get(task) + + if not meta: + continue + + success = bool(meta.get("execution_result", {}).get("success", False)) + if success: + successes_this_run += 1 + + total_agent_execution_time += float(meta.get("agent_execution_time", 0.0) or 0.0) + in_tok, out_tok, ttl_tok = get_token_counts(meta) + total_input_tokens += in_tok + total_output_tokens += out_tok + total_tokens += ttl_tok + total_turns += int(meta.get("turn_count", 0) or 0) + + if actual_model_name is None: + actual_model_name = meta.get("actual_model_name") or None + + pass1_rate = successes_this_run / total_tasks if total_tasks > 0 else 0 + pass1_rates_per_run.append(pass1_rate) + print(f" Run {run_idx}: {successes_this_run}/{total_tasks} = {pass1_rate*100:.1f}%") + + # Calculate pass@k + for task in all_tasks: + successes = [] + for run_idx in range(1, actual_runs + 1): + run_name = f"run-{run_idx}" + meta = results.get(run_name, {}).get(task) + success = bool(meta.get("execution_result", {}).get("success", False)) if meta else False + successes.append(success) + + if any(successes): + pass_k_task_success_any += 1 + if all(successes): + pass_power_k_task_success_all += 1 + + # Averages + denom = total_tasks * actual_runs if total_tasks > 0 else 1 + avg_agent_execution_time = total_agent_execution_time / denom + avg_input_tokens = total_input_tokens / denom + avg_output_tokens = total_output_tokens / denom + avg_total_tokens = total_tokens / denom + avg_turns = total_turns / denom + + # Pass@1 stats + if pass1_rates_per_run: + avg_pass1 = sum(pass1_rates_per_run) / len(pass1_rates_per_run) + mean = avg_pass1 + variance = sum((r - mean) ** 2 for r in pass1_rates_per_run) / len(pass1_rates_per_run) + std_pass1 = variance ** 0.5 + else: + avg_pass1 = 0.0 + std_pass1 = 0.0 + + # Cost calculation + per_run_input_tokens = total_input_tokens / actual_runs if actual_runs else 0 + per_run_output_tokens = total_output_tokens / actual_runs if actual_runs else 0 + model_for_pricing = actual_model_name or model_name + per_run_cost = compute_cost_usd(model_for_pricing, per_run_input_tokens, per_run_output_tokens) + + summary = { + "generated_at": datetime.now().isoformat(), + "model": model_name, + "actual_model_name": actual_model_name or model_name, + "runs": actual_runs, + "total_tasks": total_tasks, + "total_agent_execution_time": round(total_agent_execution_time, 2), + "total_input_tokens": total_input_tokens, + "total_output_tokens": total_output_tokens, + "total_tokens": total_tokens, + "total_turns": total_turns, + "avg_agent_execution_time": round(avg_agent_execution_time, 4), + "avg_input_tokens": round(avg_input_tokens, 2), + "avg_output_tokens": round(avg_output_tokens, 2), + "avg_total_tokens": round(avg_total_tokens, 2), + "avg_turns": round(avg_turns, 2), + "per_run_input_tokens": round(per_run_input_tokens, 2), + "per_run_output_tokens": round(per_run_output_tokens, 2), + "per_run_cost": round(per_run_cost, 4) if per_run_cost else None, + "pass@1": { + "avg": round(avg_pass1, 4), + "std": round(std_pass1, 4), + "per_run": [round(r, 4) for r in pass1_rates_per_run] + }, + } + + if actual_runs > 1: + summary[f"pass@{actual_runs}"] = round(pass_k_task_success_any / total_tasks, 4) + summary[f"pass^{actual_runs}"] = round(pass_power_k_task_success_all / total_tasks, 4) + + return summary + + +def main(): + parser = argparse.ArgumentParser(description="Simple results aggregator for specific directories") + parser.add_argument("--result-dir", required=True, help="Path to result directory (e.g., results/exp/model__service)") + parser.add_argument("--k", type=int, default=4, help="Number of runs (default: 4)") + parser.add_argument("--output", help="Output JSON file path (default: /summary.json)") + + args = parser.parse_args() + + result_dir = Path(args.result_dir) + if not result_dir.exists(): + print(f"āŒ Result directory {result_dir} does not exist") + return 1 + + # Extract model name from directory name + model_name = result_dir.name.replace("__", "-") + + print(f"šŸ”„ Processing: {result_dir}") + print(f"šŸ“‹ Model: {model_name}") + + # Collect results + results = collect_results_from_dir(result_dir, args.k) + + if not results: + print("āŒ No results found") + return 1 + + # Calculate metrics + summary = calculate_metrics(results, args.k, model_name) + + # Save summary + output_path = Path(args.output) if args.output else result_dir / "summary.json" + with open(output_path, "w") as f: + json.dump(summary, f, indent=2) + + print(f"\nāœ… Summary saved to: {output_path}") + print(f"\nšŸ“ˆ Results:") + print(f" Pass@1: {summary['pass@1']['avg']*100:.1f}% ± {summary['pass@1']['std']*100:.1f}%") + if f"pass@{args.k}" in summary: + print(f" Pass@{args.k}: {summary[f'pass@{args.k}']*100:.1f}%") + print(f" Pass^{args.k}: {summary[f'pass^{args.k}']*100:.1f}%") + print(f" Per-run cost: ${summary['per_run_cost']:.4f}" if summary['per_run_cost'] else " Per-run cost: N/A") + print(f" Avg agent time: {summary['avg_agent_execution_time']:.2f}s") + print(f" Avg turns: {summary['avg_turns']:.2f}") + print(f"\nšŸ“Š Token Usage:") + avg_tokens_per_run = summary['total_tokens'] / summary['runs'] if summary['runs'] > 0 else 0 + print(f" Avg tokens per run: {avg_tokens_per_run:,.0f}") + print(f" Avg tokens per turn: {summary['avg_total_tokens'] / summary['avg_turns']:.0f}" if summary['avg_turns'] > 0 else " Avg tokens per turn: N/A") + print(f" Total tokens (all runs): {summary['total_tokens']:,}") + print(f" Total turns (all runs): {summary['total_turns']:,}") + + return 0 + + +if __name__ == "__main__": + exit(main()) diff --git a/src/mcp_services/insforge/insforge_state_manager.py b/src/mcp_services/insforge/insforge_state_manager.py index cb8a374d..7c5b7961 100644 --- a/src/mcp_services/insforge/insforge_state_manager.py +++ b/src/mcp_services/insforge/insforge_state_manager.py @@ -55,6 +55,12 @@ def __init__( except Exception as e: raise RuntimeError(f"Insforge initialization failed: {e}") + # Store baseline tables (system tables that exist before any tasks run) + self._baseline_tables = set( + (t['schema'], t['name']) for t in self._get_all_tables() + ) + logger.debug(f"Stored baseline: {len(self._baseline_tables)} tables") + def _test_connection(self): """Test backend connection.""" try: @@ -80,8 +86,7 @@ def _test_connection(self): def _create_initial_state(self, task: BaseTask) -> Optional[InitialStateInfo]: """Create initial backend state for a task. - This runs prepare_environment.py script if it exists in the task directory. - The script should use Insforge MCP tools or HTTP API to set up tables, data, etc. + Restores from backup which may place tables in public or task-specific schema. Args: task: Task for which to create initial state @@ -92,21 +97,50 @@ def _create_initial_state(self, task: BaseTask) -> Optional[InitialStateInfo]: try: # Generate unique state ID for this task run state_id = f"{task.category_id}_{task.task_id}_{self._get_timestamp()}" + schema_name = task.category_id logger.info(f"| Creating initial state for Insforge task: {task.name}") - # Run prepare_environment.py if it exists - task_prepared = self._run_prepare_environment(task) + # Get list of existing tables before restore (to track what we create) + tables_before = self._get_all_tables() + logger.info(f"| Tables before restore: {len(tables_before)}") - if not task_prepared: - logger.debug(f"| No prepare_environment.py found for task {task.name}") + # Create schema for this task (in case backup uses it) + self._drop_schema(schema_name) + self._create_schema(schema_name) - # Track the task context + # Restore from backup if backup exists (may create tables in public or task schema) + if self._restore_from_backup(schema_name): + logger.info(f"| āœ“ Restored '{schema_name}' from backup") + else: + logger.info(f"| ā—‹ No backup found for '{schema_name}'") + # Run prepare_environment.py if it exists + task_prepared = self._run_prepare_environment(task) + if not task_prepared: + logger.debug(f"| No prepare_environment.py found for task {task.name}") + + # Get list of tables after restore (to track what we need to clean up) + tables_after = self._get_all_tables() + + # Track ALL new tables created by the restore (compare before/after) + tables_before_set = {(t['schema'], t['name']) for t in tables_before} + created_tables = [ + t for t in tables_after + if (t['schema'], t['name']) not in tables_before_set + ] + + logger.info(f"| Tracked {len(created_tables)} new tables for cleanup") + for t in created_tables: + logger.debug(f"| - {t['schema']}.{t['name']}") + + # Track the task context including created tables context = { "state_id": state_id, "category_id": task.category_id, "task_id": task.task_id, "task_name": task.name, + "schema": schema_name, + "created_tables": created_tables, # Track all created tables } return InitialStateInfo( @@ -134,8 +168,8 @@ def _store_initial_state_info( def _cleanup_task_initial_state(self, task: BaseTask) -> bool: """Clean up task-specific resources. - Note: Actual cleanup of created resources is delegated to prepare_environment.py - cleanup scripts or handled by _cleanup_tracked_resources. + Drops ALL tables created during task (both setup and agent-created) + by comparing against baseline. Args: task: Task whose initial state should be cleaned up @@ -146,10 +180,39 @@ def _cleanup_task_initial_state(self, task: BaseTask) -> bool: try: logger.info(f"| Cleaning up initial state for task: {task.name}") - # Clear current task context - if (self._current_task_context and - self._current_task_context.get("task_name") == task.name): - self._current_task_context = None + if self._current_task_context: + schema_name = self._current_task_context.get("schema") + + # Get ALL current tables + all_current_tables = self._get_all_tables() + + # Find tables to drop: anything not in baseline + tables_to_drop = [ + t for t in all_current_tables + if (t['schema'], t['name']) not in self._baseline_tables + ] + + logger.info(f"| Found {len(tables_to_drop)} tables to clean up (setup + agent-created)") + + # Drop individual tables + for table_info in tables_to_drop: + try: + self._drop_table(table_info["schema"], table_info["name"]) + logger.debug(f"| āœ“ Dropped table: {table_info['schema']}.{table_info['name']}") + except Exception as e: + logger.warning(f"| Failed to drop table {table_info}: {e}") + + # Drop the task schema (may be empty if all tables were in public) + if schema_name: + try: + self._drop_schema(schema_name) + logger.info(f"| āœ“ Dropped schema: {schema_name}") + except Exception as e: + logger.warning(f"| Failed to drop schema {schema_name}: {e}") + + # Clear task context + if self._current_task_context.get("task_name") == task.name: + self._current_task_context = None logger.info(f"| āœ“ Initial state cleanup completed for {task.name}") return True @@ -237,6 +300,177 @@ def _get_timestamp(self) -> str: return datetime.now().strftime("%Y%m%d%H%M%S") + def _drop_schema(self, schema_name: str) -> None: + """Drop schema and all its contents.""" + import psycopg2 + from psycopg2 import sql + + conn_params = { + "host": "localhost", + "port": 5432, + "user": "postgres", + "password": "postgres", + "database": "insforge", + } + + conn = psycopg2.connect(**conn_params) + conn.autocommit = True + try: + with conn.cursor() as cur: + cur.execute( + sql.SQL("DROP SCHEMA IF EXISTS {} CASCADE").format( + sql.Identifier(schema_name) + ) + ) + logger.debug(f"| Dropped schema: {schema_name}") + finally: + conn.close() + + def _create_schema(self, schema_name: str) -> None: + """Create empty schema.""" + import psycopg2 + from psycopg2 import sql + + conn_params = { + "host": "localhost", + "port": 5432, + "user": "postgres", + "password": "postgres", + "database": "insforge", + } + + conn = psycopg2.connect(**conn_params) + conn.autocommit = True + try: + with conn.cursor() as cur: + cur.execute( + sql.SQL("CREATE SCHEMA {}").format(sql.Identifier(schema_name)) + ) + logger.debug(f"| Created schema: {schema_name}") + finally: + conn.close() + + def _get_all_tables(self) -> List[Dict[str, str]]: + """Get list of all user tables. + + Returns: + List of dicts with 'schema' and 'name' keys + """ + import psycopg2 + + conn_params = { + "host": "localhost", + "port": 5432, + "user": "postgres", + "password": "postgres", + "database": "insforge", + } + + conn = psycopg2.connect(**conn_params) + try: + with conn.cursor() as cur: + cur.execute(""" + SELECT table_schema, table_name + FROM information_schema.tables + WHERE table_type = 'BASE TABLE' + AND table_schema NOT IN ('information_schema', 'pg_catalog') + AND table_schema NOT LIKE 'pg_%' + AND table_name NOT LIKE '\\_%' + ORDER BY table_schema, table_name + """) + rows = cur.fetchall() + return [{"schema": row[0], "name": row[1]} for row in rows] + finally: + conn.close() + + def _drop_table(self, schema_name: str, table_name: str) -> None: + """Drop a specific table.""" + import psycopg2 + from psycopg2 import sql + + conn_params = { + "host": "localhost", + "port": 5432, + "user": "postgres", + "password": "postgres", + "database": "insforge", + } + + conn = psycopg2.connect(**conn_params) + conn.autocommit = True + try: + with conn.cursor() as cur: + cur.execute( + sql.SQL("DROP TABLE IF EXISTS {}.{} CASCADE").format( + sql.Identifier(schema_name), + sql.Identifier(table_name) + ) + ) + logger.debug(f"| Dropped table: {schema_name}.{table_name}") + finally: + conn.close() + + def _restore_from_backup(self, category_name: str) -> bool: + """Restore from backup file. + + Tables may be restored into public schema or category-specific schema + depending on how the backup was created. + + Args: + category_name: Name of category (e.g., 'employees', 'chinook', 'lego') + + Returns: + True if backup was restored, False if no backup exists + """ + # Path to backup file + backup_dir = Path(__file__).parent.parent.parent.parent / "postgres_state" + backup_file = backup_dir / f"{category_name}.backup" + + logger.debug(f"| Looking for backup at: {backup_file}") + logger.debug(f"| Backup exists: {backup_file.exists()}") + + if not backup_file.exists(): + logger.info(f"| ā—‹ No backup file found: {backup_file}") + return False + + logger.info(f"| Restoring {category_name} from backup...") + + # Set up environment for pg_restore + env = os.environ.copy() + env["PGPASSWORD"] = "postgres" + + try: + # Restore backup without schema filter (tables go to whatever schema they're in) + result = subprocess.run( + [ + "pg_restore", + "-h", "localhost", + "-p", "5432", + "-U", "postgres", + "-d", "insforge", + "-v", + str(backup_file), + ], + env=env, + capture_output=True, + text=True, + timeout=120, # 2 minute timeout + ) + + if result.returncode != 0 and "ERROR" in result.stderr: + logger.warning(f"| pg_restore had errors for {category_name}: {result.stderr}") + return False + + logger.info(f"| āœ“ {category_name} restored successfully") + return True + + except subprocess.TimeoutExpired: + logger.error(f"| āœ— Restore timed out for {category_name}") + return False + except Exception as e: + logger.error(f"| āœ— Failed to restore {category_name}: {e}") + return False + def get_service_config_for_agent(self) -> dict: """Get configuration for agent execution. @@ -266,7 +500,15 @@ def set_verification_environment(self, messages_path: str = None) -> None: os.environ["INSFORGE_BACKEND_URL"] = self.backend_url os.environ["INSFORGE_API_KEY"] = self.api_key + # Set PostgreSQL connection details for direct database verification + # (Insforge exposes its internal postgres database for verification) + os.environ["POSTGRES_HOST"] = "localhost" + os.environ["POSTGRES_PORT"] = "5432" + os.environ["POSTGRES_DATABASE"] = "insforge" + os.environ["POSTGRES_USERNAME"] = "postgres" + os.environ["POSTGRES_PASSWORD"] = "postgres" + if messages_path: os.environ["MCP_MESSAGES"] = str(messages_path) - logger.debug("Verification environment variables set for Insforge") + logger.debug("Verification environment variables set for Insforge (including direct postgres access)") diff --git a/src/mcp_services/supabase/__init__.py b/src/mcp_services/supabase/__init__.py new file mode 100644 index 00000000..669a7286 --- /dev/null +++ b/src/mcp_services/supabase/__init__.py @@ -0,0 +1,11 @@ +"""Supabase MCP service integration for MCPMark.""" + +from .supabase_login_helper import SupabaseLoginHelper +from .supabase_state_manager import SupabaseStateManager +from .supabase_task_manager import SupabaseTaskManager + +__all__ = [ + "SupabaseLoginHelper", + "SupabaseStateManager", + "SupabaseTaskManager", +] diff --git a/src/mcp_services/supabase/supabase_login_helper.py b/src/mcp_services/supabase/supabase_login_helper.py new file mode 100644 index 00000000..dd7253ac --- /dev/null +++ b/src/mcp_services/supabase/supabase_login_helper.py @@ -0,0 +1,168 @@ +""" +Supabase Login Helper for MCPMark +=================================== + +Handles configuration and validation for Supabase MCP service. +""" + +import os +from typing import Dict, Any, Optional + +from src.base.login_helper import BaseLoginHelper +from src.logger import get_logger + +logger = get_logger(__name__) + + +class SupabaseLoginHelper(BaseLoginHelper): + """Login helper for Supabase MCP service. + + Validates PostgREST API URL and API key configuration. + """ + + def __init__(self): + super().__init__("supabase") + + def prepare_credentials(self) -> Dict[str, Any]: + """Prepare credentials for Supabase/PostgREST connection. + + Returns: + Dictionary containing api_url, api_key, and postgres connection details + """ + # Get PostgREST API configuration (from Supabase CLI) + api_url = os.getenv("SUPABASE_API_URL", "http://localhost:54321") + api_key = os.getenv("SUPABASE_API_KEY") + + # Get PostgreSQL connection details (Supabase CLI defaults) + postgres_host = os.getenv("SUPABASE_DB_HOST", "localhost") + postgres_port = int(os.getenv("SUPABASE_DB_PORT", "54322")) + postgres_user = os.getenv("SUPABASE_DB_USER", "postgres") + postgres_password = os.getenv("SUPABASE_DB_PASSWORD", "postgres") + postgres_database = os.getenv("SUPABASE_DB_NAME", "postgres") + + if not api_key: + logger.warning( + "SUPABASE_API_KEY not set.\n" + "Run 'supabase status' to get your anon or service_role key.\n" + "Set SUPABASE_API_KEY in your .mcp_env file." + ) + # Try to get it from supabase status + api_key = self._get_key_from_supabase_status() + + return { + "api_url": api_url, + "api_key": api_key or "", + "postgres_host": postgres_host, + "postgres_port": postgres_port, + "postgres_user": postgres_user, + "postgres_password": postgres_password, + "postgres_database": postgres_database, + } + + def _get_key_from_supabase_status(self) -> Optional[str]: + """Try to get anon key from supabase status command. + + Returns: + Anon key if found, None otherwise + """ + import subprocess + + try: + result = subprocess.run( + ["supabase", "status"], + capture_output=True, + text=True, + timeout=10, + ) + + if result.returncode == 0: + # Parse output for anon key + for line in result.stdout.split('\n'): + if 'anon key:' in line.lower(): + # Extract the key after the colon + key = line.split(':', 1)[1].strip() + logger.info("Found anon key from 'supabase status'") + return key + + except (subprocess.SubprocessError, FileNotFoundError): + logger.debug("Could not run 'supabase status' to get anon key") + + return None + + def test_credentials(self, credentials: Dict[str, Any]) -> bool: + """Test if Supabase credentials are valid. + + Args: + credentials: Dictionary with api_url, api_key, and postgres connection details + + Returns: + True if credentials are valid + """ + import requests + import psycopg2 + + api_url = credentials["api_url"] + api_key = credentials.get("api_key", "") + + # Test PostgreSQL connection + try: + conn_params = { + "host": credentials["postgres_host"], + "port": credentials["postgres_port"], + "user": credentials["postgres_user"], + "password": credentials["postgres_password"], + "database": credentials["postgres_database"], + } + conn = psycopg2.connect(**conn_params) + conn.close() + logger.info("āœ“ PostgreSQL connection successful") + except Exception as e: + logger.error(f"āœ— PostgreSQL connection failed: {e}") + return False + + # Test PostgREST API connection (optional - may not be running yet) + try: + headers = {} + if api_key: + headers["apikey"] = api_key + headers["Authorization"] = f"Bearer {api_key}" + + response = requests.get(api_url, headers=headers, timeout=5) + + # Any response (including 404, 401) means the API is reachable + logger.info(f"āœ“ PostgREST API reachable at {api_url} (status: {response.status_code})") + return True + + except requests.exceptions.ConnectionError: + logger.warning( + f"⚠ PostgREST API not reachable at {api_url}.\n" + "Make sure PostgREST is running (e.g., docker run -p 3000:3000 postgrest/postgrest)\n" + "or use a cloud Supabase instance URL." + ) + # Still return True as PostgreSQL connection works + return True + except Exception as e: + logger.warning(f"⚠ PostgREST API test failed: {e}") + # Still return True as PostgreSQL connection works + return True + + def format_credentials_info(self, credentials: Dict[str, Any]) -> str: + """Format credentials info for display. + + Args: + credentials: Dictionary with connection details + + Returns: + Formatted string describing the credentials + """ + api_url = credentials["api_url"] + has_api_key = bool(credentials.get("api_key")) + postgres_host = credentials["postgres_host"] + postgres_db = credentials["postgres_database"] + + return ( + f"Supabase Configuration:\n" + f" API URL: {api_url}\n" + f" API Key: {'āœ“ Configured' if has_api_key else 'āœ— Not set'}\n" + f" PostgreSQL: {postgres_host}/{postgres_db}" + ) diff --git a/src/mcp_services/supabase/supabase_state_manager.py b/src/mcp_services/supabase/supabase_state_manager.py new file mode 100644 index 00000000..f9895887 --- /dev/null +++ b/src/mcp_services/supabase/supabase_state_manager.py @@ -0,0 +1,518 @@ +""" +Supabase State Manager for MCPMark +==================================== + +Manages database state for Supabase tasks using the same PostgreSQL backend +as Insforge, but accessed via PostgREST/Supabase MCP server. +""" + +import os +import sys +import subprocess +import psycopg2 +from psycopg2 import sql +from pathlib import Path +from typing import Optional, Dict, Any, List + +from src.base.state_manager import BaseStateManager, InitialStateInfo +from src.base.task_manager import BaseTask +from src.logger import get_logger + +logger = get_logger(__name__) + + +class SupabaseStateManager(BaseStateManager): + """Manages Supabase/PostgREST database state for task evaluation. + + Uses the same PostgreSQL database as Insforge but exposes it via + PostgREST API for the Supabase MCP server to access. + """ + + def __init__( + self, + api_url: str, + api_key: str, + postgres_host: str = "localhost", + postgres_port: int = 54322, # Supabase CLI default port + postgres_user: str = "postgres", + postgres_password: str = "postgres", + postgres_database: str = "postgres", # Supabase CLI default database + ): + """Initialize Supabase state manager. + + Args: + api_url: PostgREST API URL from Supabase CLI (default: http://localhost:54321) + api_key: API key from Supabase CLI (anon or service_role key) + postgres_host: PostgreSQL host for direct database operations + postgres_port: PostgreSQL port (Supabase CLI uses 54322) + postgres_user: PostgreSQL username + postgres_password: PostgreSQL password + postgres_database: Main PostgreSQL database name + """ + super().__init__(service_name="supabase") + + self.api_url = api_url.rstrip('/') + self.api_key = api_key + + # PostgreSQL connection for state management (Supabase CLI instance) + self.postgres_host = postgres_host + self.postgres_port = postgres_port + self.postgres_user = postgres_user + self.postgres_password = postgres_password + self.postgres_database = postgres_database + + # Track current task context for agent configuration + self._current_task_context: Optional[Dict[str, Any]] = None + + # Validate connection on initialization + try: + self._test_connection() + logger.info("Supabase state manager initialized successfully") + except Exception as e: + raise RuntimeError(f"Supabase initialization failed: {e}") + + # Store baseline tables (system tables that exist before any tasks run) + self._baseline_tables = set( + (t['schema'], t['name']) for t in self._get_all_tables() + ) + logger.debug(f"Stored baseline: {len(self._baseline_tables)} tables") + + def _test_connection(self): + """Test PostgreSQL connection.""" + try: + conn_params = { + "host": self.postgres_host, + "port": self.postgres_port, + "user": self.postgres_user, + "password": self.postgres_password, + "database": self.postgres_database, + } + conn = psycopg2.connect(**conn_params) + conn.close() + logger.debug("PostgreSQL connection test successful") + except Exception as e: + raise RuntimeError(f"Cannot connect to PostgreSQL: {e}") + + def _create_initial_state(self, task: BaseTask) -> Optional[InitialStateInfo]: + """Create initial backend state for a task. + + Restores from backup which may place tables in public or task-specific schema. + + Args: + task: Task for which to create initial state + + Returns: + InitialStateInfo object or None if creation failed + """ + try: + # Generate unique state ID for this task run + state_id = f"{task.category_id}_{task.task_id}_{self._get_timestamp()}" + schema_name = task.category_id + + logger.info(f"| Creating initial state for Supabase task: {task.name}") + + # Drop schema first (cleanup from previous runs) + self._drop_schema(schema_name) + + # Get list of existing tables before restore (to track what we create) + tables_before = self._get_all_tables() + logger.info(f"| Tables before restore: {len(tables_before)}") + + # Create schema for this task (in case backup uses it) + self._create_schema(schema_name) + + # Restore from backup if backup exists (may create tables in public or task schema) + if self._restore_from_backup(schema_name): + logger.info(f"| āœ“ Restored '{schema_name}' from backup") + else: + logger.info(f"| ā—‹ No backup found for '{schema_name}'") + # Run prepare_environment.py if it exists + task_prepared = self._run_prepare_environment(task) + if not task_prepared: + logger.debug(f"| No prepare_environment.py found for task {task.name}") + + # Get list of tables after restore (to track what we need to clean up) + tables_after = self._get_all_tables() + + # Track ALL new tables created by the restore (compare before/after) + tables_before_set = {(t['schema'], t['name']) for t in tables_before} + created_tables = [ + t for t in tables_after + if (t['schema'], t['name']) not in tables_before_set + ] + + logger.info(f"| Tracked {len(created_tables)} new tables for cleanup") + for t in created_tables: + logger.debug(f"| - {t['schema']}.{t['name']}") + + # Track the task context including created tables + context = { + "state_id": state_id, + "category_id": task.category_id, + "task_id": task.task_id, + "task_name": task.name, + "schema": schema_name, + "created_tables": created_tables, + } + + return InitialStateInfo( + state_id=state_id, + state_url=self.api_url, + metadata=context, + ) + + except Exception as e: + logger.error(f"Failed to create initial state for {task.name}: {e}") + return None + + def _store_initial_state_info( + self, task: BaseTask, state_info: InitialStateInfo + ) -> None: + """Store backend info in task object for agent access.""" + if hasattr(task, "__dict__"): + task.api_url = self.api_url + task.api_key = self.api_key + task.state_id = state_info.state_id + + # Store current task context for agent configuration + self._current_task_context = state_info.metadata + + def _cleanup_task_initial_state(self, task: BaseTask) -> bool: + """Clean up task-specific resources. + + Drops ALL tables created during task (both setup and agent-created) + by comparing against baseline. + + Args: + task: Task whose initial state should be cleaned up + + Returns: + True if cleanup successful + """ + try: + logger.info(f"| Cleaning up initial state for task: {task.name}") + + if self._current_task_context: + schema_name = self._current_task_context.get("schema") + + # Get ALL current tables + all_current_tables = self._get_all_tables() + + # Find tables to drop: anything not in baseline + tables_to_drop = [ + t for t in all_current_tables + if (t['schema'], t['name']) not in self._baseline_tables + ] + + logger.info(f"| Found {len(tables_to_drop)} tables to clean up (setup + agent-created)") + + # Drop individual tables + for table_info in tables_to_drop: + try: + self._drop_table(table_info["schema"], table_info["name"]) + logger.debug(f"| āœ“ Dropped table: {table_info['schema']}.{table_info['name']}") + except Exception as e: + logger.warning(f"| Failed to drop table {table_info}: {e}") + + # Drop the task schema (may be empty if all tables were in public) + if schema_name: + try: + self._drop_schema(schema_name) + logger.info(f"| āœ“ Dropped schema: {schema_name}") + except Exception as e: + logger.warning(f"| Failed to drop schema {schema_name}: {e}") + + # Clear task context + if self._current_task_context.get("task_name") == task.name: + self._current_task_context = None + + logger.info(f"| āœ“ Initial state cleanup completed for {task.name}") + return True + + except Exception as e: + logger.error(f"Failed to cleanup task initial state for {task.name}: {e}") + return False + + def _cleanup_single_resource(self, resource: Dict[str, Any]) -> bool: + """Clean up a single tracked resource. + + Args: + resource: Resource dictionary with type, id, and metadata + + Returns: + True if cleanup successful + """ + resource_type = resource["type"] + resource_id = resource["id"] + + logger.debug(f"| Cleanup for {resource_type} {resource_id} (handled by task scripts)") + return True + + def _run_prepare_environment(self, task: BaseTask) -> bool: + """Run prepare_environment.py script if it exists in the task directory. + + The script should use database operations to set up required state. + + Args: + task: Task for which to prepare environment + + Returns: + True if script ran successfully, False if script doesn't exist + """ + task_dir = task.task_instruction_path.parent + prepare_script = task_dir / "prepare_environment.py" + + if not prepare_script.exists(): + logger.debug(f"No prepare_environment.py found for task {task.name}") + return False + + logger.info(f"| Running prepare_environment.py for task {task.name}") + + # Set up environment variables for the script + env = os.environ.copy() + env.update({ + "SUPABASE_API_URL": self.api_url, + "SUPABASE_API_KEY": self.api_key, + "POSTGRES_HOST": self.postgres_host, + "POSTGRES_PORT": str(self.postgres_port), + "POSTGRES_DATABASE": self.postgres_database, + "POSTGRES_USERNAME": self.postgres_user, + "POSTGRES_PASSWORD": self.postgres_password, + }) + + try: + # Run the prepare_environment.py script + result = subprocess.run( + [sys.executable, str(prepare_script)], + cwd=str(task_dir), # Run from task directory + env=env, + capture_output=True, + text=True, + timeout=300, # 5 minute timeout + ) + + if result.returncode == 0: + logger.info(f"| āœ“ Environment preparation completed for {task.name}") + if result.stdout.strip(): + logger.debug(f"| prepare_environment.py output: {result.stdout}") + return True + else: + logger.error(f"| āœ— Environment preparation failed for {task.name}") + logger.error(f"| Error output: {result.stderr}") + raise RuntimeError(f"prepare_environment.py failed with exit code {result.returncode}") + + except subprocess.TimeoutExpired: + logger.error(f"āœ— Environment preparation timed out for {task.name}") + raise RuntimeError("prepare_environment.py execution timed out") + except Exception as e: + logger.error(f"āœ— Failed to run prepare_environment.py for {task.name}: {e}") + raise + + def _get_timestamp(self) -> str: + """Get timestamp for unique naming.""" + from datetime import datetime + return datetime.now().strftime("%Y%m%d%H%M%S") + + def _drop_schema(self, schema_name: str) -> None: + """Drop schema and all its contents.""" + conn_params = { + "host": self.postgres_host, + "port": self.postgres_port, + "user": self.postgres_user, + "password": self.postgres_password, + "database": self.postgres_database, + } + + conn = psycopg2.connect(**conn_params) + conn.autocommit = True + try: + with conn.cursor() as cur: + cur.execute( + sql.SQL("DROP SCHEMA IF EXISTS {} CASCADE").format( + sql.Identifier(schema_name) + ) + ) + logger.debug(f"| Dropped schema: {schema_name}") + finally: + conn.close() + + def _create_schema(self, schema_name: str) -> None: + """Create empty schema.""" + conn_params = { + "host": self.postgres_host, + "port": self.postgres_port, + "user": self.postgres_user, + "password": self.postgres_password, + "database": self.postgres_database, + } + + conn = psycopg2.connect(**conn_params) + conn.autocommit = True + try: + with conn.cursor() as cur: + cur.execute( + sql.SQL("CREATE SCHEMA {}").format(sql.Identifier(schema_name)) + ) + logger.debug(f"| Created schema: {schema_name}") + finally: + conn.close() + + def _get_all_tables(self) -> List[Dict[str, str]]: + """Get list of all user tables. + + Returns: + List of dicts with 'schema' and 'name' keys + """ + conn_params = { + "host": self.postgres_host, + "port": self.postgres_port, + "user": self.postgres_user, + "password": self.postgres_password, + "database": self.postgres_database, + } + + conn = psycopg2.connect(**conn_params) + try: + with conn.cursor() as cur: + cur.execute(""" + SELECT table_schema, table_name + FROM information_schema.tables + WHERE table_type = 'BASE TABLE' + AND table_schema NOT IN ('information_schema', 'pg_catalog') + AND table_schema NOT LIKE 'pg_%' + AND table_name NOT LIKE '\\_%' + ORDER BY table_schema, table_name + """) + rows = cur.fetchall() + return [{"schema": row[0], "name": row[1]} for row in rows] + finally: + conn.close() + + def _drop_table(self, schema_name: str, table_name: str) -> None: + """Drop a specific table.""" + conn_params = { + "host": self.postgres_host, + "port": self.postgres_port, + "user": self.postgres_user, + "password": self.postgres_password, + "database": self.postgres_database, + } + + conn = psycopg2.connect(**conn_params) + conn.autocommit = True + try: + with conn.cursor() as cur: + cur.execute( + sql.SQL("DROP TABLE IF EXISTS {}.{} CASCADE").format( + sql.Identifier(schema_name), + sql.Identifier(table_name) + ) + ) + logger.debug(f"| Dropped table: {schema_name}.{table_name}") + finally: + conn.close() + + def _restore_from_backup(self, category_name: str) -> bool: + """Restore from backup file. + + Tables may be restored into public schema or category-specific schema + depending on how the backup was created. + + Args: + category_name: Name of category (e.g., 'employees', 'chinook', 'lego') + + Returns: + True if backup was restored, False if no backup exists + """ + # Path to backup file (same as used by Insforge/Postgres) + backup_dir = Path(__file__).parent.parent.parent.parent / "postgres_state" + backup_file = backup_dir / f"{category_name}.backup" + + logger.debug(f"| Looking for backup at: {backup_file}") + + if not backup_file.exists(): + logger.info(f"| ā—‹ No backup file found: {backup_file}") + return False + + logger.info(f"| Restoring {category_name} from backup...") + + # Set up environment for pg_restore + env = os.environ.copy() + env["PGPASSWORD"] = self.postgres_password + + try: + # Restore backup + result = subprocess.run( + [ + "pg_restore", + "-h", self.postgres_host, + "-p", str(self.postgres_port), + "-U", self.postgres_user, + "-d", self.postgres_database, + "-v", + str(backup_file), + ], + env=env, + capture_output=True, + text=True, + timeout=120, # 2 minute timeout + ) + + if result.returncode != 0 and "ERROR" in result.stderr: + logger.warning(f"| pg_restore had errors for {category_name}: {result.stderr}") + return False + + logger.info(f"| āœ“ {category_name} restored successfully") + return True + + except subprocess.TimeoutExpired: + logger.error(f"| āœ— Restore timed out for {category_name}") + return False + except Exception as e: + logger.error(f"| āœ— Failed to restore {category_name}: {e}") + return False + + def get_service_config_for_agent(self) -> dict: + """Get configuration for agent execution. + + This configuration is passed to the agent/MCP server so it can + connect to the Supabase/PostgREST endpoint. + + Returns: + Dictionary containing API URL and API key + """ + config = { + "api_url": self.api_url, + "api_key": self.api_key, + "schema": "public", # Default schema for PostgREST + } + + # Include current task context if available + if self._current_task_context: + config["task_context"] = self._current_task_context + # If task uses a specific schema, include it + if self._current_task_context.get("schema"): + config["schema"] = self._current_task_context["schema"] + + return config + + def set_verification_environment(self, messages_path: str = None) -> None: + """Set environment variables needed for verification scripts. + + Args: + messages_path: Optional path to messages.json file for verification + """ + os.environ["SUPABASE_API_URL"] = self.api_url + os.environ["SUPABASE_API_KEY"] = self.api_key + + # Set PostgreSQL connection details for direct database verification + os.environ["POSTGRES_HOST"] = self.postgres_host + os.environ["POSTGRES_PORT"] = str(self.postgres_port) + os.environ["POSTGRES_DATABASE"] = self.postgres_database + os.environ["POSTGRES_USERNAME"] = self.postgres_user + os.environ["POSTGRES_PASSWORD"] = self.postgres_password + + if messages_path: + os.environ["MCP_MESSAGES"] = str(messages_path) + + logger.debug("Verification environment variables set for Supabase (including direct postgres access)") diff --git a/src/mcp_services/supabase/supabase_task_manager.py b/src/mcp_services/supabase/supabase_task_manager.py new file mode 100644 index 00000000..1d69a8ce --- /dev/null +++ b/src/mcp_services/supabase/supabase_task_manager.py @@ -0,0 +1,113 @@ +""" +Supabase Task Manager for MCPMark +=================================== + +Manages Supabase task discovery, execution, and verification. +Reuses Postgres tasks but accesses them via PostgREST/Supabase MCP. +""" + +import os +import subprocess +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Any, Dict, List, Optional + +from src.base.task_manager import BaseTask, BaseTaskManager +from src.logger import get_logger + +logger = get_logger(__name__) + + +@dataclass +class SupabaseTask(BaseTask): + """Supabase-specific task with API information.""" + + task_name: str = "" + api_url: Optional[str] = None + api_key: Optional[str] = None + + +class SupabaseTaskManager(BaseTaskManager): + """Manages Supabase tasks for MCPMark evaluation. + + Uses the same task structure as Postgres tasks but accessed via + PostgREST/Supabase MCP server. + """ + + def __init__(self, tasks_root: Path = None): + """Initialize Supabase task manager. + + Args: + tasks_root: Path to tasks directory + """ + if tasks_root is None: + tasks_root = Path(__file__).resolve().parents[3] / "tasks" + + super().__init__( + tasks_root, + mcp_service="supabase", + task_class=SupabaseTask, + task_organization="file", # Supabase uses file-based tasks (like Postgres) + ) + + def _create_task_from_files( + self, category_id: str, task_files_info: Dict[str, Any] + ) -> Optional[SupabaseTask]: + """Instantiate a `SupabaseTask` from the dictionary returned by `_find_task_files`.""" + import json + + # Check for meta.json + meta_path = task_files_info["instruction_path"].parent / "meta.json" + final_category_id = category_id + task_id = task_files_info["task_id"] + + if meta_path.exists(): + try: + with open(meta_path, 'r') as f: + meta_data = json.load(f) + # Use values from meta.json if available + final_category_id = meta_data.get("category_id", category_id) + task_id = meta_data.get("task_id", task_id) + except Exception as e: + logger.warning(f"Failed to load meta.json from {meta_path}: {e}") + + return SupabaseTask( + task_instruction_path=task_files_info["instruction_path"], + task_verification_path=task_files_info["verification_path"], + service="supabase", + category_id=final_category_id, + task_id=task_id, + task_name=task_files_info["task_id"], + ) + + def _get_verification_command(self, task: SupabaseTask) -> List[str]: + """Get verification command with Supabase API info.""" + cmd = [sys.executable, str(task.task_verification_path)] + return cmd + + def run_verification(self, task: BaseTask) -> subprocess.CompletedProcess: + """Run verification with Supabase environment.""" + env = os.environ.copy() + + # Pass Supabase connection info to verification script + if hasattr(task, "api_url") and task.api_url: + env["SUPABASE_API_URL"] = task.api_url + + if hasattr(task, "api_key") and task.api_key: + env["SUPABASE_API_KEY"] = task.api_key + + return subprocess.run( + self._get_verification_command(task), + capture_output=True, + text=True, + timeout=300, + env=env, + ) + + def _format_task_instruction(self, base_instruction: str) -> str: + """Add Supabase-specific instructions.""" + return ( + base_instruction + + "\n\nNote: Use Supabase MCP tools (PostgREST) to complete this task. The API connection is already configured." + ) diff --git a/src/services.py b/src/services.py index 67111121..041ce311 100644 --- a/src/services.py +++ b/src/services.py @@ -300,6 +300,70 @@ "mcp_server": None, "eval_config": None, }, + "supabase": { + "config_schema": { + "api_url": { + "env_var": "SUPABASE_API_URL", + "required": False, + "description": "Supabase PostgREST API URL (default: http://localhost:54321 from CLI)", + "default": "http://localhost:54321", + }, + "api_key": { + "env_var": "SUPABASE_API_KEY", + "required": False, + "description": "Supabase API key (anon or service_role key from 'supabase status')", + }, + "postgres_host": { + "env_var": "SUPABASE_DB_HOST", + "required": False, + "description": "PostgreSQL host for Supabase CLI instance", + "default": "localhost", + }, + "postgres_port": { + "env_var": "SUPABASE_DB_PORT", + "required": False, + "description": "PostgreSQL port for Supabase CLI instance (default: 54322)", + "default": 54322, + }, + "postgres_user": { + "env_var": "SUPABASE_DB_USER", + "required": False, + "description": "PostgreSQL username", + "default": "postgres", + }, + "postgres_password": { + "env_var": "SUPABASE_DB_PASSWORD", + "required": False, + "description": "PostgreSQL password", + "default": "postgres", + }, + "postgres_database": { + "env_var": "SUPABASE_DB_NAME", + "required": False, + "description": "PostgreSQL database name", + "default": "postgres", + }, + }, + "components": { + "task_manager": "src.mcp_services.supabase.supabase_task_manager.SupabaseTaskManager", + "state_manager": "src.mcp_services.supabase.supabase_state_manager.SupabaseStateManager", + "login_helper": "src.mcp_services.supabase.supabase_login_helper.SupabaseLoginHelper", + }, + "config_mapping": { + "state_manager": { + "api_url": "api_url", + "api_key": "api_key", + "postgres_host": "postgres_host", + "postgres_port": "postgres_port", + "postgres_user": "postgres_user", + "postgres_password": "postgres_password", + "postgres_database": "postgres_database", + }, + "login_helper": {}, + }, + "mcp_server": None, + "eval_config": None, + }, "playwright_webarena": { "config_schema": { "browser": { diff --git a/supabase/.branches/_current_branch b/supabase/.branches/_current_branch new file mode 100644 index 00000000..88d050b1 --- /dev/null +++ b/supabase/.branches/_current_branch @@ -0,0 +1 @@ +main \ No newline at end of file diff --git a/supabase/.temp/cli-latest b/supabase/.temp/cli-latest new file mode 100644 index 00000000..2213dd2c --- /dev/null +++ b/supabase/.temp/cli-latest @@ -0,0 +1 @@ +v2.51.0 \ No newline at end of file diff --git a/tasks/insforge b/tasks/insforge new file mode 120000 index 00000000..7d72bd78 --- /dev/null +++ b/tasks/insforge @@ -0,0 +1 @@ +postgres \ No newline at end of file diff --git a/tasks/supabase b/tasks/supabase new file mode 120000 index 00000000..7d72bd78 --- /dev/null +++ b/tasks/supabase @@ -0,0 +1 @@ +postgres \ No newline at end of file From 757c5688537abe6629dab2e747891f5cb20df965 Mon Sep 17 00:00:00 2001 From: yaowenc2 Date: Sun, 19 Oct 2025 16:44:27 -0700 Subject: [PATCH 3/5] change state manager --- src/mcp_services/insforge/insforge_state_manager.py | 7 ++++--- src/mcp_services/supabase/supabase_state_manager.py | 3 +-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/mcp_services/insforge/insforge_state_manager.py b/src/mcp_services/insforge/insforge_state_manager.py index 7c5b7961..0a910bb2 100644 --- a/src/mcp_services/insforge/insforge_state_manager.py +++ b/src/mcp_services/insforge/insforge_state_manager.py @@ -101,13 +101,14 @@ def _create_initial_state(self, task: BaseTask) -> Optional[InitialStateInfo]: logger.info(f"| Creating initial state for Insforge task: {task.name}") + # Drop schema first (cleanup from previous runs) + self._drop_schema(schema_name) + # Get list of existing tables before restore (to track what we create) tables_before = self._get_all_tables() logger.info(f"| Tables before restore: {len(tables_before)}") - # Create schema for this task (in case backup uses it) - self._drop_schema(schema_name) - self._create_schema(schema_name) + # Note: Don't create schema here - pg_restore will create it from the backup # Restore from backup if backup exists (may create tables in public or task schema) if self._restore_from_backup(schema_name): diff --git a/src/mcp_services/supabase/supabase_state_manager.py b/src/mcp_services/supabase/supabase_state_manager.py index f9895887..a8927c0b 100644 --- a/src/mcp_services/supabase/supabase_state_manager.py +++ b/src/mcp_services/supabase/supabase_state_manager.py @@ -118,8 +118,7 @@ def _create_initial_state(self, task: BaseTask) -> Optional[InitialStateInfo]: tables_before = self._get_all_tables() logger.info(f"| Tables before restore: {len(tables_before)}") - # Create schema for this task (in case backup uses it) - self._create_schema(schema_name) + # Note: Don't create schema here - pg_restore will create it from the backup # Restore from backup if backup exists (may create tables in public or task schema) if self._restore_from_backup(schema_name): From 2540e219c3f925ec2cf263c6ae0c883c59c3b442 Mon Sep 17 00:00:00 2001 From: yaowenc2 Date: Sun, 19 Oct 2025 22:07:05 -0700 Subject: [PATCH 4/5] materialzied view --- src/mcp_services/insforge/insforge_state_manager.py | 12 ++++++++++-- src/mcp_services/supabase/supabase_state_manager.py | 12 ++++++++++-- src/model_config.py | 5 +++++ 3 files changed, 25 insertions(+), 4 deletions(-) diff --git a/src/mcp_services/insforge/insforge_state_manager.py b/src/mcp_services/insforge/insforge_state_manager.py index 0a910bb2..c2d3f927 100644 --- a/src/mcp_services/insforge/insforge_state_manager.py +++ b/src/mcp_services/insforge/insforge_state_manager.py @@ -385,7 +385,7 @@ def _get_all_tables(self) -> List[Dict[str, str]]: conn.close() def _drop_table(self, schema_name: str, table_name: str) -> None: - """Drop a specific table.""" + """Drop a specific table or materialized view.""" import psycopg2 from psycopg2 import sql @@ -401,13 +401,21 @@ def _drop_table(self, schema_name: str, table_name: str) -> None: conn.autocommit = True try: with conn.cursor() as cur: + # Try dropping as table first cur.execute( sql.SQL("DROP TABLE IF EXISTS {}.{} CASCADE").format( sql.Identifier(schema_name), sql.Identifier(table_name) ) ) - logger.debug(f"| Dropped table: {schema_name}.{table_name}") + # Also try dropping as materialized view (in case agent created one) + cur.execute( + sql.SQL("DROP MATERIALIZED VIEW IF EXISTS {}.{} CASCADE").format( + sql.Identifier(schema_name), + sql.Identifier(table_name) + ) + ) + logger.debug(f"| Dropped table/view: {schema_name}.{table_name}") finally: conn.close() diff --git a/src/mcp_services/supabase/supabase_state_manager.py b/src/mcp_services/supabase/supabase_state_manager.py index a8927c0b..1bea58f2 100644 --- a/src/mcp_services/supabase/supabase_state_manager.py +++ b/src/mcp_services/supabase/supabase_state_manager.py @@ -388,7 +388,7 @@ def _get_all_tables(self) -> List[Dict[str, str]]: conn.close() def _drop_table(self, schema_name: str, table_name: str) -> None: - """Drop a specific table.""" + """Drop a specific table or materialized view.""" conn_params = { "host": self.postgres_host, "port": self.postgres_port, @@ -401,13 +401,21 @@ def _drop_table(self, schema_name: str, table_name: str) -> None: conn.autocommit = True try: with conn.cursor() as cur: + # Try dropping as table first cur.execute( sql.SQL("DROP TABLE IF EXISTS {}.{} CASCADE").format( sql.Identifier(schema_name), sql.Identifier(table_name) ) ) - logger.debug(f"| Dropped table: {schema_name}.{table_name}") + # Also try dropping as materialized view (in case agent created one) + cur.execute( + sql.SQL("DROP MATERIALIZED VIEW IF EXISTS {}.{} CASCADE").format( + sql.Identifier(schema_name), + sql.Identifier(table_name) + ) + ) + logger.debug(f"| Dropped table/view: {schema_name}.{table_name}") finally: conn.close() diff --git a/src/model_config.py b/src/model_config.py index 28209ab3..f8fadde9 100644 --- a/src/model_config.py +++ b/src/model_config.py @@ -97,6 +97,11 @@ class ModelConfig: "api_key_var": "ANTHROPIC_API_KEY", "litellm_input_model_name": "anthropic/claude-sonnet-4-20250514", }, + "claude-sonnet-4.5": { + "provider": "anthropic", + "api_key_var": "ANTHROPIC_API_KEY", + "litellm_input_model_name": "anthropic/claude-sonnet-4-5-20250929", + }, "claude-opus-4": { "provider": "anthropic", "api_key_var": "ANTHROPIC_API_KEY", From 75c159694839147edf034402dd80fba3c85b53c1 Mon Sep 17 00:00:00 2001 From: yaowenc2 Date: Tue, 21 Oct 2025 21:58:58 -0700 Subject: [PATCH 5/5] unused task --- supabase/.branches/_current_branch | 1 - supabase/.temp/cli-latest | 1 - 2 files changed, 2 deletions(-) delete mode 100644 supabase/.branches/_current_branch delete mode 100644 supabase/.temp/cli-latest diff --git a/supabase/.branches/_current_branch b/supabase/.branches/_current_branch deleted file mode 100644 index 88d050b1..00000000 --- a/supabase/.branches/_current_branch +++ /dev/null @@ -1 +0,0 @@ -main \ No newline at end of file diff --git a/supabase/.temp/cli-latest b/supabase/.temp/cli-latest deleted file mode 100644 index 2213dd2c..00000000 --- a/supabase/.temp/cli-latest +++ /dev/null @@ -1 +0,0 @@ -v2.51.0 \ No newline at end of file