From e6e4a8aa4d71e8cb57e71a00eece9c61e1a28267 Mon Sep 17 00:00:00 2001
From: yaowenc2 <yaowenc2@illinois.edu>
Date: Fri, 17 Oct 2025 16:03:23 -0700
Subject: [PATCH 1/5] =?UTF-8?q?=E2=9C=A8=20feat:=20add=20Insforge=20MCP=20?=
 =?UTF-8?q?server=20support?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds complete support for benchmarking Insforge Backend-as-a-Service via MCP.

## What's Added:
- **Insforge MCP Service**: New service configuration in `src/services.py`
- **State Management**: `InsforgeStateManager` handles backend setup via `prepare_environment.py` scripts
- **Login Helper**: `InsforgeLoginHelper` validates backend connectivity
- **Task Manager**: `InsforgeTaskManager` manages Insforge-specific tasks
- **MCP Integration**: Added Insforge to both `base_agent.py` and `mcpmark_agent.py`
- **Docker Support**: Updated `run-task.sh` with Insforge container configuration

## How It Works:
- Uses `@insforge/mcp` npm package for MCP server
- Requires `INSFORGE_API_KEY` and `INSFORGE_BACKEND_URL` environment variables
- Can test SQL tasks by symlinking `tasks/insforge -> tasks/postgres`
- Enables comparison between direct SQL (postgres-mcp) and REST API (insforge) approaches

## Configuration:
```bash
# In .mcp_env
INSFORGE_API_KEY="your-api-key"
INSFORGE_BACKEND_URL="http://localhost:7130"
```

## Usage:
```bash
python -m pipeline \
  --mcp insforge \
  --models claude-sonnet-4 \
  --exp-name insforge-test \
  --tasks your_task \
  --k 1
```

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
---
 run-task.sh                                   |  10 +
 src/agents/base_agent.py                      |  16 +-
 src/agents/mcpmark_agent.py                   |  22 +-
 src/mcp_services/insforge/__init__.py         |   1 +
 .../insforge/insforge_login_helper.py         | 159 ++++++++++
 .../insforge/insforge_state_manager.py        | 272 ++++++++++++++++++
 .../insforge/insforge_task_manager.py         | 108 +++++++
 src/services.py                               |  31 ++
 8 files changed, 614 insertions(+), 5 deletions(-)
 create mode 100644 src/mcp_services/insforge/__init__.py
 create mode 100644 src/mcp_services/insforge/insforge_login_helper.py
 create mode 100644 src/mcp_services/insforge/insforge_state_manager.py
 create mode 100644 src/mcp_services/insforge/insforge_task_manager.py

diff --git a/run-task.sh b/run-task.sh
index 875d7671..405ac526 100755
--- a/run-task.sh
+++ b/run-task.sh
@@ -139,6 +139,16 @@ elif [ "$SERVICE" = "filesystem" ]; then
         $([ -f .mcp_env ] && echo "-v $(pwd)/.mcp_env:/app/.mcp_env:ro") \
         "$DOCKER_IMAGE" \
         python3 -m pipeline --mcp "$SERVICE" --k 1 "$@"
+elif [ "$SERVICE" = "insforge" ]; then
+    # For Insforge service, use host network to access Insforge backend on host
+    docker run --rm \
+        --memory="$DOCKER_MEMORY_LIMIT" \
+        --cpus="$DOCKER_CPU_LIMIT" \
+        --add-host=host.docker.internal:host-gateway \
+        -v "$(pwd)/results:/app/results" \
+        $([ -f .mcp_env ] && echo "-v $(pwd)/.mcp_env:/app/.mcp_env:ro") \
+        "$DOCKER_IMAGE" \
+        python3 -m pipeline --mcp "$SERVICE" --k 1 "$@"
 else
     # For other services (notion, github, playwright, etc.)
     docker run --rm \
diff --git a/src/agents/base_agent.py b/src/agents/base_agent.py
index 81fb87a7..070bddd0 100644
--- a/src/agents/base_agent.py
+++ b/src/agents/base_agent.py
@@ -19,7 +19,7 @@
 class BaseMCPAgent(ABC):
     """Base class with shared functionality for MCPMark agents."""
 
-    STDIO_SERVICES = ["notion", "filesystem", "playwright", "playwright_webarena", "postgres"]
+    STDIO_SERVICES = ["notion", "filesystem", "playwright", "playwright_webarena", "postgres", "insforge"]
     HTTP_SERVICES = ["github"]
     DEFAULT_TIMEOUT = 600
 
@@ -207,6 +207,20 @@ def _create_stdio_server(self) -> MCPStdioServer:
                 env={"DATABASE_URI": database_url},
             )
 
+        if self.mcp_service == "insforge":
+            api_key = self.service_config.get("api_key")
+            backend_url = self.service_config.get("backend_url")
+            if not all([api_key, backend_url]):
+                raise ValueError("Insforge requires api_key and backend_url")
+            return MCPStdioServer(
+                command="npx",
+                args=["-y", "@insforge/mcp"],
+                env={
+                    "INSFORGE_API_KEY": api_key,
+                    "INSFORGE_BACKEND_URL": backend_url,
+                },
+            )
+
         raise ValueError(f"Unsupported stdio service: {self.mcp_service}")
 
     def _create_http_server(self) -> MCPHttpServer:
diff --git a/src/agents/mcpmark_agent.py b/src/agents/mcpmark_agent.py
index db0343cd..4a3ba12e 100644
--- a/src/agents/mcpmark_agent.py
+++ b/src/agents/mcpmark_agent.py
@@ -844,18 +844,32 @@ def _create_stdio_server(self) -> MCPStdioServer:
             username = self.service_config.get("username")
             password = self.service_config.get("password")
             database = self.service_config.get("current_database") or self.service_config.get("database")
-            
+
             if not all([username, password, database]):
                 raise ValueError("PostgreSQL requires username, password, and database")
-            
+
             database_url = f"postgresql://{username}:{password}@{host}:{port}/{database}"
-            
+
             return MCPStdioServer(
                 command="pipx",
                 args=["run", "postgres-mcp", "--access-mode=unrestricted"],
                 env={"DATABASE_URI": database_url}
             )
-        
+
+        elif self.mcp_service == "insforge":
+            api_key = self.service_config.get("api_key")
+            backend_url = self.service_config.get("backend_url")
+            if not all([api_key, backend_url]):
+                raise ValueError("Insforge requires api_key and backend_url")
+            return MCPStdioServer(
+                command="npx",
+                args=["-y", "@insforge/mcp"],
+                env={
+                    "INSFORGE_API_KEY": api_key,
+                    "INSFORGE_BACKEND_URL": backend_url,
+                },
+            )
+
         else:
             raise ValueError(f"Unsupported stdio service: {self.mcp_service}")
     
diff --git a/src/mcp_services/insforge/__init__.py b/src/mcp_services/insforge/__init__.py
new file mode 100644
index 00000000..3a4022af
--- /dev/null
+++ b/src/mcp_services/insforge/__init__.py
@@ -0,0 +1 @@
+"""Insforge MCP Service Implementation for MCPMark."""
diff --git a/src/mcp_services/insforge/insforge_login_helper.py b/src/mcp_services/insforge/insforge_login_helper.py
new file mode 100644
index 00000000..e32bd311
--- /dev/null
+++ b/src/mcp_services/insforge/insforge_login_helper.py
@@ -0,0 +1,159 @@
+"""
+Insforge Login Helper for MCPMark
+==================================
+
+Handles Insforge backend authentication and connection validation.
+"""
+
+import json
+import requests
+from pathlib import Path
+from typing import Optional, Dict, Any
+
+from src.base.login_helper import BaseLoginHelper
+from src.logger import get_logger
+
+logger = get_logger(__name__)
+
+
+class InsforgeLoginHelper(BaseLoginHelper):
+    """Handles Insforge backend authentication and connection validation."""
+
+    def __init__(
+        self,
+        api_key: str,
+        backend_url: str,
+        state_path: Optional[Path] = None,
+    ):
+        """Initialize Insforge login helper.
+
+        Args:
+            api_key: Insforge backend API key for authentication
+            backend_url: Insforge backend URL (e.g., https://your-app.insforge.app)
+            state_path: Path to save connection state
+        """
+        super().__init__()
+        self.api_key = api_key
+        self.backend_url = backend_url.rstrip('/')
+        self.state_path = state_path or Path.home() / ".mcpbench" / "insforge_auth.json"
+
+        # Ensure state directory exists
+        self.state_path.parent.mkdir(parents=True, exist_ok=True)
+
+    def login(self, **kwargs) -> bool:
+        """Test Insforge backend connection and validate API key.
+
+        Returns:
+            bool: True if connection successful and API key valid
+        """
+        try:
+            # Test 1: Basic connectivity - try to get backend metadata
+            logger.info(f"Testing connection to Insforge backend: {self.backend_url}")
+
+            headers = {
+                "Authorization": f"Bearer {self.api_key}",
+                "Content-Type": "application/json",
+            }
+
+            # Test with a simple API endpoint - get current user or backend info
+            # Try the auth current session endpoint first
+            test_url = f"{self.backend_url}/api/auth/sessions/current"
+
+            response = requests.get(
+                test_url,
+                headers=headers,
+                timeout=10,
+            )
+
+            if response.status_code == 200:
+                # API key is valid and can authenticate
+                logger.info("✓ Insforge API key authentication successful")
+                connection_info = {
+                    "backend_url": self.backend_url,
+                    "authenticated": True,
+                    "authenticated_at": self._get_current_timestamp(),
+                }
+            elif response.status_code == 401:
+                # Invalid API key
+                logger.error("✗ Invalid Insforge API key")
+                return False
+            else:
+                # API key might be admin key, try a different endpoint
+                # Try listing tables/backend metadata as a test
+                logger.info("Testing with backend metadata endpoint...")
+
+                # Simple connectivity test - just check if backend is reachable
+                health_url = f"{self.backend_url}/api/health"
+                try:
+                    health_response = requests.get(health_url, timeout=5)
+                    if health_response.status_code in [200, 404]:  # 404 is ok, backend is reachable
+                        logger.info("✓ Insforge backend is reachable")
+                        connection_info = {
+                            "backend_url": self.backend_url,
+                            "api_key_type": "admin",
+                            "authenticated": True,
+                            "authenticated_at": self._get_current_timestamp(),
+                        }
+                    else:
+                        logger.warning(f"Unexpected response from backend: {health_response.status_code}")
+                        connection_info = {
+                            "backend_url": self.backend_url,
+                            "authenticated": True,
+                            "authenticated_at": self._get_current_timestamp(),
+                        }
+                except Exception as e:
+                    logger.warning(f"Health check failed, but proceeding: {e}")
+                    # Still consider it successful if we have credentials
+                    connection_info = {
+                        "backend_url": self.backend_url,
+                        "authenticated": True,
+                        "authenticated_at": self._get_current_timestamp(),
+                    }
+
+            # Save connection state
+            self._save_connection_state(connection_info)
+
+            logger.info(f"Insforge backend connection validated: {self.backend_url}")
+            return True
+
+        except requests.exceptions.Timeout:
+            logger.error(f"Connection timeout to Insforge backend: {self.backend_url}")
+            return False
+        except requests.exceptions.ConnectionError:
+            logger.error(f"Cannot connect to Insforge backend: {self.backend_url}")
+            return False
+        except Exception as e:
+            logger.error(f"Unexpected error during Insforge authentication: {e}")
+            return False
+
+    def _save_connection_state(self, state: Dict[str, Any]):
+        """Save connection state to file."""
+        try:
+            # Don't save API key
+            safe_state = {k: v for k, v in state.items() if k not in ["api_key", "access_token"]}
+
+            with open(self.state_path, "w") as f:
+                json.dump(safe_state, f, indent=2)
+
+            # Set restrictive permissions
+            self.state_path.chmod(0o600)
+            logger.info(f"Connection state saved to: {self.state_path}")
+
+        except Exception as e:
+            logger.error(f"Failed to save connection state: {e}")
+
+    def _get_current_timestamp(self) -> str:
+        """Get current timestamp in ISO format."""
+        from datetime import datetime, timezone
+
+        return datetime.now(timezone.utc).isoformat()
+
+    def is_connected(self) -> bool:
+        """Check if we can connect to Insforge backend."""
+        return self.login()
+
+    def get_connection_params(self) -> Dict[str, Any]:
+        """Get connection parameters (without API key)."""
+        return {
+            "backend_url": self.backend_url,
+        }
diff --git a/src/mcp_services/insforge/insforge_state_manager.py b/src/mcp_services/insforge/insforge_state_manager.py
new file mode 100644
index 00000000..cb8a374d
--- /dev/null
+++ b/src/mcp_services/insforge/insforge_state_manager.py
@@ -0,0 +1,272 @@
+"""
+Insforge State Manager for MCPMark
+===================================
+
+Manages backend state for Insforge tasks including setup via prepare_environment.py
+and resource cleanup tracking.
+"""
+
+import os
+import sys
+import subprocess
+import requests
+from pathlib import Path
+from typing import Optional, Dict, Any, List
+
+from src.base.state_manager import BaseStateManager, InitialStateInfo
+from src.base.task_manager import BaseTask
+from src.logger import get_logger
+
+logger = get_logger(__name__)
+
+
+class InsforgeStateManager(BaseStateManager):
+    """Manages Insforge backend state for task evaluation."""
+
+    def __init__(
+        self,
+        api_key: str,
+        backend_url: str,
+    ):
+        """Initialize Insforge state manager.
+
+        Args:
+            api_key: Insforge backend API key for authentication
+            backend_url: Insforge backend URL (e.g., https://your-app.insforge.app)
+        """
+        super().__init__(service_name="insforge")
+
+        self.api_key = api_key
+        self.backend_url = backend_url.rstrip('/')
+
+        # HTTP headers for API requests
+        self.headers = {
+            "Authorization": f"Bearer {self.api_key}",
+            "Content-Type": "application/json",
+        }
+
+        # Track current task context for agent configuration
+        self._current_task_context: Optional[Dict[str, Any]] = None
+
+        # Validate connection on initialization
+        try:
+            self._test_connection()
+            logger.info("Insforge state manager initialized successfully")
+        except Exception as e:
+            raise RuntimeError(f"Insforge initialization failed: {e}")
+
+    def _test_connection(self):
+        """Test backend connection."""
+        try:
+            # Simple connectivity test - try any endpoint
+            response = requests.get(
+                f"{self.backend_url}/api/health",
+                timeout=5,
+            )
+            # Any response (even 404) means backend is reachable
+            logger.debug(f"Insforge backend connectivity test: {response.status_code}")
+        except requests.exceptions.RequestException:
+            # Try with API key
+            try:
+                response = requests.get(
+                    f"{self.backend_url}/api/auth/sessions/current",
+                    headers=self.headers,
+                    timeout=5,
+                )
+                logger.debug(f"Insforge backend auth test: {response.status_code}")
+            except Exception as inner_e:
+                raise RuntimeError(f"Cannot connect to Insforge backend: {inner_e}")
+
+    def _create_initial_state(self, task: BaseTask) -> Optional[InitialStateInfo]:
+        """Create initial backend state for a task.
+
+        This runs prepare_environment.py script if it exists in the task directory.
+        The script should use Insforge MCP tools or HTTP API to set up tables, data, etc.
+
+        Args:
+            task: Task for which to create initial state
+
+        Returns:
+            InitialStateInfo object or None if creation failed
+        """
+        try:
+            # Generate unique state ID for this task run
+            state_id = f"{task.category_id}_{task.task_id}_{self._get_timestamp()}"
+
+            logger.info(f"| Creating initial state for Insforge task: {task.name}")
+
+            # Run prepare_environment.py if it exists
+            task_prepared = self._run_prepare_environment(task)
+
+            if not task_prepared:
+                logger.debug(f"| No prepare_environment.py found for task {task.name}")
+
+            # Track the task context
+            context = {
+                "state_id": state_id,
+                "category_id": task.category_id,
+                "task_id": task.task_id,
+                "task_name": task.name,
+            }
+
+            return InitialStateInfo(
+                state_id=state_id,
+                state_url=self.backend_url,
+                metadata=context,
+            )
+
+        except Exception as e:
+            logger.error(f"Failed to create initial state for {task.name}: {e}")
+            return None
+
+    def _store_initial_state_info(
+        self, task: BaseTask, state_info: InitialStateInfo
+    ) -> None:
+        """Store backend info in task object for agent access."""
+        if hasattr(task, "__dict__"):
+            task.backend_url = self.backend_url
+            task.api_key = self.api_key
+            task.state_id = state_info.state_id
+
+            # Store current task context for agent configuration
+            self._current_task_context = state_info.metadata
+
+    def _cleanup_task_initial_state(self, task: BaseTask) -> bool:
+        """Clean up task-specific resources.
+
+        Note: Actual cleanup of created resources is delegated to prepare_environment.py
+        cleanup scripts or handled by _cleanup_tracked_resources.
+
+        Args:
+            task: Task whose initial state should be cleaned up
+
+        Returns:
+            True if cleanup successful
+        """
+        try:
+            logger.info(f"| Cleaning up initial state for task: {task.name}")
+
+            # Clear current task context
+            if (self._current_task_context and
+                self._current_task_context.get("task_name") == task.name):
+                self._current_task_context = None
+
+            logger.info(f"| ✓ Initial state cleanup completed for {task.name}")
+            return True
+
+        except Exception as e:
+            logger.error(f"Failed to cleanup task initial state for {task.name}: {e}")
+            return False
+
+    def _cleanup_single_resource(self, resource: Dict[str, Any]) -> bool:
+        """Clean up a single tracked resource.
+
+        This is a placeholder for resource-specific cleanup logic.
+        Tasks should handle their own cleanup via cleanup scripts.
+
+        Args:
+            resource: Resource dictionary with type, id, and metadata
+
+        Returns:
+            True if cleanup successful
+        """
+        resource_type = resource["type"]
+        resource_id = resource["id"]
+
+        logger.debug(f"| Cleanup for {resource_type} {resource_id} (handled by task scripts)")
+        return True
+
+    def _run_prepare_environment(self, task: BaseTask) -> bool:
+        """Run prepare_environment.py script if it exists in the task directory.
+
+        The script should use Insforge MCP tools or HTTP API to set up required state.
+
+        Args:
+            task: Task for which to prepare environment
+
+        Returns:
+            True if script ran successfully, False if script doesn't exist
+        """
+        task_dir = task.task_instruction_path.parent
+        prepare_script = task_dir / "prepare_environment.py"
+
+        if not prepare_script.exists():
+            logger.debug(f"No prepare_environment.py found for task {task.name}")
+            return False
+
+        logger.info(f"| Running prepare_environment.py for task {task.name}")
+
+        # Set up environment variables for the script
+        env = os.environ.copy()
+        env.update({
+            "INSFORGE_BACKEND_URL": self.backend_url,
+            "INSFORGE_API_KEY": self.api_key,
+        })
+
+        try:
+            # Run the prepare_environment.py script
+            result = subprocess.run(
+                [sys.executable, str(prepare_script)],
+                cwd=str(task_dir),  # Run from task directory
+                env=env,
+                capture_output=True,
+                text=True,
+                timeout=300,  # 5 minute timeout
+            )
+
+            if result.returncode == 0:
+                logger.info(f"| ✓ Environment preparation completed for {task.name}")
+                if result.stdout.strip():
+                    logger.debug(f"| prepare_environment.py output: {result.stdout}")
+                return True
+            else:
+                logger.error(f"| ✗ Environment preparation failed for {task.name}")
+                logger.error(f"| Error output: {result.stderr}")
+                raise RuntimeError(f"prepare_environment.py failed with exit code {result.returncode}")
+
+        except subprocess.TimeoutExpired:
+            logger.error(f"✗ Environment preparation timed out for {task.name}")
+            raise RuntimeError("prepare_environment.py execution timed out")
+        except Exception as e:
+            logger.error(f"✗ Failed to run prepare_environment.py for {task.name}: {e}")
+            raise
+
+    def _get_timestamp(self) -> str:
+        """Get timestamp for unique naming."""
+        from datetime import datetime
+
+        return datetime.now().strftime("%Y%m%d%H%M%S")
+
+    def get_service_config_for_agent(self) -> dict:
+        """Get configuration for agent execution.
+
+        This configuration is passed to the agent/MCP server so it can
+        connect to the Insforge backend.
+
+        Returns:
+            Dictionary containing backend URL and API key
+        """
+        config = {
+            "backend_url": self.backend_url,
+            "api_key": self.api_key,
+        }
+
+        # Include current task context if available
+        if self._current_task_context:
+            config["task_context"] = self._current_task_context
+
+        return config
+
+    def set_verification_environment(self, messages_path: str = None) -> None:
+        """Set environment variables needed for verification scripts.
+
+        Args:
+            messages_path: Optional path to messages.json file for verification
+        """
+        os.environ["INSFORGE_BACKEND_URL"] = self.backend_url
+        os.environ["INSFORGE_API_KEY"] = self.api_key
+
+        if messages_path:
+            os.environ["MCP_MESSAGES"] = str(messages_path)
+
+        logger.debug("Verification environment variables set for Insforge")
diff --git a/src/mcp_services/insforge/insforge_task_manager.py b/src/mcp_services/insforge/insforge_task_manager.py
new file mode 100644
index 00000000..f4feb695
--- /dev/null
+++ b/src/mcp_services/insforge/insforge_task_manager.py
@@ -0,0 +1,108 @@
+"""
+Insforge Task Manager for MCPMark
+===================================
+
+Manages Insforge task discovery, execution, and verification.
+"""
+
+import os
+import subprocess
+import sys
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+
+from src.base.task_manager import BaseTask, BaseTaskManager
+from src.logger import get_logger
+
+logger = get_logger(__name__)
+
+
+@dataclass
+class InsforgeTask(BaseTask):
+    """Insforge-specific task with backend information."""
+
+    task_name: str = ""
+    backend_url: Optional[str] = None
+    api_key: Optional[str] = None
+
+
+class InsforgeTaskManager(BaseTaskManager):
+    """Manages Insforge tasks for MCPMark evaluation."""
+
+    def __init__(self, tasks_root: Path = None):
+        """Initialize Insforge task manager.
+
+        Args:
+            tasks_root: Path to tasks directory
+        """
+        if tasks_root is None:
+            tasks_root = Path(__file__).resolve().parents[3] / "tasks"
+
+        super().__init__(
+            tasks_root,
+            mcp_service="insforge",
+            task_class=InsforgeTask,
+            task_organization="file",  # Insforge uses file-based tasks
+        )
+
+    def _create_task_from_files(
+        self, category_id: str, task_files_info: Dict[str, Any]
+    ) -> Optional[InsforgeTask]:
+        """Instantiate an `InsforgeTask` from the dictionary returned by `_find_task_files`."""
+        import json
+
+        # Check for meta.json
+        meta_path = task_files_info["instruction_path"].parent / "meta.json"
+        final_category_id = category_id
+        task_id = task_files_info["task_id"]
+
+        if meta_path.exists():
+            try:
+                with open(meta_path, 'r') as f:
+                    meta_data = json.load(f)
+                    # Use values from meta.json if available
+                    final_category_id = meta_data.get("category_id", category_id)
+                    task_id = meta_data.get("task_id", task_id)
+            except Exception as e:
+                logger.warning(f"Failed to load meta.json from {meta_path}: {e}")
+
+        return InsforgeTask(
+            task_instruction_path=task_files_info["instruction_path"],
+            task_verification_path=task_files_info["verification_path"],
+            service="insforge",
+            category_id=final_category_id,
+            task_id=task_id,
+            task_name=task_files_info["task_id"],
+        )
+
+    def _get_verification_command(self, task: InsforgeTask) -> List[str]:
+        """Get verification command with Insforge backend info."""
+        cmd = [sys.executable, str(task.task_verification_path)]
+        return cmd
+
+    def run_verification(self, task: BaseTask) -> subprocess.CompletedProcess:
+        """Run verification with Insforge environment."""
+        env = os.environ.copy()
+
+        # Pass Insforge connection info to verification script
+        if hasattr(task, "backend_url") and task.backend_url:
+            env["INSFORGE_BACKEND_URL"] = task.backend_url
+
+        if hasattr(task, "api_key") and task.api_key:
+            env["INSFORGE_API_KEY"] = task.api_key
+
+        return subprocess.run(
+            self._get_verification_command(task),
+            capture_output=True,
+            text=True,
+            timeout=300,
+            env=env,
+        )
+
+    def _format_task_instruction(self, base_instruction: str) -> str:
+        """Add Insforge-specific instructions."""
+        return (
+            base_instruction
+            + "\n\nNote: Use Insforge MCP tools to complete this task. The backend connection is already configured."
+        )
diff --git a/src/services.py b/src/services.py
index 772236ad..67111121 100644
--- a/src/services.py
+++ b/src/services.py
@@ -269,6 +269,37 @@
         "mcp_server": None,
         "eval_config": None,
     },
+    "insforge": {
+        "config_schema": {
+            "api_key": {
+                "env_var": "INSFORGE_API_KEY",
+                "required": True,
+                "description": "Insforge backend API key for authentication",
+            },
+            "backend_url": {
+                "env_var": "INSFORGE_BACKEND_URL",
+                "required": True,
+                "description": "Insforge backend URL (e.g., https://your-app.insforge.app)",
+            },
+        },
+        "components": {
+            "task_manager": "src.mcp_services.insforge.insforge_task_manager.InsforgeTaskManager",
+            "state_manager": "src.mcp_services.insforge.insforge_state_manager.InsforgeStateManager",
+            "login_helper": "src.mcp_services.insforge.insforge_login_helper.InsforgeLoginHelper",
+        },
+        "config_mapping": {
+            "state_manager": {
+                "api_key": "api_key",
+                "backend_url": "backend_url",
+            },
+            "login_helper": {
+                "api_key": "api_key",
+                "backend_url": "backend_url",
+            },
+        },
+        "mcp_server": None,
+        "eval_config": None,
+    },
     "playwright_webarena": {
         "config_schema": {
             "browser": {

From 6f5710e1af7e4834e7757e3157b6e288c2d02714 Mon Sep 17 00:00:00 2001
From: yaowenc2 <yaowenc2@illinois.edu>
Date: Sun, 19 Oct 2025 15:18:16 -0700
Subject: [PATCH 2/5] update supabase support

---
 src/agents/base_agent.py                      |   4 +-
 src/agents/mcpmark_agent.py                   |  24 +-
 src/aggregators/aggregate_results.py          |  22 +-
 src/aggregators/aggregate_specific_results.py | 238 ++++++++
 .../insforge/insforge_state_manager.py        | 270 ++++++++-
 src/mcp_services/supabase/__init__.py         |  11 +
 .../supabase/supabase_login_helper.py         | 168 ++++++
 .../supabase/supabase_state_manager.py        | 518 ++++++++++++++++++
 .../supabase/supabase_task_manager.py         | 113 ++++
 src/services.py                               |  64 +++
 supabase/.branches/_current_branch            |   1 +
 supabase/.temp/cli-latest                     |   1 +
 tasks/insforge                                |   1 +
 tasks/supabase                                |   1 +
 14 files changed, 1408 insertions(+), 28 deletions(-)
 create mode 100644 src/aggregators/aggregate_specific_results.py
 create mode 100644 src/mcp_services/supabase/__init__.py
 create mode 100644 src/mcp_services/supabase/supabase_login_helper.py
 create mode 100644 src/mcp_services/supabase/supabase_state_manager.py
 create mode 100644 src/mcp_services/supabase/supabase_task_manager.py
 create mode 100644 supabase/.branches/_current_branch
 create mode 100644 supabase/.temp/cli-latest
 create mode 120000 tasks/insforge
 create mode 120000 tasks/supabase

diff --git a/src/agents/base_agent.py b/src/agents/base_agent.py
index 070bddd0..661a1994 100644
--- a/src/agents/base_agent.py
+++ b/src/agents/base_agent.py
@@ -20,7 +20,7 @@ class BaseMCPAgent(ABC):
     """Base class with shared functionality for MCPMark agents."""
 
     STDIO_SERVICES = ["notion", "filesystem", "playwright", "playwright_webarena", "postgres", "insforge"]
-    HTTP_SERVICES = ["github"]
+    HTTP_SERVICES = ["github", "supabase"]
     DEFAULT_TIMEOUT = 600
 
     CLAUDE_THINKING_BUDGETS = {
@@ -214,7 +214,7 @@ def _create_stdio_server(self) -> MCPStdioServer:
                 raise ValueError("Insforge requires api_key and backend_url")
             return MCPStdioServer(
                 command="npx",
-                args=["-y", "@insforge/mcp"],
+                args=["-y", "@insforge/mcp@dev"],
                 env={
                     "INSFORGE_API_KEY": api_key,
                     "INSFORGE_BACKEND_URL": backend_url,
diff --git a/src/agents/mcpmark_agent.py b/src/agents/mcpmark_agent.py
index 4a3ba12e..66c13109 100644
--- a/src/agents/mcpmark_agent.py
+++ b/src/agents/mcpmark_agent.py
@@ -863,7 +863,7 @@ def _create_stdio_server(self) -> MCPStdioServer:
                 raise ValueError("Insforge requires api_key and backend_url")
             return MCPStdioServer(
                 command="npx",
-                args=["-y", "@insforge/mcp"],
+                args=["-y", "@insforge/mcp@dev"],
                 env={
                     "INSFORGE_API_KEY": api_key,
                     "INSFORGE_BACKEND_URL": backend_url,
@@ -880,7 +880,7 @@ def _create_http_server(self) -> MCPHttpServer:
             github_token = self.service_config.get("github_token")
             if not github_token:
                 raise ValueError("GitHub token required")
-            
+
             return MCPHttpServer(
                 url="https://api.githubcopilot.com/mcp/",
                 headers={
@@ -888,6 +888,26 @@ def _create_http_server(self) -> MCPHttpServer:
                     "User-Agent": "MCPMark/1.0"
                 }
             )
+
+        elif self.mcp_service == "supabase":
+            # Use built-in MCP server from Supabase CLI
+            api_url = self.service_config.get("api_url", "http://localhost:54321")
+            api_key = self.service_config.get("api_key", "")
+
+            if not api_key:
+                raise ValueError("Supabase requires api_key (use secret key from 'supabase status')")
+
+            # Supabase CLI exposes MCP at /mcp endpoint
+            mcp_url = f"{api_url}/mcp"
+
+            return MCPHttpServer(
+                url=mcp_url,
+                headers={
+                    "apikey": api_key,
+                    "Authorization": f"Bearer {api_key}",
+                }
+            )
+
         else:
             raise ValueError(f"Unsupported HTTP service: {self.mcp_service}")
     
diff --git a/src/aggregators/aggregate_results.py b/src/aggregators/aggregate_results.py
index 44a8afd5..9e045756 100755
--- a/src/aggregators/aggregate_results.py
+++ b/src/aggregators/aggregate_results.py
@@ -23,9 +23,9 @@
 def discover_tasks() -> Dict[str, List[str]]:
     """Discover all tasks from ./tasks directory."""
     tasks_dir = Path("./tasks")
-    
+
     all_tasks = {}
-    
+
     # Handle each MCP service
     # Note: playwright and playwright_webarena both map to "playwright" MCP
     service_mappings = {
@@ -33,9 +33,9 @@ def discover_tasks() -> Dict[str, List[str]]:
         "github": ["github"],
         "notion": ["notion"],
         "playwright": ["playwright", "playwright_webarena"],  # Both count as playwright
-        "postgres": ["postgres"]
+        "postgres": ["postgres", "supabase", "insforge"]  # All map to postgres
     }
-    
+
     for mcp_service, task_dirs in service_mappings.items():
         tasks = []
         for task_dir_name in task_dirs:
@@ -68,9 +68,11 @@ def collect_results(exp_dir: Path, k: int) -> Dict[str, Dict[str, Any]]:
             continue
         
         model, service = model_service_dir.name.split("__", 1)
-        # Normalize service name: treat playwright_webarena as playwright
+        # Normalize service names
         if service == "playwright_webarena":
             service = "playwright"
+        elif service in ["supabase", "insforge"]:
+            service = "postgres"
         
         for run_idx in range(1, k + 1):
             run_dir = model_service_dir / f"run-{run_idx}"
@@ -874,23 +876,23 @@ def main():
         help="Comma-separated list of models that only need run-1"
     )
     parser.add_argument("--push", action="store_true", help="Push to GitHub (default to main)")
-    
+
     args = parser.parse_args()
-    
+
     # Parse single-run models
     single_run_models = []
     if args.single_run_models:
         single_run_models = [m.strip() for m in args.single_run_models.split(",")]
         print(f"📌 Single-run models: {', '.join(single_run_models)}")
-    
+
     # Setup paths
     exp_dir = Path("./results") / args.exp_name
     if not exp_dir.exists():
         print(f"❌ Experiment directory {exp_dir} does not exist")
         return 1
-    
+
     print(f"🔄 Processing experiment: {args.exp_name}")
-    
+
     # Discover all tasks
     print("📋 Discovering tasks...")
     all_tasks = discover_tasks()
diff --git a/src/aggregators/aggregate_specific_results.py b/src/aggregators/aggregate_specific_results.py
new file mode 100644
index 00000000..65414390
--- /dev/null
+++ b/src/aggregators/aggregate_specific_results.py
@@ -0,0 +1,238 @@
+#!/usr/bin/env python3
+"""
+Simple Results Aggregator - Aggregate specific result directories
+Usage: python -m src.aggregators.aggregate_specific_results --result-dir results/exp/model__service --k 4
+"""
+
+import json
+import argparse
+from pathlib import Path
+from collections import defaultdict
+from typing import Dict, Any, Tuple, List
+from datetime import datetime
+import sys
+sys.path.append(str(Path(__file__).parent.parent.parent))
+from src.aggregators.pricing import compute_cost_usd
+
+
+def collect_results_from_dir(result_dir: Path, k: int) -> Dict[str, Any]:
+    """Collect all results from a specific result directory."""
+    results = {}
+
+    for run_idx in range(1, k + 1):
+        run_dir = result_dir / f"run-{run_idx}"
+        if not run_dir.exists():
+            print(f"⚠️  Warning: {run_dir} does not exist, skipping")
+            continue
+
+        run_results = {}
+        for task_dir in run_dir.iterdir():
+            if not task_dir.is_dir():
+                continue
+
+            meta_path = task_dir / "meta.json"
+            if meta_path.exists():
+                with open(meta_path) as f:
+                    meta = json.load(f)
+                    run_results[task_dir.name] = meta
+
+        results[f"run-{run_idx}"] = run_results
+
+    return results
+
+
+def get_token_counts(meta: Dict[str, Any]) -> Tuple[int, int, int]:
+    """Extract token counts from meta."""
+    tu = meta.get("token_usage", {}) or {}
+    input_tokens = int(tu.get("input_tokens", 0) or 0)
+    output_tokens = int(tu.get("output_tokens", 0) or 0)
+    total_tokens = int(tu.get("total_tokens", input_tokens + output_tokens) or (input_tokens + output_tokens))
+    return input_tokens, output_tokens, total_tokens
+
+
+def calculate_metrics(results: Dict, k: int, model_name: str) -> Dict:
+    """Calculate metrics from results."""
+
+    # Get all unique task names
+    all_tasks = set()
+    for run_name, run_data in results.items():
+        all_tasks.update(run_data.keys())
+    all_tasks = sorted(all_tasks)
+
+    total_tasks = len(all_tasks)
+    actual_runs = len(results)
+
+    print(f"\n📊 Analysis:")
+    print(f"  Total unique tasks: {total_tasks}")
+    print(f"  Runs found: {actual_runs} (expected: {k})")
+
+    # Aggregates
+    total_agent_execution_time = 0.0
+    total_input_tokens = 0
+    total_output_tokens = 0
+    total_tokens = 0
+    total_turns = 0
+
+    actual_model_name = None
+
+    # Per-run pass@1
+    pass1_rates_per_run = []
+
+    # For pass@k
+    pass_k_task_success_any = 0
+    pass_power_k_task_success_all = 0
+
+    for run_idx in range(1, actual_runs + 1):
+        run_name = f"run-{run_idx}"
+        successes_this_run = 0
+
+        for task in all_tasks:
+            meta = results.get(run_name, {}).get(task)
+
+            if not meta:
+                continue
+
+            success = bool(meta.get("execution_result", {}).get("success", False))
+            if success:
+                successes_this_run += 1
+
+            total_agent_execution_time += float(meta.get("agent_execution_time", 0.0) or 0.0)
+            in_tok, out_tok, ttl_tok = get_token_counts(meta)
+            total_input_tokens += in_tok
+            total_output_tokens += out_tok
+            total_tokens += ttl_tok
+            total_turns += int(meta.get("turn_count", 0) or 0)
+
+            if actual_model_name is None:
+                actual_model_name = meta.get("actual_model_name") or None
+
+        pass1_rate = successes_this_run / total_tasks if total_tasks > 0 else 0
+        pass1_rates_per_run.append(pass1_rate)
+        print(f"  Run {run_idx}: {successes_this_run}/{total_tasks} = {pass1_rate*100:.1f}%")
+
+    # Calculate pass@k
+    for task in all_tasks:
+        successes = []
+        for run_idx in range(1, actual_runs + 1):
+            run_name = f"run-{run_idx}"
+            meta = results.get(run_name, {}).get(task)
+            success = bool(meta.get("execution_result", {}).get("success", False)) if meta else False
+            successes.append(success)
+
+        if any(successes):
+            pass_k_task_success_any += 1
+        if all(successes):
+            pass_power_k_task_success_all += 1
+
+    # Averages
+    denom = total_tasks * actual_runs if total_tasks > 0 else 1
+    avg_agent_execution_time = total_agent_execution_time / denom
+    avg_input_tokens = total_input_tokens / denom
+    avg_output_tokens = total_output_tokens / denom
+    avg_total_tokens = total_tokens / denom
+    avg_turns = total_turns / denom
+
+    # Pass@1 stats
+    if pass1_rates_per_run:
+        avg_pass1 = sum(pass1_rates_per_run) / len(pass1_rates_per_run)
+        mean = avg_pass1
+        variance = sum((r - mean) ** 2 for r in pass1_rates_per_run) / len(pass1_rates_per_run)
+        std_pass1 = variance ** 0.5
+    else:
+        avg_pass1 = 0.0
+        std_pass1 = 0.0
+
+    # Cost calculation
+    per_run_input_tokens = total_input_tokens / actual_runs if actual_runs else 0
+    per_run_output_tokens = total_output_tokens / actual_runs if actual_runs else 0
+    model_for_pricing = actual_model_name or model_name
+    per_run_cost = compute_cost_usd(model_for_pricing, per_run_input_tokens, per_run_output_tokens)
+
+    summary = {
+        "generated_at": datetime.now().isoformat(),
+        "model": model_name,
+        "actual_model_name": actual_model_name or model_name,
+        "runs": actual_runs,
+        "total_tasks": total_tasks,
+        "total_agent_execution_time": round(total_agent_execution_time, 2),
+        "total_input_tokens": total_input_tokens,
+        "total_output_tokens": total_output_tokens,
+        "total_tokens": total_tokens,
+        "total_turns": total_turns,
+        "avg_agent_execution_time": round(avg_agent_execution_time, 4),
+        "avg_input_tokens": round(avg_input_tokens, 2),
+        "avg_output_tokens": round(avg_output_tokens, 2),
+        "avg_total_tokens": round(avg_total_tokens, 2),
+        "avg_turns": round(avg_turns, 2),
+        "per_run_input_tokens": round(per_run_input_tokens, 2),
+        "per_run_output_tokens": round(per_run_output_tokens, 2),
+        "per_run_cost": round(per_run_cost, 4) if per_run_cost else None,
+        "pass@1": {
+            "avg": round(avg_pass1, 4),
+            "std": round(std_pass1, 4),
+            "per_run": [round(r, 4) for r in pass1_rates_per_run]
+        },
+    }
+
+    if actual_runs > 1:
+        summary[f"pass@{actual_runs}"] = round(pass_k_task_success_any / total_tasks, 4)
+        summary[f"pass^{actual_runs}"] = round(pass_power_k_task_success_all / total_tasks, 4)
+
+    return summary
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Simple results aggregator for specific directories")
+    parser.add_argument("--result-dir", required=True, help="Path to result directory (e.g., results/exp/model__service)")
+    parser.add_argument("--k", type=int, default=4, help="Number of runs (default: 4)")
+    parser.add_argument("--output", help="Output JSON file path (default: <result-dir>/summary.json)")
+
+    args = parser.parse_args()
+
+    result_dir = Path(args.result_dir)
+    if not result_dir.exists():
+        print(f"❌ Result directory {result_dir} does not exist")
+        return 1
+
+    # Extract model name from directory name
+    model_name = result_dir.name.replace("__", "-")
+
+    print(f"🔄 Processing: {result_dir}")
+    print(f"📋 Model: {model_name}")
+
+    # Collect results
+    results = collect_results_from_dir(result_dir, args.k)
+
+    if not results:
+        print("❌ No results found")
+        return 1
+
+    # Calculate metrics
+    summary = calculate_metrics(results, args.k, model_name)
+
+    # Save summary
+    output_path = Path(args.output) if args.output else result_dir / "summary.json"
+    with open(output_path, "w") as f:
+        json.dump(summary, f, indent=2)
+
+    print(f"\n✅ Summary saved to: {output_path}")
+    print(f"\n📈 Results:")
+    print(f"  Pass@1: {summary['pass@1']['avg']*100:.1f}% ± {summary['pass@1']['std']*100:.1f}%")
+    if f"pass@{args.k}" in summary:
+        print(f"  Pass@{args.k}: {summary[f'pass@{args.k}']*100:.1f}%")
+        print(f"  Pass^{args.k}: {summary[f'pass^{args.k}']*100:.1f}%")
+    print(f"  Per-run cost: ${summary['per_run_cost']:.4f}" if summary['per_run_cost'] else "  Per-run cost: N/A")
+    print(f"  Avg agent time: {summary['avg_agent_execution_time']:.2f}s")
+    print(f"  Avg turns: {summary['avg_turns']:.2f}")
+    print(f"\n📊 Token Usage:")
+    avg_tokens_per_run = summary['total_tokens'] / summary['runs'] if summary['runs'] > 0 else 0
+    print(f"  Avg tokens per run: {avg_tokens_per_run:,.0f}")
+    print(f"  Avg tokens per turn: {summary['avg_total_tokens'] / summary['avg_turns']:.0f}" if summary['avg_turns'] > 0 else "  Avg tokens per turn: N/A")
+    print(f"  Total tokens (all runs): {summary['total_tokens']:,}")
+    print(f"  Total turns (all runs): {summary['total_turns']:,}")
+
+    return 0
+
+
+if __name__ == "__main__":
+    exit(main())
diff --git a/src/mcp_services/insforge/insforge_state_manager.py b/src/mcp_services/insforge/insforge_state_manager.py
index cb8a374d..7c5b7961 100644
--- a/src/mcp_services/insforge/insforge_state_manager.py
+++ b/src/mcp_services/insforge/insforge_state_manager.py
@@ -55,6 +55,12 @@ def __init__(
         except Exception as e:
             raise RuntimeError(f"Insforge initialization failed: {e}")
 
+        # Store baseline tables (system tables that exist before any tasks run)
+        self._baseline_tables = set(
+            (t['schema'], t['name']) for t in self._get_all_tables()
+        )
+        logger.debug(f"Stored baseline: {len(self._baseline_tables)} tables")
+
     def _test_connection(self):
         """Test backend connection."""
         try:
@@ -80,8 +86,7 @@ def _test_connection(self):
     def _create_initial_state(self, task: BaseTask) -> Optional[InitialStateInfo]:
         """Create initial backend state for a task.
 
-        This runs prepare_environment.py script if it exists in the task directory.
-        The script should use Insforge MCP tools or HTTP API to set up tables, data, etc.
+        Restores from backup which may place tables in public or task-specific schema.
 
         Args:
             task: Task for which to create initial state
@@ -92,21 +97,50 @@ def _create_initial_state(self, task: BaseTask) -> Optional[InitialStateInfo]:
         try:
             # Generate unique state ID for this task run
             state_id = f"{task.category_id}_{task.task_id}_{self._get_timestamp()}"
+            schema_name = task.category_id
 
             logger.info(f"| Creating initial state for Insforge task: {task.name}")
 
-            # Run prepare_environment.py if it exists
-            task_prepared = self._run_prepare_environment(task)
+            # Get list of existing tables before restore (to track what we create)
+            tables_before = self._get_all_tables()
+            logger.info(f"| Tables before restore: {len(tables_before)}")
 
-            if not task_prepared:
-                logger.debug(f"| No prepare_environment.py found for task {task.name}")
+            # Create schema for this task (in case backup uses it)
+            self._drop_schema(schema_name)
+            self._create_schema(schema_name)
 
-            # Track the task context
+            # Restore from backup if backup exists (may create tables in public or task schema)
+            if self._restore_from_backup(schema_name):
+                logger.info(f"| ✓ Restored '{schema_name}' from backup")
+            else:
+                logger.info(f"| ○ No backup found for '{schema_name}'")
+                # Run prepare_environment.py if it exists
+                task_prepared = self._run_prepare_environment(task)
+                if not task_prepared:
+                    logger.debug(f"| No prepare_environment.py found for task {task.name}")
+
+            # Get list of tables after restore (to track what we need to clean up)
+            tables_after = self._get_all_tables()
+
+            # Track ALL new tables created by the restore (compare before/after)
+            tables_before_set = {(t['schema'], t['name']) for t in tables_before}
+            created_tables = [
+                t for t in tables_after
+                if (t['schema'], t['name']) not in tables_before_set
+            ]
+
+            logger.info(f"| Tracked {len(created_tables)} new tables for cleanup")
+            for t in created_tables:
+                logger.debug(f"|   - {t['schema']}.{t['name']}")
+
+            # Track the task context including created tables
             context = {
                 "state_id": state_id,
                 "category_id": task.category_id,
                 "task_id": task.task_id,
                 "task_name": task.name,
+                "schema": schema_name,
+                "created_tables": created_tables,  # Track all created tables
             }
 
             return InitialStateInfo(
@@ -134,8 +168,8 @@ def _store_initial_state_info(
     def _cleanup_task_initial_state(self, task: BaseTask) -> bool:
         """Clean up task-specific resources.
 
-        Note: Actual cleanup of created resources is delegated to prepare_environment.py
-        cleanup scripts or handled by _cleanup_tracked_resources.
+        Drops ALL tables created during task (both setup and agent-created)
+        by comparing against baseline.
 
         Args:
             task: Task whose initial state should be cleaned up
@@ -146,10 +180,39 @@ def _cleanup_task_initial_state(self, task: BaseTask) -> bool:
         try:
             logger.info(f"| Cleaning up initial state for task: {task.name}")
 
-            # Clear current task context
-            if (self._current_task_context and
-                self._current_task_context.get("task_name") == task.name):
-                self._current_task_context = None
+            if self._current_task_context:
+                schema_name = self._current_task_context.get("schema")
+
+                # Get ALL current tables
+                all_current_tables = self._get_all_tables()
+
+                # Find tables to drop: anything not in baseline
+                tables_to_drop = [
+                    t for t in all_current_tables
+                    if (t['schema'], t['name']) not in self._baseline_tables
+                ]
+
+                logger.info(f"| Found {len(tables_to_drop)} tables to clean up (setup + agent-created)")
+
+                # Drop individual tables
+                for table_info in tables_to_drop:
+                    try:
+                        self._drop_table(table_info["schema"], table_info["name"])
+                        logger.debug(f"| ✓ Dropped table: {table_info['schema']}.{table_info['name']}")
+                    except Exception as e:
+                        logger.warning(f"| Failed to drop table {table_info}: {e}")
+
+                # Drop the task schema (may be empty if all tables were in public)
+                if schema_name:
+                    try:
+                        self._drop_schema(schema_name)
+                        logger.info(f"| ✓ Dropped schema: {schema_name}")
+                    except Exception as e:
+                        logger.warning(f"| Failed to drop schema {schema_name}: {e}")
+
+                # Clear task context
+                if self._current_task_context.get("task_name") == task.name:
+                    self._current_task_context = None
 
             logger.info(f"| ✓ Initial state cleanup completed for {task.name}")
             return True
@@ -237,6 +300,177 @@ def _get_timestamp(self) -> str:
 
         return datetime.now().strftime("%Y%m%d%H%M%S")
 
+    def _drop_schema(self, schema_name: str) -> None:
+        """Drop schema and all its contents."""
+        import psycopg2
+        from psycopg2 import sql
+
+        conn_params = {
+            "host": "localhost",
+            "port": 5432,
+            "user": "postgres",
+            "password": "postgres",
+            "database": "insforge",
+        }
+
+        conn = psycopg2.connect(**conn_params)
+        conn.autocommit = True
+        try:
+            with conn.cursor() as cur:
+                cur.execute(
+                    sql.SQL("DROP SCHEMA IF EXISTS {} CASCADE").format(
+                        sql.Identifier(schema_name)
+                    )
+                )
+                logger.debug(f"| Dropped schema: {schema_name}")
+        finally:
+            conn.close()
+
+    def _create_schema(self, schema_name: str) -> None:
+        """Create empty schema."""
+        import psycopg2
+        from psycopg2 import sql
+
+        conn_params = {
+            "host": "localhost",
+            "port": 5432,
+            "user": "postgres",
+            "password": "postgres",
+            "database": "insforge",
+        }
+
+        conn = psycopg2.connect(**conn_params)
+        conn.autocommit = True
+        try:
+            with conn.cursor() as cur:
+                cur.execute(
+                    sql.SQL("CREATE SCHEMA {}").format(sql.Identifier(schema_name))
+                )
+                logger.debug(f"| Created schema: {schema_name}")
+        finally:
+            conn.close()
+
+    def _get_all_tables(self) -> List[Dict[str, str]]:
+        """Get list of all user tables.
+
+        Returns:
+            List of dicts with 'schema' and 'name' keys
+        """
+        import psycopg2
+
+        conn_params = {
+            "host": "localhost",
+            "port": 5432,
+            "user": "postgres",
+            "password": "postgres",
+            "database": "insforge",
+        }
+
+        conn = psycopg2.connect(**conn_params)
+        try:
+            with conn.cursor() as cur:
+                cur.execute("""
+                    SELECT table_schema, table_name
+                    FROM information_schema.tables
+                    WHERE table_type = 'BASE TABLE'
+                    AND table_schema NOT IN ('information_schema', 'pg_catalog')
+                    AND table_schema NOT LIKE 'pg_%'
+                    AND table_name NOT LIKE '\\_%'
+                    ORDER BY table_schema, table_name
+                """)
+                rows = cur.fetchall()
+                return [{"schema": row[0], "name": row[1]} for row in rows]
+        finally:
+            conn.close()
+
+    def _drop_table(self, schema_name: str, table_name: str) -> None:
+        """Drop a specific table."""
+        import psycopg2
+        from psycopg2 import sql
+
+        conn_params = {
+            "host": "localhost",
+            "port": 5432,
+            "user": "postgres",
+            "password": "postgres",
+            "database": "insforge",
+        }
+
+        conn = psycopg2.connect(**conn_params)
+        conn.autocommit = True
+        try:
+            with conn.cursor() as cur:
+                cur.execute(
+                    sql.SQL("DROP TABLE IF EXISTS {}.{} CASCADE").format(
+                        sql.Identifier(schema_name),
+                        sql.Identifier(table_name)
+                    )
+                )
+                logger.debug(f"| Dropped table: {schema_name}.{table_name}")
+        finally:
+            conn.close()
+
+    def _restore_from_backup(self, category_name: str) -> bool:
+        """Restore from backup file.
+
+        Tables may be restored into public schema or category-specific schema
+        depending on how the backup was created.
+
+        Args:
+            category_name: Name of category (e.g., 'employees', 'chinook', 'lego')
+
+        Returns:
+            True if backup was restored, False if no backup exists
+        """
+        # Path to backup file
+        backup_dir = Path(__file__).parent.parent.parent.parent / "postgres_state"
+        backup_file = backup_dir / f"{category_name}.backup"
+
+        logger.debug(f"| Looking for backup at: {backup_file}")
+        logger.debug(f"| Backup exists: {backup_file.exists()}")
+
+        if not backup_file.exists():
+            logger.info(f"| ○ No backup file found: {backup_file}")
+            return False
+
+        logger.info(f"| Restoring {category_name} from backup...")
+
+        # Set up environment for pg_restore
+        env = os.environ.copy()
+        env["PGPASSWORD"] = "postgres"
+
+        try:
+            # Restore backup without schema filter (tables go to whatever schema they're in)
+            result = subprocess.run(
+                [
+                    "pg_restore",
+                    "-h", "localhost",
+                    "-p", "5432",
+                    "-U", "postgres",
+                    "-d", "insforge",
+                    "-v",
+                    str(backup_file),
+                ],
+                env=env,
+                capture_output=True,
+                text=True,
+                timeout=120,  # 2 minute timeout
+            )
+
+            if result.returncode != 0 and "ERROR" in result.stderr:
+                logger.warning(f"| pg_restore had errors for {category_name}: {result.stderr}")
+                return False
+
+            logger.info(f"| ✓ {category_name} restored successfully")
+            return True
+
+        except subprocess.TimeoutExpired:
+            logger.error(f"| ✗ Restore timed out for {category_name}")
+            return False
+        except Exception as e:
+            logger.error(f"| ✗ Failed to restore {category_name}: {e}")
+            return False
+
     def get_service_config_for_agent(self) -> dict:
         """Get configuration for agent execution.
 
@@ -266,7 +500,15 @@ def set_verification_environment(self, messages_path: str = None) -> None:
         os.environ["INSFORGE_BACKEND_URL"] = self.backend_url
         os.environ["INSFORGE_API_KEY"] = self.api_key
 
+        # Set PostgreSQL connection details for direct database verification
+        # (Insforge exposes its internal postgres database for verification)
+        os.environ["POSTGRES_HOST"] = "localhost"
+        os.environ["POSTGRES_PORT"] = "5432"
+        os.environ["POSTGRES_DATABASE"] = "insforge"
+        os.environ["POSTGRES_USERNAME"] = "postgres"
+        os.environ["POSTGRES_PASSWORD"] = "postgres"
+
         if messages_path:
             os.environ["MCP_MESSAGES"] = str(messages_path)
 
-        logger.debug("Verification environment variables set for Insforge")
+        logger.debug("Verification environment variables set for Insforge (including direct postgres access)")
diff --git a/src/mcp_services/supabase/__init__.py b/src/mcp_services/supabase/__init__.py
new file mode 100644
index 00000000..669a7286
--- /dev/null
+++ b/src/mcp_services/supabase/__init__.py
@@ -0,0 +1,11 @@
+"""Supabase MCP service integration for MCPMark."""
+
+from .supabase_login_helper import SupabaseLoginHelper
+from .supabase_state_manager import SupabaseStateManager
+from .supabase_task_manager import SupabaseTaskManager
+
+__all__ = [
+    "SupabaseLoginHelper",
+    "SupabaseStateManager",
+    "SupabaseTaskManager",
+]
diff --git a/src/mcp_services/supabase/supabase_login_helper.py b/src/mcp_services/supabase/supabase_login_helper.py
new file mode 100644
index 00000000..dd7253ac
--- /dev/null
+++ b/src/mcp_services/supabase/supabase_login_helper.py
@@ -0,0 +1,168 @@
+"""
+Supabase Login Helper for MCPMark
+===================================
+
+Handles configuration and validation for Supabase MCP service.
+"""
+
+import os
+from typing import Dict, Any, Optional
+
+from src.base.login_helper import BaseLoginHelper
+from src.logger import get_logger
+
+logger = get_logger(__name__)
+
+
+class SupabaseLoginHelper(BaseLoginHelper):
+    """Login helper for Supabase MCP service.
+
+    Validates PostgREST API URL and API key configuration.
+    """
+
+    def __init__(self):
+        super().__init__("supabase")
+
+    def prepare_credentials(self) -> Dict[str, Any]:
+        """Prepare credentials for Supabase/PostgREST connection.
+
+        Returns:
+            Dictionary containing api_url, api_key, and postgres connection details
+        """
+        # Get PostgREST API configuration (from Supabase CLI)
+        api_url = os.getenv("SUPABASE_API_URL", "http://localhost:54321")
+        api_key = os.getenv("SUPABASE_API_KEY")
+
+        # Get PostgreSQL connection details (Supabase CLI defaults)
+        postgres_host = os.getenv("SUPABASE_DB_HOST", "localhost")
+        postgres_port = int(os.getenv("SUPABASE_DB_PORT", "54322"))
+        postgres_user = os.getenv("SUPABASE_DB_USER", "postgres")
+        postgres_password = os.getenv("SUPABASE_DB_PASSWORD", "postgres")
+        postgres_database = os.getenv("SUPABASE_DB_NAME", "postgres")
+
+        if not api_key:
+            logger.warning(
+                "SUPABASE_API_KEY not set.\n"
+                "Run 'supabase status' to get your anon or service_role key.\n"
+                "Set SUPABASE_API_KEY in your .mcp_env file."
+            )
+            # Try to get it from supabase status
+            api_key = self._get_key_from_supabase_status()
+
+        return {
+            "api_url": api_url,
+            "api_key": api_key or "",
+            "postgres_host": postgres_host,
+            "postgres_port": postgres_port,
+            "postgres_user": postgres_user,
+            "postgres_password": postgres_password,
+            "postgres_database": postgres_database,
+        }
+
+    def _get_key_from_supabase_status(self) -> Optional[str]:
+        """Try to get anon key from supabase status command.
+
+        Returns:
+            Anon key if found, None otherwise
+        """
+        import subprocess
+
+        try:
+            result = subprocess.run(
+                ["supabase", "status"],
+                capture_output=True,
+                text=True,
+                timeout=10,
+            )
+
+            if result.returncode == 0:
+                # Parse output for anon key
+                for line in result.stdout.split('\n'):
+                    if 'anon key:' in line.lower():
+                        # Extract the key after the colon
+                        key = line.split(':', 1)[1].strip()
+                        logger.info("Found anon key from 'supabase status'")
+                        return key
+
+        except (subprocess.SubprocessError, FileNotFoundError):
+            logger.debug("Could not run 'supabase status' to get anon key")
+
+        return None
+
+    def test_credentials(self, credentials: Dict[str, Any]) -> bool:
+        """Test if Supabase credentials are valid.
+
+        Args:
+            credentials: Dictionary with api_url, api_key, and postgres connection details
+
+        Returns:
+            True if credentials are valid
+        """
+        import requests
+        import psycopg2
+
+        api_url = credentials["api_url"]
+        api_key = credentials.get("api_key", "")
+
+        # Test PostgreSQL connection
+        try:
+            conn_params = {
+                "host": credentials["postgres_host"],
+                "port": credentials["postgres_port"],
+                "user": credentials["postgres_user"],
+                "password": credentials["postgres_password"],
+                "database": credentials["postgres_database"],
+            }
+            conn = psycopg2.connect(**conn_params)
+            conn.close()
+            logger.info("✓ PostgreSQL connection successful")
+        except Exception as e:
+            logger.error(f"✗ PostgreSQL connection failed: {e}")
+            return False
+
+        # Test PostgREST API connection (optional - may not be running yet)
+        try:
+            headers = {}
+            if api_key:
+                headers["apikey"] = api_key
+                headers["Authorization"] = f"Bearer {api_key}"
+
+            response = requests.get(api_url, headers=headers, timeout=5)
+
+            # Any response (including 404, 401) means the API is reachable
+            logger.info(f"✓ PostgREST API reachable at {api_url} (status: {response.status_code})")
+            return True
+
+        except requests.exceptions.ConnectionError:
+            logger.warning(
+                f"⚠ PostgREST API not reachable at {api_url}.\n"
+                "Make sure PostgREST is running (e.g., docker run -p 3000:3000 postgrest/postgrest)\n"
+                "or use a cloud Supabase instance URL."
+            )
+            # Still return True as PostgreSQL connection works
+            return True
+        except Exception as e:
+            logger.warning(f"⚠ PostgREST API test failed: {e}")
+            # Still return True as PostgreSQL connection works
+            return True
+
+    def format_credentials_info(self, credentials: Dict[str, Any]) -> str:
+        """Format credentials info for display.
+
+        Args:
+            credentials: Dictionary with connection details
+
+        Returns:
+            Formatted string describing the credentials
+        """
+        api_url = credentials["api_url"]
+        has_api_key = bool(credentials.get("api_key"))
+        postgres_host = credentials["postgres_host"]
+        postgres_db = credentials["postgres_database"]
+
+        return (
+            f"Supabase Configuration:\n"
+            f"  API URL: {api_url}\n"
+            f"  API Key: {'✓ Configured' if has_api_key else '✗ Not set'}\n"
+            f"  PostgreSQL: {postgres_host}/{postgres_db}"
+        )
diff --git a/src/mcp_services/supabase/supabase_state_manager.py b/src/mcp_services/supabase/supabase_state_manager.py
new file mode 100644
index 00000000..f9895887
--- /dev/null
+++ b/src/mcp_services/supabase/supabase_state_manager.py
@@ -0,0 +1,518 @@
+"""
+Supabase State Manager for MCPMark
+====================================
+
+Manages database state for Supabase tasks using the same PostgreSQL backend
+as Insforge, but accessed via PostgREST/Supabase MCP server.
+"""
+
+import os
+import sys
+import subprocess
+import psycopg2
+from psycopg2 import sql
+from pathlib import Path
+from typing import Optional, Dict, Any, List
+
+from src.base.state_manager import BaseStateManager, InitialStateInfo
+from src.base.task_manager import BaseTask
+from src.logger import get_logger
+
+logger = get_logger(__name__)
+
+
+class SupabaseStateManager(BaseStateManager):
+    """Manages Supabase/PostgREST database state for task evaluation.
+
+    Uses the same PostgreSQL database as Insforge but exposes it via
+    PostgREST API for the Supabase MCP server to access.
+    """
+
+    def __init__(
+        self,
+        api_url: str,
+        api_key: str,
+        postgres_host: str = "localhost",
+        postgres_port: int = 54322,  # Supabase CLI default port
+        postgres_user: str = "postgres",
+        postgres_password: str = "postgres",
+        postgres_database: str = "postgres",  # Supabase CLI default database
+    ):
+        """Initialize Supabase state manager.
+
+        Args:
+            api_url: PostgREST API URL from Supabase CLI (default: http://localhost:54321)
+            api_key: API key from Supabase CLI (anon or service_role key)
+            postgres_host: PostgreSQL host for direct database operations
+            postgres_port: PostgreSQL port (Supabase CLI uses 54322)
+            postgres_user: PostgreSQL username
+            postgres_password: PostgreSQL password
+            postgres_database: Main PostgreSQL database name
+        """
+        super().__init__(service_name="supabase")
+
+        self.api_url = api_url.rstrip('/')
+        self.api_key = api_key
+
+        # PostgreSQL connection for state management (Supabase CLI instance)
+        self.postgres_host = postgres_host
+        self.postgres_port = postgres_port
+        self.postgres_user = postgres_user
+        self.postgres_password = postgres_password
+        self.postgres_database = postgres_database
+
+        # Track current task context for agent configuration
+        self._current_task_context: Optional[Dict[str, Any]] = None
+
+        # Validate connection on initialization
+        try:
+            self._test_connection()
+            logger.info("Supabase state manager initialized successfully")
+        except Exception as e:
+            raise RuntimeError(f"Supabase initialization failed: {e}")
+
+        # Store baseline tables (system tables that exist before any tasks run)
+        self._baseline_tables = set(
+            (t['schema'], t['name']) for t in self._get_all_tables()
+        )
+        logger.debug(f"Stored baseline: {len(self._baseline_tables)} tables")
+
+    def _test_connection(self):
+        """Test PostgreSQL connection."""
+        try:
+            conn_params = {
+                "host": self.postgres_host,
+                "port": self.postgres_port,
+                "user": self.postgres_user,
+                "password": self.postgres_password,
+                "database": self.postgres_database,
+            }
+            conn = psycopg2.connect(**conn_params)
+            conn.close()
+            logger.debug("PostgreSQL connection test successful")
+        except Exception as e:
+            raise RuntimeError(f"Cannot connect to PostgreSQL: {e}")
+
+    def _create_initial_state(self, task: BaseTask) -> Optional[InitialStateInfo]:
+        """Create initial backend state for a task.
+
+        Restores from backup which may place tables in public or task-specific schema.
+
+        Args:
+            task: Task for which to create initial state
+
+        Returns:
+            InitialStateInfo object or None if creation failed
+        """
+        try:
+            # Generate unique state ID for this task run
+            state_id = f"{task.category_id}_{task.task_id}_{self._get_timestamp()}"
+            schema_name = task.category_id
+
+            logger.info(f"| Creating initial state for Supabase task: {task.name}")
+
+            # Drop schema first (cleanup from previous runs)
+            self._drop_schema(schema_name)
+
+            # Get list of existing tables before restore (to track what we create)
+            tables_before = self._get_all_tables()
+            logger.info(f"| Tables before restore: {len(tables_before)}")
+
+            # Create schema for this task (in case backup uses it)
+            self._create_schema(schema_name)
+
+            # Restore from backup if backup exists (may create tables in public or task schema)
+            if self._restore_from_backup(schema_name):
+                logger.info(f"| ✓ Restored '{schema_name}' from backup")
+            else:
+                logger.info(f"| ○ No backup found for '{schema_name}'")
+                # Run prepare_environment.py if it exists
+                task_prepared = self._run_prepare_environment(task)
+                if not task_prepared:
+                    logger.debug(f"| No prepare_environment.py found for task {task.name}")
+
+            # Get list of tables after restore (to track what we need to clean up)
+            tables_after = self._get_all_tables()
+
+            # Track ALL new tables created by the restore (compare before/after)
+            tables_before_set = {(t['schema'], t['name']) for t in tables_before}
+            created_tables = [
+                t for t in tables_after
+                if (t['schema'], t['name']) not in tables_before_set
+            ]
+
+            logger.info(f"| Tracked {len(created_tables)} new tables for cleanup")
+            for t in created_tables:
+                logger.debug(f"|   - {t['schema']}.{t['name']}")
+
+            # Track the task context including created tables
+            context = {
+                "state_id": state_id,
+                "category_id": task.category_id,
+                "task_id": task.task_id,
+                "task_name": task.name,
+                "schema": schema_name,
+                "created_tables": created_tables,
+            }
+
+            return InitialStateInfo(
+                state_id=state_id,
+                state_url=self.api_url,
+                metadata=context,
+            )
+
+        except Exception as e:
+            logger.error(f"Failed to create initial state for {task.name}: {e}")
+            return None
+
+    def _store_initial_state_info(
+        self, task: BaseTask, state_info: InitialStateInfo
+    ) -> None:
+        """Store backend info in task object for agent access."""
+        if hasattr(task, "__dict__"):
+            task.api_url = self.api_url
+            task.api_key = self.api_key
+            task.state_id = state_info.state_id
+
+            # Store current task context for agent configuration
+            self._current_task_context = state_info.metadata
+
+    def _cleanup_task_initial_state(self, task: BaseTask) -> bool:
+        """Clean up task-specific resources.
+
+        Drops ALL tables created during task (both setup and agent-created)
+        by comparing against baseline.
+
+        Args:
+            task: Task whose initial state should be cleaned up
+
+        Returns:
+            True if cleanup successful
+        """
+        try:
+            logger.info(f"| Cleaning up initial state for task: {task.name}")
+
+            if self._current_task_context:
+                schema_name = self._current_task_context.get("schema")
+
+                # Get ALL current tables
+                all_current_tables = self._get_all_tables()
+
+                # Find tables to drop: anything not in baseline
+                tables_to_drop = [
+                    t for t in all_current_tables
+                    if (t['schema'], t['name']) not in self._baseline_tables
+                ]
+
+                logger.info(f"| Found {len(tables_to_drop)} tables to clean up (setup + agent-created)")
+
+                # Drop individual tables
+                for table_info in tables_to_drop:
+                    try:
+                        self._drop_table(table_info["schema"], table_info["name"])
+                        logger.debug(f"| ✓ Dropped table: {table_info['schema']}.{table_info['name']}")
+                    except Exception as e:
+                        logger.warning(f"| Failed to drop table {table_info}: {e}")
+
+                # Drop the task schema (may be empty if all tables were in public)
+                if schema_name:
+                    try:
+                        self._drop_schema(schema_name)
+                        logger.info(f"| ✓ Dropped schema: {schema_name}")
+                    except Exception as e:
+                        logger.warning(f"| Failed to drop schema {schema_name}: {e}")
+
+                # Clear task context
+                if self._current_task_context.get("task_name") == task.name:
+                    self._current_task_context = None
+
+            logger.info(f"| ✓ Initial state cleanup completed for {task.name}")
+            return True
+
+        except Exception as e:
+            logger.error(f"Failed to cleanup task initial state for {task.name}: {e}")
+            return False
+
+    def _cleanup_single_resource(self, resource: Dict[str, Any]) -> bool:
+        """Clean up a single tracked resource.
+
+        Args:
+            resource: Resource dictionary with type, id, and metadata
+
+        Returns:
+            True if cleanup successful
+        """
+        resource_type = resource["type"]
+        resource_id = resource["id"]
+
+        logger.debug(f"| Cleanup for {resource_type} {resource_id} (handled by task scripts)")
+        return True
+
+    def _run_prepare_environment(self, task: BaseTask) -> bool:
+        """Run prepare_environment.py script if it exists in the task directory.
+
+        The script should use database operations to set up required state.
+
+        Args:
+            task: Task for which to prepare environment
+
+        Returns:
+            True if script ran successfully, False if script doesn't exist
+        """
+        task_dir = task.task_instruction_path.parent
+        prepare_script = task_dir / "prepare_environment.py"
+
+        if not prepare_script.exists():
+            logger.debug(f"No prepare_environment.py found for task {task.name}")
+            return False
+
+        logger.info(f"| Running prepare_environment.py for task {task.name}")
+
+        # Set up environment variables for the script
+        env = os.environ.copy()
+        env.update({
+            "SUPABASE_API_URL": self.api_url,
+            "SUPABASE_API_KEY": self.api_key,
+            "POSTGRES_HOST": self.postgres_host,
+            "POSTGRES_PORT": str(self.postgres_port),
+            "POSTGRES_DATABASE": self.postgres_database,
+            "POSTGRES_USERNAME": self.postgres_user,
+            "POSTGRES_PASSWORD": self.postgres_password,
+        })
+
+        try:
+            # Run the prepare_environment.py script
+            result = subprocess.run(
+                [sys.executable, str(prepare_script)],
+                cwd=str(task_dir),  # Run from task directory
+                env=env,
+                capture_output=True,
+                text=True,
+                timeout=300,  # 5 minute timeout
+            )
+
+            if result.returncode == 0:
+                logger.info(f"| ✓ Environment preparation completed for {task.name}")
+                if result.stdout.strip():
+                    logger.debug(f"| prepare_environment.py output: {result.stdout}")
+                return True
+            else:
+                logger.error(f"| ✗ Environment preparation failed for {task.name}")
+                logger.error(f"| Error output: {result.stderr}")
+                raise RuntimeError(f"prepare_environment.py failed with exit code {result.returncode}")
+
+        except subprocess.TimeoutExpired:
+            logger.error(f"✗ Environment preparation timed out for {task.name}")
+            raise RuntimeError("prepare_environment.py execution timed out")
+        except Exception as e:
+            logger.error(f"✗ Failed to run prepare_environment.py for {task.name}: {e}")
+            raise
+
+    def _get_timestamp(self) -> str:
+        """Get timestamp for unique naming."""
+        from datetime import datetime
+        return datetime.now().strftime("%Y%m%d%H%M%S")
+
+    def _drop_schema(self, schema_name: str) -> None:
+        """Drop schema and all its contents."""
+        conn_params = {
+            "host": self.postgres_host,
+            "port": self.postgres_port,
+            "user": self.postgres_user,
+            "password": self.postgres_password,
+            "database": self.postgres_database,
+        }
+
+        conn = psycopg2.connect(**conn_params)
+        conn.autocommit = True
+        try:
+            with conn.cursor() as cur:
+                cur.execute(
+                    sql.SQL("DROP SCHEMA IF EXISTS {} CASCADE").format(
+                        sql.Identifier(schema_name)
+                    )
+                )
+                logger.debug(f"| Dropped schema: {schema_name}")
+        finally:
+            conn.close()
+
+    def _create_schema(self, schema_name: str) -> None:
+        """Create empty schema."""
+        conn_params = {
+            "host": self.postgres_host,
+            "port": self.postgres_port,
+            "user": self.postgres_user,
+            "password": self.postgres_password,
+            "database": self.postgres_database,
+        }
+
+        conn = psycopg2.connect(**conn_params)
+        conn.autocommit = True
+        try:
+            with conn.cursor() as cur:
+                cur.execute(
+                    sql.SQL("CREATE SCHEMA {}").format(sql.Identifier(schema_name))
+                )
+                logger.debug(f"| Created schema: {schema_name}")
+        finally:
+            conn.close()
+
+    def _get_all_tables(self) -> List[Dict[str, str]]:
+        """Get list of all user tables.
+
+        Returns:
+            List of dicts with 'schema' and 'name' keys
+        """
+        conn_params = {
+            "host": self.postgres_host,
+            "port": self.postgres_port,
+            "user": self.postgres_user,
+            "password": self.postgres_password,
+            "database": self.postgres_database,
+        }
+
+        conn = psycopg2.connect(**conn_params)
+        try:
+            with conn.cursor() as cur:
+                cur.execute("""
+                    SELECT table_schema, table_name
+                    FROM information_schema.tables
+                    WHERE table_type = 'BASE TABLE'
+                    AND table_schema NOT IN ('information_schema', 'pg_catalog')
+                    AND table_schema NOT LIKE 'pg_%'
+                    AND table_name NOT LIKE '\\_%'
+                    ORDER BY table_schema, table_name
+                """)
+                rows = cur.fetchall()
+                return [{"schema": row[0], "name": row[1]} for row in rows]
+        finally:
+            conn.close()
+
+    def _drop_table(self, schema_name: str, table_name: str) -> None:
+        """Drop a specific table."""
+        conn_params = {
+            "host": self.postgres_host,
+            "port": self.postgres_port,
+            "user": self.postgres_user,
+            "password": self.postgres_password,
+            "database": self.postgres_database,
+        }
+
+        conn = psycopg2.connect(**conn_params)
+        conn.autocommit = True
+        try:
+            with conn.cursor() as cur:
+                cur.execute(
+                    sql.SQL("DROP TABLE IF EXISTS {}.{} CASCADE").format(
+                        sql.Identifier(schema_name),
+                        sql.Identifier(table_name)
+                    )
+                )
+                logger.debug(f"| Dropped table: {schema_name}.{table_name}")
+        finally:
+            conn.close()
+
+    def _restore_from_backup(self, category_name: str) -> bool:
+        """Restore from backup file.
+
+        Tables may be restored into public schema or category-specific schema
+        depending on how the backup was created.
+
+        Args:
+            category_name: Name of category (e.g., 'employees', 'chinook', 'lego')
+
+        Returns:
+            True if backup was restored, False if no backup exists
+        """
+        # Path to backup file (same as used by Insforge/Postgres)
+        backup_dir = Path(__file__).parent.parent.parent.parent / "postgres_state"
+        backup_file = backup_dir / f"{category_name}.backup"
+
+        logger.debug(f"| Looking for backup at: {backup_file}")
+
+        if not backup_file.exists():
+            logger.info(f"| ○ No backup file found: {backup_file}")
+            return False
+
+        logger.info(f"| Restoring {category_name} from backup...")
+
+        # Set up environment for pg_restore
+        env = os.environ.copy()
+        env["PGPASSWORD"] = self.postgres_password
+
+        try:
+            # Restore backup
+            result = subprocess.run(
+                [
+                    "pg_restore",
+                    "-h", self.postgres_host,
+                    "-p", str(self.postgres_port),
+                    "-U", self.postgres_user,
+                    "-d", self.postgres_database,
+                    "-v",
+                    str(backup_file),
+                ],
+                env=env,
+                capture_output=True,
+                text=True,
+                timeout=120,  # 2 minute timeout
+            )
+
+            if result.returncode != 0 and "ERROR" in result.stderr:
+                logger.warning(f"| pg_restore had errors for {category_name}: {result.stderr}")
+                return False
+
+            logger.info(f"| ✓ {category_name} restored successfully")
+            return True
+
+        except subprocess.TimeoutExpired:
+            logger.error(f"| ✗ Restore timed out for {category_name}")
+            return False
+        except Exception as e:
+            logger.error(f"| ✗ Failed to restore {category_name}: {e}")
+            return False
+
+    def get_service_config_for_agent(self) -> dict:
+        """Get configuration for agent execution.
+
+        This configuration is passed to the agent/MCP server so it can
+        connect to the Supabase/PostgREST endpoint.
+
+        Returns:
+            Dictionary containing API URL and API key
+        """
+        config = {
+            "api_url": self.api_url,
+            "api_key": self.api_key,
+            "schema": "public",  # Default schema for PostgREST
+        }
+
+        # Include current task context if available
+        if self._current_task_context:
+            config["task_context"] = self._current_task_context
+            # If task uses a specific schema, include it
+            if self._current_task_context.get("schema"):
+                config["schema"] = self._current_task_context["schema"]
+
+        return config
+
+    def set_verification_environment(self, messages_path: str = None) -> None:
+        """Set environment variables needed for verification scripts.
+
+        Args:
+            messages_path: Optional path to messages.json file for verification
+        """
+        os.environ["SUPABASE_API_URL"] = self.api_url
+        os.environ["SUPABASE_API_KEY"] = self.api_key
+
+        # Set PostgreSQL connection details for direct database verification
+        os.environ["POSTGRES_HOST"] = self.postgres_host
+        os.environ["POSTGRES_PORT"] = str(self.postgres_port)
+        os.environ["POSTGRES_DATABASE"] = self.postgres_database
+        os.environ["POSTGRES_USERNAME"] = self.postgres_user
+        os.environ["POSTGRES_PASSWORD"] = self.postgres_password
+
+        if messages_path:
+            os.environ["MCP_MESSAGES"] = str(messages_path)
+
+        logger.debug("Verification environment variables set for Supabase (including direct postgres access)")
diff --git a/src/mcp_services/supabase/supabase_task_manager.py b/src/mcp_services/supabase/supabase_task_manager.py
new file mode 100644
index 00000000..1d69a8ce
--- /dev/null
+++ b/src/mcp_services/supabase/supabase_task_manager.py
@@ -0,0 +1,113 @@
+"""
+Supabase Task Manager for MCPMark
+===================================
+
+Manages Supabase task discovery, execution, and verification.
+Reuses Postgres tasks but accesses them via PostgREST/Supabase MCP.
+"""
+
+import os
+import subprocess
+import sys
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+
+from src.base.task_manager import BaseTask, BaseTaskManager
+from src.logger import get_logger
+
+logger = get_logger(__name__)
+
+
+@dataclass
+class SupabaseTask(BaseTask):
+    """Supabase-specific task with API information."""
+
+    task_name: str = ""
+    api_url: Optional[str] = None
+    api_key: Optional[str] = None
+
+
+class SupabaseTaskManager(BaseTaskManager):
+    """Manages Supabase tasks for MCPMark evaluation.
+
+    Uses the same task structure as Postgres tasks but accessed via
+    PostgREST/Supabase MCP server.
+    """
+
+    def __init__(self, tasks_root: Path = None):
+        """Initialize Supabase task manager.
+
+        Args:
+            tasks_root: Path to tasks directory
+        """
+        if tasks_root is None:
+            tasks_root = Path(__file__).resolve().parents[3] / "tasks"
+
+        super().__init__(
+            tasks_root,
+            mcp_service="supabase",
+            task_class=SupabaseTask,
+            task_organization="file",  # Supabase uses file-based tasks (like Postgres)
+        )
+
+    def _create_task_from_files(
+        self, category_id: str, task_files_info: Dict[str, Any]
+    ) -> Optional[SupabaseTask]:
+        """Instantiate a `SupabaseTask` from the dictionary returned by `_find_task_files`."""
+        import json
+
+        # Check for meta.json
+        meta_path = task_files_info["instruction_path"].parent / "meta.json"
+        final_category_id = category_id
+        task_id = task_files_info["task_id"]
+
+        if meta_path.exists():
+            try:
+                with open(meta_path, 'r') as f:
+                    meta_data = json.load(f)
+                    # Use values from meta.json if available
+                    final_category_id = meta_data.get("category_id", category_id)
+                    task_id = meta_data.get("task_id", task_id)
+            except Exception as e:
+                logger.warning(f"Failed to load meta.json from {meta_path}: {e}")
+
+        return SupabaseTask(
+            task_instruction_path=task_files_info["instruction_path"],
+            task_verification_path=task_files_info["verification_path"],
+            service="supabase",
+            category_id=final_category_id,
+            task_id=task_id,
+            task_name=task_files_info["task_id"],
+        )
+
+    def _get_verification_command(self, task: SupabaseTask) -> List[str]:
+        """Get verification command with Supabase API info."""
+        cmd = [sys.executable, str(task.task_verification_path)]
+        return cmd
+
+    def run_verification(self, task: BaseTask) -> subprocess.CompletedProcess:
+        """Run verification with Supabase environment."""
+        env = os.environ.copy()
+
+        # Pass Supabase connection info to verification script
+        if hasattr(task, "api_url") and task.api_url:
+            env["SUPABASE_API_URL"] = task.api_url
+
+        if hasattr(task, "api_key") and task.api_key:
+            env["SUPABASE_API_KEY"] = task.api_key
+
+        return subprocess.run(
+            self._get_verification_command(task),
+            capture_output=True,
+            text=True,
+            timeout=300,
+            env=env,
+        )
+
+    def _format_task_instruction(self, base_instruction: str) -> str:
+        """Add Supabase-specific instructions."""
+        return (
+            base_instruction
+            + "\n\nNote: Use Supabase MCP tools (PostgREST) to complete this task. The API connection is already configured."
+        )
diff --git a/src/services.py b/src/services.py
index 67111121..041ce311 100644
--- a/src/services.py
+++ b/src/services.py
@@ -300,6 +300,70 @@
         "mcp_server": None,
         "eval_config": None,
     },
+    "supabase": {
+        "config_schema": {
+            "api_url": {
+                "env_var": "SUPABASE_API_URL",
+                "required": False,
+                "description": "Supabase PostgREST API URL (default: http://localhost:54321 from CLI)",
+                "default": "http://localhost:54321",
+            },
+            "api_key": {
+                "env_var": "SUPABASE_API_KEY",
+                "required": False,
+                "description": "Supabase API key (anon or service_role key from 'supabase status')",
+            },
+            "postgres_host": {
+                "env_var": "SUPABASE_DB_HOST",
+                "required": False,
+                "description": "PostgreSQL host for Supabase CLI instance",
+                "default": "localhost",
+            },
+            "postgres_port": {
+                "env_var": "SUPABASE_DB_PORT",
+                "required": False,
+                "description": "PostgreSQL port for Supabase CLI instance (default: 54322)",
+                "default": 54322,
+            },
+            "postgres_user": {
+                "env_var": "SUPABASE_DB_USER",
+                "required": False,
+                "description": "PostgreSQL username",
+                "default": "postgres",
+            },
+            "postgres_password": {
+                "env_var": "SUPABASE_DB_PASSWORD",
+                "required": False,
+                "description": "PostgreSQL password",
+                "default": "postgres",
+            },
+            "postgres_database": {
+                "env_var": "SUPABASE_DB_NAME",
+                "required": False,
+                "description": "PostgreSQL database name",
+                "default": "postgres",
+            },
+        },
+        "components": {
+            "task_manager": "src.mcp_services.supabase.supabase_task_manager.SupabaseTaskManager",
+            "state_manager": "src.mcp_services.supabase.supabase_state_manager.SupabaseStateManager",
+            "login_helper": "src.mcp_services.supabase.supabase_login_helper.SupabaseLoginHelper",
+        },
+        "config_mapping": {
+            "state_manager": {
+                "api_url": "api_url",
+                "api_key": "api_key",
+                "postgres_host": "postgres_host",
+                "postgres_port": "postgres_port",
+                "postgres_user": "postgres_user",
+                "postgres_password": "postgres_password",
+                "postgres_database": "postgres_database",
+            },
+            "login_helper": {},
+        },
+        "mcp_server": None,
+        "eval_config": None,
+    },
     "playwright_webarena": {
         "config_schema": {
             "browser": {
diff --git a/supabase/.branches/_current_branch b/supabase/.branches/_current_branch
new file mode 100644
index 00000000..88d050b1
--- /dev/null
+++ b/supabase/.branches/_current_branch
@@ -0,0 +1 @@
+main
\ No newline at end of file
diff --git a/supabase/.temp/cli-latest b/supabase/.temp/cli-latest
new file mode 100644
index 00000000..2213dd2c
--- /dev/null
+++ b/supabase/.temp/cli-latest
@@ -0,0 +1 @@
+v2.51.0
\ No newline at end of file
diff --git a/tasks/insforge b/tasks/insforge
new file mode 120000
index 00000000..7d72bd78
--- /dev/null
+++ b/tasks/insforge
@@ -0,0 +1 @@
+postgres
\ No newline at end of file
diff --git a/tasks/supabase b/tasks/supabase
new file mode 120000
index 00000000..7d72bd78
--- /dev/null
+++ b/tasks/supabase
@@ -0,0 +1 @@
+postgres
\ No newline at end of file

From 757c5688537abe6629dab2e747891f5cb20df965 Mon Sep 17 00:00:00 2001
From: yaowenc2 <yaowenc2@illinois.edu>
Date: Sun, 19 Oct 2025 16:44:27 -0700
Subject: [PATCH 3/5] change state manager

---
 src/mcp_services/insforge/insforge_state_manager.py | 7 ++++---
 src/mcp_services/supabase/supabase_state_manager.py | 3 +--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/mcp_services/insforge/insforge_state_manager.py b/src/mcp_services/insforge/insforge_state_manager.py
index 7c5b7961..0a910bb2 100644
--- a/src/mcp_services/insforge/insforge_state_manager.py
+++ b/src/mcp_services/insforge/insforge_state_manager.py
@@ -101,13 +101,14 @@ def _create_initial_state(self, task: BaseTask) -> Optional[InitialStateInfo]:
 
             logger.info(f"| Creating initial state for Insforge task: {task.name}")
 
+            # Drop schema first (cleanup from previous runs)
+            self._drop_schema(schema_name)
+
             # Get list of existing tables before restore (to track what we create)
             tables_before = self._get_all_tables()
             logger.info(f"| Tables before restore: {len(tables_before)}")
 
-            # Create schema for this task (in case backup uses it)
-            self._drop_schema(schema_name)
-            self._create_schema(schema_name)
+            # Note: Don't create schema here - pg_restore will create it from the backup
 
             # Restore from backup if backup exists (may create tables in public or task schema)
             if self._restore_from_backup(schema_name):
diff --git a/src/mcp_services/supabase/supabase_state_manager.py b/src/mcp_services/supabase/supabase_state_manager.py
index f9895887..a8927c0b 100644
--- a/src/mcp_services/supabase/supabase_state_manager.py
+++ b/src/mcp_services/supabase/supabase_state_manager.py
@@ -118,8 +118,7 @@ def _create_initial_state(self, task: BaseTask) -> Optional[InitialStateInfo]:
             tables_before = self._get_all_tables()
             logger.info(f"| Tables before restore: {len(tables_before)}")
 
-            # Create schema for this task (in case backup uses it)
-            self._create_schema(schema_name)
+            # Note: Don't create schema here - pg_restore will create it from the backup
 
             # Restore from backup if backup exists (may create tables in public or task schema)
             if self._restore_from_backup(schema_name):

From 2540e219c3f925ec2cf263c6ae0c883c59c3b442 Mon Sep 17 00:00:00 2001
From: yaowenc2 <yaowenc2@illinois.edu>
Date: Sun, 19 Oct 2025 22:07:05 -0700
Subject: [PATCH 4/5] materialzied view

---
 src/mcp_services/insforge/insforge_state_manager.py | 12 ++++++++++--
 src/mcp_services/supabase/supabase_state_manager.py | 12 ++++++++++--
 src/model_config.py                                 |  5 +++++
 3 files changed, 25 insertions(+), 4 deletions(-)

diff --git a/src/mcp_services/insforge/insforge_state_manager.py b/src/mcp_services/insforge/insforge_state_manager.py
index 0a910bb2..c2d3f927 100644
--- a/src/mcp_services/insforge/insforge_state_manager.py
+++ b/src/mcp_services/insforge/insforge_state_manager.py
@@ -385,7 +385,7 @@ def _get_all_tables(self) -> List[Dict[str, str]]:
             conn.close()
 
     def _drop_table(self, schema_name: str, table_name: str) -> None:
-        """Drop a specific table."""
+        """Drop a specific table or materialized view."""
         import psycopg2
         from psycopg2 import sql
 
@@ -401,13 +401,21 @@ def _drop_table(self, schema_name: str, table_name: str) -> None:
         conn.autocommit = True
         try:
             with conn.cursor() as cur:
+                # Try dropping as table first
                 cur.execute(
                     sql.SQL("DROP TABLE IF EXISTS {}.{} CASCADE").format(
                         sql.Identifier(schema_name),
                         sql.Identifier(table_name)
                     )
                 )
-                logger.debug(f"| Dropped table: {schema_name}.{table_name}")
+                # Also try dropping as materialized view (in case agent created one)
+                cur.execute(
+                    sql.SQL("DROP MATERIALIZED VIEW IF EXISTS {}.{} CASCADE").format(
+                        sql.Identifier(schema_name),
+                        sql.Identifier(table_name)
+                    )
+                )
+                logger.debug(f"| Dropped table/view: {schema_name}.{table_name}")
         finally:
             conn.close()
 
diff --git a/src/mcp_services/supabase/supabase_state_manager.py b/src/mcp_services/supabase/supabase_state_manager.py
index a8927c0b..1bea58f2 100644
--- a/src/mcp_services/supabase/supabase_state_manager.py
+++ b/src/mcp_services/supabase/supabase_state_manager.py
@@ -388,7 +388,7 @@ def _get_all_tables(self) -> List[Dict[str, str]]:
             conn.close()
 
     def _drop_table(self, schema_name: str, table_name: str) -> None:
-        """Drop a specific table."""
+        """Drop a specific table or materialized view."""
         conn_params = {
             "host": self.postgres_host,
             "port": self.postgres_port,
@@ -401,13 +401,21 @@ def _drop_table(self, schema_name: str, table_name: str) -> None:
         conn.autocommit = True
         try:
             with conn.cursor() as cur:
+                # Try dropping as table first
                 cur.execute(
                     sql.SQL("DROP TABLE IF EXISTS {}.{} CASCADE").format(
                         sql.Identifier(schema_name),
                         sql.Identifier(table_name)
                     )
                 )
-                logger.debug(f"| Dropped table: {schema_name}.{table_name}")
+                # Also try dropping as materialized view (in case agent created one)
+                cur.execute(
+                    sql.SQL("DROP MATERIALIZED VIEW IF EXISTS {}.{} CASCADE").format(
+                        sql.Identifier(schema_name),
+                        sql.Identifier(table_name)
+                    )
+                )
+                logger.debug(f"| Dropped table/view: {schema_name}.{table_name}")
         finally:
             conn.close()
 
diff --git a/src/model_config.py b/src/model_config.py
index 28209ab3..f8fadde9 100644
--- a/src/model_config.py
+++ b/src/model_config.py
@@ -97,6 +97,11 @@ class ModelConfig:
             "api_key_var": "ANTHROPIC_API_KEY",
             "litellm_input_model_name": "anthropic/claude-sonnet-4-20250514",
         },
+        "claude-sonnet-4.5": {
+            "provider": "anthropic",
+            "api_key_var": "ANTHROPIC_API_KEY",
+            "litellm_input_model_name": "anthropic/claude-sonnet-4-5-20250929",
+        },
         "claude-opus-4": {
             "provider": "anthropic",
             "api_key_var": "ANTHROPIC_API_KEY",

From 75c159694839147edf034402dd80fba3c85b53c1 Mon Sep 17 00:00:00 2001
From: yaowenc2 <yaowenc2@illinois.edu>
Date: Tue, 21 Oct 2025 21:58:58 -0700
Subject: [PATCH 5/5] unused task

---
 supabase/.branches/_current_branch | 1 -
 supabase/.temp/cli-latest          | 1 -
 2 files changed, 2 deletions(-)
 delete mode 100644 supabase/.branches/_current_branch
 delete mode 100644 supabase/.temp/cli-latest

diff --git a/supabase/.branches/_current_branch b/supabase/.branches/_current_branch
deleted file mode 100644
index 88d050b1..00000000
--- a/supabase/.branches/_current_branch
+++ /dev/null
@@ -1 +0,0 @@
-main
\ No newline at end of file
diff --git a/supabase/.temp/cli-latest b/supabase/.temp/cli-latest
deleted file mode 100644
index 2213dd2c..00000000
--- a/supabase/.temp/cli-latest
+++ /dev/null
@@ -1 +0,0 @@
-v2.51.0
\ No newline at end of file