Version: Latest

AI Integration Examples

Learn how to integrate popular AI services and frameworks with the Brobot MCP Server.

OpenAI GPT Integration

Basic GPT-4 Integration

import openai
from brobot_client import BrobotClient
import json

# Initialize clients
openai.api_key = "your-api-key"
brobot = BrobotClient()

def execute_natural_language_command(instruction: str):
    """Execute a natural language command using GPT-4."""
    
    # Get current screen state
    observation = brobot.get_observation()
    active_states = [s.name for s in observation.active_states]
    
    # Create prompt for GPT-4
    prompt = f"""
    Current application state: {active_states}
    User instruction: {instruction}
    
    Available actions:
    - click(image_pattern) - Click on UI element
    - type_text(text) - Type text
    - wait_for_state(state_name) - Wait for state
    
    Respond with a JSON array of actions to execute.
    Example: [{"action": "click", "params": {"pattern": "login_btn.png"}}]
    """
    
    # Get GPT-4 response
    response = openai.ChatCompletion.create(
        model="gpt-4",
        messages=[
            {"role": "system", "content": "You are a UI automation assistant."},
            {"role": "user", "content": prompt}
        ],
        temperature=0.3
    )
    
    # Parse and execute actions
    actions = json.loads(response.choices[0].message.content)
    
    for action in actions:
        if action["action"] == "click":
            brobot.click(action["params"]["pattern"])
        elif action["action"] == "type_text":
            brobot.type_text(action["params"]["text"])
        elif action["action"] == "wait_for_state":
            brobot.wait_for_state(action["params"]["state"])

# Example usage
execute_natural_language_command("Log into the application with username 'demo'")

Vision-Enabled GPT-4V

import base64
from openai import OpenAI

client = OpenAI()
brobot = BrobotClient()

def analyze_and_act():
    """Use GPT-4V to analyze screenshots and decide actions."""
    
    # Get observation with screenshot
    obs = brobot.get_observation()
    
    # Prepare image for GPT-4V
    image_base64 = obs.screenshot
    
    response = client.chat.completions.create(
        model="gpt-4-vision-preview",
        messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": "What UI elements do you see? What should I click to login?"
                    },
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/png;base64,{image_base64}"
                        }
                    }
                ]
            }
        ],
        max_tokens=300
    )
    
    # Execute suggested action
    suggestion = response.choices[0].message.content
    print(f"GPT-4V suggests: {suggestion}")

Anthropic Claude Integration

Claude 3 with Computer Use

from anthropic import Anthropic
from brobot_client import BrobotClient
import asyncio

anthropic = Anthropic(api_key="your-api-key")
brobot = BrobotClient()

class ClaudeAutomationAgent:
    """Agent that uses Claude to control applications."""
    
    def __init__(self):
        self.conversation = []
        
    async def process_task(self, task: str):
        """Process a high-level task using Claude."""
        
        # Get current state
        obs = brobot.get_observation()
        
        # Build context
        context = f"""
        Task: {task}
        Current screen: {obs.active_states}
        Available actions: click, type, drag, wait
        
        Plan and execute the steps needed to complete this task.
        """
        
        response = anthropic.messages.create(
            model="claude-3-opus",
            messages=[{"role": "user", "content": context}],
            max_tokens=1000
        )
        
        # Execute Claude's plan
        await self._execute_plan(response.content)
    
    async def _execute_plan(self, plan: str):
        """Parse and execute Claude's plan."""
        # Implementation depends on Claude's response format
        pass

# Usage
agent = ClaudeAutomationAgent()
await agent.process_task("Create a new document and save it as 'report.pdf'")

Interactive Claude Assistant

def create_interactive_assistant():
    """Create an interactive automation assistant with Claude."""
    
    class InteractiveSession:
        def __init__(self):
            self.messages = []
            
        def chat(self, user_input: str):
            # Add context about current screen
            obs = brobot.get_observation()
            
            enhanced_input = f"""
            User: {user_input}
            
            Current application state: {obs.get_most_confident_state().name}
            Visible elements: {[s.name for s in obs.active_states]}
            """
            
            self.messages.append({"role": "user", "content": enhanced_input})
            
            response = anthropic.messages.create(
                model="claude-3-sonnet",
                messages=self.messages
            )
            
            self.messages.append({"role": "assistant", "content": response.content})
            
            return response.content
    
    return InteractiveSession()

# Interactive usage
session = create_interactive_assistant()
print(session.chat("How do I navigate to settings?"))
print(session.chat("Now change the theme to dark mode"))

LangChain Integration

Brobot as LangChain Tool

from langchain.agents import initialize_agent, Tool
from langchain.llms import OpenAI
from brobot_client import BrobotClient

# Create Brobot tools for LangChain
def create_brobot_tools():
    client = BrobotClient()
    
    def observe_screen(query: str = "") -> str:
        """Observe current screen state."""
        obs = client.get_observation()
        states = [f"{s.name} ({s.confidence:.0%})" for s in obs.active_states]
        return f"Active states: {', '.join(states)}"
    
    def click_element(pattern: str) -> str:
        """Click on a UI element."""
        try:
            result = client.click(pattern)
            return f"Clicked {pattern} successfully"
        except Exception as e:
            return f"Failed to click {pattern}: {str(e)}"
    
    def type_text(text: str) -> str:
        """Type text in current field."""
        result = client.type_text(text)
        return f"Typed '{text}'"
    
    return [
        Tool(
            name="ObserveScreen",
            func=observe_screen,
            description="Get current screen state and active UI elements"
        ),
        Tool(
            name="Click",
            func=click_element,
            description="Click on UI element by image pattern name"
        ),
        Tool(
            name="Type",
            func=type_text,
            description="Type text into current field"
        )
    ]

# Create agent
llm = OpenAI(temperature=0)
tools = create_brobot_tools()
agent = initialize_agent(tools, llm, agent="zero-shot-react-description", verbose=True)

# Use agent
agent.run("Log into the application with username 'demo@example.com'")

Custom LangChain Chain

from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from langchain.memory import ConversationBufferMemory

class BrobotAutomationChain:
    """Custom chain for complex automations."""
    
    def __init__(self):
        self.brobot = BrobotClient()
        self.memory = ConversationBufferMemory()
        
        self.planner_prompt = PromptTemplate(
            input_variables=["task", "current_state"],
            template="""
            Task: {task}
            Current State: {current_state}
            
            Create a step-by-step plan to complete this task.
            Format: numbered list of actions
            """
        )
        
        self.planner = LLMChain(
            llm=OpenAI(temperature=0.3),
            prompt=self.planner_prompt,
            memory=self.memory
        )
    
    def execute_task(self, task: str):
        # Get current state
        obs = self.brobot.get_observation()
        current_state = obs.get_most_confident_state().name
        
        # Generate plan
        plan = self.planner.run(task=task, current_state=current_state)
        
        # Execute plan steps
        for step in plan.split('\n'):
            if 'click' in step.lower():
                # Extract pattern and click
                pass
            elif 'type' in step.lower():
                # Extract text and type
                pass

# Usage
chain = BrobotAutomationChain()
chain.execute_task("Create a new invoice for $1,500")

AutoGPT/Agent Frameworks

AutoGPT Plugin

class BrobotAutoGPTPlugin:
    """Plugin to give AutoGPT UI control capabilities."""
    
    def __init__(self):
        self.client = BrobotClient()
        
    def get_commands(self):
        return {
            "ui_observe": self.observe,
            "ui_click": self.click,
            "ui_type": self.type_text,
            "ui_wait": self.wait_for_state
        }
    
    def observe(self) -> dict:
        """Observe current UI state."""
        obs = self.client.get_observation()
        return {
            "states": [s.name for s in obs.active_states],
            "screenshot_available": bool(obs.screenshot)
        }
    
    def click(self, target: str) -> dict:
        """Click UI element."""
        try:
            self.client.click(target)
            return {"success": True, "message": f"Clicked {target}"}
        except Exception as e:
            return {"success": False, "error": str(e)}
    
    def type_text(self, text: str) -> dict:
        """Type text."""
        self.client.type_text(text)
        return {"success": True, "message": f"Typed: {text}"}
    
    def wait_for_state(self, state: str, timeout: float = 10) -> dict:
        """Wait for specific state."""
        try:
            self.client.wait_for_state(state, timeout)
            return {"success": True, "message": f"Reached state: {state}"}
        except:
            return {"success": False, "error": "Timeout waiting for state"}

Multi-Agent Systems

Coordinator-Worker Pattern

import asyncio
from typing import List, Dict

class AutomationCoordinator:
    """Coordinates multiple AI agents for complex tasks."""
    
    def __init__(self):
        self.brobot = BrobotClient()
        self.observer_agent = ObserverAgent()
        self.planner_agent = PlannerAgent()
        self.executor_agent = ExecutorAgent()
    
    async def execute_complex_task(self, task: str):
        # Observer analyzes current state
        state_analysis = await self.observer_agent.analyze(self.brobot)
        
        # Planner creates execution plan
        plan = await self.planner_agent.create_plan(task, state_analysis)
        
        # Executor carries out plan
        results = await self.executor_agent.execute(plan, self.brobot)
        
        return results

class ObserverAgent:
    """Specialized in understanding UI state."""
    
    async def analyze(self, brobot: BrobotClient) -> Dict:
        obs = brobot.get_observation()
        
        # Use AI to analyze screenshot and states
        analysis = {
            "current_screen": self._identify_screen(obs),
            "available_actions": self._find_actionable_elements(obs),
            "navigation_options": self._identify_navigation(obs)
        }
        
        return analysis

class PlannerAgent:
    """Creates execution plans."""
    
    async def create_plan(self, task: str, state: Dict) -> List[Dict]:
        # Use AI to create step-by-step plan
        pass

class ExecutorAgent:
    """Executes plans reliably."""
    
    async def execute(self, plan: List[Dict], brobot: BrobotClient):
        results = []
        for step in plan:
            result = await self._execute_step(step, brobot)
            results.append(result)
            
            if not result["success"]:
                # Handle failures
                break
        
        return results

Best Practices

1. Error Handling

def safe_automation(instruction: str):
    """Automation with comprehensive error handling."""
    max_retries = 3
    
    for attempt in range(max_retries):
        try:
            # Get current state
            obs = brobot.get_observation()
            
            # AI processes instruction
            actions = ai_process_instruction(instruction, obs)
            
            # Execute with validation
            for action in actions:
                result = execute_action(action)
                if not result.success:
                    # AI decides how to recover
                    recovery = ai_plan_recovery(action, result.error)
                    execute_action(recovery)
            
            return True
            
        except Exception as e:
            if attempt < max_retries - 1:
                # Let AI decide if we should retry
                should_retry = ai_should_retry(e, attempt)
                if not should_retry:
                    break
            else:
                raise

2. State Verification

def verify_state_transition(expected_state: str, timeout: float = 10):
    """Verify state transitions with AI assistance."""
    start_time = time.time()
    
    while time.time() - start_time < timeout:
        obs = brobot.get_observation()
        
        # AI verifies if we're in expected state
        is_correct = ai_verify_state(obs, expected_state)
        
        if is_correct:
            return True
        
        # AI suggests corrective action
        correction = ai_suggest_correction(obs, expected_state)
        if correction:
            execute_action(correction)
        
        time.sleep(1)
    
    return False

3. Context Management

class ContextAwareAutomation:
    """Maintains context across automation sessions."""
    
    def __init__(self):
        self.context = {
            "application": None,
            "user": None,
            "task_history": [],
            "state_history": []
        }
        
    def execute_with_context(self, task: str):
        # Add current context to AI prompt
        enhanced_task = f"""
        Task: {task}
        Application: {self.context['application']}
        Previous tasks: {self.context['task_history'][-5:]}
        """
        
        result = ai_execute(enhanced_task)
        
        # Update context
        self.context['task_history'].append(task)
        return result

Performance Optimization

Caching AI Decisions

from functools import lru_cache
import hashlib

class CachedAIAutomation:
    """Cache AI decisions for repeated scenarios."""
    
    @lru_cache(maxsize=100)
    def get_ai_decision(self, state_hash: str, task: str):
        """Cache AI decisions based on state and task."""
        return ai_model.decide(state_hash, task)
    
    def execute_task(self, task: str):
        obs = brobot.get_observation()
        
        # Create hash of current state
        state_data = {
            "states": [s.name for s in obs.active_states],
            "screen_size": (obs.screen_width, obs.screen_height)
        }
        state_hash = hashlib.md5(
            json.dumps(state_data, sort_keys=True).encode()
        ).hexdigest()
        
        # Get cached or new decision
        decision = self.get_ai_decision(state_hash, task)
        
        # Execute decision
        return execute_decision(decision)

Parallel Processing

async def parallel_ui_analysis():
    """Analyze UI using multiple AI models in parallel."""
    
    async def gpt_analysis():
        return await gpt_analyze_ui(brobot.get_observation())
    
    async def claude_analysis():
        return await claude_analyze_ui(brobot.get_observation())
    
    async def local_model_analysis():
        return await local_model_analyze(brobot.get_observation())
    
    # Run all analyses in parallel
    results = await asyncio.gather(
        gpt_analysis(),
        claude_analysis(),
        local_model_analysis()
    )
    
    # Combine insights
    return combine_ai_insights(results)

Next Steps

Explore the API Reference for detailed endpoint information
Read Troubleshooting for common issues
Join our Discord for community support

OpenAI GPT Integration​

Basic GPT-4 Integration​

Vision-Enabled GPT-4V​

Anthropic Claude Integration​

Claude 3 with Computer Use​

Interactive Claude Assistant​

LangChain Integration​

Brobot as LangChain Tool​

Custom LangChain Chain​

AutoGPT/Agent Frameworks​

AutoGPT Plugin​

Multi-Agent Systems​

Coordinator-Worker Pattern​

Best Practices​

1. Error Handling​

2. State Verification​

3. Context Management​

Performance Optimization​

Caching AI Decisions​

Parallel Processing​

Next Steps​

OpenAI GPT Integration

Basic GPT-4 Integration

Vision-Enabled GPT-4V

Anthropic Claude Integration

Claude 3 with Computer Use

Interactive Claude Assistant

LangChain Integration

Brobot as LangChain Tool

Custom LangChain Chain

AutoGPT/Agent Frameworks

AutoGPT Plugin

Multi-Agent Systems

Coordinator-Worker Pattern

Best Practices

1. Error Handling

2. State Verification

3. Context Management

Performance Optimization

Caching AI Decisions

Parallel Processing

Next Steps