Skip to main content
Version: Next

AI Integration Examples

Learn how to integrate popular AI services and frameworks with the Brobot MCP Server.

OpenAI GPT Integration

Basic GPT-4 Integration

import openai
from brobot_client import BrobotClient
import json

# Initialize clients
openai.api_key = "your-api-key"
brobot = BrobotClient()

def execute_natural_language_command(instruction: str):
"""Execute a natural language command using GPT-4."""

# Get current screen state
observation = brobot.get_observation()
active_states = [s.name for s in observation.active_states]

# Create prompt for GPT-4
prompt = f"""
Current application state: {active_states}
User instruction: {instruction}

Available actions:
- click(image_pattern) - Click on UI element
- type_text(text) - Type text
- wait_for_state(state_name) - Wait for state

Respond with a JSON array of actions to execute.
Example: [{"action": "click", "params": {"pattern": "login_btn.png"}}]
"""

# Get GPT-4 response
response = openai.ChatCompletion.create(
model="gpt-4",
messages=[
{"role": "system", "content": "You are a UI automation assistant."},
{"role": "user", "content": prompt}
],
temperature=0.3
)

# Parse and execute actions
actions = json.loads(response.choices[0].message.content)

for action in actions:
if action["action"] == "click":
brobot.click(action["params"]["pattern"])
elif action["action"] == "type_text":
brobot.type_text(action["params"]["text"])
elif action["action"] == "wait_for_state":
brobot.wait_for_state(action["params"]["state"])

# Example usage
execute_natural_language_command("Log into the application with username 'demo'")

Vision-Enabled GPT-4V

import base64
from openai import OpenAI

client = OpenAI()
brobot = BrobotClient()

def analyze_and_act():
"""Use GPT-4V to analyze screenshots and decide actions."""

# Get observation with screenshot
obs = brobot.get_observation()

# Prepare image for GPT-4V
image_base64 = obs.screenshot

response = client.chat.completions.create(
model="gpt-4-vision-preview",
messages=[
{
"role": "user",
"content": [
{
"type": "text",
"text": "What UI elements do you see? What should I click to login?"
},
{
"type": "image_url",
"image_url": {
"url": f"data:image/png;base64,{image_base64}"
}
}
]
}
],
max_tokens=300
)

# Execute suggested action
suggestion = response.choices[0].message.content
print(f"GPT-4V suggests: {suggestion}")

Anthropic Claude Integration

Claude 3 with Computer Use

from anthropic import Anthropic
from brobot_client import BrobotClient
import asyncio

anthropic = Anthropic(api_key="your-api-key")
brobot = BrobotClient()

class ClaudeAutomationAgent:
"""Agent that uses Claude to control applications."""

def __init__(self):
self.conversation = []

async def process_task(self, task: str):
"""Process a high-level task using Claude."""

# Get current state
obs = brobot.get_observation()

# Build context
context = f"""
Task: {task}
Current screen: {obs.active_states}
Available actions: click, type, drag, wait

Plan and execute the steps needed to complete this task.
"""

response = anthropic.messages.create(
model="claude-3-opus",
messages=[{"role": "user", "content": context}],
max_tokens=1000
)

# Execute Claude's plan
await self._execute_plan(response.content)

async def _execute_plan(self, plan: str):
"""Parse and execute Claude's plan."""
# Implementation depends on Claude's response format
pass

# Usage
agent = ClaudeAutomationAgent()
await agent.process_task("Create a new document and save it as 'report.pdf'")

Interactive Claude Assistant

def create_interactive_assistant():
"""Create an interactive automation assistant with Claude."""

class InteractiveSession:
def __init__(self):
self.messages = []

def chat(self, user_input: str):
# Add context about current screen
obs = brobot.get_observation()

enhanced_input = f"""
User: {user_input}

Current application state: {obs.get_most_confident_state().name}
Visible elements: {[s.name for s in obs.active_states]}
"""

self.messages.append({"role": "user", "content": enhanced_input})

response = anthropic.messages.create(
model="claude-3-sonnet",
messages=self.messages
)

self.messages.append({"role": "assistant", "content": response.content})

return response.content

return InteractiveSession()

# Interactive usage
session = create_interactive_assistant()
print(session.chat("How do I navigate to settings?"))
print(session.chat("Now change the theme to dark mode"))

LangChain Integration

Brobot as LangChain Tool

from langchain.agents import initialize_agent, Tool
from langchain.llms import OpenAI
from brobot_client import BrobotClient

# Create Brobot tools for LangChain
def create_brobot_tools():
client = BrobotClient()

def observe_screen(query: str = "") -> str:
"""Observe current screen state."""
obs = client.get_observation()
states = [f"{s.name} ({s.confidence:.0%})" for s in obs.active_states]
return f"Active states: {', '.join(states)}"

def click_element(pattern: str) -> str:
"""Click on a UI element."""
try:
result = client.click(pattern)
return f"Clicked {pattern} successfully"
except Exception as e:
return f"Failed to click {pattern}: {str(e)}"

def type_text(text: str) -> str:
"""Type text in current field."""
result = client.type_text(text)
return f"Typed '{text}'"

return [
Tool(
name="ObserveScreen",
func=observe_screen,
description="Get current screen state and active UI elements"
),
Tool(
name="Click",
func=click_element,
description="Click on UI element by image pattern name"
),
Tool(
name="Type",
func=type_text,
description="Type text into current field"
)
]

# Create agent
llm = OpenAI(temperature=0)
tools = create_brobot_tools()
agent = initialize_agent(tools, llm, agent="zero-shot-react-description", verbose=True)

# Use agent
agent.run("Log into the application with username 'demo@example.com'")

Custom LangChain Chain

from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from langchain.memory import ConversationBufferMemory

class BrobotAutomationChain:
"""Custom chain for complex automations."""

def __init__(self):
self.brobot = BrobotClient()
self.memory = ConversationBufferMemory()

self.planner_prompt = PromptTemplate(
input_variables=["task", "current_state"],
template="""
Task: {task}
Current State: {current_state}

Create a step-by-step plan to complete this task.
Format: numbered list of actions
"""
)

self.planner = LLMChain(
llm=OpenAI(temperature=0.3),
prompt=self.planner_prompt,
memory=self.memory
)

def execute_task(self, task: str):
# Get current state
obs = self.brobot.get_observation()
current_state = obs.get_most_confident_state().name

# Generate plan
plan = self.planner.run(task=task, current_state=current_state)

# Execute plan steps
for step in plan.split('\n'):
if 'click' in step.lower():
# Extract pattern and click
pass
elif 'type' in step.lower():
# Extract text and type
pass

# Usage
chain = BrobotAutomationChain()
chain.execute_task("Create a new invoice for $1,500")

AutoGPT/Agent Frameworks

AutoGPT Plugin

class BrobotAutoGPTPlugin:
"""Plugin to give AutoGPT UI control capabilities."""

def __init__(self):
self.client = BrobotClient()

def get_commands(self):
return {
"ui_observe": self.observe,
"ui_click": self.click,
"ui_type": self.type_text,
"ui_wait": self.wait_for_state
}

def observe(self) -> dict:
"""Observe current UI state."""
obs = self.client.get_observation()
return {
"states": [s.name for s in obs.active_states],
"screenshot_available": bool(obs.screenshot)
}

def click(self, target: str) -> dict:
"""Click UI element."""
try:
self.client.click(target)
return {"success": True, "message": f"Clicked {target}"}
except Exception as e:
return {"success": False, "error": str(e)}

def type_text(self, text: str) -> dict:
"""Type text."""
self.client.type_text(text)
return {"success": True, "message": f"Typed: {text}"}

def wait_for_state(self, state: str, timeout: float = 10) -> dict:
"""Wait for specific state."""
try:
self.client.wait_for_state(state, timeout)
return {"success": True, "message": f"Reached state: {state}"}
except:
return {"success": False, "error": "Timeout waiting for state"}

Multi-Agent Systems

Coordinator-Worker Pattern

import asyncio
from typing import List, Dict

class AutomationCoordinator:
"""Coordinates multiple AI agents for complex tasks."""

def __init__(self):
self.brobot = BrobotClient()
self.observer_agent = ObserverAgent()
self.planner_agent = PlannerAgent()
self.executor_agent = ExecutorAgent()

async def execute_complex_task(self, task: str):
# Observer analyzes current state
state_analysis = await self.observer_agent.analyze(self.brobot)

# Planner creates execution plan
plan = await self.planner_agent.create_plan(task, state_analysis)

# Executor carries out plan
results = await self.executor_agent.execute(plan, self.brobot)

return results

class ObserverAgent:
"""Specialized in understanding UI state."""

async def analyze(self, brobot: BrobotClient) -> Dict:
obs = brobot.get_observation()

# Use AI to analyze screenshot and states
analysis = {
"current_screen": self._identify_screen(obs),
"available_actions": self._find_actionable_elements(obs),
"navigation_options": self._identify_navigation(obs)
}

return analysis

class PlannerAgent:
"""Creates execution plans."""

async def create_plan(self, task: str, state: Dict) -> List[Dict]:
# Use AI to create step-by-step plan
pass

class ExecutorAgent:
"""Executes plans reliably."""

async def execute(self, plan: List[Dict], brobot: BrobotClient):
results = []
for step in plan:
result = await self._execute_step(step, brobot)
results.append(result)

if not result["success"]:
# Handle failures
break

return results

Best Practices

1. Error Handling

def safe_automation(instruction: str):
"""Automation with comprehensive error handling."""
max_retries = 3

for attempt in range(max_retries):
try:
# Get current state
obs = brobot.get_observation()

# AI processes instruction
actions = ai_process_instruction(instruction, obs)

# Execute with validation
for action in actions:
result = execute_action(action)
if not result.success:
# AI decides how to recover
recovery = ai_plan_recovery(action, result.error)
execute_action(recovery)

return True

except Exception as e:
if attempt < max_retries - 1:
# Let AI decide if we should retry
should_retry = ai_should_retry(e, attempt)
if not should_retry:
break
else:
raise

2. State Verification

def verify_state_transition(expected_state: str, timeout: float = 10):
"""Verify state transitions with AI assistance."""
start_time = time.time()

while time.time() - start_time < timeout:
obs = brobot.get_observation()

# AI verifies if we're in expected state
is_correct = ai_verify_state(obs, expected_state)

if is_correct:
return True

# AI suggests corrective action
correction = ai_suggest_correction(obs, expected_state)
if correction:
execute_action(correction)

time.sleep(1)

return False

3. Context Management

class ContextAwareAutomation:
"""Maintains context across automation sessions."""

def __init__(self):
self.context = {
"application": None,
"user": None,
"task_history": [],
"state_history": []
}

def execute_with_context(self, task: str):
# Add current context to AI prompt
enhanced_task = f"""
Task: {task}
Application: {self.context['application']}
Previous tasks: {self.context['task_history'][-5:]}
"""

result = ai_execute(enhanced_task)

# Update context
self.context['task_history'].append(task)
return result

Performance Optimization

Caching AI Decisions

from functools import lru_cache
import hashlib

class CachedAIAutomation:
"""Cache AI decisions for repeated scenarios."""

@lru_cache(maxsize=100)
def get_ai_decision(self, state_hash: str, task: str):
"""Cache AI decisions based on state and task."""
return ai_model.decide(state_hash, task)

def execute_task(self, task: str):
obs = brobot.get_observation()

# Create hash of current state
state_data = {
"states": [s.name for s in obs.active_states],
"screen_size": (obs.screen_width, obs.screen_height)
}
state_hash = hashlib.md5(
json.dumps(state_data, sort_keys=True).encode()
).hexdigest()

# Get cached or new decision
decision = self.get_ai_decision(state_hash, task)

# Execute decision
return execute_decision(decision)

Parallel Processing

async def parallel_ui_analysis():
"""Analyze UI using multiple AI models in parallel."""

async def gpt_analysis():
return await gpt_analyze_ui(brobot.get_observation())

async def claude_analysis():
return await claude_analyze_ui(brobot.get_observation())

async def local_model_analysis():
return await local_model_analyze(brobot.get_observation())

# Run all analyses in parallel
results = await asyncio.gather(
gpt_analysis(),
claude_analysis(),
local_model_analysis()
)

# Combine insights
return combine_ai_insights(results)

Next Steps