Playwright Integration - Browser Use (original) (raw)

import asyncio
import os
import subprocess
import sys
import tempfile

from pydantic import BaseModel, Field

# Check for required dependencies first - before other imports
try:
    import aiohttp  # type: ignore
    from playwright.async_api import Browser, Page, async_playwright  # type: ignore
except ImportError as e:
    print(f'❌ Missing dependencies for this example: {e}')
    print('This example requires: playwright aiohttp')
    print('Install with: uv add playwright aiohttp')
    print('Also run: playwright install chromium')
    sys.exit(1)

from browser_use import Agent, BrowserSession, ChatOpenAI, Tools
from browser_use.agent.views import ActionResult

# Global Playwright browser instance - shared between custom actions
playwright_browser: Browser | None = None
playwright_page: Page | None = None


# Custom action parameter models
class PlaywrightFillFormAction(BaseModel):
    """Parameters for Playwright form filling action."""

    customer_name: str = Field(..., description='Customer name to fill')
    phone_number: str = Field(..., description='Phone number to fill')
    email: str = Field(..., description='Email address to fill')
    size_option: str = Field(..., description='Size option (small/medium/large)')


class PlaywrightScreenshotAction(BaseModel):
    """Parameters for Playwright screenshot action."""

    filename: str = Field(default='playwright_screenshot.png', description='Filename for screenshot')
    quality: int | None = Field(default=None, description='JPEG quality (1-100), only for .jpg/.jpeg files')


class PlaywrightGetTextAction(BaseModel):
    """Parameters for getting text using Playwright selectors."""

    selector: str = Field(..., description='CSS selector to get text from. Use "title" for page title.')


async def start_chrome_with_debug_port(port: int = 9222):
    """
    Start Chrome with remote debugging enabled.
    Returns the Chrome process.
    """
    # Create temporary directory for Chrome user data
    user_data_dir = tempfile.mkdtemp(prefix='chrome_cdp_')

    # Chrome launch command
    chrome_paths = [
        '/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',  # macOS
        '/usr/bin/google-chrome',  # Linux
        '/usr/bin/chromium-browser',  # Linux Chromium
        'chrome',  # Windows/PATH
        'chromium',  # Generic
    ]

    chrome_exe = None
    for path in chrome_paths:
        if os.path.exists(path) or path in ['chrome', 'chromium']:
            try:
                # Test if executable works
                test_proc = await asyncio.create_subprocess_exec(
                    path, '--version', stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL
                )
                await test_proc.wait()
                chrome_exe = path
                break
            except Exception:
                continue

    if not chrome_exe:
        raise RuntimeError('❌ Chrome not found. Please install Chrome or Chromium.')

    # Chrome command arguments
    cmd = [
        chrome_exe,
        f'--remote-debugging-port={port}',
        f'--user-data-dir={user_data_dir}',
        '--no-first-run',
        '--no-default-browser-check',
        '--disable-extensions',
        'about:blank',  # Start with blank page
    ]

    # Start Chrome process
    process = await asyncio.create_subprocess_exec(*cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

    # Wait for Chrome to start and CDP to be ready
    cdp_ready = False
    for _ in range(20):  # 20 second timeout
        try:
            async with aiohttp.ClientSession() as session:
                async with session.get(
                    f'http://localhost:{port}/json/version', timeout=aiohttp.ClientTimeout(total=1)
                ) as response:
                    if response.status == 200:
                        cdp_ready = True
                        break
        except Exception:
            pass
        await asyncio.sleep(1)

    if not cdp_ready:
        process.terminate()
        raise RuntimeError('❌ Chrome failed to start with CDP')

    return process


playwright_instance = None

async def connect_playwright_to_cdp(cdp_url: str):
    """
    Connect Playwright to the same Chrome instance Browser-Use is using.
    This enables custom actions to use Playwright functions.
    """
    global playwright_browser, playwright_page, playwright_instance

    playwright_instance = await async_playwright().start()
    playwright_browser = await playwright_instance.chromium.connect_over_cdp(cdp_url)

    # Get or create a page
    if playwright_browser and playwright_browser.contexts and playwright_browser.contexts[0].pages:
        playwright_page = playwright_browser.contexts[0].pages[0]
    elif playwright_browser:
        context = await playwright_browser.new_context()
        playwright_page = await context.new_page()


# Create custom tools that use Playwright functions
tools = Tools()


@tools.registry.action(
    "Fill out a form using Playwright's precise form filling capabilities. This uses Playwright selectors for reliable form interaction.",
    param_model=PlaywrightFillFormAction,
)
async def playwright_fill_form(params: PlaywrightFillFormAction, browser_session: BrowserSession):
    """
    Custom action that uses Playwright to fill forms with high precision.
    This demonstrates how to create Browser-Use actions that leverage Playwright's capabilities.
    """
    try:
        if not playwright_page:
            return ActionResult(error='Playwright not connected. Run setup first.')

        # Filling form with Playwright's precise selectors

        # Wait for form to be ready and fill basic fields
        await playwright_page.wait_for_selector('input[name="custname"]', timeout=10000)
        await playwright_page.fill('input[name="custname"]', params.customer_name)
        await playwright_page.fill('input[name="custtel"]', params.phone_number)
        await playwright_page.fill('input[name="custemail"]', params.email)

        # Handle size selection - check if it's a select dropdown or radio buttons
        size_select = playwright_page.locator('select[name="size"]')
        size_radio = playwright_page.locator(f'input[name="size"][value="{params.size_option}"]')

        if await size_select.count() > 0:
            # It's a select dropdown
            await playwright_page.select_option('select[name="size"]', params.size_option)
        elif await size_radio.count() > 0:
            # It's radio buttons
            await playwright_page.check(f'input[name="size"][value="{params.size_option}"]')
        else:
            raise ValueError(f'Could not find size input field for value: {params.size_option}')

        # Get form data to verify it was filled
        form_data = {}
        form_data['name'] = await playwright_page.input_value('input[name="custname"]')
        form_data['phone'] = await playwright_page.input_value('input[name="custtel"]')
        form_data['email'] = await playwright_page.input_value('input[name="custemail"]')

        # Get size value based on input type
        if await size_select.count() > 0:
            form_data['size'] = await playwright_page.input_value('select[name="size"]')
        else:
            # For radio buttons, find the checked one
            checked_radio = playwright_page.locator('input[name="size"]:checked')
            if await checked_radio.count() > 0:
                form_data['size'] = await checked_radio.get_attribute('value')
            else:
                form_data['size'] = 'none selected'

        success_msg = f'✅ Form filled successfully with Playwright: {form_data}'

        return ActionResult(
            extracted_content=success_msg, include_in_memory=True, long_term_memory=f'Filled form with: {form_data}'
        )

    except Exception as e:
        error_msg = f'❌ Playwright form filling failed: {str(e)}'
        return ActionResult(error=error_msg)


@tools.registry.action(
    "Take a screenshot using Playwright's screenshot capabilities with high quality and precision.",
    param_model=PlaywrightScreenshotAction,
)
async def playwright_screenshot(params: PlaywrightScreenshotAction, browser_session: BrowserSession):
    """
    Custom action that uses Playwright's advanced screenshot features.
    """
    try:
        if not playwright_page:
            return ActionResult(error='Playwright not connected. Run setup first.')

        # Taking screenshot with Playwright

        # Use Playwright's screenshot with full page capture
        screenshot_kwargs = {'path': params.filename, 'full_page': True}

        # Add quality parameter only for JPEG files
        if params.quality is not None and params.filename.lower().endswith(('.jpg', '.jpeg')):
            screenshot_kwargs['quality'] = params.quality

        await playwright_page.screenshot(**screenshot_kwargs)

        success_msg = f'✅ Screenshot saved as {params.filename} using Playwright'

        return ActionResult(
            extracted_content=success_msg, include_in_memory=True, long_term_memory=f'Screenshot saved: {params.filename}'
        )

    except Exception as e:
        error_msg = f'❌ Playwright screenshot failed: {str(e)}'
        return ActionResult(error=error_msg)


@tools.registry.action(
    "Extract text from elements using Playwright's powerful CSS selectors and XPath support.", param_model=PlaywrightGetTextAction
)
async def playwright_get_text(params: PlaywrightGetTextAction, browser_session: BrowserSession):
    """
    Custom action that uses Playwright's advanced text extraction with CSS selectors and XPath.
    """
    try:
        if not playwright_page:
            return ActionResult(error='Playwright not connected. Run setup first.')

        # Extracting text with Playwright selectors

        # Handle special selectors
        if params.selector.lower() == 'title':
            # Use page.title() for title element
            text_content = await playwright_page.title()
            result_data = {
                'selector': 'title',
                'text_content': text_content,
                'inner_text': text_content,
                'tag_name': 'TITLE',
                'is_visible': True,
            }
        else:
            # Use Playwright's robust element selection and text extraction
            element = playwright_page.locator(params.selector).first

            if await element.count() == 0:
                error_msg = f'❌ No element found with selector: {params.selector}'
                return ActionResult(error=error_msg)

            text_content = await element.text_content()
            inner_text = await element.inner_text()

            # Get additional element info
            tag_name = await element.evaluate('el => el.tagName')
            is_visible = await element.is_visible()

            result_data = {
                'selector': params.selector,
                'text_content': text_content,
                'inner_text': inner_text,
                'tag_name': tag_name,
                'is_visible': is_visible,
            }

        success_msg = f'✅ Extracted text using Playwright: {result_data}'

        return ActionResult(
            extracted_content=str(result_data),
            include_in_memory=True,
            long_term_memory=f'Extracted from {params.selector}: {result_data["text_content"]}',
        )

    except Exception as e:
        error_msg = f'❌ Playwright text extraction failed: {str(e)}'
        return ActionResult(error=error_msg)


async def main():
    """
    Main function demonstrating Browser-Use + Playwright integration with custom actions.
    """
    print('🚀 Advanced Playwright + Browser-Use Integration with Custom Actions')

    chrome_process = None
    try:
        # Step 1: Start Chrome with CDP debugging
        chrome_process = await start_chrome_with_debug_port()
        cdp_url = 'http://localhost:9222'

        # Step 2: Connect Playwright to the same Chrome instance
        await connect_playwright_to_cdp(cdp_url)

        # Step 3: Create Browser-Use session connected to same Chrome
        browser_session = BrowserSession(cdp_url=cdp_url)

        # Step 4: Create AI agent with our custom Playwright-powered tools
        agent = Agent(
            task="""
            Please help me demonstrate the integration between Browser-Use and Playwright:
            
            1. First, navigate to https://httpbin.org/forms/post
            2. Use the 'playwright_fill_form' action to fill the form with these details:
               - Customer name: "Alice Johnson"
               - Phone: "555-9876"
               - Email: "alice@demo.com"
               - Size: "large"
            3. Take a screenshot using the 'playwright_screenshot' action and save it as "form_demo.png"
            4. Extract the title of the page using 'playwright_get_text' action with selector "title"
            5. Finally, submit the form and tell me what happened
            
            This demonstrates how Browser-Use AI can orchestrate tasks while using Playwright's precise capabilities for specific operations.
            """,
            llm=ChatOpenAI(model='gpt-4.1-mini'),
            tools=tools,  # Our custom tools with Playwright actions
            browser_session=browser_session,
        )

        print('🎯 Starting AI agent with custom Playwright actions...')

        # Step 5: Run the agent - it will use both Browser-Use actions and our custom Playwright actions
        result = await agent.run()

        # Keep browser open briefly to see results
        print(f'✅ Integration demo completed! Result: {result}')
        await asyncio.sleep(2)  # Brief pause to see results

    except Exception as e:
        print(f'❌ Error: {e}')
        raise

    finally:
        # Clean up resources
        if playwright_browser:
            await playwright_browser.close()

        if playwright_instance:
            await playwright_instance.stop()

        if chrome_process:
            chrome_process.terminate()
            try:
                await asyncio.wait_for(chrome_process.wait(), 5)
            except TimeoutError:
                chrome_process.kill()

        print('✅ Cleanup complete')


if __name__ == '__main__':
    # Run the advanced integration demo
    asyncio.run(main())