computers/shared/base_playwright.py (111 lines of code) (raw):

import time import base64 from typing import List, Dict, Literal from playwright.sync_api import sync_playwright, Browser, Page from utils import check_blocklisted_url # Optional: key mapping if your model uses "CUA" style keys CUA_KEY_TO_PLAYWRIGHT_KEY = { "/": "Divide", "\\": "Backslash", "alt": "Alt", "arrowdown": "ArrowDown", "arrowleft": "ArrowLeft", "arrowright": "ArrowRight", "arrowup": "ArrowUp", "backspace": "Backspace", "capslock": "CapsLock", "cmd": "Meta", "ctrl": "Control", "delete": "Delete", "end": "End", "enter": "Enter", "esc": "Escape", "home": "Home", "insert": "Insert", "option": "Alt", "pagedown": "PageDown", "pageup": "PageUp", "shift": "Shift", "space": " ", "super": "Meta", "tab": "Tab", "win": "Meta", } class BasePlaywrightComputer: """ Abstract base for Playwright-based computers: - Subclasses override `_get_browser_and_page()` to do local or remote connection, returning (Browser, Page). - This base class handles context creation (`__enter__`/`__exit__`), plus standard "Computer" actions like click, scroll, etc. - We also have extra browser actions: `goto(url)` and `back()`. """ def get_environment(self): return "browser" def get_dimensions(self): return (1024, 768) def __init__(self): self._playwright = None self._browser: Browser | None = None self._page: Page | None = None def __enter__(self): # Start Playwright and call the subclass hook for getting browser/page self._playwright = sync_playwright().start() self._browser, self._page = self._get_browser_and_page() # Set up network interception to flag URLs matching domains in BLOCKED_DOMAINS def handle_route(route, request): url = request.url if check_blocklisted_url(url): print(f"Flagging blocked domain: {url}") route.abort() else: route.continue_() self._page.route("**/*", handle_route) return self def __exit__(self, exc_type, exc_val, exc_tb): if self._browser: self._browser.close() if self._playwright: self._playwright.stop() def get_current_url(self) -> str: return self._page.url # --- Common "Computer" actions --- def screenshot(self) -> str: """Capture only the viewport (not full_page).""" png_bytes = self._page.screenshot(full_page=False) return base64.b64encode(png_bytes).decode("utf-8") def click(self, x: int, y: int, button: str = "left") -> None: match button: case "back": self.back() case "forward": self.forward() case "wheel": self._page.mouse.wheel(x, y) case _: button_mapping = {"left": "left", "right": "right"} button_type = button_mapping.get(button, "left") self._page.mouse.click(x, y, button=button_type) def double_click(self, x: int, y: int) -> None: self._page.mouse.dblclick(x, y) def scroll(self, x: int, y: int, scroll_x: int, scroll_y: int) -> None: self._page.mouse.move(x, y) self._page.evaluate(f"window.scrollBy({scroll_x}, {scroll_y})") def type(self, text: str) -> None: self._page.keyboard.type(text) def wait(self, ms: int = 1000) -> None: time.sleep(ms / 1000) def move(self, x: int, y: int) -> None: self._page.mouse.move(x, y) def keypress(self, keys: List[str]) -> None: mapped_keys = [CUA_KEY_TO_PLAYWRIGHT_KEY.get(key.lower(), key) for key in keys] for key in mapped_keys: self._page.keyboard.down(key) for key in reversed(mapped_keys): self._page.keyboard.up(key) def drag(self, path: List[Dict[str, int]]) -> None: if not path: return self._page.mouse.move(path[0]["x"], path[0]["y"]) self._page.mouse.down() for point in path[1:]: self._page.mouse.move(point["x"], point["y"]) self._page.mouse.up() # --- Extra browser-oriented actions --- def goto(self, url: str) -> None: try: return self._page.goto(url) except Exception as e: print(f"Error navigating to {url}: {e}") def back(self) -> None: return self._page.go_back() def forward(self) -> None: return self._page.go_forward() # --- Subclass hook --- def _get_browser_and_page(self) -> tuple[Browser, Page]: """Subclasses must implement, returning (Browser, Page).""" raise NotImplementedError