computers/default/docker.py (105 lines of code) (raw):

import subprocess import time import shlex class DockerComputer: def get_environment(self): return "linux" def get_dimensions(self): return (1280, 720) # Default fallback; will be updated in __enter__. def __init__( self, container_name="cua-sample-app", image="ghcr.io/openai/openai-cua-sample-app:latest", display=":99", port_mapping="5900:5900", ): self.container_name = container_name self.image = image self.display = display self.port_mapping = port_mapping def __enter__(self): # Check if the container is running result = subprocess.run( ["docker", "ps", "-q", "-f", f"name={self.container_name}"], capture_output=True, text=True, ) if not result.stdout.strip(): raise RuntimeError( f"Container {self.container_name} is not running. Build and run with:\n" f"docker build -t {self.container_name} .\n" f"docker run --rm -it --name {self.container_name} " f"-p {self.port_mapping} -e DISPLAY={self.display} {self.container_name}" ) # Fetch display geometry geometry = self._exec( f"DISPLAY={self.display} xdotool getdisplaygeometry" ).strip() if geometry: w, h = geometry.split() self.dimensions = (int(w), int(h)) # print("Starting Docker container...") # # Run the container detached, removing it automatically when it stops # subprocess.check_call( # [ # "docker", # "run", # "-d", # "--rm", # "--name", # self.container_name, # "-p", # self.port_mapping, # self.image, # ] # ) # # Give the container a moment to start # time.sleep(3) # print("Entering DockerComputer context") return self def __exit__(self, exc_type, exc_val, exc_tb): # print("Stopping Docker container...") # subprocess.check_call(["docker", "stop", self.container_name]) # print("Exiting DockerComputer context") pass def _exec(self, cmd: str) -> str: """ Run 'cmd' in the container. We wrap cmd in double quotes and escape any double quotes inside it, so spaces or quotes don't break the shell call. """ # Escape any existing double quotes in cmd safe_cmd = cmd.replace('"', '\\"') # Then wrap the entire cmd in double quotes for `sh -c` docker_cmd = f'docker exec {self.container_name} sh -c "{safe_cmd}"' return subprocess.check_output(docker_cmd, shell=True).decode( "utf-8", errors="ignore" ) def screenshot(self) -> str: """ Takes a screenshot with ImageMagick (import), returning base64-encoded PNG. Requires 'import'. """ # cmd = ( # f"export DISPLAY={self.display} && " # "import -window root /tmp/screenshot.png && " # "base64 /tmp/screenshot.png" # ) cmd = ( f"export DISPLAY={self.display} && " "import -window root png:- | base64 -w 0" ) return self._exec(cmd) def click(self, x: int, y: int, button: str = "left") -> None: button_map = {"left": 1, "middle": 2, "right": 3} b = button_map.get(button, 1) self._exec(f"DISPLAY={self.display} xdotool mousemove {x} {y} click {b}") def double_click(self, x: int, y: int) -> None: self._exec( f"DISPLAY={self.display} xdotool mousemove {x} {y} click --repeat 2 1" ) def scroll(self, x: int, y: int, scroll_x: int, scroll_y: int) -> None: """ For simple vertical scrolling: xdotool click 4 (scroll up) or 5 (scroll down). """ self._exec(f"DISPLAY={self.display} xdotool mousemove {x} {y}") clicks = abs(scroll_y) button = 4 if scroll_y < 0 else 5 for _ in range(clicks): self._exec(f"DISPLAY={self.display} xdotool click {button}") def type(self, text: str) -> None: """ Type the given text via xdotool, preserving spaces and quotes. """ # Escape single quotes in the user text: ' -> '\'\'' safe_text = text.replace("'", "'\\''") # Then wrap everything in single quotes for xdotool cmd = f"DISPLAY={self.display} xdotool type -- '{safe_text}'" self._exec(cmd) def wait(self, ms: int = 1000) -> None: time.sleep(ms / 1000) def move(self, x: int, y: int) -> None: self._exec(f"DISPLAY={self.display} xdotool mousemove {x} {y}") def keypress(self, keys: list[str]) -> None: mapping = { "ENTER": "Return", "LEFT": "Left", "RIGHT": "Right", "UP": "Up", "DOWN": "Down", "ESC": "Escape", "SPACE": "space", "BACKSPACE": "BackSpace", "TAB": "Tab", } mapped_keys = [mapping.get(key, key) for key in keys] combo = "+".join(mapped_keys) self._exec(f"DISPLAY={self.display} xdotool key {combo}") def drag(self, path: list[dict[str, int]]) -> None: if not path: return start_x = path[0]["x"] start_y = path[0]["y"] self._exec( f"DISPLAY={self.display} xdotool mousemove {start_x} {start_y} mousedown 1" ) for point in path[1:]: self._exec( f"DISPLAY={self.display} xdotool mousemove {point['x']} {point['y']}" ) self._exec(f"DISPLAY={self.display} xdotool mouseup 1") def get_current_url(self): return None