446 lines
16 KiB
Python
446 lines
16 KiB
Python
|
|
# MIT License
|
||
|
|
# Copyright (c) 2024
|
||
|
|
"""Multimodal recorder for Windows desktop sessions."""
|
||
|
|
|
||
|
|
from __future__ import annotations
|
||
|
|
|
||
|
|
import json
|
||
|
|
import threading
|
||
|
|
import time
|
||
|
|
import uuid
|
||
|
|
from pathlib import Path
|
||
|
|
from typing import List, Optional, Tuple
|
||
|
|
|
||
|
|
import cv2 # type: ignore
|
||
|
|
import numpy as np # type: ignore
|
||
|
|
import psutil # type: ignore
|
||
|
|
import uiautomation as auto # type: ignore
|
||
|
|
from pynput import keyboard, mouse
|
||
|
|
import mss # type: ignore
|
||
|
|
|
||
|
|
from .schema import (
|
||
|
|
EventRecord,
|
||
|
|
FramePaths,
|
||
|
|
MouseInfo,
|
||
|
|
Rect,
|
||
|
|
SessionManifest,
|
||
|
|
UISnapshot,
|
||
|
|
UITreeNode,
|
||
|
|
UISelector,
|
||
|
|
WindowInfo,
|
||
|
|
)
|
||
|
|
from .screen_recorder import ScreenRecorder
|
||
|
|
|
||
|
|
|
||
|
|
class Recorder:
|
||
|
|
"""Capture UI events, UIA context, screenshots, and screen video."""
|
||
|
|
|
||
|
|
def __init__(self, output_dir: Path, hotkey: str = "F9", fps: int = 12, screen: int = 0) -> None:
|
||
|
|
self.output_dir = output_dir
|
||
|
|
self.hotkey = hotkey
|
||
|
|
self.fps = fps
|
||
|
|
self.screen = screen
|
||
|
|
|
||
|
|
self.session_id = str(uuid.uuid4())
|
||
|
|
self.session_dir = self.output_dir / self.session_id
|
||
|
|
self.events_path = self.session_dir / "events.jsonl"
|
||
|
|
self.video_path = self.session_dir / "video.mp4"
|
||
|
|
self.frames_dir = self.session_dir / "frames"
|
||
|
|
self.frames_crops_dir = self.session_dir / "frames_crops"
|
||
|
|
self.ui_snapshots_dir = self.session_dir / "ui_snapshots"
|
||
|
|
|
||
|
|
self.events: List[EventRecord] = []
|
||
|
|
self._stop_event = threading.Event()
|
||
|
|
self._lock = threading.Lock()
|
||
|
|
self._text_buffer: List[str] = []
|
||
|
|
self._flush_timer: Optional[threading.Timer] = None
|
||
|
|
self._start_perf = 0.0
|
||
|
|
self._start_ts = 0.0
|
||
|
|
self._last_hwnd: Optional[int] = None
|
||
|
|
self._mouse_controller = mouse.Controller()
|
||
|
|
self._screen_recorder: Optional[ScreenRecorder] = None
|
||
|
|
self._window_thread: Optional[threading.Thread] = None
|
||
|
|
self._mouse_listener: Optional[mouse.Listener] = None
|
||
|
|
self._keyboard_listener: Optional[keyboard.Listener] = None
|
||
|
|
self._monitor: Optional[dict] = None
|
||
|
|
self._event_index = 0
|
||
|
|
self._uia_local = threading.local()
|
||
|
|
self._ensure_uia_initialized()
|
||
|
|
|
||
|
|
# Public API ---------------------------------------------------------
|
||
|
|
def start(self) -> Path:
|
||
|
|
"""Start recording until the hotkey is pressed."""
|
||
|
|
self.session_dir.mkdir(parents=True, exist_ok=True)
|
||
|
|
self.frames_dir.mkdir(parents=True, exist_ok=True)
|
||
|
|
self.frames_crops_dir.mkdir(parents=True, exist_ok=True)
|
||
|
|
self.ui_snapshots_dir.mkdir(parents=True, exist_ok=True)
|
||
|
|
|
||
|
|
self._start_perf = time.perf_counter()
|
||
|
|
self._start_ts = time.time()
|
||
|
|
with mss.mss() as sct:
|
||
|
|
monitors = sct.monitors
|
||
|
|
if 0 <= self.screen < len(monitors):
|
||
|
|
self._monitor = monitors[self.screen]
|
||
|
|
else:
|
||
|
|
self._monitor = monitors[0]
|
||
|
|
|
||
|
|
self._screen_recorder = ScreenRecorder(self.video_path, fps=self.fps, screen=self.screen)
|
||
|
|
self._screen_recorder.start()
|
||
|
|
|
||
|
|
self._window_thread = threading.Thread(target=self._watch_window, daemon=True)
|
||
|
|
self._window_thread.start()
|
||
|
|
|
||
|
|
self._mouse_listener = mouse.Listener(on_click=self._on_click)
|
||
|
|
self._keyboard_listener = keyboard.Listener(on_press=self._on_key_press)
|
||
|
|
self._mouse_listener.start()
|
||
|
|
self._keyboard_listener.start()
|
||
|
|
|
||
|
|
self._stop_event.wait()
|
||
|
|
self._flush_text_buffer()
|
||
|
|
self._shutdown()
|
||
|
|
return self.session_dir
|
||
|
|
|
||
|
|
# Event handlers -----------------------------------------------------
|
||
|
|
def _on_click(self, x: int, y: int, button: mouse.Button, pressed: bool) -> None:
|
||
|
|
if not pressed or self._stop_event.is_set():
|
||
|
|
return
|
||
|
|
window_info = self._get_window_info()
|
||
|
|
selector = self._hit_test(x, y)
|
||
|
|
mouse_info = MouseInfo(x=int(x), y=int(y), button=str(button).split(".")[-1], action="down")
|
||
|
|
self._record_event(
|
||
|
|
event_type="mouse_click",
|
||
|
|
mouse_info=mouse_info,
|
||
|
|
text=None,
|
||
|
|
uia_selector=selector,
|
||
|
|
window=window_info,
|
||
|
|
)
|
||
|
|
|
||
|
|
def _on_key_press(self, key: keyboard.Key | keyboard.KeyCode) -> Optional[bool]:
|
||
|
|
if self._is_hotkey(key):
|
||
|
|
self._stop_event.set()
|
||
|
|
return False
|
||
|
|
if self._stop_event.is_set():
|
||
|
|
return False
|
||
|
|
ch = self._key_to_char(key)
|
||
|
|
if ch is None:
|
||
|
|
return None
|
||
|
|
self._text_buffer.append(ch)
|
||
|
|
self._schedule_flush()
|
||
|
|
return None
|
||
|
|
|
||
|
|
# Background watchers ------------------------------------------------
|
||
|
|
def _watch_window(self, interval: float = 0.5) -> None:
|
||
|
|
while not self._stop_event.is_set():
|
||
|
|
info = self._get_window_info()
|
||
|
|
hwnd = info.hwnd if info else None
|
||
|
|
if hwnd and hwnd != self._last_hwnd:
|
||
|
|
self._last_hwnd = hwnd
|
||
|
|
selector = self._hit_test(*self._current_mouse_position())
|
||
|
|
self._record_event(
|
||
|
|
event_type="window_change",
|
||
|
|
mouse_info=self._current_mouse_info(),
|
||
|
|
text=None,
|
||
|
|
uia_selector=selector,
|
||
|
|
window=info,
|
||
|
|
)
|
||
|
|
time.sleep(interval)
|
||
|
|
|
||
|
|
# Recording helpers --------------------------------------------------
|
||
|
|
def _shutdown(self) -> None:
|
||
|
|
if self._flush_timer and self._flush_timer.is_alive():
|
||
|
|
self._flush_timer.cancel()
|
||
|
|
if self._mouse_listener:
|
||
|
|
self._mouse_listener.stop()
|
||
|
|
if self._keyboard_listener:
|
||
|
|
self._keyboard_listener.stop()
|
||
|
|
if self._window_thread and self._window_thread.is_alive():
|
||
|
|
self._window_thread.join(timeout=1.0)
|
||
|
|
if self._screen_recorder:
|
||
|
|
self._screen_recorder.stop()
|
||
|
|
self._write_events()
|
||
|
|
self._write_manifest()
|
||
|
|
|
||
|
|
def _schedule_flush(self) -> None:
|
||
|
|
if self._flush_timer and self._flush_timer.is_alive():
|
||
|
|
self._flush_timer.cancel()
|
||
|
|
self._flush_timer = threading.Timer(0.8, self._flush_text_buffer)
|
||
|
|
self._flush_timer.daemon = True
|
||
|
|
self._flush_timer.start()
|
||
|
|
|
||
|
|
def _flush_text_buffer(self) -> None:
|
||
|
|
if not self._text_buffer:
|
||
|
|
return
|
||
|
|
text = "".join(self._text_buffer)
|
||
|
|
self._text_buffer = []
|
||
|
|
mouse_info = self._current_mouse_info()
|
||
|
|
selector = None
|
||
|
|
if mouse_info:
|
||
|
|
selector = self._hit_test(mouse_info.x, mouse_info.y)
|
||
|
|
window_info = self._get_window_info()
|
||
|
|
self._record_event(
|
||
|
|
event_type="text_input",
|
||
|
|
mouse_info=mouse_info,
|
||
|
|
text=text,
|
||
|
|
uia_selector=selector,
|
||
|
|
window=window_info,
|
||
|
|
)
|
||
|
|
|
||
|
|
def _record_event(
|
||
|
|
self,
|
||
|
|
event_type: str,
|
||
|
|
mouse_info: Optional[MouseInfo],
|
||
|
|
text: Optional[str],
|
||
|
|
uia_selector: Optional[UISelector],
|
||
|
|
window: Optional[WindowInfo],
|
||
|
|
) -> None:
|
||
|
|
self._event_index += 1
|
||
|
|
ts = time.time()
|
||
|
|
offset_ms = int((time.perf_counter() - self._start_perf) * 1000)
|
||
|
|
frame_paths = self._capture_frame(event_type, self._event_index, mouse_info, uia_selector, window)
|
||
|
|
ui_snapshot_path = self._save_ui_snapshot(self._event_index, uia_selector)
|
||
|
|
|
||
|
|
record = EventRecord(
|
||
|
|
ts=ts,
|
||
|
|
event_type=event_type,
|
||
|
|
window=window,
|
||
|
|
mouse=mouse_info,
|
||
|
|
text=text,
|
||
|
|
uia=uia_selector,
|
||
|
|
frame_paths=frame_paths,
|
||
|
|
video_time_offset_ms=offset_ms,
|
||
|
|
ui_snapshot=ui_snapshot_path,
|
||
|
|
)
|
||
|
|
with self._lock:
|
||
|
|
self.events.append(record)
|
||
|
|
|
||
|
|
def _capture_frame(
|
||
|
|
self,
|
||
|
|
tag: str,
|
||
|
|
event_index: int,
|
||
|
|
mouse_info: Optional[MouseInfo],
|
||
|
|
uia_selector: Optional[UISelector],
|
||
|
|
window: Optional[WindowInfo],
|
||
|
|
) -> Optional[FramePaths]:
|
||
|
|
if not self._monitor:
|
||
|
|
return None
|
||
|
|
|
||
|
|
region = self._monitor_region(window)
|
||
|
|
with mss.mss() as sct:
|
||
|
|
shot = np.array(sct.grab(region))
|
||
|
|
frame = cv2.cvtColor(shot, cv2.COLOR_BGRA2BGR)
|
||
|
|
|
||
|
|
full_path = self.frames_dir / f"frame_{event_index:05d}_{tag}.png"
|
||
|
|
cv2.imwrite(str(full_path), frame)
|
||
|
|
|
||
|
|
crop_mouse_path = None
|
||
|
|
crop_element_path = None
|
||
|
|
if mouse_info:
|
||
|
|
crop_mouse_path = self._save_mouse_crop(frame, region, mouse_info, event_index)
|
||
|
|
if uia_selector and uia_selector.bounding_rect:
|
||
|
|
crop_element_path = self._save_element_crop(frame, region, uia_selector.bounding_rect, event_index)
|
||
|
|
|
||
|
|
return FramePaths(
|
||
|
|
full=str(full_path),
|
||
|
|
crop_mouse=str(crop_mouse_path) if crop_mouse_path else None,
|
||
|
|
crop_element=str(crop_element_path) if crop_element_path else None,
|
||
|
|
)
|
||
|
|
|
||
|
|
def _save_mouse_crop(self, frame: np.ndarray, region: dict, mouse_info: MouseInfo, event_index: int) -> Optional[Path]:
|
||
|
|
width, height = frame.shape[1], frame.shape[0]
|
||
|
|
center_x = int(mouse_info.x - region["left"])
|
||
|
|
center_y = int(mouse_info.y - region["top"])
|
||
|
|
crop_w, crop_h = 400, 300
|
||
|
|
x0 = max(0, center_x - crop_w // 2)
|
||
|
|
y0 = max(0, center_y - crop_h // 2)
|
||
|
|
x1 = min(width, x0 + crop_w)
|
||
|
|
y1 = min(height, y0 + crop_h)
|
||
|
|
if x1 <= x0 or y1 <= y0:
|
||
|
|
return None
|
||
|
|
crop = frame[y0:y1, x0:x1]
|
||
|
|
path = self.frames_crops_dir / f"frame_{event_index:05d}_mouse.png"
|
||
|
|
cv2.imwrite(str(path), crop)
|
||
|
|
return path
|
||
|
|
|
||
|
|
def _save_element_crop(self, frame: np.ndarray, region: dict, rect: Rect, event_index: int) -> Optional[Path]:
|
||
|
|
width, height = frame.shape[1], frame.shape[0]
|
||
|
|
x0 = max(0, int(rect.left - region["left"]))
|
||
|
|
y0 = max(0, int(rect.top - region["top"]))
|
||
|
|
x1 = min(width, int(rect.right - region["left"]))
|
||
|
|
y1 = min(height, int(rect.bottom - region["top"]))
|
||
|
|
if x1 <= x0 or y1 <= y0:
|
||
|
|
return None
|
||
|
|
crop = frame[y0:y1, x0:x1]
|
||
|
|
path = self.frames_crops_dir / f"frame_{event_index:05d}_element.png"
|
||
|
|
cv2.imwrite(str(path), crop)
|
||
|
|
return path
|
||
|
|
|
||
|
|
def _monitor_region(self, window: Optional[WindowInfo]) -> dict:
|
||
|
|
if window and window.rect and window.rect.width > 0 and window.rect.height > 0:
|
||
|
|
return {
|
||
|
|
"left": int(window.rect.left),
|
||
|
|
"top": int(window.rect.top),
|
||
|
|
"width": int(window.rect.width),
|
||
|
|
"height": int(window.rect.height),
|
||
|
|
}
|
||
|
|
return {
|
||
|
|
"left": int(self._monitor["left"]),
|
||
|
|
"top": int(self._monitor["top"]),
|
||
|
|
"width": int(self._monitor["width"]),
|
||
|
|
"height": int(self._monitor["height"]),
|
||
|
|
}
|
||
|
|
|
||
|
|
def _save_ui_snapshot(self, event_index: int, selector: Optional[UISelector]) -> Optional[str]:
|
||
|
|
tree = self._capture_tree(max_depth=3)
|
||
|
|
if not tree and selector is None:
|
||
|
|
return None
|
||
|
|
path = self.ui_snapshots_dir / f"ui_{event_index:05d}.json"
|
||
|
|
snapshot = UISnapshot(selector=selector, tree=tree)
|
||
|
|
with path.open("w", encoding="utf-8") as f:
|
||
|
|
json.dump(snapshot.dict(exclude_none=True), f, ensure_ascii=False)
|
||
|
|
return str(path)
|
||
|
|
|
||
|
|
# UI helpers ---------------------------------------------------------
|
||
|
|
def _capture_tree(self, max_depth: int = 3) -> List[UITreeNode]:
|
||
|
|
self._ensure_uia_initialized()
|
||
|
|
root = auto.GetForegroundControl()
|
||
|
|
if root is None:
|
||
|
|
return []
|
||
|
|
nodes: List[UITreeNode] = []
|
||
|
|
queue: List[Tuple[auto.Control, int]] = [(root, 0)] # type: ignore
|
||
|
|
while queue:
|
||
|
|
node, depth = queue.pop(0)
|
||
|
|
if depth > max_depth:
|
||
|
|
continue
|
||
|
|
nodes.append(
|
||
|
|
UITreeNode(
|
||
|
|
name=node.Name,
|
||
|
|
automation_id=node.AutomationId,
|
||
|
|
class_name=node.ClassName,
|
||
|
|
control_type=node.ControlTypeName,
|
||
|
|
depth=depth,
|
||
|
|
)
|
||
|
|
)
|
||
|
|
try:
|
||
|
|
children = list(node.GetChildren())
|
||
|
|
except Exception:
|
||
|
|
children = []
|
||
|
|
for child in children:
|
||
|
|
queue.append((child, depth + 1))
|
||
|
|
return nodes
|
||
|
|
|
||
|
|
def _hit_test(self, x: int, y: int) -> Optional[UISelector]:
|
||
|
|
try:
|
||
|
|
self._ensure_uia_initialized()
|
||
|
|
ctrl = auto.ControlFromPoint((int(x), int(y)))
|
||
|
|
except Exception:
|
||
|
|
ctrl = None
|
||
|
|
if not ctrl:
|
||
|
|
return None
|
||
|
|
return self._build_selector(ctrl)
|
||
|
|
|
||
|
|
def _get_window_info(self) -> Optional[WindowInfo]:
|
||
|
|
self._ensure_uia_initialized()
|
||
|
|
ctrl = auto.GetForegroundControl()
|
||
|
|
if ctrl is None:
|
||
|
|
return None
|
||
|
|
rect = getattr(ctrl, "BoundingRectangle", None)
|
||
|
|
self._ensure_uia_initialized()
|
||
|
|
rect_model = None
|
||
|
|
if rect:
|
||
|
|
rect_model = Rect(left=int(rect.left), top=int(rect.top), right=int(rect.right), bottom=int(rect.bottom))
|
||
|
|
process_name = None
|
||
|
|
try:
|
||
|
|
process_name = psutil.Process(ctrl.ProcessId).name()
|
||
|
|
except Exception:
|
||
|
|
process_name = None
|
||
|
|
hwnd = getattr(ctrl, "NativeWindowHandle", None) or getattr(ctrl, "Handle", None)
|
||
|
|
return WindowInfo(
|
||
|
|
hwnd=int(hwnd) if hwnd else None,
|
||
|
|
title=ctrl.Name,
|
||
|
|
process_name=process_name,
|
||
|
|
rect=rect_model,
|
||
|
|
)
|
||
|
|
|
||
|
|
def _build_selector(self, ctrl: auto.Control) -> UISelector: # type: ignore
|
||
|
|
rect = getattr(ctrl, "BoundingRectangle", None)
|
||
|
|
rect_model = None
|
||
|
|
if rect:
|
||
|
|
rect_model = Rect(left=int(rect.left), top=int(rect.top), right=int(rect.right), bottom=int(rect.bottom))
|
||
|
|
return UISelector(
|
||
|
|
automation_id=getattr(ctrl, "AutomationId", None),
|
||
|
|
name=getattr(ctrl, "Name", None),
|
||
|
|
class_name=getattr(ctrl, "ClassName", None),
|
||
|
|
control_type=getattr(ctrl, "ControlTypeName", None),
|
||
|
|
bounding_rect=rect_model,
|
||
|
|
)
|
||
|
|
|
||
|
|
# Utility ------------------------------------------------------------
|
||
|
|
def _key_to_char(self, key: keyboard.Key | keyboard.KeyCode) -> Optional[str]:
|
||
|
|
if isinstance(key, keyboard.KeyCode) and key.char:
|
||
|
|
return key.char
|
||
|
|
if key == keyboard.Key.space:
|
||
|
|
return " "
|
||
|
|
if key == keyboard.Key.enter:
|
||
|
|
return "\n"
|
||
|
|
if key == keyboard.Key.backspace:
|
||
|
|
if self._text_buffer:
|
||
|
|
self._text_buffer.pop()
|
||
|
|
return None
|
||
|
|
return None
|
||
|
|
|
||
|
|
def _is_hotkey(self, key: keyboard.Key | keyboard.KeyCode) -> bool:
|
||
|
|
target = self.hotkey.lower()
|
||
|
|
name = None
|
||
|
|
if isinstance(key, keyboard.Key):
|
||
|
|
name = (key.name or "").lower()
|
||
|
|
elif isinstance(key, keyboard.KeyCode):
|
||
|
|
name = (key.char or "").lower()
|
||
|
|
return name == target
|
||
|
|
|
||
|
|
def _current_mouse_position(self) -> Tuple[int, int]:
|
||
|
|
pos = self._mouse_controller.position
|
||
|
|
return int(pos[0]), int(pos[1])
|
||
|
|
|
||
|
|
def _current_mouse_info(self) -> Optional[MouseInfo]:
|
||
|
|
x, y = self._current_mouse_position()
|
||
|
|
return MouseInfo(x=int(x), y=int(y), button=None, action=None)
|
||
|
|
|
||
|
|
def _ensure_uia_initialized(self) -> None:
|
||
|
|
if getattr(self._uia_local, "token", None) is None:
|
||
|
|
self._uia_local.token = auto.UIAutomationInitializerInThread()
|
||
|
|
|
||
|
|
# Persistence --------------------------------------------------------
|
||
|
|
def _write_events(self) -> None:
|
||
|
|
with self.events_path.open("w", encoding="utf-8") as f:
|
||
|
|
for event in self.events:
|
||
|
|
f.write(json.dumps(event.dict(exclude_none=True), ensure_ascii=False))
|
||
|
|
f.write("\n")
|
||
|
|
|
||
|
|
def _write_manifest(self) -> None:
|
||
|
|
resolution = self._resolution()
|
||
|
|
manifest = SessionManifest(
|
||
|
|
session_id=self.session_id,
|
||
|
|
start_time=self._start_ts,
|
||
|
|
end_time=time.time(),
|
||
|
|
resolution=resolution,
|
||
|
|
fps=self.fps,
|
||
|
|
screen=self.screen,
|
||
|
|
video_path=str(self.video_path),
|
||
|
|
events_path=str(self.events_path),
|
||
|
|
frames_dir=str(self.frames_dir),
|
||
|
|
frames_crops_dir=str(self.frames_crops_dir),
|
||
|
|
ui_snapshots_dir=str(self.ui_snapshots_dir),
|
||
|
|
)
|
||
|
|
path = self.session_dir / "manifest.json"
|
||
|
|
with path.open("w", encoding="utf-8") as f:
|
||
|
|
json.dump(manifest.dict(exclude_none=True), f, ensure_ascii=False, indent=2)
|
||
|
|
|
||
|
|
def _resolution(self) -> str:
|
||
|
|
if self._monitor:
|
||
|
|
return f"{self._monitor['width']}x{self._monitor['height']}"
|
||
|
|
try:
|
||
|
|
width, height = auto.GetScreenSize()
|
||
|
|
return f"{width}x{height}"
|
||
|
|
except Exception:
|
||
|
|
return "unknown"
|