Skip to content

[BUG]: to_image gets stuck very rarely #5549

@leviska

Description

@leviska

Description

to_image randomly gets stuck for indefinitely. It seems, that calling the browser in kaleido breaks somewhere

I have this call threads from py-spy

Thread 3146004 (idle): "ThreadPoolExecutor-0_22"
    _wait_for_tstate_lock (threading.py:1169)
    join (threading.py:1149)
    oneshot_async_run (kaleido/_sync_server.py:128)
    calc_fig_sync (kaleido/__init__.py:171)
    to_image (plotly/io/_kaleido.py:398)
    render_image (testtest/test_plotly.py:111)
    run (concurrent/futures/thread.py:59)
    _worker (concurrent/futures/thread.py:93)
    run (threading.py:1012)
    _bootstrap_inner (threading.py:1075)
    _bootstrap (threading.py:1032)

Thread 3197159 (idle): "browser_procThread"
    read_pipe (logistro/_api.py:223)
    run (threading.py:1012)
    _bootstrap_inner (threading.py:1075)
    _bootstrap (threading.py:1032)

Steps to reproduce

On my machine this script gets like 1 stuck thread out of a 1000, so you might need to rerun it a few times

Example of broken output:

  [OK] task=0996 total=1.70s to_image=1.70s make_fig=0.00s 31472 bytes
  [OK] task=0995 total=1.78s to_image=1.78s make_fig=0.00s 27284 bytes
  [OK] task=0999 total=1.68s to_image=1.68s make_fig=0.00s 32319 bytes
  [WATCHDOG] ⚠️  'render_total[265]' has been running for >40s (elapsed: 40.4s)
  [WATCHDOG] ⚠️  'to_image[265]' has been running for >40s (elapsed: 40.4s)
  [WATCHDOG] ⚠️  'render_total[265]' has been running for >45s (elapsed: 45.4s)
  [WATCHDOG] ⚠️  'to_image[265]' has been running for >45s (elapsed: 45.4s)
  [...]
  [WATCHDOG] ⚠️  'render_total[265]' has been running for >445s (elapsed: 445.5s)
  [WATCHDOG] ⚠️  'to_image[265]' has been running for >445s (elapsed: 445.5s)

I'm not sure what the problem is exactly, is it multithreading, or what, but we have long running scripts that call to_image a few thousand times (they update the image as new data appears), and they get periodically stuck on the same call stack

Note: we encountered this problem in our real codebase, but I've asked claude to write this reproducible example

import threading
import time
import traceback
from concurrent.futures import ThreadPoolExecutor, as_completed

import plotly.graph_objects as go
import plotly.io as pio


# --- Watchdog ---

class Watchdog:
    """Monitors active tasks and warns if any exceed 5s intervals."""

    def __init__(self, interval: float = 1.0, warn_every: float = 5.0):
        self.interval = interval
        self.warn_every = warn_every
        self._lock = threading.Lock()
        self._tasks: dict[str, float] = {}  # name -> start time
        self._thread = threading.Thread(target=self._run, daemon=True, name="Watchdog")

    def register(self, name: str):
        with self._lock:
            self._tasks[name] = time.perf_counter()

    def unregister(self, name: str):
        with self._lock:
            self._tasks.pop(name, None)

    def _run(self):
        warned: dict[str, int] = {}  # name -> last warned threshold (in multiples of warn_every)
        while True:
            time.sleep(self.interval)
            now = time.perf_counter()
            with self._lock:
                snapshot = dict(self._tasks)
            for name, start in snapshot.items():
                elapsed = now - start
                level = int(elapsed / self.warn_every)  # how many warn_every intervals passed
                if level >= 1 and warned.get(name, 0) < level:
                    warned[name] = level
                    threshold = level * self.warn_every
                    print(
                        f"  [WATCHDOG] ⚠️  '{name}' has been running for >{threshold:.0f}s "
                        f"(elapsed: {elapsed:.1f}s)",
                        flush=True,
                    )
            # Clean up warned entries for finished tasks
            for name in list(warned):
                if name not in snapshot:
                    del warned[name]

    def start(self):
        self._thread.start()
        return self


watchdog = Watchdog(interval=1.0, warn_every=5.0).start()


# --- Timed context manager ---

class timed:
    """Context manager that registers a task with the watchdog and records duration."""

    def __init__(self, name: str):
        self.name = name
        self.start = 0.0
        self.elapsed = 0.0

    def __enter__(self):
        self.start = time.perf_counter()
        watchdog.register(self.name)
        return self

    def __exit__(self, *_):
        self.elapsed = time.perf_counter() - self.start
        watchdog.unregister(self.name)


# --- Plotly rendering ---

def make_test_fig(thread_id: int) -> go.Figure:
    with timed(f"make_fig[{thread_id}]") as t:
        fig = go.Figure(
            data=[go.Scatter(x=list(range(10)), y=[i * thread_id % 7 for i in range(10)])],
            layout=go.Layout(title=f"Thread {thread_id}"),
        )
    if t.elapsed > 1.0:
        print(f"  [SLOW] make_fig[{thread_id}] took {t.elapsed:.2f}s", flush=True)
    return fig


def render_image(thread_id: int) -> dict:
    result = {
        "thread_id": thread_id,
        "success": False,
        "error": None,
        "bytes": 0,
        "duration_make_fig": 0.0,
        "duration_to_image": 0.0,
        "duration_total": 0.0,
    }

    with timed(f"render_total[{thread_id}]") as total:
        try:
            fig = make_test_fig(thread_id)
            result["duration_make_fig"] = total.elapsed  # captured inside make_test_fig

            with timed(f"to_image[{thread_id}]") as t_img:
                img_bytes = pio.to_image(fig, format="png")
            result["duration_to_image"] = t_img.elapsed

            result["success"] = True
            result["bytes"] = len(img_bytes)

        except Exception as e:
            result["error"] = f"{type(e).__name__}: {e}\n{traceback.format_exc()}"

    result["duration_total"] = total.elapsed
    return result


# --- Main ---

def main():
    n_threads = 64
    n_tasks = 1000
    print(f"Launching {n_tasks} tasks across {n_threads} workers...\n")

    wall_start = time.perf_counter()
    results = []

    with ThreadPoolExecutor(max_workers=n_threads) as executor:
        futures = {executor.submit(render_image, i): i for i in range(n_tasks)}
        for future in as_completed(futures):
            r = future.result()
            status = "OK" if r["success"] else "FAIL"
            print(
                f"  [{status}] task={r['thread_id']:04d} "
                f"total={r['duration_total']:.2f}s "
                f"to_image={r['duration_to_image']:.2f}s "
                f"make_fig={r['duration_make_fig']:.2f}s "
                f"{r['bytes']} bytes",
                flush=True,
            )
            if r["error"]:
                print(f"         ERROR: {r['error'][:300]}", flush=True)
            results.append(r)

    wall_elapsed = time.perf_counter() - wall_start
    successes = [r for r in results if r["success"]]
    failures = [r for r in results if not r["success"]]

    print()
    print("=== Summary ===")
    print(f"  Total tasks   : {n_tasks}")
    print(f"  Successes     : {len(successes)}")
    print(f"  Failures      : {len(failures)}")
    print(f"  Wall time     : {wall_elapsed:.2f}s")
    if successes:
        for label, key in [("total", "duration_total"), ("to_image", "duration_to_image"), ("make_fig", "duration_make_fig")]:
            vals = [r[key] for r in successes]
            print(f"  {label:10s}    : min={min(vals):.2f}s  max={max(vals):.2f}s  avg={sum(vals)/len(vals):.2f}s")

    if failures:
        print("\n=== Errors ===")
        for r in failures:
            print(f"  Task {r['thread_id']}: {r['error']}")


if __name__ == "__main__":
    main()

Metadata

Metadata

Assignees

No one assigned

    Labels

    bugsomething broken

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions