-
-
Notifications
You must be signed in to change notification settings - Fork 2.8k
[BUG]: to_image gets stuck very rarely #5549
Copy link
Copy link
Open
Labels
bugsomething brokensomething broken
Description
Description
to_image randomly gets stuck for indefinitely. It seems, that calling the browser in kaleido breaks somewhere
I have this call threads from py-spy
Thread 3146004 (idle): "ThreadPoolExecutor-0_22"
_wait_for_tstate_lock (threading.py:1169)
join (threading.py:1149)
oneshot_async_run (kaleido/_sync_server.py:128)
calc_fig_sync (kaleido/__init__.py:171)
to_image (plotly/io/_kaleido.py:398)
render_image (testtest/test_plotly.py:111)
run (concurrent/futures/thread.py:59)
_worker (concurrent/futures/thread.py:93)
run (threading.py:1012)
_bootstrap_inner (threading.py:1075)
_bootstrap (threading.py:1032)
Thread 3197159 (idle): "browser_procThread"
read_pipe (logistro/_api.py:223)
run (threading.py:1012)
_bootstrap_inner (threading.py:1075)
_bootstrap (threading.py:1032)
Steps to reproduce
On my machine this script gets like 1 stuck thread out of a 1000, so you might need to rerun it a few times
Example of broken output:
[OK] task=0996 total=1.70s to_image=1.70s make_fig=0.00s 31472 bytes
[OK] task=0995 total=1.78s to_image=1.78s make_fig=0.00s 27284 bytes
[OK] task=0999 total=1.68s to_image=1.68s make_fig=0.00s 32319 bytes
[WATCHDOG] ⚠️ 'render_total[265]' has been running for >40s (elapsed: 40.4s)
[WATCHDOG] ⚠️ 'to_image[265]' has been running for >40s (elapsed: 40.4s)
[WATCHDOG] ⚠️ 'render_total[265]' has been running for >45s (elapsed: 45.4s)
[WATCHDOG] ⚠️ 'to_image[265]' has been running for >45s (elapsed: 45.4s)
[...]
[WATCHDOG] ⚠️ 'render_total[265]' has been running for >445s (elapsed: 445.5s)
[WATCHDOG] ⚠️ 'to_image[265]' has been running for >445s (elapsed: 445.5s)
I'm not sure what the problem is exactly, is it multithreading, or what, but we have long running scripts that call to_image a few thousand times (they update the image as new data appears), and they get periodically stuck on the same call stack
Note: we encountered this problem in our real codebase, but I've asked claude to write this reproducible example
import threading
import time
import traceback
from concurrent.futures import ThreadPoolExecutor, as_completed
import plotly.graph_objects as go
import plotly.io as pio
# --- Watchdog ---
class Watchdog:
"""Monitors active tasks and warns if any exceed 5s intervals."""
def __init__(self, interval: float = 1.0, warn_every: float = 5.0):
self.interval = interval
self.warn_every = warn_every
self._lock = threading.Lock()
self._tasks: dict[str, float] = {} # name -> start time
self._thread = threading.Thread(target=self._run, daemon=True, name="Watchdog")
def register(self, name: str):
with self._lock:
self._tasks[name] = time.perf_counter()
def unregister(self, name: str):
with self._lock:
self._tasks.pop(name, None)
def _run(self):
warned: dict[str, int] = {} # name -> last warned threshold (in multiples of warn_every)
while True:
time.sleep(self.interval)
now = time.perf_counter()
with self._lock:
snapshot = dict(self._tasks)
for name, start in snapshot.items():
elapsed = now - start
level = int(elapsed / self.warn_every) # how many warn_every intervals passed
if level >= 1 and warned.get(name, 0) < level:
warned[name] = level
threshold = level * self.warn_every
print(
f" [WATCHDOG] ⚠️ '{name}' has been running for >{threshold:.0f}s "
f"(elapsed: {elapsed:.1f}s)",
flush=True,
)
# Clean up warned entries for finished tasks
for name in list(warned):
if name not in snapshot:
del warned[name]
def start(self):
self._thread.start()
return self
watchdog = Watchdog(interval=1.0, warn_every=5.0).start()
# --- Timed context manager ---
class timed:
"""Context manager that registers a task with the watchdog and records duration."""
def __init__(self, name: str):
self.name = name
self.start = 0.0
self.elapsed = 0.0
def __enter__(self):
self.start = time.perf_counter()
watchdog.register(self.name)
return self
def __exit__(self, *_):
self.elapsed = time.perf_counter() - self.start
watchdog.unregister(self.name)
# --- Plotly rendering ---
def make_test_fig(thread_id: int) -> go.Figure:
with timed(f"make_fig[{thread_id}]") as t:
fig = go.Figure(
data=[go.Scatter(x=list(range(10)), y=[i * thread_id % 7 for i in range(10)])],
layout=go.Layout(title=f"Thread {thread_id}"),
)
if t.elapsed > 1.0:
print(f" [SLOW] make_fig[{thread_id}] took {t.elapsed:.2f}s", flush=True)
return fig
def render_image(thread_id: int) -> dict:
result = {
"thread_id": thread_id,
"success": False,
"error": None,
"bytes": 0,
"duration_make_fig": 0.0,
"duration_to_image": 0.0,
"duration_total": 0.0,
}
with timed(f"render_total[{thread_id}]") as total:
try:
fig = make_test_fig(thread_id)
result["duration_make_fig"] = total.elapsed # captured inside make_test_fig
with timed(f"to_image[{thread_id}]") as t_img:
img_bytes = pio.to_image(fig, format="png")
result["duration_to_image"] = t_img.elapsed
result["success"] = True
result["bytes"] = len(img_bytes)
except Exception as e:
result["error"] = f"{type(e).__name__}: {e}\n{traceback.format_exc()}"
result["duration_total"] = total.elapsed
return result
# --- Main ---
def main():
n_threads = 64
n_tasks = 1000
print(f"Launching {n_tasks} tasks across {n_threads} workers...\n")
wall_start = time.perf_counter()
results = []
with ThreadPoolExecutor(max_workers=n_threads) as executor:
futures = {executor.submit(render_image, i): i for i in range(n_tasks)}
for future in as_completed(futures):
r = future.result()
status = "OK" if r["success"] else "FAIL"
print(
f" [{status}] task={r['thread_id']:04d} "
f"total={r['duration_total']:.2f}s "
f"to_image={r['duration_to_image']:.2f}s "
f"make_fig={r['duration_make_fig']:.2f}s "
f"{r['bytes']} bytes",
flush=True,
)
if r["error"]:
print(f" ERROR: {r['error'][:300]}", flush=True)
results.append(r)
wall_elapsed = time.perf_counter() - wall_start
successes = [r for r in results if r["success"]]
failures = [r for r in results if not r["success"]]
print()
print("=== Summary ===")
print(f" Total tasks : {n_tasks}")
print(f" Successes : {len(successes)}")
print(f" Failures : {len(failures)}")
print(f" Wall time : {wall_elapsed:.2f}s")
if successes:
for label, key in [("total", "duration_total"), ("to_image", "duration_to_image"), ("make_fig", "duration_make_fig")]:
vals = [r[key] for r in successes]
print(f" {label:10s} : min={min(vals):.2f}s max={max(vals):.2f}s avg={sum(vals)/len(vals):.2f}s")
if failures:
print("\n=== Errors ===")
for r in failures:
print(f" Task {r['thread_id']}: {r['error']}")
if __name__ == "__main__":
main()Reactions are currently unavailable
Metadata
Metadata
Assignees
Labels
bugsomething brokensomething broken