Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions examples/example.c
Original file line number Diff line number Diff line change
Expand Up @@ -904,7 +904,8 @@ main(int argc, char **argv)
options, SENTRY_CRASH_UPLOAD_MODE_ASYNC);
}

#if defined(SENTRY_PLATFORM_WINDOWS) || defined(SENTRY_PLATFORM_MACOS)
#if defined(SENTRY_PLATFORM_WINDOWS) || defined(SENTRY_PLATFORM_MACOS) \
|| defined(SENTRY_PLATFORM_LINUX)
if (has_arg(argc, argv, "app-hang")) {
sentry_options_set_app_hang_enabled(options, 1);
sentry_options_set_app_hang_timeout_ms(options, 1000);
Expand All @@ -922,7 +923,8 @@ main(int argc, char **argv)
return EXIT_FAILURE;
}

#if defined(SENTRY_PLATFORM_WINDOWS) || defined(SENTRY_PLATFORM_MACOS)
#if defined(SENTRY_PLATFORM_WINDOWS) || defined(SENTRY_PLATFORM_MACOS) \
|| defined(SENTRY_PLATFORM_LINUX)
/* app-hang: spawn the demo thread BEFORE any other post-init work so it
* begins heartbeating immediately. The thread freezes for 3x the timeout,
* giving the daemon time to detect the hang and ship the envelope. We wait
Expand Down
8 changes: 4 additions & 4 deletions include/sentry.h
Original file line number Diff line number Diff line change
Expand Up @@ -1703,14 +1703,14 @@ SENTRY_EXPERIMENTAL_API void sentry_options_set_session_replay_duration(
* Enable app-hang detection via the native crash backend.
*
* When enabled, the out-of-process daemon monitors the thread first emitting
* a heatbeat through `sentry_app_hang_heartbeat`.
* a heartbeat through `sentry_app_hang_heartbeat`.
* If the heartbeat goes stale for longer than the configured timeout, the
* daemon walks the thread's stack remotely and emits an `AppHang` event.
* The host process keeps running.
*
* Off by default. This setting only has an effect when using the `native`
* backend. The feature is supported on macOS and Windows; the call is a silent
* no-op on other platforms.
* backend. The feature is supported on Windows, macOS, and Linux; the call is a
* silent no-op on other platforms.
*/
SENTRY_EXPERIMENTAL_API void sentry_options_set_app_hang_enabled(
sentry_options_t *opts, int enabled);
Expand All @@ -1737,7 +1737,7 @@ SENTRY_EXPERIMENTAL_API void sentry_options_set_app_hang_timeout_ms(
* No-op if
* - app-hang detection is not enabled
* - the native backend is not active
* - the platform is neither macOS nor Windows
* - the platform is not Windows, macOS, or Linux
*/
SENTRY_EXPERIMENTAL_API void sentry_app_hang_heartbeat(void);

Expand Down
9 changes: 8 additions & 1 deletion src/backends/native/sentry_crash_context.h
Original file line number Diff line number Diff line change
Expand Up @@ -327,7 +327,7 @@ typedef struct {
uint32_t module_count;
sentry_module_info_t modules[SENTRY_CRASH_MAX_MODULES];

/* App-hang detection (Windows + macOS, native backend only).
/* App-hang detection (Windows + macOS + Linux, native backend only).
*
* Sync model:
* - app_hang_enabled, app_hang_timeout_ms: written by host before daemon
Expand All @@ -345,6 +345,13 @@ typedef struct {
volatile uint64_t app_hang_target_tid;
volatile uint64_t app_hang_last_heartbeat_ms;

/* Daemon-only (Linux): set true for the duration of capture_and_send_app_-
* hang so the stacktrace builder will remote-unwind the (live, attachable)
* hung thread, which is otherwise excluded as the "crashed" thread. Written
* and read only by the daemon process; it lives in shmem but the host never
* touches it. */
bool app_hang_in_progress;

} sentry_crash_context_t;

// Shared memory size: calculated at compile-time based on actual struct size
Expand Down
181 changes: 177 additions & 4 deletions src/backends/native/sentry_crash_daemon.c
Original file line number Diff line number Diff line change
Expand Up @@ -955,7 +955,10 @@ build_stacktrace_for_thread(
bool is_crashed_thread
= thread_idx == SIZE_MAX || tid == ctx->crashed_tid;

if (tid > 0 && !is_crashed_thread) {
/* For a real crash we must not ptrace-attach the crashed thread. During
* an app hang the "crashed" thread is the live, attachable hung thread,
* so allow remote unwinding of it. */
if (tid > 0 && (!is_crashed_thread || ctx->app_hang_in_progress)) {
sentry_remote_registers_t registers = { 0 };
sentry_remote_frame_t *remote_frames
= sentry_malloc(sizeof(*remote_frames) * MAX_STACK_FRAMES);
Expand Down Expand Up @@ -3573,7 +3576,177 @@ capture_and_send_app_hang(const sentry_options_t *options,
sentry__path_free(env_path);
}
}
#endif /* SENTRY_PLATFORM_WINDOWS / SENTRY_PLATFORM_MACOS */

#elif defined(SENTRY_PLATFORM_LINUX)

/**
* App-hang capture path (Linux). The host is alive but frozen. The daemon
* samples the hung thread out-of-process using the same ptrace-based remote
* DWARF unwinder the crash path uses for non-crashed threads. We make the hung
* thread look like the "crashed" thread so the existing event/envelope builders
* produce the AppHang payload, and set app_hang_in_progress so the stacktrace
* builder will remote-unwind it (the crashed thread is otherwise excluded).
*
* Requires the daemon to be permitted to ptrace the host; the host already
* issues prctl(PR_SET_PTRACER, daemon_pid) at startup. If the daemon was built
* without remote libunwind (or the attach fails), no frames are captured and we
* send nothing.
*/
static void
capture_and_send_app_hang(const sentry_options_t *options,
sentry_crash_ipc_t *ipc, uint64_t freeze_ms)
{
/* NOTE (race, experimental first cut): mirrors the macOS/Windows note —
* this mutates shmem fields the host's signal handler also writes on a real
* crash. The daemon loop is single-threaded and processes a pending crash
* before reaching here; the remaining window is the host crashing
* mid-capture. */
sentry_crash_context_t *ctx = ipc->shmem;

const uint64_t target_tid = ctx->app_hang_target_tid;

/* Populate modules from /proc/<pid>/maps so debug_meta is filled and frames
* enrich/symbolicate (no signal handler runs for a hang). */
capture_modules_from_proc_maps(ctx);

/* Make the hung thread look like the crashed thread: a single-thread report
* (matching macOS/Windows). */
ctx->crashed_tid = (pid_t)target_tid;
ctx->platform.num_threads = 1;
ctx->platform.threads[0].tid = (pid_t)target_tid;
memset(&ctx->platform.threads[0].context, 0,
sizeof(ctx->platform.threads[0].context));
/* Zero the process-level context too: it is what the exception stacktrace
* (thread_idx == SIZE_MAX) reads. If the remote unwind fails, ip stays 0 so
* build_stacktrace_for_thread returns null and the event is skipped. */
memset(&ctx->platform.context, 0, sizeof(ctx->platform.context));

/* Thread name from /proc/<pid>/task/<tid>/comm. */
ctx->platform.threads[0].name[0] = '\0';
{
char comm_path[64];
snprintf(comm_path, sizeof(comm_path), "/proc/%d/task/%d/comm",
(int)ctx->crashed_pid, (int)target_tid);
FILE *comm_file = fopen(comm_path, "r");
if (comm_file) {
if (fgets(ctx->platform.threads[0].name,
sizeof(ctx->platform.threads[0].name), comm_file)) {
size_t len = strlen(ctx->platform.threads[0].name);
if (len > 0
&& ctx->platform.threads[0].name[len - 1] == '\n') {
ctx->platform.threads[0].name[len - 1] = '\0';
}
}
fclose(comm_file);
}
}

/* Let the Linux stacktrace builder remote-unwind the (live) hung thread. */
ctx->app_hang_in_progress = true;

/* Build the per-event description with the freeze duration. `freeze_ms` is
* the time since the last heartbeat at detection, necessarily at least the
* configured timeout — hence "at least". */
char value_buf[128];
snprintf(value_buf, sizeof(value_buf), "App hung for at least %llu ms.",
(unsigned long long)freeze_ms);

/* Reuse the scope file the host keeps up-to-date so the app-hang event
* carries the same scope/breadcrumbs/attachments as a crash event. */
const char *event_file_path = ctx->event_path[0] ? ctx->event_path : NULL;
sentry_path_t *run_folder = NULL;
if (event_file_path) {
sentry_path_t *ev_path = sentry__path_from_str(event_file_path);
if (ev_path) {
run_folder = sentry__path_dir(ev_path);
sentry__path_free(ev_path);
}
}

/* App-hang event: override exception type/value/level, handled, synthetic.
* The exception stacktrace path remote-unwinds the hung thread. */
sentry_value_t event = build_native_event(ctx, event_file_path, run_folder,
/*exception_type=*/"AppHang",
/*exception_value=*/value_buf, /*level=*/"error",
/*mechanism_type=*/"AppHang", /*handled=*/true);

/* Skip entirely if no frames were captured (daemon built without remote
* libunwind, or the ptrace attach failed): the exception stacktrace is null
* in that case. build_native_event has NOT yet been handed to the envelope
* writer, so we own `event` and must decref it on every early return here.
* (get_by_key / get_by_index return borrowed refs — do not decref those.) */
sentry_value_t exc_values = sentry_value_get_by_key(
sentry_value_get_by_key(event, "exception"), "values");
sentry_value_t exc0 = sentry_value_get_by_index(exc_values, 0);
sentry_value_t stacktrace = sentry_value_get_by_key(exc0, "stacktrace");
if (sentry_value_is_null(stacktrace)) {
SENTRY_DEBUG("app-hang: no frames captured, skipping event");
sentry_value_decref(event);
ctx->app_hang_in_progress = false;
if (run_folder) {
sentry__path_free(run_folder);
}
return;
}

/* Surface the freeze duration as the event message too, so the issue
* title reads "App hung for at least X ms." rather than the type alone. */
sentry_value_set_by_key(
event, "message", sentry_value_new_string(value_buf));

char envelope_path[SENTRY_CRASH_MAX_PATH];
int path_len = snprintf(envelope_path, sizeof(envelope_path),
"%s/sentry-app-hang-%lu-%llu.env", ctx->database_path,
(unsigned long)ctx->crashed_pid,
(unsigned long long)ctx->app_hang_last_heartbeat_ms);
if (path_len < 0 || path_len >= (int)sizeof(envelope_path)) {
SENTRY_WARN("app-hang: envelope path truncated or invalid");
sentry_value_decref(event);
ctx->app_hang_in_progress = false;
if (run_folder) {
sentry__path_free(run_folder);
}
return;
}

/* write_envelope_with_native_stacktrace takes ownership of `event` and
* decrefs it internally — do not decref it after this call. */
bool ok = write_envelope_with_native_stacktrace(
options, envelope_path, ctx, event, /*minidump_path=*/NULL, run_folder);

if (run_folder) {
sentry__path_free(run_folder);
}
ctx->app_hang_in_progress = false;

if (!ok) {
SENTRY_WARN("app-hang: failed to write envelope");
return;
}

/* Sync the latest user consent from shmem into the run state before sending,
* mirroring the crash + macOS app-hang paths, so a revoke/grant is honored.
*/
if (options->run) {
sentry__atomic_store(&options->run->user_consent,
sentry__atomic_fetch(&ctx->user_consent));
}

/* Read envelope from disk and hand to transport. */
sentry_path_t *env_path = sentry__path_from_str(envelope_path);
if (env_path) {
sentry_envelope_t *envelope = sentry__envelope_from_path(env_path);
if (envelope && options && options->transport && options->run) {
sentry__capture_envelope(options->transport, envelope, options);
} else if (envelope) {
/* No transport/run available: capture would not free it. */
sentry_envelope_free(envelope);
}
sentry__path_remove(env_path);
sentry__path_free(env_path);
}
}
#endif /* SENTRY_PLATFORM_WINDOWS / SENTRY_PLATFORM_MACOS / SENTRY_PLATFORM_LINUX */

/**
* Manually write a Sentry envelope with event, minidump, and attachments.
Expand Down Expand Up @@ -4642,7 +4815,7 @@ sentry__crash_daemon_main(pid_t app_pid, uint64_t app_tid, HANDLE event_handle,
CloseHandle(timer);
}
#else
# if defined(SENTRY_PLATFORM_MACOS)
# if defined(SENTRY_PLATFORM_MACOS) || defined(SENTRY_PLATFORM_LINUX)
/* App-hang detector state. Daemon-local; the timeout is cached here so it
* does not race the host on subsequent shmem mutations. When enabled, the
* loop polls on a short cadence (so it can evaluate the heartbeat each
Expand Down Expand Up @@ -4708,7 +4881,7 @@ sentry__crash_daemon_main(pid_t app_pid, uint64_t app_tid, HANDLE event_handle,
// If crash already processed, just ignore spurious notifications
SENTRY_DEBUG("Spurious notification or already processed");
}
# if defined(SENTRY_PLATFORM_MACOS)
# if defined(SENTRY_PLATFORM_MACOS) || defined(SENTRY_PLATFORM_LINUX)
else if (app_hang_enabled && !crash_processed) {
/* No crash notification this wake (timeout or spurious) — evaluate
* the app-hang heartbeat. */
Expand Down
51 changes: 51 additions & 0 deletions src/sentry_app_hang.c
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,11 @@
# include <pthread.h>
# include <stdatomic.h>
# include <time.h>
# elif defined(SENTRY_PLATFORM_LINUX)
# include <stdatomic.h>
# include <sys/syscall.h>
# include <time.h>
# include <unistd.h>
# endif
#endif

Expand Down Expand Up @@ -178,6 +183,52 @@ app_hang_record_heartbeat(sentry_crash_context_t *ctx)
ctx->app_hang_last_heartbeat_ms = sentry__app_hang_now_ms();
}

# elif defined(SENTRY_PLATFORM_LINUX)

uint64_t
sentry__app_hang_now_ms(void)
{
/* CLOCK_MONOTONIC excludes time the system spent suspended (matching macOS
* CLOCK_UPTIME_RAW and the Windows unbiased clock) and is consistent across
* the host and daemon processes, which share the same machine. */
struct timespec ts;
if (clock_gettime(CLOCK_MONOTONIC, &ts) != 0) {
return 0;
}
return (uint64_t)ts.tv_sec * 1000ULL + (uint64_t)ts.tv_nsec / 1000000ULL;
}

static void
app_hang_record_heartbeat(sentry_crash_context_t *ctx)
{
/* The kernel thread id (gettid). This is the value the daemon matches
* against when it enumerates /proc/<pid>/task and ptrace-attaches. Use the
* raw syscall so we do not depend on a libc gettid() wrapper. */
uint64_t current_tid = (uint64_t)syscall(SYS_gettid);
if (current_tid == 0) {
return;
}

/* Self-register on the first heartbeat: CAS the current TID into the latch
* slot iff still unset — the first thread to heartbeat wins and becomes the
* monitored target. The shmem field is declared `volatile uint64_t`; view
* it as an atomic for the compare-exchange. */
_Atomic uint64_t *slot
= (_Atomic uint64_t *)(void *)&ctx->app_hang_target_tid;
uint64_t expected = 0;
atomic_compare_exchange_strong(slot, &expected, current_tid);

/* Drop the heartbeat unless the latched thread is us, so a stray heartbeat
* from another thread cannot mask a frozen monitored thread. */
if (ctx->app_hang_target_tid != current_tid) {
return;
}

/* Relaxed 64-bit store; aligned on a 64-bit target so it is atomic and
* cannot tear. The daemon reads it with a relaxed load. */
ctx->app_hang_last_heartbeat_ms = sentry__app_hang_now_ms();
}

# endif

void
Expand Down
6 changes: 3 additions & 3 deletions src/sentry_app_hang.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,10 @@
#include <stdint.h>

/* The host-side heartbeat machinery (clock, latch, shmem registration) is
* available on the native backend on Windows (non-Xbox) and macOS. Linux and
* other targets fall back to no-op stubs. */
* available on the native backend on Windows (non-Xbox), macOS, and Linux.
* Android and other targets fall back to no-op stubs. */
#if (((defined(SENTRY_PLATFORM_WINDOWS) && !defined(SENTRY_PLATFORM_XBOX)) \
|| defined(SENTRY_PLATFORM_MACOS))) \
|| defined(SENTRY_PLATFORM_MACOS) || defined(SENTRY_PLATFORM_LINUX))) \
&& defined(SENTRY_BACKEND_NATIVE)
# define SENTRY_APP_HANG_HOST_SUPPORTED 1
#endif
Expand Down
4 changes: 2 additions & 2 deletions tests/test_integration_native.py
Original file line number Diff line number Diff line change
Expand Up @@ -1142,8 +1142,8 @@ def test_native_restart_on_crash(cmake, httpserver):


@pytest.mark.skipif(
sys.platform not in ("win32", "darwin"),
reason="app-hang detection is implemented on Windows and macOS",
sys.platform not in ("win32", "darwin", "linux"),
reason="app-hang detection is implemented on Windows, macOS, and Linux",
)
def test_native_app_hang(cmake, httpserver):
"""App hang detection emits exactly one AppHang event.
Expand Down
Loading