diff --git a/ddprof-lib/src/main/cpp/arguments.cpp b/ddprof-lib/src/main/cpp/arguments.cpp index 72b8aec22..9ea35588c 100644 --- a/ddprof-lib/src/main/cpp/arguments.cpp +++ b/ddprof-lib/src/main/cpp/arguments.cpp @@ -374,6 +374,12 @@ Error Arguments::parse(const char *args) { } } + CASE("nativemem") + _nativemem = value == NULL ? 0 : parseUnits(value, BYTES); + if (_nativemem < 0) { + msg = "nativemem must be >= 0"; + } + DEFAULT() if (_unknown_arg == NULL) _unknown_arg = arg; @@ -385,7 +391,7 @@ Error Arguments::parse(const char *args) { return Error(msg); } - if (_event == NULL && _cpu < 0 && _wall < 0 && _memory < 0) { + if (_event == NULL && _cpu < 0 && _wall < 0 && _memory < 0 && _nativemem < 0) { _event = EVENT_CPU; } diff --git a/ddprof-lib/src/main/cpp/arguments.h b/ddprof-lib/src/main/cpp/arguments.h index 462be5b53..96a5b9400 100644 --- a/ddprof-lib/src/main/cpp/arguments.h +++ b/ddprof-lib/src/main/cpp/arguments.h @@ -1,5 +1,6 @@ /* * Copyright 2017 Andrei Pangin + * Copyright 2026, Datadog, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -174,6 +175,7 @@ class Arguments { double _live_samples_ratio; bool _record_heap_usage; bool _gc_generations; + long _nativemem; int _jstackdepth; int _safe_mode; StackWalkFeatures _features; @@ -209,6 +211,7 @@ class Arguments { _live_samples_ratio(0.1), // default to liveness-tracking 10% of the allocation samples _record_heap_usage(false), _gc_generations(false), + _nativemem(-1), _jstackdepth(DEFAULT_JSTACKDEPTH), _safe_mode(0), _features{1, 1, 1, 1, 1, 1}, diff --git a/ddprof-lib/src/main/cpp/codeCache.cpp b/ddprof-lib/src/main/cpp/codeCache.cpp index 9ce154687..a3c6e29d1 100644 --- a/ddprof-lib/src/main/cpp/codeCache.cpp +++ b/ddprof-lib/src/main/cpp/codeCache.cpp @@ -1,5 +1,6 @@ /* * Copyright The async-profiler authors + * Copyright 2026, Datadog, Inc. * SPDX-License-Identifier: Apache-2.0 */ @@ -308,6 +309,11 @@ void CodeCache::saveImport(ImportId id, void** entry) { void CodeCache::addImport(void **entry, const char *name) { switch (name[0]) { + case 'a': + if (strcmp(name, "aligned_alloc") == 0) { + saveImport(im_aligned_alloc, entry); + } + break; case 'c': if (strcmp(name, "calloc") == 0) { saveImport(im_calloc, entry); @@ -337,6 +343,8 @@ void CodeCache::addImport(void **entry, const char *name) { saveImport(im_pthread_setspecific, entry); } else if (strcmp(name, "poll") == 0) { saveImport(im_poll, entry); + } else if (strcmp(name, "posix_memalign") == 0) { + saveImport(im_posix_memalign, entry); } break; case 'r': diff --git a/ddprof-lib/src/main/cpp/codeCache.h b/ddprof-lib/src/main/cpp/codeCache.h index 170ac22a3..5c9a5b155 100644 --- a/ddprof-lib/src/main/cpp/codeCache.h +++ b/ddprof-lib/src/main/cpp/codeCache.h @@ -1,5 +1,6 @@ /* * Copyright The async-profiler authors + * Copyright 2026, Datadog, Inc. * SPDX-License-Identifier: Apache-2.0 */ @@ -34,6 +35,8 @@ enum ImportId { im_calloc, im_realloc, im_free, + im_posix_memalign, + im_aligned_alloc, im_sigaction, NUM_IMPORTS }; diff --git a/ddprof-lib/src/main/cpp/event.h b/ddprof-lib/src/main/cpp/event.h index e9363165f..752db842d 100644 --- a/ddprof-lib/src/main/cpp/event.h +++ b/ddprof-lib/src/main/cpp/event.h @@ -1,5 +1,6 @@ /* * Copyright 2020 Andrei Pangin + * Copyright 2026, Datadog, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -88,6 +89,16 @@ class ObjectLivenessEvent : public Event { Context _ctx; }; +class MallocEvent : public Event { +public: + u64 _start_time; + uintptr_t _address; + u64 _size; + float _weight; + + MallocEvent() : Event(), _start_time(0), _address(0), _size(0), _weight(1.0f) {} +}; + class WallClockEpochEvent { public: bool _dirty; diff --git a/ddprof-lib/src/main/cpp/flightRecorder.cpp b/ddprof-lib/src/main/cpp/flightRecorder.cpp index eeaa3f32d..7472ed1c5 100644 --- a/ddprof-lib/src/main/cpp/flightRecorder.cpp +++ b/ddprof-lib/src/main/cpp/flightRecorder.cpp @@ -906,6 +906,12 @@ void Recording::writeSettings(Buffer *buf, Arguments &args) { writeBoolSetting(buf, T_ALLOC, "enabled", args._record_allocations); writeBoolSetting(buf, T_HEAP_LIVE_OBJECT, "enabled", args._record_liveness); + writeBoolSetting(buf, T_MALLOC, "enabled", args._nativemem >= 0); + if (args._nativemem >= 0) { + writeIntSetting(buf, T_MALLOC, "nativemem", args._nativemem); + // samplingInterval=-1 means every allocation is recorded (nativemem=0). + writeIntSetting(buf, T_MALLOC, "samplingInterval", args._nativemem == 0 ? -1 : args._nativemem); + } writeBoolSetting(buf, T_ACTIVE_RECORDING, "debugSymbols", VMStructs::libjvm()->hasDebugSymbols()); @@ -1575,6 +1581,21 @@ void Recording::recordAllocation(RecordingBuffer *buf, int tid, flushIfNeeded(buf); } +void Recording::recordMallocSample(Buffer *buf, int tid, u64 call_trace_id, + MallocEvent *event) { + int start = buf->skip(1); + buf->putVar64(T_MALLOC); + buf->putVar64(event->_start_time); + buf->putVar64(tid); + buf->putVar64(call_trace_id); + buf->putVar64(event->_address); + buf->putVar64(event->_size); + buf->putFloat(event->_weight); + writeCurrentContext(buf); + writeEventSizePrefix(buf, start); + flushIfNeeded(buf); +} + void Recording::recordHeapLiveObject(Buffer *buf, int tid, u64 call_trace_id, ObjectLivenessEvent *event) { int start = buf->skip(1); @@ -1817,6 +1838,9 @@ void FlightRecorder::recordEvent(int lock_index, int tid, u64 call_trace_id, case BCI_PARK: rec->recordThreadPark(buf, tid, call_trace_id, (LockEvent *)event); break; + case BCI_NATIVE_MALLOC: + rec->recordMallocSample(buf, tid, call_trace_id, (MallocEvent *)event); + break; } rec->flushIfNeeded(buf); rec->addThread(lock_index, tid); diff --git a/ddprof-lib/src/main/cpp/flightRecorder.h b/ddprof-lib/src/main/cpp/flightRecorder.h index 02efdebc0..e9aa3cde1 100644 --- a/ddprof-lib/src/main/cpp/flightRecorder.h +++ b/ddprof-lib/src/main/cpp/flightRecorder.h @@ -281,6 +281,8 @@ class Recording { void recordQueueTime(Buffer *buf, int tid, QueueTimeEvent *event); void recordAllocation(RecordingBuffer *buf, int tid, u64 call_trace_id, AllocEvent *event); + void recordMallocSample(Buffer *buf, int tid, u64 call_trace_id, + MallocEvent *event); void recordHeapLiveObject(Buffer *buf, int tid, u64 call_trace_id, ObjectLivenessEvent *event); void recordMonitorBlocked(Buffer *buf, int tid, u64 call_trace_id, diff --git a/ddprof-lib/src/main/cpp/hotspot/hotspotSupport.cpp b/ddprof-lib/src/main/cpp/hotspot/hotspotSupport.cpp index f1aa809f3..7e60be921 100644 --- a/ddprof-lib/src/main/cpp/hotspot/hotspotSupport.cpp +++ b/ddprof-lib/src/main/cpp/hotspot/hotspotSupport.cpp @@ -66,6 +66,8 @@ inline EventType eventTypeFromBCI(jint bci_type) { return LOCK_SAMPLE; case BCI_PARK: return PARK_SAMPLE; + case BCI_NATIVE_MALLOC: + return MALLOC_SAMPLE; default: // For unknown or invalid BCI types, default to EXECUTION_SAMPLE // This maintains backward compatibility and prevents undefined behavior @@ -756,7 +758,7 @@ int HotspotSupport::getJavaTraceAsync(void *ucontext, ASGCT_CallFrame *frames, } HotspotStackFrame frame(ucontext); - uintptr_t saved_pc, saved_sp, saved_fp; + uintptr_t saved_pc = 0, saved_sp = 0, saved_fp = 0; if (ucontext != NULL) { saved_pc = frame.pc(); saved_sp = frame.sp(); @@ -780,9 +782,13 @@ int HotspotSupport::getJavaTraceAsync(void *ucontext, ASGCT_CallFrame *frames, } return 1; } - } else { - return 0; } + // Ported from upstream async-profiler (Profiler::getJavaTraceAsync in + // src/profiler.cpp): when ucontext is NULL — as it is for malloc hooks, + // which run outside any signal context — skip the PC-dependent pre-checks + // and fall through to ASGCT. ASGCT then resolves the top Java frame from + // JavaThread::last_Java_sp / last_Java_pc, which the JVM populates on every + // Java → native transition. JVMJavaThreadState state = vm_thread->state(); bool in_java = (state == _thread_in_Java || state == _thread_in_Java_trans); @@ -955,7 +961,7 @@ int HotspotSupport::walkJavaStack(StackWalkRequest& request) { int java_frames = 0; if (features.mixed) { java_frames = walkVM(ucontext, frames, max_depth, features, eventTypeFromBCI(request.event_type), lock_index, truncated); - } else if (request.event_type == BCI_CPU || request.event_type == BCI_WALL) { + } else if (request.event_type == BCI_CPU || request.event_type == BCI_WALL || request.event_type == BCI_NATIVE_MALLOC) { if (cstack >= CSTACK_VM) { java_frames = walkVM(ucontext, frames, max_depth, features, eventTypeFromBCI(request.event_type), lock_index, truncated); } else { diff --git a/ddprof-lib/src/main/cpp/jfrMetadata.cpp b/ddprof-lib/src/main/cpp/jfrMetadata.cpp index 127f99c87..54e0f6a15 100644 --- a/ddprof-lib/src/main/cpp/jfrMetadata.cpp +++ b/ddprof-lib/src/main/cpp/jfrMetadata.cpp @@ -1,5 +1,6 @@ /* * Copyright 2020 Andrei Pangin + * Copyright 2026, Datadog, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -298,6 +299,18 @@ void JfrMetadata::initialize( << field("name", T_STRING, "Name") << field("count", T_LONG, "Count")) + << (type("profiler.Malloc", T_MALLOC, "malloc") + << category("Java Virtual Machine", "Native Memory") + << field("startTime", T_LONG, "Start Time", F_TIME_TICKS) + << field("eventThread", T_THREAD, "Event Thread", F_CPOOL) + << field("stackTrace", T_STACK_TRACE, "Stack Trace", F_CPOOL) + << field("address", T_LONG, "Address", F_ADDRESS) + << field("size", T_LONG, "Size", F_BYTES) + << field("weight", T_FLOAT, "Sample weight") + << field("spanId", T_LONG, "Span ID") + << field("localRootSpanId", T_LONG, "Local Root Span ID") || + contextAttributes) + << (type("jdk.OSInformation", T_OS_INFORMATION, "OS Information") << category("Operating System") << field("startTime", T_LONG, "Start Time", F_TIME_TICKS) diff --git a/ddprof-lib/src/main/cpp/jfrMetadata.h b/ddprof-lib/src/main/cpp/jfrMetadata.h index b6cfc054a..52c2e0ae8 100644 --- a/ddprof-lib/src/main/cpp/jfrMetadata.h +++ b/ddprof-lib/src/main/cpp/jfrMetadata.h @@ -1,5 +1,6 @@ /* * Copyright 2020 Andrei Pangin + * Copyright 2026, Datadog, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -78,6 +79,7 @@ enum JfrType { T_DATADOG_CLASSREF_CACHE = 124, T_DATADOG_COUNTER = 125, T_UNWIND_FAILURE = 126, + T_MALLOC = 127, T_ANNOTATION = 200, T_LABEL = 201, T_CATEGORY = 202, diff --git a/ddprof-lib/src/main/cpp/jvmSupport.cpp b/ddprof-lib/src/main/cpp/jvmSupport.cpp index 661eca786..6e3f5bc3a 100644 --- a/ddprof-lib/src/main/cpp/jvmSupport.cpp +++ b/ddprof-lib/src/main/cpp/jvmSupport.cpp @@ -20,7 +20,9 @@ int JVMSupport::walkJavaStack(StackWalkRequest& request) { if (VM::isHotspot()) { return HotspotSupport::walkJavaStack(request); } else if (VM::isOpenJ9() || VM::isZing()) { - assert(request.event_type == BCI_CPU || request.event_type == BCI_WALL); + assert(request.event_type == BCI_CPU || + request.event_type == BCI_WALL || + request.event_type == BCI_NATIVE_MALLOC); return asyncGetCallTrace(request.frames, request.max_depth, request.ucontext); } assert(false && "Unsupported JVM"); diff --git a/ddprof-lib/src/main/cpp/mallocTracer.cpp b/ddprof-lib/src/main/cpp/mallocTracer.cpp new file mode 100644 index 000000000..9974804d3 --- /dev/null +++ b/ddprof-lib/src/main/cpp/mallocTracer.cpp @@ -0,0 +1,403 @@ +/* + * Copyright The async-profiler authors + * Copyright 2026, Datadog, Inc. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include +#include +#include +#include +#include +#include "codeCache.h" +#include "libraries.h" +#include "mallocTracer.h" +#include "os.h" +#include "pidController.h" +#include "profiler.h" +#include "symbols.h" +#include "tsc.h" +#include "vmEntry.h" + +#ifdef __clang__ +# define NO_OPTIMIZE __attribute__((optnone)) +#else +# define NO_OPTIMIZE __attribute__((optimize("-fno-omit-frame-pointer,-fno-optimize-sibling-calls"))) +#endif + +#define SAVE_IMPORT(FUNC) \ + do { \ + void** _entry = lib->findImport(im_##FUNC); \ + if (_entry != NULL) _orig_##FUNC = (decltype(_orig_##FUNC))*_entry; \ + } while (0) + +static void* (*_orig_malloc)(size_t); +static void (*_orig_free)(void*); +static void* (*_orig_calloc)(size_t, size_t); +static void* (*_orig_realloc)(void*, size_t); +static int (*_orig_posix_memalign)(void**, size_t, size_t); +static void* (*_orig_aligned_alloc)(size_t, size_t); + +// Inline helper to avoid repeating the running+ret+size guard in each hook. +static inline void maybeRecord(void* ret, size_t size) { + if (MallocTracer::running() && ret && size) { + MallocTracer::recordMalloc(ret, size); + } +} + +extern "C" void* malloc_hook(size_t size) { + void* ret = _orig_malloc(size); + maybeRecord(ret, size); + return ret; +} + +extern "C" void* calloc_hook(size_t num, size_t size) { + void* ret = _orig_calloc(num, size); + // ret != NULL guarantees no overflow per POSIX, so num * size is safe. + if (MallocTracer::running() && ret && num && size) { + MallocTracer::recordMalloc(ret, num * size); + } + return ret; +} + +// Make sure this is not optimized away (function-scoped -fno-optimize-sibling-calls) +extern "C" NO_OPTIMIZE +void* calloc_hook_dummy(size_t num, size_t size) { + return _orig_calloc(num, size); +} + +extern "C" void* realloc_hook(void* addr, size_t size) { + void* ret = _orig_realloc(addr, size); + if (MallocTracer::running() && ret != NULL && size > 0) { + MallocTracer::recordMalloc(ret, size); + } + return ret; +} + +extern "C" int posix_memalign_hook(void** memptr, size_t alignment, size_t size) { + int ret = _orig_posix_memalign(memptr, alignment, size); + if (MallocTracer::running() && ret == 0 && memptr && *memptr && size) { + MallocTracer::recordMalloc(*memptr, size); + } + return ret; +} + +// Make sure this is not optimized away (function-scoped -fno-optimize-sibling-calls) +extern "C" NO_OPTIMIZE +int posix_memalign_hook_dummy(void** memptr, size_t alignment, size_t size) { + return _orig_posix_memalign(memptr, alignment, size); +} + +extern "C" void* aligned_alloc_hook(size_t alignment, size_t size) { + void* ret = _orig_aligned_alloc(alignment, size); + maybeRecord(ret, size); + return ret; +} + +volatile u64 MallocTracer::_interval; +volatile u64 MallocTracer::_bytes_until_sample; +u64 MallocTracer::_configured_interval; +volatile u64 MallocTracer::_sample_count; +volatile u64 MallocTracer::_last_config_update_ts; +volatile bool MallocTracer::_running = false; +PidController MallocTracer::_pid(MallocTracer::TARGET_SAMPLES_PER_WINDOW, + 31, 511, 3, MallocTracer::CONFIG_UPDATE_CHECK_PERIOD_SECS, 15); + +Mutex MallocHooker::_patch_lock; +int MallocHooker::_patched_libs = 0; +bool MallocHooker::_initialized = false; +void* MallocHooker::_calloc_hook_fn = nullptr; +void* MallocHooker::_posix_memalign_hook_fn = nullptr; + +// xoroshiro128+ PRNG state — shared, relaxed atomics. +// Benign races are acceptable: occasional duplicate output is harmless +// for a sampling PRNG and thread_local cannot be used on the malloc path. +static u64 _xo_state[2]; + +static pthread_t _current_thread; +static volatile bool _nested_malloc = false; +static volatile bool _nested_posix_memalign = false; + +// Test if calloc() implementation calls malloc() +static void* nested_malloc_hook(size_t size) { + if (pthread_self() == _current_thread) { + _nested_malloc = true; + } + return _orig_malloc(size); +} + +// Test if posix_memalign() implementation calls aligned_alloc() +static void* nested_aligned_alloc_hook(size_t alignment, size_t size) { + if (pthread_self() == _current_thread) { + _nested_posix_memalign = true; + } + return _orig_aligned_alloc(alignment, size); +} + +// In some implementations, specifically on musl, calloc() calls malloc() internally, +// and posix_memalign() calls aligned_alloc(). Detect such cases to prevent double-accounting. +void MallocHooker::detectNestedMalloc() { + if (_orig_malloc != NULL && _orig_calloc != NULL) { + CodeCache* libc = Libraries::instance()->findLibraryByAddress((void*)_orig_calloc); + if (libc != NULL) { + UnloadProtection handle(libc); + if (handle.isValid()) { + libc->patchImport(im_malloc, (void*)nested_malloc_hook); + + _current_thread = pthread_self(); + free(_orig_calloc(1, 1)); + _current_thread = pthread_t(0); + + // Restore original malloc so libc doesn't carry the probe hook until patchLibraries() runs. + libc->patchImport(im_malloc, (void*)_orig_malloc); + } + } + } + + if (_orig_posix_memalign != NULL && _orig_aligned_alloc != NULL) { + CodeCache* libc = Libraries::instance()->findLibraryByAddress((void*)_orig_posix_memalign); + if (libc != NULL) { + UnloadProtection handle(libc); + if (handle.isValid()) { + libc->patchImport(im_aligned_alloc, (void*)nested_aligned_alloc_hook); + + _current_thread = pthread_self(); + void* pm_probe = NULL; + _orig_posix_memalign(&pm_probe, sizeof(void*), sizeof(void*)); + _current_thread = pthread_t(0); + if (pm_probe != NULL) _orig_free(pm_probe); + + // Restore original aligned_alloc so libc doesn't carry the probe hook. + libc->patchImport(im_aligned_alloc, (void*)_orig_aligned_alloc); + } + } + } +} + +// Call each intercepted function at least once to ensure its GOT entry is updated +static void resolveMallocSymbols() { + static volatile intptr_t sink; + + void* p0 = malloc(1); + void* p1 = realloc(p0, 2); + if (p1 == NULL) { + // realloc failed; p0 is still valid and must be freed explicitly. + free(p0); + } + void* p2 = calloc(1, 1); + void* p3 = aligned_alloc(sizeof(void*), sizeof(void*)); + void* p4 = NULL; + if (posix_memalign(&p4, sizeof(void*), sizeof(void*)) == 0) free(p4); + free(p3); + free(p2); + free(p1); + + sink = (intptr_t)p1 + (intptr_t)p2 + (intptr_t)p3 + (intptr_t)p4; +} + +// Seed xoroshiro128+ state from a 64-bit value using splitmix64. +static void splitmix64_seed(u64 seed) { + seed += 0x9e3779b97f4a7c15ULL; + seed = (seed ^ (seed >> 30)) * 0xbf58476d1ce4e5b9ULL; + seed = (seed ^ (seed >> 27)) * 0x94d049bb133111ebULL; + __atomic_store_n(&_xo_state[0], seed ^ (seed >> 31), __ATOMIC_RELAXED); + seed += 0x9e3779b97f4a7c15ULL; + seed = (seed ^ (seed >> 30)) * 0xbf58476d1ce4e5b9ULL; + seed = (seed ^ (seed >> 27)) * 0x94d049bb133111ebULL; + __atomic_store_n(&_xo_state[1], seed ^ (seed >> 31), __ATOMIC_RELAXED); +} + +bool MallocHooker::initialize() { + if (_initialized) return _orig_malloc != NULL; + + CodeCache* lib = Libraries::instance()->findLibraryByAddress((void*)MallocTracer::recordMalloc); + if (lib == NULL) { + _initialized = true; + return false; + } + + resolveMallocSymbols(); + + SAVE_IMPORT(malloc); + SAVE_IMPORT(free); + SAVE_IMPORT(calloc); + SAVE_IMPORT(realloc); + SAVE_IMPORT(posix_memalign); + SAVE_IMPORT(aligned_alloc); + + detectNestedMalloc(); + + // Pre-compute hook pointers so patchLibraries() avoids repeated conditionals. + _calloc_hook_fn = _nested_malloc ? (void*)calloc_hook_dummy : (void*)calloc_hook; + _posix_memalign_hook_fn = _nested_posix_memalign ? (void*)posix_memalign_hook_dummy : (void*)posix_memalign_hook; + + lib->mark( + [](const char* s) -> bool { + return strcmp(s, "malloc_hook") == 0 + || strcmp(s, "calloc_hook") == 0 + || strcmp(s, "calloc_hook_dummy") == 0 + || strcmp(s, "realloc_hook") == 0 + || strcmp(s, "posix_memalign_hook") == 0 + || strcmp(s, "posix_memalign_hook_dummy") == 0 + || strcmp(s, "aligned_alloc_hook") == 0; + }, + MARK_ASYNC_PROFILER); + + splitmix64_seed(TSC::ticks()); + _initialized = true; + return _orig_malloc != NULL; +} + +// To avoid complexity in hooking and tracking reentrancy, a TLS-based approach is not used. +// Reentrant allocation calls would result in double-accounting. However, this does not impact +// the leak detector, as it correctly tracks memory as freed regardless of how many times +// recordMalloc is called with the same address. +void MallocHooker::patchLibraries() { + // If initialize() hasn't resolved _orig_malloc yet, advancing _patched_libs here + // would consume library slots without patching them, causing a later real call + // (from MallocTracer::start) to find _patched_libs == native_lib_count and skip + // all libraries. This happens when dlopen_hook fires during a non-nativemem session. + if (_orig_malloc == NULL) return; + + MutexLocker ml(_patch_lock); + + const CodeCacheArray& native_libs = Libraries::instance()->native_libs(); + int native_lib_count = native_libs.count(); + + // _patched_libs is intentionally monotonic: hooks are permanent and cannot be + // uninstalled safely (library unloading races). On profiler restart, only + // newly-loaded libraries need patching. + TEST_LOG("MallocHooker::patchLibraries: _patched_libs=%d native_lib_count=%d _orig_malloc=%p", + _patched_libs, native_lib_count, (void*)_orig_malloc); + while (_patched_libs < native_lib_count) { + CodeCache* cc = native_libs[_patched_libs++]; + + UnloadProtection handle(cc); + if (!handle.isValid()) { + TEST_LOG("MallocHooker::patchLibraries: skipping (invalid handle) %s", cc->name()); + continue; + } + + TEST_LOG("MallocHooker::patchLibraries: patching %s has_malloc=%d", + cc->name(), cc->findImport(im_malloc) != nullptr); + if (_orig_malloc) cc->patchImport(im_malloc, (void*)malloc_hook); + if (_orig_realloc) cc->patchImport(im_realloc, (void*)realloc_hook); + if (_orig_aligned_alloc) cc->patchImport(im_aligned_alloc, (void*)aligned_alloc_hook); + if (_orig_calloc) cc->patchImport(im_calloc, _calloc_hook_fn); + if (_orig_posix_memalign) cc->patchImport(im_posix_memalign, _posix_memalign_hook_fn); + } +} + +void MallocHooker::installHooks() { + patchLibraries(); +} + +static inline u64 xo_rotl(u64 x, int k) { + return (x << k) | (x >> (64 - k)); +} + +u64 MallocTracer::nextPoissonInterval() { + // xoroshiro128+ — relaxed atomics tolerate benign races on the shared state. + u64 s0 = __atomic_load_n(&_xo_state[0], __ATOMIC_RELAXED); + u64 s1 = __atomic_load_n(&_xo_state[1], __ATOMIC_RELAXED); + u64 result = s0 + s1; + s1 ^= s0; + __atomic_store_n(&_xo_state[0], xo_rotl(s0, 55) ^ s1 ^ (s1 << 14), __ATOMIC_RELAXED); + __atomic_store_n(&_xo_state[1], xo_rotl(s1, 36), __ATOMIC_RELAXED); + double u = (double)(result >> 11) / (double)(1ULL << 53); + if (u < 1e-18) u = 1e-18; + return (u64)(__atomic_load_n(&_interval, __ATOMIC_ACQUIRE) * -log(u)); +} + +bool MallocTracer::shouldSample(size_t size) { + if (__atomic_load_n(&_interval, __ATOMIC_ACQUIRE) <= 1) return true; + while (true) { + u64 prev = __atomic_load_n(&_bytes_until_sample, __ATOMIC_RELAXED); + if (size < prev) { + if (__atomic_compare_exchange_n(&_bytes_until_sample, &prev, prev - size, + false, __ATOMIC_ACQ_REL, __ATOMIC_RELAXED)) + return false; + } else { + u64 next = nextPoissonInterval(); + if (__atomic_compare_exchange_n(&_bytes_until_sample, &prev, next, + false, __ATOMIC_ACQ_REL, __ATOMIC_RELAXED)) + return true; + } + } +} + +void MallocTracer::updateConfiguration(u64 events, double time_coefficient) { + double signal = _pid.compute(events, time_coefficient); + int64_t new_interval = (int64_t)__atomic_load_n(&_interval, __ATOMIC_ACQUIRE) - (int64_t)signal; + if (new_interval < (int64_t)_configured_interval) + new_interval = (int64_t)_configured_interval; + if (new_interval > (int64_t)(1ULL << 40)) + new_interval = (int64_t)(1ULL << 40); + __atomic_store_n(&_interval, (u64)new_interval, __ATOMIC_RELEASE); +} + +void MallocTracer::recordMalloc(void* address, size_t size) { + if (shouldSample(size)) { + u64 current_interval = __atomic_load_n(&_interval, __ATOMIC_ACQUIRE); + MallocEvent event; + event._start_time = TSC::ticks(); + event._address = (uintptr_t)address; + event._size = size; + // _interval == 0 means sample every allocation; weight is 1.0. + if (size == 0 || current_interval <= 1) { + event._weight = 1.0f; + } else { + event._weight = (float)(1.0 / (1.0 - exp(-(double)size / (double)current_interval))); + } + + Profiler::instance()->recordSample(NULL, size, OS::threadId(), BCI_NATIVE_MALLOC, 0, &event); + + u64 current_samples = __atomic_add_fetch(&_sample_count, 1, __ATOMIC_RELAXED); + if ((current_samples % TARGET_SAMPLES_PER_WINDOW) == 0) { + u64 now = OS::nanotime(); + u64 prev_ts = __atomic_load_n(&_last_config_update_ts, __ATOMIC_ACQUIRE); + u64 time_diff = now - prev_ts; + u64 check_period_ns = (u64)CONFIG_UPDATE_CHECK_PERIOD_SECS * 1000000000ULL; + if (time_diff > check_period_ns) { + if (__atomic_compare_exchange_n(&_last_config_update_ts, &prev_ts, now, + false, __ATOMIC_ACQ_REL, __ATOMIC_RELAXED)) { + __atomic_fetch_sub(&_sample_count, current_samples, __ATOMIC_RELEASE); + updateConfiguration(current_samples, + (double)check_period_ns / time_diff); + } + } + } + } +} + +Error MallocTracer::start(Arguments& args) { + _configured_interval = args._nativemem > 0 ? args._nativemem : 0; + __atomic_store_n(&_interval, _configured_interval, __ATOMIC_RELEASE); + __atomic_store_n(&_bytes_until_sample, + _configured_interval > 1 ? nextPoissonInterval() : 0, + __ATOMIC_RELEASE); + __atomic_store_n(&_sample_count, (u64)0, __ATOMIC_RELEASE); + // Clear accumulated integral/derivative so a fresh session is not biased by + // state from a prior one (relevant for tests that stop and restart the profiler). + _pid.reset(); + __atomic_store_n(&_last_config_update_ts, OS::nanotime(), __ATOMIC_RELEASE); + + // initialize() is idempotent and returns false when symbol resolution fails. + if (!MallocHooker::initialize()) { + return Error("Failed to resolve malloc symbols; native memory profiling unavailable"); + } + + // Enable recording before patching so a concurrent dlopen() during patchLibraries() + // sees running()==true and patches the new library via installHooks(). + // _orig_* pointers are already resolved in initialize(), so this is safe. + __atomic_store_n(&_running, true, __ATOMIC_RELEASE); + MallocHooker::patchLibraries(); + + return Error::OK; +} + +void MallocTracer::stop() { + // Ideally, we should reset original malloc entries, but it's not currently safe + // in the view of library unloading. Consider using dl_iterate_phdr. + __atomic_store_n(&_running, false, __ATOMIC_RELEASE); +} diff --git a/ddprof-lib/src/main/cpp/mallocTracer.h b/ddprof-lib/src/main/cpp/mallocTracer.h new file mode 100644 index 000000000..d93c2e01e --- /dev/null +++ b/ddprof-lib/src/main/cpp/mallocTracer.h @@ -0,0 +1,71 @@ +/* + * Copyright The async-profiler authors + * Copyright 2026, Datadog, Inc. + * SPDX-License-Identifier: Apache-2.0 + */ + +#ifndef _MALLOCTRACER_H +#define _MALLOCTRACER_H + +#include +#include "engine.h" +#include "event.h" +#include "mutex.h" +#include "pidController.h" + +// Manages GOT-patching for malloc interception across all loaded native libraries. +class MallocHooker { + private: + static Mutex _patch_lock; + static int _patched_libs; + static bool _initialized; + static void* _calloc_hook_fn; + static void* _posix_memalign_hook_fn; + + static void detectNestedMalloc(); + + public: + // Returns true if symbols were successfully resolved. + static bool initialize(); + static void patchLibraries(); + static void installHooks(); +}; + +class MallocTracer : public Engine { + private: + static volatile u64 _interval; + static volatile u64 _bytes_until_sample; + + static u64 _configured_interval; + static volatile u64 _sample_count; + static volatile u64 _last_config_update_ts; + static const int CONFIG_UPDATE_CHECK_PERIOD_SECS = 1; + static const int TARGET_SAMPLES_PER_WINDOW = 100; + + static volatile bool _running; + static PidController _pid; + + static u64 nextPoissonInterval(); + static bool shouldSample(size_t size); + static void updateConfiguration(u64 events, double time_coefficient); + + public: + const char* name() { + return "MallocTracer"; + } + + Error start(Arguments& args); + void stop(); + + static inline bool running() { + return __atomic_load_n(&_running, __ATOMIC_ACQUIRE); + } + + static inline void installHooks() { + MallocHooker::installHooks(); + } + + static void recordMalloc(void* address, size_t size); +}; + +#endif // _MALLOCTRACER_H diff --git a/ddprof-lib/src/main/cpp/pidController.h b/ddprof-lib/src/main/cpp/pidController.h index 030530b3b..051356c4a 100644 --- a/ddprof-lib/src/main/cpp/pidController.h +++ b/ddprof-lib/src/main/cpp/pidController.h @@ -57,6 +57,14 @@ class PidController { _integral_value(0) {} double compute(u64 input, double time_delta_seconds); + + // Reset integrator/derivative state. Intended for tests and for cases where the + // controller must start from a clean slate (e.g. a new profiling session on a + // different workload). + inline void reset() { + _avg_error = 0; + _integral_value = 0; + } }; #endif \ No newline at end of file diff --git a/ddprof-lib/src/main/cpp/profiler.cpp b/ddprof-lib/src/main/cpp/profiler.cpp index 18a6230f1..dd8a52473 100644 --- a/ddprof-lib/src/main/cpp/profiler.cpp +++ b/ddprof-lib/src/main/cpp/profiler.cpp @@ -7,6 +7,7 @@ #include #include "profiler.h" #include "asyncSampleMutex.h" +#include "mallocTracer.h" #include "context.h" #include "context_api.h" #include "guards.h" @@ -59,6 +60,7 @@ static void (*orig_segvHandler)(int signo, siginfo_t *siginfo, void *ucontext); static void (*orig_busHandler)(int signo, siginfo_t *siginfo, void *ucontext); static Engine noop_engine; +static MallocTracer malloc_tracer; static PerfEvents perf_events; static WallClockASGCT wall_asgct_engine; static J9WallClock j9_engine; @@ -545,10 +547,9 @@ void Profiler::recordSample(void *ucontext, u64 counter, int tid, num_frames += getNativeTrace(ucontext, native_stop, event_type, tid, &java_ctx, &truncated, lock_index); assert(num_frames >= 0); - + int max_remaining = _max_stack_depth - num_frames; if (max_remaining > 0) { - // Walk Java frames if we have room, but only for mixed mode or CPU/Wall events with cstack enabled. For async events, we want to avoid walking Java frames in the signal handler if possible, since it can lead to deadlocks. Instead, we'll try to get the Java trace asynchronously after the signal handler returns. StackWalkRequest request = {event_type, lock_index, ucontext, frames + num_frames, max_remaining, &java_ctx, &truncated}; num_frames += JVMSupport::walkJavaStack(request); } @@ -576,6 +577,7 @@ void Profiler::recordSample(void *ucontext, u64 counter, int tid, _locks[lock_index].unlock(); } + void Profiler::recordWallClockEpoch(int tid, WallClockEpochEvent *event) { u32 lock_index = getLockIndex(tid); if (!_locks[lock_index].tryLock() && @@ -684,6 +686,7 @@ void *Profiler::dlopen_hook(const char *filename, int flags) { Libraries::instance()->updateSymbols(false); // Patch sigaction in newly loaded libraries LibraryPatcher::patch_sigaction(); + MallocTracer::installHooks(); // Extract build-ids for newly loaded libraries if remote symbolication is enabled Profiler* profiler = instance(); if (profiler != nullptr && profiler->_remote_symbolication) { @@ -1040,7 +1043,8 @@ Error Profiler::start(Arguments &args, bool reset) { (args._cpu >= 0 ? EM_CPU : 0) | (args._wall >= 0 ? EM_WALL : 0) | (args._record_allocations || args._record_liveness || args._gc_generations ? EM_ALLOC - : 0); + : 0) | + (args._nativemem >= 0 ? EM_NATIVEMEM : 0); if (_event_mask == 0) { return Error("No profiling events specified"); @@ -1136,8 +1140,8 @@ Error Profiler::start(Arguments &args, bool reset) { Log::warn("Branch stack is supported only with PMU events"); } else if (_cstack == CSTACK_VM) { if (!VMStructs::hasStackStructs()) { - return Error( - "VMStructs stack walking is not supported on this JVM/platform"); + _cstack = DWARF_SUPPORTED ? CSTACK_DWARF : CSTACK_NO; + Log::error("VMStructs stack walking is not supported on this JVM/platform, defaulting to the default native call stack unwinding mode."); } } @@ -1197,6 +1201,24 @@ Error Profiler::start(Arguments &args, bool reset) { } } } + if (_event_mask & EM_NATIVEMEM) { + error = malloc_tracer.start(args); + if (error) { + Log::warn("%s", error.message()); + if (_event_mask == EM_NATIVEMEM) { + // nativemem is the only requested mode: propagate the real error + disableEngines(); + switchLibraryTrap(false); + lockAll(); + _jfr.stop(); + unlockAll(); + return error; + } + error = Error::OK; // recoverable when other modes are also active + } else { + activated |= EM_NATIVEMEM; + } + } if (activated) { switchThreadEvents(JVMTI_ENABLE); @@ -1235,6 +1257,8 @@ Error Profiler::stop() { if (_event_mask & EM_ALLOC) _alloc_engine->stop(); + if (_event_mask & EM_NATIVEMEM) + malloc_tracer.stop(); if (_event_mask & EM_WALL) _wall_engine->stop(); if (_event_mask & EM_CPU) @@ -1283,6 +1307,9 @@ Error Profiler::check(Arguments &args) { _alloc_engine = selectAllocEngine(args); error = _alloc_engine->check(args); } + if (!error && args._nativemem >= 0) { + error = malloc_tracer.check(args); + } if (!error) { if (args._cstack == CSTACK_DWARF && !DWARF_SUPPORTED) { return Error("DWARF unwinding is not supported on this platform"); diff --git a/ddprof-lib/src/main/cpp/symbols_linux.cpp b/ddprof-lib/src/main/cpp/symbols_linux.cpp index bcf08b080..3193cff54 100644 --- a/ddprof-lib/src/main/cpp/symbols_linux.cpp +++ b/ddprof-lib/src/main/cpp/symbols_linux.cpp @@ -22,6 +22,7 @@ #include #include #include +#include "common.h" #include "symbols.h" #include "dwarf.h" #include "fdtransferClient.h" @@ -977,6 +978,7 @@ void Symbols::parseLibraries(CodeCacheArray* array, bool kernel_symbols) { } else if (lib.image_base == NULL) { // Unlikely case when image base has not been found: not safe to access program headers. // Be careful: executable file is not always ELF, e.g. classes.jsa + TEST_LOG("parseLibraries: image_base==NULL for %s, skipping program headers", lib.file); ElfParser::parseFile(cc, lib.map_start, lib.file, true); } else { // Parse debug symbols first @@ -985,6 +987,8 @@ void Symbols::parseLibraries(CodeCacheArray* array, bool kernel_symbols) { UnloadProtection handle(cc); if (handle.isValid()) { ElfParser::parseProgramHeaders(cc, lib.image_base, lib.map_end, OS::isMusl()); + } else { + TEST_LOG("parseLibraries: UnloadProtection invalid for %s, skipping program headers", lib.file); } } diff --git a/ddprof-lib/src/main/cpp/vmEntry.h b/ddprof-lib/src/main/cpp/vmEntry.h index 322436558..6be4c87bb 100644 --- a/ddprof-lib/src/main/cpp/vmEntry.h +++ b/ddprof-lib/src/main/cpp/vmEntry.h @@ -33,6 +33,7 @@ enum ASGCT_CallFrameType { BCI_THREAD_ID = -17, // method_id designates a thread BCI_ERROR = -18, // method_id is an error string BCI_NATIVE_FRAME_REMOTE = -19, // method_id points to RemoteFrameInfo for remote symbolication + BCI_NATIVE_MALLOC = -20, // native malloc/free sample (size stored in counter) }; // See hotspot/src/share/vm/prims/forte.cpp diff --git a/ddprof-test-native/src/main/cpp/nativealloc.c b/ddprof-test-native/src/main/cpp/nativealloc.c new file mode 100644 index 000000000..2fb080523 --- /dev/null +++ b/ddprof-test-native/src/main/cpp/nativealloc.c @@ -0,0 +1,21 @@ +/* + * Copyright 2026, Datadog, Inc. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include +#include + +/* Drive malloc through libddproftest.so's PLT, which is patched by MallocTracer. + * ByteBuffer.allocateDirect() routes through libjvm.so which may use -Bsymbolic-functions, + * binding malloc internally and bypassing GOT patching entirely. */ +JNIEXPORT void JNICALL +Java_com_datadoghq_profiler_nativemem_NativeAllocHelper_nativeMalloc( + JNIEnv *env, jclass clazz, jlong size, jint count) { + for (jint i = 0; i < count; i++) { + void *p = malloc((size_t)size); + if (p != NULL) { + free(p); + } + } +} diff --git a/ddprof-test/src/test/java/com/datadoghq/profiler/AbstractProfilerTest.java b/ddprof-test/src/test/java/com/datadoghq/profiler/AbstractProfilerTest.java index 53137f8df..de75c2f06 100644 --- a/ddprof-test/src/test/java/com/datadoghq/profiler/AbstractProfilerTest.java +++ b/ddprof-test/src/test/java/com/datadoghq/profiler/AbstractProfilerTest.java @@ -286,8 +286,15 @@ private void checkConfig() { long wallIntervalMillis = wallIntervalAccessor.getMember(item).longValueIn(MILLISECOND); if (!Platform.isJ9() && Platform.isJavaVersionAtLeast(11)) { // fixme J9 engine have weird defaults and need fixing - assertEquals(cpuInterval.toMillis(), cpuIntervalMillis); - assertEquals(wallInterval.toMillis(), wallIntervalMillis); + // Only assert intervals that were explicitly requested in the profiler + // command; engines not requested carry default intervals that do not + // match the (absent) command value. + if (cpuInterval.toMillis() > 0) { + assertEquals(cpuInterval.toMillis(), cpuIntervalMillis); + } + if (wallInterval.toMillis() > 0) { + assertEquals(wallInterval.toMillis(), wallIntervalMillis); + } } } } diff --git a/ddprof-test/src/test/java/com/datadoghq/profiler/junit/CStackInjector.java b/ddprof-test/src/test/java/com/datadoghq/profiler/junit/CStackInjector.java index 75dfda181..7f4c9534f 100644 --- a/ddprof-test/src/test/java/com/datadoghq/profiler/junit/CStackInjector.java +++ b/ddprof-test/src/test/java/com/datadoghq/profiler/junit/CStackInjector.java @@ -191,7 +191,11 @@ public Object resolveParameter(ParameterContext parameterContext, ExtensionConte method.invoke(testInstance); } } - testMethod.invoke(testInstance, parameter); + if (testMethod.getParameterCount() == 0) { + testMethod.invoke(testInstance); + } else { + testMethod.invoke(testInstance, parameter); + } return; // If the test passes, stop retrying } catch (InvocationTargetException e) { throwable = e.getTargetException(); diff --git a/ddprof-test/src/test/java/com/datadoghq/profiler/nativemem/NativeAllocHelper.java b/ddprof-test/src/test/java/com/datadoghq/profiler/nativemem/NativeAllocHelper.java new file mode 100644 index 000000000..e0dbf5352 --- /dev/null +++ b/ddprof-test/src/test/java/com/datadoghq/profiler/nativemem/NativeAllocHelper.java @@ -0,0 +1,15 @@ +/* + * Copyright 2026, Datadog, Inc. + * SPDX-License-Identifier: Apache-2.0 + */ +package com.datadoghq.profiler.nativemem; + +final class NativeAllocHelper { + static { + System.loadLibrary("ddproftest"); + } + + static native void nativeMalloc(long size, int count); + + private NativeAllocHelper() {} +} diff --git a/ddprof-test/src/test/java/com/datadoghq/profiler/nativemem/NativememProfilerTest.java b/ddprof-test/src/test/java/com/datadoghq/profiler/nativemem/NativememProfilerTest.java new file mode 100644 index 000000000..c62f0ed43 --- /dev/null +++ b/ddprof-test/src/test/java/com/datadoghq/profiler/nativemem/NativememProfilerTest.java @@ -0,0 +1,113 @@ +/* + * Copyright 2026, Datadog, Inc. + * SPDX-License-Identifier: Apache-2.0 + */ +package com.datadoghq.profiler.nativemem; + +import com.datadoghq.profiler.CStackAwareAbstractProfilerTest; +import com.datadoghq.profiler.Platform; +import com.datadoghq.profiler.junit.CStack; +import com.datadoghq.profiler.junit.RetryTest; +import org.junit.jupiter.api.Assumptions; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.TestTemplate; +import org.junit.jupiter.params.provider.ValueSource; +import org.openjdk.jmc.common.item.IAttribute; +import org.openjdk.jmc.common.item.IItem; +import org.openjdk.jmc.common.item.IItemCollection; +import org.openjdk.jmc.common.item.IItemIterable; +import org.openjdk.jmc.common.item.IMemberAccessor; +import org.openjdk.jmc.common.unit.IQuantity; + +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertTrue; +import static org.openjdk.jmc.common.item.Attribute.attr; +import static org.openjdk.jmc.common.unit.UnitLookup.ADDRESS; + +/** + * Smoke tests for native memory (malloc) profiling. + * + *

Runs with {@code cstack=vm}, {@code cstack=vmx}, {@code cstack=dwarf}, and + * {@code cstack=fp}. All modes produce usable Java stacks for malloc events: + * vm/vmx seed from {@code callerPC()}/{@code JavaFrameAnchor} via + * {@code HotspotSupport::walkVM}; dwarf/fp hand a {@code NULL ucontext} to + * {@code AsyncGetCallTrace}, which falls back to the JavaFrameAnchor populated + * by the Java → native transition. + */ +public class NativememProfilerTest extends CStackAwareAbstractProfilerTest { + + private static final IAttribute MALLOC_ADDRESS = attr("address", "address", "", ADDRESS); + + @BeforeAll + static void preloadNativeLib() { + // Ensure libddproftest.so is loaded before the profiler starts in @BeforeEach. + // patchLibraries() only patches libraries already in native_libs at call time; + // if the library loads after start() via dlopen_hook, glibc JVMs may not forward + // the System.loadLibrary dlopen through the patched GOT entry. + NativeAllocHelper.nativeMalloc(0, 0); + } + + public NativememProfilerTest(@CStack String cstack) { + super(cstack); + } + + @Override + protected String getProfilerCommand() { + return "nativemem=0"; // sample every allocation + } + + @Override + protected boolean isPlatformSupported() { + return Platform.isLinux() && !Platform.isJ9() && !Platform.isZing(); + } + + @RetryTest(3) + @TestTemplate + @ValueSource(strings = {"vm", "vmx", "dwarf", "fp"}) + public void shouldRecordMallocSamples() throws InterruptedException { + // GOT patching conflicts with ASan/TSan interceptors: both replace malloc/free + // symbols, causing undefined behavior or crashes when hooks chain into each other. + Assumptions.assumeFalse(isAsan() || isTsan()); + + triggerAllocations(1000); + + stopProfiler(); + + IItemCollection events = verifyEvents("profiler.Malloc"); + boolean foundMinSize = false; + for (IItemIterable items : events) { + IMemberAccessor sizeAccessor = SIZE.getAccessor(items.getType()); + IMemberAccessor weightAccessor = WEIGHT.getAccessor(items.getType()); + IMemberAccessor addrAccessor = MALLOC_ADDRESS.getAccessor(items.getType()); + if (sizeAccessor == null) { + continue; + } + assertNotNull(addrAccessor, "profiler.Malloc events must carry an address field"); + assertNotNull(weightAccessor, "profiler.Malloc events must carry a weight field"); + for (IItem item : items) { + IQuantity size = sizeAccessor.getMember(item); + assertNotNull(size, "profiler.Malloc event must have a non-null size field"); + assertTrue(size.longValue() > 0, "allocation size must be positive"); + if (size.longValue() >= 1024) { + foundMinSize = true; + } + IQuantity addr = addrAccessor.getMember(item); + assertTrue(addr == null || addr.longValue() != 0, "malloc address must not be zero"); + // nativemem=0 samples every allocation; weight must be exactly 1.0. + IQuantity weight = weightAccessor.getMember(item); + assertNotNull(weight, "profiler.Malloc event must have a non-null weight field"); + assertTrue(Math.abs(weight.doubleValue() - 1.0) < 1e-6, + "weight must be 1.0 for nativemem=0 (all allocations sampled), got " + weight.doubleValue()); + } + } + assertTrue(foundMinSize, "expected at least one malloc event with size >= 1024 bytes"); + + // triggerAllocations is a Java wrapper so it appears in all cstack modes, including fp/dwarf. + verifyStackTraces("profiler.Malloc", "triggerAllocations", "shouldRecordMallocSamples"); + } + + private static void triggerAllocations(int count) { + NativeAllocHelper.nativeMalloc(1024, count); + } + +} diff --git a/ddprof-test/src/test/java/com/datadoghq/profiler/nativemem/NativememSampledProfilerTest.java b/ddprof-test/src/test/java/com/datadoghq/profiler/nativemem/NativememSampledProfilerTest.java new file mode 100644 index 000000000..2a8a71cb0 --- /dev/null +++ b/ddprof-test/src/test/java/com/datadoghq/profiler/nativemem/NativememSampledProfilerTest.java @@ -0,0 +1,97 @@ +/* + * Copyright 2026, Datadog, Inc. + * SPDX-License-Identifier: Apache-2.0 + */ +package com.datadoghq.profiler.nativemem; + +import com.datadoghq.profiler.CStackAwareAbstractProfilerTest; +import com.datadoghq.profiler.Platform; +import com.datadoghq.profiler.junit.CStack; +import com.datadoghq.profiler.junit.RetryTest; +import org.junit.jupiter.api.Assumptions; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.TestTemplate; +import org.junit.jupiter.params.provider.ValueSource; +import org.openjdk.jmc.common.item.IItem; +import org.openjdk.jmc.common.item.IItemCollection; +import org.openjdk.jmc.common.item.IItemIterable; +import org.openjdk.jmc.common.item.IMemberAccessor; +import org.openjdk.jmc.common.unit.IQuantity; + +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertTrue; + +/** + * Covers the sampled path of the native memory profiler (interval > 1), which + * exercises the Poisson interval generator, PID controller update, and the + * {@code 1 / (1 - exp(-size/interval))} weight formula. The smoke test class + * {@link NativememProfilerTest} only covers {@code nativemem=0}, which bypasses + * these code paths via the {@code _interval <= 1} fast path. + */ +public class NativememSampledProfilerTest extends CStackAwareAbstractProfilerTest { + + @BeforeAll + static void preloadNativeLib() { + // Same as NativememProfilerTest: load libddproftest.so before the profiler starts + // so patchLibraries() finds it in native_libs and patches its malloc GOT entry. + NativeAllocHelper.nativeMalloc(0, 0); + } + + public NativememSampledProfilerTest(@CStack String cstack) { + super(cstack); + } + + @Override + protected String getProfilerCommand() { + return "nativemem=512"; + } + + @Override + protected boolean isPlatformSupported() { + return Platform.isLinux() && !Platform.isJ9() && !Platform.isZing(); + } + + @RetryTest(3) + @TestTemplate + @ValueSource(strings = {"vm", "vmx", "dwarf", "fp"}) + public void shouldEmitWeightedMallocSamples() throws InterruptedException { + // GOT patching conflicts with ASan/TSan interceptors. + Assumptions.assumeFalse(isAsan() || isTsan()); + + // Drive enough allocation volume through malloc to yield several Poisson samples. + triggerAllocations(20_000); + + stopProfiler(); + + IItemCollection events = verifyEvents("profiler.Malloc"); + int sampleCount = 0; + for (IItemIterable items : events) { + IMemberAccessor sizeAccessor = SIZE.getAccessor(items.getType()); + IMemberAccessor weightAccessor = WEIGHT.getAccessor(items.getType()); + assertNotNull(sizeAccessor, "profiler.Malloc events must carry a size field"); + assertNotNull(weightAccessor, "profiler.Malloc events must carry a weight field"); + for (IItem item : items) { + IQuantity size = sizeAccessor.getMember(item); + IQuantity weight = weightAccessor.getMember(item); + assertNotNull(size, "profiler.Malloc event must have a non-null size field"); + assertNotNull(weight, "profiler.Malloc event must have a non-null weight field"); + // Weight is 1 / (1 - exp(-size/interval)); that function is strictly > 1 + // for all positive sizes, so any Poisson-sampled event must carry weight >= 1. + assertTrue(weight.doubleValue() >= 1.0, + "weight must be >= 1.0 on the sampled path, got " + weight.doubleValue() + + " (size=" + size.longValue() + ")"); + sampleCount++; + } + } + + // With ~20M bytes allocated and a 512-byte interval we expect plenty of samples. + // The assertion is loose to tolerate CI variance but tight enough to catch + // regressions where the sampling path silently produces zero events. + assertTrue(sampleCount >= 8, + "expected at least 8 sampled malloc events, got " + sampleCount); + } + + private static void triggerAllocations(int count) { + NativeAllocHelper.nativeMalloc(1024, count); + } +} diff --git a/doc/architecture/NativeMemoryProfiling.md b/doc/architecture/NativeMemoryProfiling.md new file mode 100644 index 000000000..b49daefd3 --- /dev/null +++ b/doc/architecture/NativeMemoryProfiling.md @@ -0,0 +1,336 @@ +# Native Memory Allocation Profiling + +## Overview + +The native memory profiler tracks heap allocations made through the C standard +library (`malloc`, `calloc`, `realloc`, `posix_memalign`, `aligned_alloc`). It +instruments these functions at the GOT (Global Offset Table) level so that every +intercepted call is accounted for without modifying application source code or +requiring a custom allocator. The `free` function is also hooked (to forward calls +correctly through the GOT) but free events are not recorded. + +Sampled allocation events carry a full Java + native stack trace and are emitted as +`profiler.Malloc` JFR events. + +The feature is activated by passing `nativemem=` to the profiler, where +`` is the byte-sampling interval (e.g. `nativemem=524288` samples roughly +one event per 512 KiB allocated). Passing `nativemem=0` records every allocation. + +--- + +## Component Map + +``` + Application code + │ malloc() / calloc() / realloc() / free() / … + ▼ + ┌─────────────┐ GOT patch ┌──────────────────────────┐ + │ libc / musl│ ◄────────── │ malloc_hook / free_hook │ mallocTracer.cpp + └─────────────┘ │ calloc_hook / … │ + └────────────┬─────────────┘ + │ recordMalloc + ▼ + ┌──────────────────────────┐ + │ MallocTracer:: │ mallocTracer.cpp/h + │ shouldSample() │ + │ recordMalloc() ──────► │ profiler.cpp + └────────────┬─────────────┘ + │ walkVM (CSTACK_VM) + ▼ + ┌──────────────────────────┐ + │ JFR buffer │ flightRecorder.cpp + │ profiler.Malloc │ + └──────────────────────────┘ +``` + +--- + +## GOT Patching + +The profiler redirects allocator calls by writing hook function addresses directly +into the importing library's GOT. This is cheaper than `LD_PRELOAD` (no process +restart) and works for libraries loaded at any time. + +### Import IDs + +`codeCache.h` defines an `ImportId` enum with one entry per hooked symbol: + +``` +im_malloc, im_calloc, im_realloc, im_free, im_posix_memalign, im_aligned_alloc +``` + +`CodeCache::patchImport(ImportId, void*)` walks the library's PLT/GOT and overwrites +the matching entry. + +### Hook signatures + +Each hook calls the saved original function first, then records the event: + +| Hook | Calls | Records | +|------|-------|---------| +| `malloc_hook(size)` | `_orig_malloc(size)` | `recordMalloc(ret, size)` if `ret != NULL && size != 0` | +| `calloc_hook(num, size)` | `_orig_calloc(num, size)` | `recordMalloc(ret, total)` if `ret != NULL && num != 0 && size != 0` (total = num×size, clamped to `SIZE_MAX` on overflow) | +| `realloc_hook(addr, size)` | `_orig_realloc(addr, size)` | `recordMalloc(ret, size)` if `ret != NULL && size > 0` | +| `free_hook(addr)` | `_orig_free(addr)` | — (forwards only) | +| `posix_memalign_hook(…)` | `_orig_posix_memalign(…)` | `recordMalloc(*memptr, size)` if `ret == 0 && memptr != NULL && *memptr != NULL && size != 0` | +| `aligned_alloc_hook(align, size)` | `_orig_aligned_alloc(align, size)` | `recordMalloc(ret, size)` if `ret != NULL && size != 0` | + +--- + +## Initialization Sequence + +`MallocTracer::start()` (called once per profiler session) runs: + +1. Resets per-session counters (`_interval`, `_bytes_until_sample`, `_sample_count`, + `_last_config_update_ts`). + +2. On the **first call only** (`!_initialized`), calls `initialize()`: + + a. **`resolveMallocSymbols()`** — calls each intercepted function at least once so + the profiler library's own PLT stubs are resolved by the dynamic linker. This + ensures that subsequent `SAVE_IMPORT` reads get the real libc function pointers + rather than the PLT resolver. + + b. **`SAVE_IMPORT(func)`** — reads the resolved GOT entry for each symbol from the + profiler library's own import table and stores it in the corresponding + `_orig_` static pointer. + + c. **`detectNestedMalloc()`** — probes whether the platform's `calloc` + implementation calls `malloc` internally (as musl does), and whether + `posix_memalign` calls `aligned_alloc` internally. If either is detected, the + corresponding hook is replaced with a dummy variant (`calloc_hook_dummy` or + `posix_memalign_hook_dummy`) that forwards to the original without recording, + preventing double-accounting. The dummy hooks preserve the caller frame pointer + so that the actual call site is not obscured. + + d. **`lib->mark(...)`** — marks the profiler's own hook functions in the code cache + so the stack walker can identify them as profiler frames. + + Then sets `_initialized = true`. + +3. **`patchLibraries()`** — iterates over all currently loaded native libraries and + writes the hook addresses into each library's GOT, under `_patch_lock`. + `_patched_libs` is a monotonic counter so that already-patched libraries are + skipped on subsequent calls. + +4. Sets `_running = true` to enable recording. + +`patchLibraries()` is called again on every `start()` to pick up any libraries +loaded between profiler sessions. + +--- + +## Dynamic Library Handling + +When the application calls `dlopen`, the profiler's `dlopen_hook` (installed as a +GOT hook for `dlopen`) calls `MallocTracer::installHooks()` after the library is +loaded: + +```cpp +// profiler.cpp +void* Profiler::dlopen_hook(const char* filename, int flags) { + void* result = dlopen(filename, flags); + if (result != NULL) { + Libraries::instance()->updateSymbols(false); + MallocTracer::installHooks(); + } + return result; +} +``` + +`installHooks()` calls `patchLibraries()` only if `_running` is `true`, so newly +loaded libraries are automatically hooked without requiring a profiler restart. + +--- + +## Sampling + +Allocation recording uses Poisson-interval sampling via `MallocTracer::shouldSample()`: + +```cpp +// mallocTracer.cpp — lock-free CAS loop with Poisson jitter +static bool shouldSample(size_t size) { + if (_interval <= 1) return true; // nativemem=0 or nativemem=1: record every allocation + while (true) { + u64 prev = _bytes_until_sample; + if (size < prev) { + if (__sync_bool_compare_and_swap(&_bytes_until_sample, prev, prev - size)) + return false; + } else { + u64 next = nextPoissonInterval(); + if (__sync_bool_compare_and_swap(&_bytes_until_sample, prev, next)) + return true; + } + } +} +``` + +`_bytes_until_sample` is a shared volatile counter decremented by each allocation's +size. When exhausted, a new Poisson-distributed interval is generated via +`nextPoissonInterval()` (using `-interval * ln(uniform_random)` where the random +value is derived from TSC ticks via XOR-shift), providing random jitter that avoids +synchronization artifacts. Multiple threads compete via CAS so no mutex is needed. + +A PID controller (`updateConfiguration()`) periodically adjusts `_interval` to +maintain approximately `TARGET_SAMPLES_PER_WINDOW` (100) samples per second. + +--- + +## Stack Trace Capture + +### Why `CSTACK_VM` is needed + +The malloc hooks execute on the calling thread with no signal context (`ucontext == +NULL`). Two distinct levels of stack capture are possible: + +- **Java-only stacks** (`CSTACK_DEFAULT`, `CSTACK_FP`, `CSTACK_DWARF`): Java frames + are still available via ASGCT / `JavaFrameAnchor`. When `ucontext == NULL`, the + profiler falls through to ASGCT so these modes do produce Java-level traces for + malloc events. + +- **Interleaved native + Java stacks** (`CSTACK_VM` only): Native frame unwinding + via frame pointers or DWARF requires a signal context as the starting point. + `CSTACK_VM` avoids this by seeding the unwind from `callerPC()` (no signal context + needed) and transitioning to Java frames via HotSpot's `JavaFrameAnchor`. + +`CSTACK_VM` starts from `callerPC()` (which expands to `__builtin_return_address(0)` +on x86/x86_64/aarch64) for the initial frame and uses HotSpot's `JavaFrameAnchor` +(lastJavaPC / lastJavaSP / lastJavaFP) to transition from native to Java frames. +This works correctly from inside a malloc hook because the anchor is set whenever +the JVM has transitioned from Java to native. + +### Default stack mode + +`CSTACK_DEFAULT` is the initial default (`arguments.h`). At profiler start, +`profiler.cpp` promotes it to `CSTACK_VM` when VMStructs are available **and the OS +is Linux**. If neither condition is met, it falls back to `CSTACK_DWARF` (if +supported) or `CSTACK_NO`: + +```cpp +if (_cstack == CSTACK_DEFAULT) { + if (VMStructs::hasStackStructs() && OS::isLinux()) { + _cstack = CSTACK_VM; + } else if (DWARF_SUPPORTED) { + _cstack = CSTACK_DWARF; + } +} +``` + +If `CSTACK_VM` is explicitly requested but `VMStructs` are not available, the +profiler resets to `CSTACK_DWARF` (if supported) or `CSTACK_NO` and logs an error: + +```cpp +} else if (_cstack == CSTACK_VM) { + if (!VMStructs::hasStackStructs()) { + _cstack = DWARF_SUPPORTED ? CSTACK_DWARF : CSTACK_NO; + Log::error("VMStructs stack walking is not supported on this JVM/platform, defaulting to the default native call stack unwinding mode."); + } +} +``` + +### Code path for malloc stack walking + +`recordSample` in `profiler.cpp` calls `getNativeTrace()` first. For +`_cstack >= CSTACK_VM`, `getNativeTrace` returns 0 immediately (native frames are +not collected via `walkFP`/`walkDwarf`). Then `JVMSupport::walkJavaStack()` is +called, which dispatches to `HotspotSupport::walkJavaStack()`: + +```cpp +// hotspot/hotspotSupport.cpp — walkJavaStack for malloc events +} else if (request.event_type == BCI_CPU || request.event_type == BCI_WALL || request.event_type == BCI_NATIVE_MALLOC) { + if (cstack >= CSTACK_VM) { + java_frames = walkVM(ucontext, frames, max_depth, features, + eventTypeFromBCI(request.event_type), + lock_index, truncated); + } + // ... +} +``` + +`HotspotSupport::walkVM` is the sole source of both native and Java frames for +malloc events. When called with `ucontext == NULL` (as it is for malloc hooks), +it seeds the unwind with `callerPC()` / `callerSP()` / `callerFP()`. + +--- + +## JFR Event Format + +A single event type is defined in `jfrMetadata.cpp` under the +`Java Virtual Machine / Native Memory` category: + +### `profiler.Malloc` (`T_MALLOC`) + +| Field | Type | Description | +|-------|------|-------------| +| `startTime` | `long` (ticks) | TSC timestamp of the allocation | +| `eventThread` | thread ref | Thread that performed the allocation | +| `stackTrace` | stack trace ref | Call stack at the allocation site | +| `address` | `long` (address) | Returned pointer value | +| `size` | `long` (bytes) | Requested allocation size | +| `weight` | `float` | Statistical sample weight based on Poisson sampling probability | +| `spanId` | `long` | Span ID from current context (optional, from context attributes) | +| `localRootSpanId` | `long` | Local root span ID from current context (optional, from context attributes) | + +Events are written by `Recording::recordMallocSample()` in `flightRecorder.cpp`: + +```cpp +buf->putVar64(T_MALLOC); +buf->putVar64(event->_start_time); +buf->putVar32(tid); +buf->putVar64(call_trace_id); +buf->putVar64(event->_address); +buf->putVar64(event->_size); +buf->putFloat(event->_weight); +writeCurrentContext(buf); +``` + +--- + +## Concurrency and Thread Safety + +| Concern | Mechanism | +|---------|-----------| +| GOT patching across threads | `_patch_lock` (Mutex) in `patchLibraries()` | +| Library unload during patching | `UnloadProtection` handle per library | +| Allocation byte counter | Lock-free CAS loop in `shouldSample` | +| JFR buffer writes | Per-lock-index try-lock with 3 attempts; events dropped on contention | +| Hook enable / disable | `volatile bool _running` — checked before every recording call | +| `_initialized` write ordering | Serialized by the profiler's outer state lock (caller responsibility) | + +--- + +## Known Limitations and Design Trade-offs + +**No reentrancy guard.** As documented in `mallocTracer.cpp`: + +> To avoid complexity in hooking and tracking reentrancy, a TLS-based approach is +> not used. Reentrant allocation calls would result in double-accounting. + +When `recordMalloc` calls into the profiler (stack walking, JFR buffer writes), any +allocations made by the profiler itself will re-enter the hooks. Infinite recursion +is prevented because the hook functions call `_orig_malloc` (a saved direct function +pointer) instead of going through the GOT, but profiler-internal allocations may be +double-counted as application allocations. +Leak detection is unaffected: the same address being recorded multiple times is +handled correctly by the tracking logic. + +**Hooks are never uninstalled.** `stop()` only sets `_running = false`. The GOT +entries remain patched for the lifetime of the process. After stopping, every +malloc/free incurs the overhead of one function-pointer indirection plus a volatile +bool read, which is negligible in practice. Uninstalling hooks safely would require +iterating all libraries again under `_patch_lock`, which is deferred. + +**`nativemem=0` records every allocation.** When `_interval == 0`, +`shouldSample` returns `true` on every call (the `interval <= 1` fast path). This +is intentional for 100% sampling but can produce very high event volumes. + +**No free event tracking.** Free calls are hooked (to forward through the GOT +correctly) but not recorded. Sampled mallocs mean most frees would match nothing, +and the immense event volume with no stack traces provides no actionable insight. + +**HotSpot / Linux only for interleaved native+Java stack traces.** `CSTACK_VM` +requires `VMStructs::hasStackStructs() && OS::isLinux()`, which is only true on +HotSpot JVMs on Linux. On other platforms the profiler falls back to `CSTACK_DWARF` +(if supported) or `CSTACK_DEFAULT`. Native frames are still captured via FP/DWARF +unwinding and Java frames via ASGCT, but they are not interleaved through +`JavaFrameAnchor` as they are with `CSTACK_VM`.