diff --git a/ddprof-lib/src/main/cpp/javaApi.cpp b/ddprof-lib/src/main/cpp/javaApi.cpp index b886681fe..76cdb104e 100644 --- a/ddprof-lib/src/main/cpp/javaApi.cpp +++ b/ddprof-lib/src/main/cpp/javaApi.cpp @@ -530,7 +530,7 @@ Java_com_datadoghq_profiler_OTelContext_readProcessCtx0(JNIEnv *env, jclass unus #endif } -extern "C" DLLEXPORT jobjectArray JNICALL +extern "C" DLLEXPORT jobject JNICALL Java_com_datadoghq_profiler_JavaProfiler_initializeContextTLS0(JNIEnv* env, jclass unused, jlongArray metadata) { ProfiledThread* thrd = ProfiledThread::current(); assert(thrd != nullptr); @@ -541,8 +541,21 @@ Java_com_datadoghq_profiler_JavaProfiler_initializeContextTLS0(JNIEnv* env, jcla OtelThreadContextRecord* record = thrd->getOtelContextRecord(); + // Contiguity of record + tag_encodings + LRS is enforced by alignas(8) on _otel_ctx_record + // plus sizeof(OtelThreadContextRecord) being a multiple of 8 (see thread.h). + // Compile-time alignment check always runs; runtime pointer-layout check is debug-only. + static_assert(DD_TAGS_CAPACITY * sizeof(u32) % alignof(u64) == 0, + "tag encodings array size must be aligned to u64 for contiguous sidecar layout"); +#ifdef DEBUG + uint8_t* record_start = reinterpret_cast(record); + uint8_t* sidecar_start = reinterpret_cast(thrd->getOtelTagEncodingsPtr()); + assert(sidecar_start == record_start + OTEL_MAX_RECORD_SIZE + && "_otel_ctx_record and _otel_tag_encodings must be contiguous"); +#endif + // Fill metadata[6]: [VALID_OFFSET, TRACE_ID_OFFSET, SPAN_ID_OFFSET, - // ATTRS_DATA_SIZE_OFFSET, ATTRS_DATA_OFFSET, LRS_SIDECAR_OFFSET] + // ATTRS_DATA_SIZE_OFFSET, ATTRS_DATA_OFFSET, LRS_OFFSET]. + // All offsets are absolute within the unified buffer returned below. if (metadata != nullptr && env->GetArrayLength(metadata) >= 6) { jlong meta[6]; meta[0] = (jlong)offsetof(OtelThreadContextRecord, valid); @@ -550,26 +563,15 @@ Java_com_datadoghq_profiler_JavaProfiler_initializeContextTLS0(JNIEnv* env, jcla meta[2] = (jlong)offsetof(OtelThreadContextRecord, span_id); meta[3] = (jlong)offsetof(OtelThreadContextRecord, attrs_data_size); meta[4] = (jlong)offsetof(OtelThreadContextRecord, attrs_data); - meta[5] = (jlong)(DD_TAGS_CAPACITY * sizeof(u32)); // LRS sidecar offset in sidecar buffer + meta[5] = (jlong)(OTEL_MAX_RECORD_SIZE + DD_TAGS_CAPACITY * sizeof(u32)); env->SetLongArrayRegion(metadata, 0, 6, meta); } - // Create 2 DirectByteBuffers: [record, sidecar] - jclass bbClass = env->FindClass("java/nio/ByteBuffer"); - jobjectArray result = env->NewObjectArray(2, bbClass, nullptr); - - // recordBuffer: 640 bytes over the OtelThreadContextRecord - jobject recordBuf = env->NewDirectByteBuffer((void*)record, (jlong)OTEL_MAX_RECORD_SIZE); - env->SetObjectArrayElement(result, 0, recordBuf); - - // sidecarBuffer: covers _otel_tag_encodings[DD_TAGS_CAPACITY] + _otel_local_root_span_id (contiguous) - static_assert(DD_TAGS_CAPACITY * sizeof(u32) % alignof(u64) == 0, - "tag encodings array size must be aligned to u64 for contiguous sidecar layout"); - size_t sidecarSize = DD_TAGS_CAPACITY * sizeof(u32) + sizeof(u64); - jobject sidecarBuf = env->NewDirectByteBuffer((void*)thrd->getOtelTagEncodingsPtr(), (jlong)sidecarSize); - env->SetObjectArrayElement(result, 1, sidecarBuf); - - return result; + // Single contiguous view over [record | tag_encodings | LRS] — used for per-field + // access and for bulk snapshot/restore. All three regions are in one ProfiledThread + // memory block. + size_t totalSize = OTEL_MAX_RECORD_SIZE + DD_TAGS_CAPACITY * sizeof(u32) + sizeof(u64); + return env->NewDirectByteBuffer((void*)record, (jlong)totalSize); } extern "C" DLLEXPORT jint JNICALL diff --git a/ddprof-lib/src/main/cpp/thread.h b/ddprof-lib/src/main/cpp/thread.h index 852129bc0..b3c721bfb 100644 --- a/ddprof-lib/src/main/cpp/thread.h +++ b/ddprof-lib/src/main/cpp/thread.h @@ -73,10 +73,15 @@ class ProfiledThread : public ThreadLocalData { UnwindFailures _unwind_failures; bool _otel_ctx_initialized; bool _crash_protection_active; - OtelThreadContextRecord _otel_ctx_record; + // alignas(8) + sizeof(OtelThreadContextRecord)==640 (multiple of 8) guarantee + // _otel_tag_encodings sits at +640 with no padding, so the three fields form one + // 688-byte contiguous region exposed as a combined DirectByteBuffer. + alignas(8) OtelThreadContextRecord _otel_ctx_record; // These two fields MUST be contiguous and 8-byte aligned — the JNI layer // exposes them as a single DirectByteBuffer (sidecar), and VarHandle long // views require 8-byte alignment for the buffer base address. + // Read invariant: sidecar readers must gate on record->valid (see ContextApi::get). + // ThreadContext.restore() relies on this to perform a bulk memcpy under valid=0. alignas(8) u32 _otel_tag_encodings[DD_TAGS_CAPACITY]; u64 _otel_local_root_span_id; diff --git a/ddprof-lib/src/main/java/com/datadoghq/profiler/JavaProfiler.java b/ddprof-lib/src/main/java/com/datadoghq/profiler/JavaProfiler.java index c690b9e47..33e0bdc13 100644 --- a/ddprof-lib/src/main/java/com/datadoghq/profiler/JavaProfiler.java +++ b/ddprof-lib/src/main/java/com/datadoghq/profiler/JavaProfiler.java @@ -225,7 +225,6 @@ void copyTags(int[] snapshot) { tlsContextStorage.get().copyCustoms(snapshot); } - /** /** * Dumps the JFR recording at the provided path * @param recording the path to the recording @@ -305,11 +304,11 @@ public Map getDebugCounters() { private static ThreadContext initializeThreadContext() { long[] metadata = new long[6]; - ByteBuffer[] buffers = initializeContextTLS0(metadata); - if (buffers == null) { + ByteBuffer buffer = initializeContextTLS0(metadata); + if (buffer == null) { throw new IllegalStateException("Failed to initialize OTEL TLS — ProfiledThread not available"); } - return new ThreadContext(buffers[0], buffers[1], metadata); + return new ThreadContext(buffer, metadata); } private static native boolean init0(); @@ -342,19 +341,20 @@ private static ThreadContext initializeThreadContext() { private static native String getStatus0(); /** - * Initializes context TLS for the current thread and returns 2 DirectByteBuffers. - * Sets otel_thread_ctx_v1 permanently to the thread's OtelThreadContextRecord. + * Initializes context TLS for the current thread and returns a single DirectByteBuffer + * spanning the OTEP record + tag-encoding sidecar + LRS (688 bytes, contiguous in + * ProfiledThread). Sets otel_thread_ctx_v1 permanently to the thread's + * OtelThreadContextRecord. * - * @param metadata output array filled with: - * [0] VALID_OFFSET — offset of 'valid' field in the record - * [1] TRACE_ID_OFFSET — offset of 'trace_id' field in the record - * [2] SPAN_ID_OFFSET — offset of 'span_id' field in the record + * @param metadata output array filled with absolute offsets into the returned buffer: + * [0] VALID_OFFSET — offset of 'valid' field + * [1] TRACE_ID_OFFSET — offset of 'trace_id' field + * [2] SPAN_ID_OFFSET — offset of 'span_id' field * [3] ATTRS_DATA_SIZE_OFFSET — offset of 'attrs_data_size' field * [4] ATTRS_DATA_OFFSET — offset of 'attrs_data' field - * [5] LRS_SIDECAR_OFFSET — offset of local_root_span_id in sidecar buffer - * @return array of 2 ByteBuffers: [recordBuffer, sidecarBuffer] + * [5] LRS_OFFSET — offset of local_root_span_id */ - private static native ByteBuffer[] initializeContextTLS0(long[] metadata); + private static native ByteBuffer initializeContextTLS0(long[] metadata); public ThreadContext getThreadContext() { return tlsContextStorage.get(); diff --git a/ddprof-lib/src/main/java/com/datadoghq/profiler/ScopeStack.java b/ddprof-lib/src/main/java/com/datadoghq/profiler/ScopeStack.java new file mode 100644 index 000000000..65d6cb332 --- /dev/null +++ b/ddprof-lib/src/main/java/com/datadoghq/profiler/ScopeStack.java @@ -0,0 +1,92 @@ +/* + * Copyright 2026 Datadog, Inc + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + */ +package com.datadoghq.profiler; + +import java.util.Arrays; + +/** + * Per-thread stack of {@link ThreadContext} snapshots for nested scopes. + * + *

Provides bulk save/restore of the full OTEP record + sidecar state via one memcpy per + * transition. Not thread-safe: a single stack instance must be accessed only from its + * owning thread. + * + *

Storage is tiered to keep shallow nesting allocation-free: + *

    + *
  • Depths 0 .. {@value #FAST_DEPTH}-1: one contiguous byte[] allocated eagerly.
  • + *
  • Depths {@value #FAST_DEPTH} and beyond: lazily allocated {@value #CHUNK_DEPTH}-slot + * chunks, each a single byte[]. Chunks are allocated once per depth band and reused.
  • + *
+ */ +public final class ScopeStack { + private static final int FAST_DEPTH = 6; + private static final int CHUNK_DEPTH = 12; + private static final int SLOT_SIZE = ThreadContext.SNAPSHOT_SIZE; + + private final byte[] fast = new byte[FAST_DEPTH * SLOT_SIZE]; + // chunks[i] covers depths [FAST_DEPTH + i*CHUNK_DEPTH .. FAST_DEPTH + (i+1)*CHUNK_DEPTH). + private byte[][] chunks; + private int depth; + + public void enter(ThreadContext ctx) { + int d = depth; + ctx.snapshot(bufferFor(d), offsetFor(d)); + depth = d + 1; + } + + public void exit(ThreadContext ctx) { + int d = depth - 1; + if (d < 0) { + throw new IllegalStateException("ScopeStack underflow"); + } + ctx.restore(bufferFor(d), offsetFor(d)); + depth = d; + } + + /** Current nesting depth (number of outstanding {@link #enter} calls). */ + public int depth() { + return depth; + } + + private byte[] bufferFor(int d) { + if (d < FAST_DEPTH) { + return fast; + } + // chunkFor is idempotent: if this depth was previously populated (via a matching enter), + // it returns the existing chunk without allocating. + return chunkFor((d - FAST_DEPTH) / CHUNK_DEPTH); + } + + private static int offsetFor(int d) { + int slot = d < FAST_DEPTH ? d : (d - FAST_DEPTH) % CHUNK_DEPTH; + return slot * SLOT_SIZE; + } + + private byte[] chunkFor(int idx) { + byte[][] cs = chunks; + if (cs == null) { + cs = new byte[4][]; + chunks = cs; + } else if (idx >= cs.length) { + int newLen = cs.length; + while (newLen <= idx) { + newLen <<= 1; + } + cs = Arrays.copyOf(cs, newLen); + chunks = cs; + } + byte[] c = cs[idx]; + if (c == null) { + c = new byte[CHUNK_DEPTH * SLOT_SIZE]; + cs[idx] = c; + } + return c; + } +} diff --git a/ddprof-lib/src/main/java/com/datadoghq/profiler/ThreadContext.java b/ddprof-lib/src/main/java/com/datadoghq/profiler/ThreadContext.java index ed699d57c..349b25c13 100644 --- a/ddprof-lib/src/main/java/com/datadoghq/profiler/ThreadContext.java +++ b/ddprof-lib/src/main/java/com/datadoghq/profiler/ThreadContext.java @@ -15,6 +15,7 @@ */ package com.datadoghq.profiler; +import java.nio.Buffer; import java.nio.ByteBuffer; import java.nio.ByteOrder; import java.nio.charset.StandardCharsets; @@ -28,7 +29,14 @@ */ public final class ThreadContext { private static final int MAX_CUSTOM_SLOTS = 10; + // Max UTF-8 byte length for a custom attribute value. Matches the 1-byte length + // field in the OTEP attrs_data entry header. Enforced up front in setContextAttribute + // so replaceOtepAttribute can assume the input always fits. + private static final int MAX_VALUE_BYTES = 255; private static final int OTEL_MAX_RECORD_SIZE = 640; + private static final int SIDECAR_SIZE = MAX_CUSTOM_SLOTS * Integer.BYTES + Long.BYTES; // 48 + // Package-private so ScopeStack can size its byte[] scratch. + static final int SNAPSHOT_SIZE = OTEL_MAX_RECORD_SIZE + SIDECAR_SIZE; // 688 private static final int LRS_OTEP_KEY_INDEX = 0; // LRS is always a fixed 16-hex-char value in attrs_data (zero-padded u64). // The entry header is 2 bytes (key_index + length), giving 18 bytes total. @@ -66,53 +74,56 @@ public final class ThreadContext { private final int attrsDataSizeOffset; private final int attrsDataOffset; private final int maxAttrsDataSize; - private final int lrsSidecarOffset; // localRootSpanId offset in sidecar + private final int lrsOffset; // localRootSpanId offset in the unified buffer + // Base offset of the tag-encoding sidecar within the unified buffer. Every tag slot i + // lives at ctxBuffer[tagEncodingsOffset + i * Integer.BYTES]. Equal to OTEL_MAX_RECORD_SIZE. + private static final int TAG_ENCODINGS_OFFSET = OTEL_MAX_RECORD_SIZE; - private final ByteBuffer recordBuffer; // 640 bytes, OtelThreadContextRecord - private final ByteBuffer sidecarBuffer; // tag encodings + LRS + // Single buffer spanning [OTEP record | tag_encodings | LRS] — 688 bytes contiguous. + // Used for per-field access AND for bulk snapshot/restore memcpy. Position state is + // thread-confined to snapshot/restore, which reset it before each bulk op. + private final ByteBuffer ctxBuffer; /** - * Creates a ThreadContext from the two DirectByteBuffers returned by native initializeContextTLS0. + * Creates a ThreadContext from the single DirectByteBuffer returned by native initializeContextTLS0. * - * @param recordBuffer 640-byte buffer over OtelThreadContextRecord - * @param sidecarBuffer buffer over tag encodings + local root span id - * @param metadata array with [VALID_OFFSET, TRACE_ID_OFFSET, SPAN_ID_OFFSET, - * ATTRS_DATA_SIZE_OFFSET, ATTRS_DATA_OFFSET, LRS_SIDECAR_OFFSET] + * @param ctxBuffer 688-byte unified buffer spanning record + tag_encodings + LRS + * @param metadata array with absolute offsets [VALID, TRACE_ID, SPAN_ID, + * ATTRS_DATA_SIZE, ATTRS_DATA, LRS] */ - public ThreadContext(ByteBuffer recordBuffer, ByteBuffer sidecarBuffer, long[] metadata) { - // Record buffer uses native order for uint16_t attrs_data_size (read by C as native uint16_t). + public ThreadContext(ByteBuffer ctxBuffer, long[] metadata) { + // Uses native order for uint16_t attrs_data_size (read by C as native uint16_t). // trace_id/span_id are uint8_t[] arrays requiring big-endian — handled via Long.reverseBytes() // in setContextDirect(). Only little-endian platforms are supported. - this.recordBuffer = recordBuffer.order(ByteOrder.nativeOrder()); - this.sidecarBuffer = sidecarBuffer.order(ByteOrder.nativeOrder()); + this.ctxBuffer = ctxBuffer.order(ByteOrder.nativeOrder()); this.validOffset = (int) metadata[0]; this.traceIdOffset = (int) metadata[1]; this.spanIdOffset = (int) metadata[2]; this.attrsDataSizeOffset = (int) metadata[3]; this.attrsDataOffset = (int) metadata[4]; this.maxAttrsDataSize = OTEL_MAX_RECORD_SIZE - this.attrsDataOffset; - this.lrsSidecarOffset = (int) metadata[5]; + this.lrsOffset = (int) metadata[5]; if (ByteOrder.nativeOrder() != ByteOrder.LITTLE_ENDIAN) { throw new UnsupportedOperationException( "ByteBuffer context path requires little-endian platform"); } // Zero sidecar + record to prevent stale encodings from a previous profiler session. - // The native ProfiledThread survives across sessions, so the sidecar may hold + // The native ProfiledThread survives across sessions, so the buffer may hold // old tag encodings and the record may hold old attrs_data. for (int i = 0; i < MAX_CUSTOM_SLOTS; i++) { - this.sidecarBuffer.putInt(i * Integer.BYTES, 0); + this.ctxBuffer.putInt(TAG_ENCODINGS_OFFSET + i * Integer.BYTES, 0); } - this.sidecarBuffer.putLong(this.lrsSidecarOffset, 0); - this.recordBuffer.put(this.validOffset, (byte) 0); + this.ctxBuffer.putLong(this.lrsOffset, 0); + this.ctxBuffer.put(this.validOffset, (byte) 0); // Pre-initialize the fixed-size LRS entry at attrs_data[0..LRS_ENTRY_SIZE-1]: // key_index=0, length=16, value=16 zero hex bytes. // The entry is always present; updates overwrite only the 16 value bytes. - this.recordBuffer.put(this.attrsDataOffset, (byte) LRS_OTEP_KEY_INDEX); - this.recordBuffer.put(this.attrsDataOffset + 1, (byte) LRS_FIXED_VALUE_LEN); + this.ctxBuffer.put(this.attrsDataOffset, (byte) LRS_OTEP_KEY_INDEX); + this.ctxBuffer.put(this.attrsDataOffset + 1, (byte) LRS_FIXED_VALUE_LEN); for (int i = 0; i < LRS_FIXED_VALUE_LEN; i++) { - this.recordBuffer.put(this.attrsDataOffset + 2 + i, (byte) '0'); + this.ctxBuffer.put(this.attrsDataOffset + 2 + i, (byte) '0'); } - this.recordBuffer.putShort(this.attrsDataSizeOffset, (short) LRS_ENTRY_SIZE); + this.ctxBuffer.putShort(this.attrsDataSizeOffset, (short) LRS_ENTRY_SIZE); } /** @@ -120,15 +131,15 @@ public ThreadContext(ByteBuffer recordBuffer, ByteBuffer sidecarBuffer, long[] m * Reads directly from the OTEP record buffer (big-endian bytes → native long). */ public long getSpanId() { - return Long.reverseBytes(recordBuffer.getLong(spanIdOffset)); + return Long.reverseBytes(ctxBuffer.getLong(spanIdOffset)); } /** * Returns the current local root span ID. - * Reads directly from the sidecar buffer (native long). + * Reads directly from the LRS region of ctxBuffer (native long). */ public long getRootSpanId() { - return sidecarBuffer.getLong(lrsSidecarOffset); + return ctxBuffer.getLong(lrsOffset); } /** @@ -171,7 +182,7 @@ public void clearContextAttribute(int keyIndex) { } int otepKeyIndex = keyIndex + 1; detach(); - sidecarBuffer.putInt(keyIndex * Integer.BYTES, 0); + ctxBuffer.putInt(TAG_ENCODINGS_OFFSET + keyIndex * Integer.BYTES, 0); removeOtepAttribute(otepKeyIndex); attach(); } @@ -179,7 +190,59 @@ public void clearContextAttribute(int keyIndex) { public void copyCustoms(int[] value) { int len = Math.min(value.length, MAX_CUSTOM_SLOTS); for (int i = 0; i < len; i++) { - value[i] = sidecarBuffer.getInt(i * Integer.BYTES); + value[i] = ctxBuffer.getInt(TAG_ENCODINGS_OFFSET + i * Integer.BYTES); + } + } + + /** + * Captures the full record + sidecar state into {@code scratch[offset..offset+SNAPSHOT_SIZE)}. + * Pair with {@link #restore} for nested-scope propagation. + * + *

The detach/memcpy/re-publish pair hides the bulk read from any signal handler going + * through {@code ContextApi::get} — while {@code valid=0}, sidecar reads are gated off. The + * pre-snapshot {@code valid} state is preserved in {@code scratch[offset + validOffset]} so + * {@link #restore} can replay it. If the record was already invalid (e.g. the all-zero clear + * path in {@link #setContextDirect} leaves {@code valid=0} with a stale {@code attrs_data_size} + * / {@code attrs_data}), the live buffer is left invalid after snapshot — re-publishing would + * expose a cleared-but-stale record. + */ + public void snapshot(byte[] scratch, int offset) { + byte priorValid = ctxBuffer.get(validOffset); + detach(); + // Cast to Buffer: ByteBuffer.position(int) only returns ByteBuffer since JDK 9 (covariant + // return). This source is compiled for Java 8 runtimes where the method lives on Buffer. + ((Buffer) ctxBuffer).position(0); + ctxBuffer.get(scratch, offset, SNAPSHOT_SIZE); + // Overwrite the valid byte in scratch (memcpy captured the post-detach 0) with the + // pre-snapshot value. restore() consults this to decide whether to re-attach. + scratch[offset + validOffset] = priorValid; + if (priorValid != 0) { + attach(); + } + } + + /** + * Restores a previously captured state. The detach/memcpy/conditional-attach pair hides the + * memcpy from readers going through {@link #ctxBuffer}'s valid flag ({@code ContextApi::get} + * in native code), which is the sole gate for sidecar reads (see {@code thread.h}). + * + *

The valid byte inside scratch is cleared to 0 for the duration of the memcpy so that + * even if the captured state had {@code valid=1}, the live buffer cannot transiently observe + * {@code valid=1} alongside partially-written fields. The captured value is restored into + * scratch after the memcpy so subsequent snapshot/restore cycles keep working, and + * {@link #attach} re-publishes only when the saved state was itself valid — matching the + * semantics of {@link #snapshot}. + */ + public void restore(byte[] scratch, int offset) { + int validIdx = offset + validOffset; + byte wasValid = scratch[validIdx]; + scratch[validIdx] = 0; + detach(); + ((Buffer) ctxBuffer).position(0); + ctxBuffer.put(scratch, offset, SNAPSHOT_SIZE); + if (wasValid != 0) { + scratch[validIdx] = wasValid; + attach(); } } @@ -198,10 +261,14 @@ public void copyCustoms(int[] value) { * request IDs, and other per-request-unique strings will exhaust the * Dictionary and cause attributes to be silently dropped. * + *

Value size limit. The UTF-8 encoding of {@code value} must fit in + * {@value #MAX_VALUE_BYTES} bytes (the OTEP attrs_data entry length field is one byte). + * Oversized values are rejected up front — they never reach the Dictionary or attrs_data. + * * @param keyIndex Index into the registered attribute key map (0-based) * @param value The string value for this attribute - * @return true if the attribute was set successfully, false if the - * Dictionary is full or the keyIndex is out of range + * @return true if the attribute was set successfully, false if the value is too long, + * the Dictionary is full, attrs_data overflows, or keyIndex is out of range */ public boolean setContextAttribute(int keyIndex, String value) { if (keyIndex < 0 || keyIndex >= MAX_CUSTOM_SLOTS || value == null) { @@ -221,11 +288,17 @@ private boolean setContextAttributeDirect(int keyIndex, String value) { int encoding; byte[] utf8; if (value.equals(attrCacheKeys[slot])) { + // Cache hit — the value was previously validated and cached; no re-check needed. encoding = attrCacheEncodings[slot]; utf8 = attrCacheBytes[slot]; } else { - // Cache miss: register in Dictionary, encode UTF-8, cache both. - // Allocates byte[] once per unique value; cached for reuse. + // Cache miss: encode UTF-8 and validate size BEFORE touching the Dictionary. + // Rejecting here avoids an orphan Dictionary entry (the native Dictionary is + // write-only for the JVM lifetime and cannot be undone). + utf8 = value.getBytes(StandardCharsets.UTF_8); + if (utf8.length > MAX_VALUE_BYTES) { + return false; + } encoding = registerConstant0(value); if (encoding < 0) { // Dictionary full: clear sidecar AND remove the OTEP attrs_data entry @@ -233,7 +306,6 @@ private boolean setContextAttributeDirect(int keyIndex, String value) { clearContextAttribute(keyIndex); return false; } - utf8 = value.getBytes(StandardCharsets.UTF_8); attrCacheEncodings[slot] = encoding; attrCacheBytes[slot] = utf8; attrCacheKeys[slot] = value; @@ -243,8 +315,13 @@ private boolean setContextAttributeDirect(int keyIndex, String value) { // so a signal handler never sees a new sidecar encoding alongside old attrs_data. int otepKeyIndex = keyIndex + 1; detach(); - sidecarBuffer.putInt(keyIndex * Integer.BYTES, encoding); + ctxBuffer.putInt(TAG_ENCODINGS_OFFSET + keyIndex * Integer.BYTES, encoding); boolean written = replaceOtepAttribute(otepKeyIndex, utf8); + if (!written) { + // attrs_data overflow: the old entry was compacted out and the new one + // couldn't fit. Zero the sidecar so both views agree there is no value. + ctxBuffer.putInt(TAG_ENCODINGS_OFFSET + keyIndex * Integer.BYTES, 0); + } attach(); return written; } @@ -263,23 +340,23 @@ private void setContextDirect(long localRootSpanId, long spanId, long trHi, long } // Write trace_id (big-endian) + span_id (big-endian) - recordBuffer.putLong(traceIdOffset, Long.reverseBytes(trHi)); - recordBuffer.putLong(traceIdOffset + 8, Long.reverseBytes(trLo)); - recordBuffer.putLong(spanIdOffset, Long.reverseBytes(spanId)); + ctxBuffer.putLong(traceIdOffset, Long.reverseBytes(trHi)); + ctxBuffer.putLong(traceIdOffset + 8, Long.reverseBytes(trLo)); + ctxBuffer.putLong(spanIdOffset, Long.reverseBytes(spanId)); // Reset custom attribute state so the previous span's values don't leak // into this span. Callers set attributes again via setContextAttribute(). for (int i = 0; i < MAX_CUSTOM_SLOTS; i++) { - // i * Integer.BYTES: byte offset into sidecar buffer for int slot i - sidecarBuffer.putInt(i * Integer.BYTES, 0); + // offset into ctxBuffer for tag-encoding slot i + ctxBuffer.putInt(TAG_ENCODINGS_OFFSET + i * Integer.BYTES, 0); } // Reset attrs_data_size to contain only the fixed LRS entry, discarding // any custom attribute entries written during the previous span. - recordBuffer.putShort(attrsDataSizeOffset, (short) LRS_ENTRY_SIZE); + ctxBuffer.putShort(attrsDataSizeOffset, (short) LRS_ENTRY_SIZE); // Update LRS sidecar and OTEP attrs_data inside the detach/attach window so a // signal handler never sees the new LRS with old trace/span IDs. - sidecarBuffer.putLong(lrsSidecarOffset, localRootSpanId); + ctxBuffer.putLong(lrsOffset, localRootSpanId); writeLrsHex(localRootSpanId); attach(); @@ -301,14 +378,14 @@ private void setContextDirect(long localRootSpanId, long spanId, long trHi, long * readers until the next non-zero setContext call publishes it. */ private void clearContextDirect() { - recordBuffer.putLong(traceIdOffset, 0); - recordBuffer.putLong(traceIdOffset + 8, 0); - recordBuffer.putLong(spanIdOffset, 0); + ctxBuffer.putLong(traceIdOffset, 0); + ctxBuffer.putLong(traceIdOffset + 8, 0); + ctxBuffer.putLong(spanIdOffset, 0); writeLrsHex(0); for (int i = 0; i < MAX_CUSTOM_SLOTS; i++) { - sidecarBuffer.putInt(i * Integer.BYTES, 0); + ctxBuffer.putInt(TAG_ENCODINGS_OFFSET + i * Integer.BYTES, 0); } - sidecarBuffer.putLong(lrsSidecarOffset, 0); + ctxBuffer.putLong(lrsOffset, 0); } /** @@ -319,7 +396,7 @@ private void clearContextDirect() { private void writeLrsHex(long val) { int base = attrsDataOffset + 2; // skip key_index byte + length byte for (int i = 15; i >= 0; i--) { - recordBuffer.put(base + i, HEX_DIGITS[(int)(val & 0xF)]); + ctxBuffer.put(base + i, HEX_DIGITS[(int)(val & 0xF)]); val >>>= 4; } } @@ -331,7 +408,7 @@ private void writeLrsHex(long val) { * clear path intentionally leaves the record invalid without calling attach(). */ private void detach() { - recordBuffer.put(validOffset, (byte) 0); + ctxBuffer.put(validOffset, (byte) 0); BUFFER_WRITER.storeFence(); } @@ -344,36 +421,39 @@ private void attach() { // Plain put is sufficient: signal handlers run on the same hardware thread, // so they observe stores in program order — no volatile needed for same-thread // visibility. The preceding storeFence() provides the release barrier. - recordBuffer.put(validOffset, (byte) 1); + ctxBuffer.put(validOffset, (byte) 1); } /** * Replace or insert an attribute in attrs_data. Record must be detached. * Writes the pre-encoded UTF-8 bytes into the record. + * + *

Caller contract: {@code utf8.length <= MAX_VALUE_BYTES}, enforced at the public + * entry point in {@link #setContextAttributeDirect}. */ private boolean replaceOtepAttribute(int otepKeyIndex, byte[] utf8) { int currentSize = compactOtepAttribute(otepKeyIndex); - int valueLen = Math.min(utf8.length, 255); + int valueLen = utf8.length; int entrySize = 2 + valueLen; if (currentSize + entrySize <= maxAttrsDataSize) { int base = attrsDataOffset + currentSize; - recordBuffer.put(base, (byte) otepKeyIndex); - recordBuffer.put(base + 1, (byte) valueLen); + ctxBuffer.put(base, (byte) otepKeyIndex); + ctxBuffer.put(base + 1, (byte) valueLen); for (int i = 0; i < valueLen; i++) { - recordBuffer.put(base + 2 + i, utf8[i]); + ctxBuffer.put(base + 2 + i, utf8[i]); } currentSize += entrySize; - recordBuffer.putShort(attrsDataSizeOffset, (short) currentSize); + ctxBuffer.putShort(attrsDataSizeOffset, (short) currentSize); return true; } - recordBuffer.putShort(attrsDataSizeOffset, (short) currentSize); + ctxBuffer.putShort(attrsDataSizeOffset, (short) currentSize); return false; } /** Remove an attribute from attrs_data by compacting. Record must be detached. */ private void removeOtepAttribute(int otepKeyIndex) { int currentSize = compactOtepAttribute(otepKeyIndex); - recordBuffer.putShort(attrsDataSizeOffset, (short) currentSize); + ctxBuffer.putShort(attrsDataSizeOffset, (short) currentSize); } /** @@ -384,13 +464,13 @@ private void removeOtepAttribute(int otepKeyIndex) { * so it is never 0. Index 0 is reserved for the fixed LRS entry. */ private int compactOtepAttribute(int otepKeyIndex) { - int currentSize = recordBuffer.getShort(attrsDataSizeOffset) & 0xFFFF; + int currentSize = ctxBuffer.getShort(attrsDataSizeOffset) & 0xFFFF; int readPos = 0; int writePos = 0; boolean found = false; while (readPos + 2 <= currentSize) { - int k = recordBuffer.get(attrsDataOffset + readPos) & 0xFF; - int len = recordBuffer.get(attrsDataOffset + readPos + 1) & 0xFF; + int k = ctxBuffer.get(attrsDataOffset + readPos) & 0xFF; + int len = ctxBuffer.get(attrsDataOffset + readPos + 1) & 0xFF; if (readPos + 2 + len > currentSize) { currentSize = writePos; break; } if (k == otepKeyIndex) { found = true; @@ -398,8 +478,8 @@ private int compactOtepAttribute(int otepKeyIndex) { } else { if (found && writePos < readPos) { for (int i = 0; i < 2 + len; i++) { - recordBuffer.put(attrsDataOffset + writePos + i, - recordBuffer.get(attrsDataOffset + readPos + i)); + ctxBuffer.put(attrsDataOffset + writePos + i, + ctxBuffer.get(attrsDataOffset + readPos + i)); } } writePos += 2 + len; @@ -410,9 +490,15 @@ private int compactOtepAttribute(int otepKeyIndex) { } /** - * Reads a custom attribute value from attrs_data by key index. - * Scans the attrs_data entries and returns the UTF-8 string for the matching key. - * Intended for tests only. + * Reads a custom attribute value by key index by scanning {@code attrs_data}. + * + *

Test-only. The only caller is {@code TagContextTest}, which uses it via + * {@link JavaProfiler#getThreadContext()} to verify that writes to the OTEP record are + * observable after set / clear / span-reset cycles. No production path — neither the DD + * signal handler nor the OTEL eBPF reader — ever calls this method: the DD handler reads + * sidecar encoding IDs and the OTEL reader parses {@code attrs_data} directly from native + * memory. The per-call {@code byte[]} / {@code String} allocation is therefore acceptable; + * do not introduce a readback cache unless a real production consumer appears. * * @param keyIndex 0-based user key index (same as passed to setContextAttribute) * @return the attribute value string, or null if not set @@ -421,19 +507,23 @@ public String readContextAttribute(int keyIndex) { if (keyIndex < 0 || keyIndex >= MAX_CUSTOM_SLOTS) { return null; } + // valid=0 → record was detached or never published. No attrs_data to trust. + if (ctxBuffer.get(validOffset) == 0) { + return null; + } int otepKeyIndex = keyIndex + 1; - int size = recordBuffer.getShort(attrsDataSizeOffset) & 0xFFFF; + int size = ctxBuffer.getShort(attrsDataSizeOffset) & 0xFFFF; int pos = 0; while (pos + 2 <= size) { - int k = recordBuffer.get(attrsDataOffset + pos) & 0xFF; - int len = recordBuffer.get(attrsDataOffset + pos + 1) & 0xFF; + int k = ctxBuffer.get(attrsDataOffset + pos) & 0xFF; + int len = ctxBuffer.get(attrsDataOffset + pos + 1) & 0xFF; if (pos + 2 + len > size) { break; } if (k == otepKeyIndex) { byte[] bytes = new byte[len]; for (int i = 0; i < len; i++) { - bytes[i] = recordBuffer.get(attrsDataOffset + pos + 2 + i); + bytes[i] = ctxBuffer.get(attrsDataOffset + pos + 2 + i); } return new String(bytes, StandardCharsets.UTF_8); } @@ -450,7 +540,7 @@ public String readContextAttribute(int keyIndex) { public String readTraceId() { StringBuilder sb = new StringBuilder(32); for (int i = 0; i < 16; i++) { - int b = recordBuffer.get(traceIdOffset + i) & 0xFF; + int b = ctxBuffer.get(traceIdOffset + i) & 0xFF; sb.append((char) HEX_DIGITS[b >> 4]); sb.append((char) HEX_DIGITS[b & 0xF]); } diff --git a/ddprof-test/src/test/java/com/datadoghq/profiler/ScopeStackTest.java b/ddprof-test/src/test/java/com/datadoghq/profiler/ScopeStackTest.java new file mode 100644 index 000000000..5078181df --- /dev/null +++ b/ddprof-test/src/test/java/com/datadoghq/profiler/ScopeStackTest.java @@ -0,0 +1,164 @@ +package com.datadoghq.profiler; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertThrows; + +import java.nio.ByteBuffer; +import java.nio.ByteOrder; + +import org.junit.jupiter.api.Assumptions; +import org.junit.jupiter.api.Test; + +/** + * Pure-Java unit test for {@link ScopeStack}. Uses heap-backed {@link ByteBuffer}s so + * no native library is required. Exercises depth accounting, underflow, and round-trip + * preservation of trace/span IDs across fast-path and chunked-path depths. + */ +public class ScopeStackTest { + + // Offsets mirror OtelThreadContextRecord in otel_context.h and the sidecar layout + // built by initializeContextTLS0 in javaApi.cpp. These are spec-fixed; guarded by + // static_asserts in native code. All are absolute within the unified buffer. + private static final int TRACE_ID_OFFSET = 0; + private static final int SPAN_ID_OFFSET = 16; + private static final int VALID_OFFSET = 24; + private static final int ATTRS_DATA_SIZE_OFFSET = 26; + private static final int ATTRS_DATA_OFFSET = 28; + private static final int LRS_OFFSET = 640 + 40; // after 640-byte record + 10 * sizeof(u32) + + private static ThreadContext newContext() { + ByteBuffer buf = ByteBuffer.allocate(ThreadContext.SNAPSHOT_SIZE).order(ByteOrder.nativeOrder()); + long[] metadata = { + VALID_OFFSET, TRACE_ID_OFFSET, SPAN_ID_OFFSET, + ATTRS_DATA_SIZE_OFFSET, ATTRS_DATA_OFFSET, LRS_OFFSET + }; + return new ThreadContext(buf, metadata); + } + + private static void assumeLittleEndian() { + Assumptions.assumeTrue( + ByteOrder.nativeOrder() == ByteOrder.LITTLE_ENDIAN, + "ThreadContext only supports little-endian platforms"); + } + + @Test + public void depthBalance() { + assumeLittleEndian(); + ThreadContext ctx = newContext(); + ScopeStack stack = new ScopeStack(); + assertEquals(0, stack.depth()); + stack.enter(ctx); + assertEquals(1, stack.depth()); + stack.enter(ctx); + assertEquals(2, stack.depth()); + stack.exit(ctx); + assertEquals(1, stack.depth()); + stack.exit(ctx); + assertEquals(0, stack.depth()); + } + + @Test + public void exitUnderflowThrows() { + assumeLittleEndian(); + ThreadContext ctx = newContext(); + ScopeStack stack = new ScopeStack(); + assertThrows(IllegalStateException.class, () -> stack.exit(ctx)); + } + + @Test + public void fastPathRoundTrip() { + assumeLittleEndian(); + ThreadContext ctx = newContext(); + ScopeStack stack = new ScopeStack(); + + ctx.put(/*lrs*/ 100L, /*span*/ 200L, /*trHi*/ 0L, /*trLo*/ 300L); + assertEquals(200L, ctx.getSpanId()); + assertEquals(100L, ctx.getRootSpanId()); + + stack.enter(ctx); + ctx.put(500L, 600L, 0L, 700L); + assertEquals(600L, ctx.getSpanId()); + assertEquals(500L, ctx.getRootSpanId()); + + stack.exit(ctx); + assertEquals(200L, ctx.getSpanId(), "span must be restored"); + assertEquals(100L, ctx.getRootSpanId(), "root span must be restored"); + } + + @Test + public void chunkedPathRoundTrip() { + // Push past FAST_DEPTH (6) to exercise the lazy-chunk path and Arrays.copyOf growth. + assumeLittleEndian(); + ThreadContext ctx = newContext(); + ScopeStack stack = new ScopeStack(); + + final int depth = 20; // FAST_DEPTH + one full 12-slot chunk + 2 into the next + for (int i = 0; i < depth; i++) { + ctx.put(1000L + i, 2000L + i, 0L, 3000L + i); + stack.enter(ctx); + } + assertEquals(depth, stack.depth()); + + // Scramble state so restore has something to correct. + ctx.put(99L, 99L, 0L, 99L); + + for (int i = depth - 1; i >= 0; i--) { + stack.exit(ctx); + assertEquals(2000L + i, ctx.getSpanId(), "span mismatch at depth " + i); + assertEquals(1000L + i, ctx.getRootSpanId(), "root mismatch at depth " + i); + } + assertEquals(0, stack.depth()); + } + + @Test + public void reusesStackAfterFullUnwind() { + // After the stack returns to depth 0, re-entering must not leak state from the prior run. + assumeLittleEndian(); + ThreadContext ctx = newContext(); + ScopeStack stack = new ScopeStack(); + + ctx.put(1L, 2L, 0L, 3L); + stack.enter(ctx); + ctx.put(10L, 20L, 0L, 30L); + stack.exit(ctx); + assertEquals(2L, ctx.getSpanId()); + + ctx.put(4L, 5L, 0L, 6L); + stack.enter(ctx); + ctx.put(40L, 50L, 0L, 60L); + stack.exit(ctx); + assertEquals(5L, ctx.getSpanId()); + } + + @Test + public void snapshotOverClearedContextDoesNotRepublish() { + // Regression: snapshot() used to unconditionally re-attach, flipping valid back to 1 + // after a zero-put clear. The clear path leaves attrs_data_size / attrs_data stale and + // relies on valid=0 to keep external readers from seeing the stale bytes. Here we verify + // the valid byte directly since setContextAttribute is a native path unavailable to + // pure-Java tests. + assumeLittleEndian(); + ByteBuffer buf = ByteBuffer.allocate(ThreadContext.SNAPSHOT_SIZE).order(ByteOrder.nativeOrder()); + long[] metadata = { + VALID_OFFSET, TRACE_ID_OFFSET, SPAN_ID_OFFSET, + ATTRS_DATA_SIZE_OFFSET, ATTRS_DATA_OFFSET, LRS_OFFSET + }; + ThreadContext ctx = new ThreadContext(buf, metadata); + ScopeStack stack = new ScopeStack(); + + ctx.put(1L, 2L, 0L, 3L); + assertEquals(1, buf.get(VALID_OFFSET), "record must be published after non-zero put"); + + // Zero-put clear: leaves valid=0 (the all-zero early-return in setContextDirect). + ctx.put(0L, 0L, 0L, 0L); + assertEquals(0, buf.get(VALID_OFFSET), "record must be invalid after zero-put clear"); + + stack.enter(ctx); + assertEquals(0, buf.get(VALID_OFFSET), + "snapshot must preserve valid=0 — not republish a cleared record"); + + stack.exit(ctx); + assertEquals(0, buf.get(VALID_OFFSET), + "restore must replay valid=0 — not republish a cleared record"); + } +} diff --git a/ddprof-test/src/test/java/com/datadoghq/profiler/context/TagContextTest.java b/ddprof-test/src/test/java/com/datadoghq/profiler/context/TagContextTest.java index 244865576..9b8fe4404 100644 --- a/ddprof-test/src/test/java/com/datadoghq/profiler/context/TagContextTest.java +++ b/ddprof-test/src/test/java/com/datadoghq/profiler/context/TagContextTest.java @@ -1,8 +1,10 @@ package com.datadoghq.profiler.context; import java.util.Arrays; +import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; +import java.util.List; import java.util.Map; import java.util.Set; import java.util.concurrent.atomic.AtomicLong; @@ -38,7 +40,12 @@ public void test() throws InterruptedException { registerCurrentThreadForWallClockProfiling(); ContextSetter contextSetter = new ContextSetter(profiler, Arrays.asList("tag1", "tag2", "tag1")); - String[] strings = IntStream.range(0, 10).mapToObj(String::valueOf).toArray(String[]::new); + // Use session-unique prefix so each @RetryingTest attempt registers fresh values in the + // native Dictionary. Without this, on musl (no JVM fork) the per-thread attrCacheKeys + // persists across retries: cache hits skip registerConstant0(), leaving + // dictionary_context_keys=0 on every retry after the first. + String pfx = Long.toHexString(System.nanoTime()) + "_"; + String[] strings = IntStream.range(0, 10).mapToObj(i -> pfx + i).toArray(String[]::new); for (int i = 0; i < strings.length * 10; i++) { work(contextSetter, "tag1", strings[i % strings.length]); } @@ -138,6 +145,100 @@ public void test() throws InterruptedException { } } + /** + * Reads the current value of {@code tag} via {@link ThreadContext#readContextAttribute} + * — the only readback path retained on the Java side (test-only). + */ + private String readTag(ContextSetter contextSetter, String tag) { + return profiler.getThreadContext().readContextAttribute(contextSetter.offsetOf(tag)); + } + + @Test + public void testSnapshotRestore() throws Exception { + // J9 does not initialize ThreadContext for non-profiled threads; skip. + Assumptions.assumeTrue(!Platform.isJ9()); + registerCurrentThreadForWallClockProfiling(); + ContextSetter contextSetter = new ContextSetter(profiler, Arrays.asList("tag1", "tag2")); + + // Initially both slots are empty + assertNull(readTag(contextSetter, "tag1")); + assertNull(readTag(contextSetter, "tag2")); + + // Set a value and read it back + assertTrue(contextSetter.setContextValue("tag1", "before")); + assertEquals("before", readTag(contextSetter, "tag1")); + + // Snapshot the string, overwrite, then restore + String saved = readTag(contextSetter, "tag1"); + assertTrue(contextSetter.setContextValue("tag1", "inside")); + assertEquals("inside", readTag(contextSetter, "tag1")); + + // Restore via setContextValue + assertTrue(contextSetter.setContextValue("tag1", saved)); + assertEquals("before", readTag(contextSetter, "tag1")); + + // put/clear/put cycle: verify offset stability across state transitions + assertTrue(contextSetter.clearContextValue("tag1")); + assertNull(readTag(contextSetter, "tag1")); + assertTrue(contextSetter.setContextValue("tag1", "after")); + assertEquals("after", readTag(contextSetter, "tag1")); + + // tag2 was never set; readContextAttribute returns null + assertNull(readTag(contextSetter, "tag2")); + } + + @Test + public void testAttrsDataOverflow() throws Exception { + Assumptions.assumeTrue(!Platform.isJ9()); + registerCurrentThreadForWallClockProfiling(); + List attrs = new ArrayList<>(); + for (int i = 1; i <= 10; i++) { + attrs.add("tag" + i); + } + ContextSetter contextSetter = new ContextSetter(profiler, attrs); + char[] chars = new char[255]; + java.util.Arrays.fill(chars, 'x'); + String bigValue = new String(chars); + int overflowIndex = -1; + for (int i = 1; i <= 10; i++) { + if (!contextSetter.setContextValue("tag" + i, bigValue)) { + overflowIndex = i; + break; + } + } + assertTrue(overflowIndex >= 0, "Expected at least one write to overflow attrs_data"); + assertNull(readTag(contextSetter, "tag" + overflowIndex), + "Overflowed slot must read null — the entry never landed in attrs_data"); + } + + @Test + public void testPutClearsCustomSlots() throws Exception { + Assumptions.assumeTrue(!Platform.isJ9()); + registerCurrentThreadForWallClockProfiling(); + ContextSetter contextSetter = new ContextSetter(profiler, Arrays.asList("tag1", "tag2")); + + assertTrue(contextSetter.setContextValue("tag1", "before-put")); + assertEquals("before-put", readTag(contextSetter, "tag1")); + + // setContext() triggers setContextDirect which resets attrs_data_size to the LRS entry only, + // dropping all user attribute entries — so scanning attrs_data for tag1 returns null. + profiler.setContext(1L, 42L, 0L, 42L); + assertNull(readTag(contextSetter, "tag1"), "tag1 must be null after setContext resets attrs_data"); + } + + @Test + public void testCrossSlotIsolation() throws Exception { + Assumptions.assumeTrue(!Platform.isJ9()); + registerCurrentThreadForWallClockProfiling(); + ContextSetter contextSetter = new ContextSetter(profiler, Arrays.asList("tag1", "tag2")); + + assertTrue(contextSetter.setContextValue("tag1", "v1")); + assertTrue(contextSetter.setContextValue("tag2", "v2")); + assertTrue(contextSetter.clearContextValue("tag2")); + assertEquals("v1", readTag(contextSetter, "tag1")); + assertNull(readTag(contextSetter, "tag2")); + } + private void work(ContextSetter contextSetter, String contextAttribute, String contextValue) throws InterruptedException { assertTrue(contextSetter.setContextValue(contextAttribute, contextValue)); diff --git a/doc/architecture/TLSContext.md b/doc/architecture/TLSContext.md index 1d3e24c5c..0a17829dc 100644 --- a/doc/architecture/TLSContext.md +++ b/doc/architecture/TLSContext.md @@ -64,30 +64,32 @@ For benchmark data, see │ ┌───────────────────────────────────────────────────────────────┐ │ │ │ setContextDirect() │ │ │ │ 1. detach() — valid ← 0, storeFence │ │ -│ │ 2. recordBuffer.putLong(traceIdOffset, reverseBytes(trHi)) │ │ -│ │ recordBuffer.putLong(traceIdOffset+8, reverseBytes(trLo)) │ │ -│ │ recordBuffer.putLong(spanIdOffset, reverseBytes(spanId)) │ │ -│ │ 3. sidecar[0..9] ← 0 │ │ +│ │ 2. ctxBuffer.putLong(traceIdOffset, reverseBytes(trHi)) │ │ +│ │ ctxBuffer.putLong(traceIdOffset+8, reverseBytes(trLo)) │ │ +│ │ ctxBuffer.putLong(spanIdOffset, reverseBytes(spanId)) │ │ +│ │ 3. tag_encodings[0..9] ← 0 │ │ │ │ attrs_data_size ← LRS_ENTRY_SIZE (keeps fixed LRS at [0]) │ │ -│ │ 4. sidecarBuffer.putLong(lrsSidecarOffset, lrs) │ │ +│ │ 4. ctxBuffer.putLong(lrsOffset, lrs) │ │ │ │ writeLrsHex(lrs) — update fixed LRS entry in attrs_data │ │ │ │ 5. attach() — storeFence, valid ← 1 │ │ │ └───────────────────────────────────────────────────────────────┘ │ │ │ │ │ ▼ │ -│ ┌────────────────────────────────┐ ┌───────────────────────────┐ │ -│ │ OtelThreadContextRecord (640B) │ │ Sidecar buffer │ │ -│ │ ┌──────────────────────┐ │ │ ┌────────────────────────┐│ │ -│ │ │ trace_id[16] (BE) │ │ │ │ tag_encodings[10] (u32)││ │ -│ │ │ span_id[8] (BE) │ │ │ │ local_root_span_id(u64)││ │ -│ │ │ valid (u8) │ │ │ └────────────────────────┘│ │ -│ │ │ reserved (u8) │ │ └───────────────────────────┘ │ -│ │ │ attrs_data_size(u16) │ │ ▲ (DD signal handler) │ -│ │ │ attrs_data[612] │ │ ┌───────────────────────────┐ │ -│ │ └──────────────────────┘ │ │ TLS pointer (8B) │ │ -│ └────────────────────────────────┘ │ otel_thread_ctx_v1 │ │ -│ ▲ ▲ │ (thread_local, DLLEXPORT) │ │ -│ │ │ └───────────────────────────┘ │ +│ ┌──────────────────────────────────────────────────────────────┐ │ +│ │ Unified ctxBuffer (688B, single DirectByteBuffer) │ │ +│ │ ┌──────────────────────┐ ┌───────────────────────────┐ │ │ +│ │ │ OtelThreadContextRec │ │ tag_encodings[10] (u32) │ │ │ +│ │ │ trace_id[16] (BE) │ │ local_root_span_id (u64) │ │ │ +│ │ │ span_id[8] (BE) │ └───────────────────────────┘ │ │ +│ │ │ valid (u8)│ offsets 640..688 in ctxBuffer │ │ +│ │ │ reserved (u8)│ │ │ +│ │ │ attrs_data_size(u16)│ ┌──────────────────────────────┐ │ │ +│ │ │ attrs_data[612] │ │ TLS pointer (8B) │ │ │ +│ │ └──────────────────────┘ │ otel_thread_ctx_v1 │ │ │ +│ │ offsets 0..640 │ (thread_local, DLLEXPORT) │ │ │ +│ └──────────────────────────────────────────────────────────────┘ │ +│ ▲ ▲ │ +│ │ │ │ │ DD signal handler External OTEP │ │ reads span_id profiler reads │ │ from record full record via │ @@ -104,14 +106,14 @@ For benchmark data, see │ │ │ JavaProfiler │ │ ├─ ThreadLocal tlsContextStorage │ -│ ├─ initializeContextTLS0(long[] metadata) → ByteBuffer[2] │ +│ ├─ initializeContextTLS0(long[] metadata) → ByteBuffer (688B) │ │ └─ registerConstant0(String value) → int encoding │ │ │ │ ThreadContext (per thread) │ -│ ├─ recordBuffer (640B DirectByteBuffer → OtelThreadContextRecord)│ -│ ├─ sidecarBuffer (DirectByteBuffer → tag encodings + LRS) │ +│ ├─ ctxBuffer (688B DirectByteBuffer — record + sidecar contiguous)│ │ ├─ put(lrs, spanId, trHi, trLo) → setContextDirect() │ │ ├─ setContextAttribute(keyIdx, value) → setContextAttributeDirect │ +│ ├─ snapshot(byte[], int) / restore(byte[], int) ← nested scopes │ │ └─ Per-thread caches: │ │ └─ attrCache[CACHE_SIZE]: String → {int encoding, byte[] utf8}│ │ │ @@ -216,22 +218,22 @@ a partially-written record. Java writer timeline: ────────────────────────────────────────────────────────────────── Time 0: detach() - recordBuffer.put(validOffset, 0) ← mark invalid + ctxBuffer.put(validOffset, 0) ← mark invalid storeFence() ← drain store buffer Time 1: Mutate record fields - recordBuffer.putLong(traceIdOffset, ...) - recordBuffer.putLong(spanIdOffset, ...) - sidecar[0..9] ← 0 ← zero tag encodings - attrs_data_size ← LRS_ENTRY_SIZE ← keep only fixed LRS entry at attrs_data[0] - sidecarBuffer.putLong(lrsSidecarOffset, lrs) ← update sidecar LRS - writeLrsHex(lrs) ← update LRS in attrs_data + ctxBuffer.putLong(traceIdOffset, ...) + ctxBuffer.putLong(spanIdOffset, ...) + tag_encodings[0..9] ← 0 ← zero tag encodings (offsets 640..680) + attrs_data_size ← LRS_ENTRY_SIZE ← keep only fixed LRS entry at attrs_data[0] + ctxBuffer.putLong(lrsOffset, lrs) ← update LRS at offset 680 + writeLrsHex(lrs) ← update LRS hex entry in attrs_data ⚡ SIGPROF may arrive here — handler sees valid=0, skips record Time 2: attach() storeFence() ← ensure writes visible - recordBuffer.put(validOffset, 1) ← mark valid + ctxBuffer.put(validOffset, 1) ← mark valid ────────────────────────────────────────────────────────────────── ``` @@ -330,8 +332,8 @@ When a thread first accesses its `ThreadContext` via the `ThreadLocal`: ```java // JavaProfiler.initializeThreadContext() long[] metadata = new long[6]; -ByteBuffer[] buffers = initializeContextTLS0(metadata); -return new ThreadContext(buffers[0], buffers[1], metadata); +ByteBuffer buffer = initializeContextTLS0(metadata); +return new ThreadContext(buffer, metadata); ``` The native `initializeContextTLS0` (in `javaApi.cpp`): @@ -339,16 +341,20 @@ The native `initializeContextTLS0` (in `javaApi.cpp`): 1. Gets the calling thread's `ProfiledThread` (creates one if needed). 2. Sets `otel_thread_ctx_v1` permanently to the thread's `OtelThreadContextRecord` (triggering TLS slot init on musl). -3. Fills the `metadata` array with field offsets (computed via - `offsetof`), so Java code writes to the correct positions regardless - of struct packing changes. -4. Creates two `DirectByteBuffer`s mapped to: - - `_otel_ctx_record` (640 bytes) - - `_otel_tag_encodings` + `_otel_local_root_span_id` (48 bytes) -5. Returns the buffer array. +3. Fills the `metadata` array with absolute offsets into the unified + buffer (computed via `offsetof` for record fields; `OTEL_MAX_RECORD_SIZE + + DD_TAGS_CAPACITY*sizeof(u32) = 680` for the LRS offset), so Java code + writes to the correct positions regardless of struct packing changes. +4. Creates a single `DirectByteBuffer` spanning the contiguous 688-byte + region: `_otel_ctx_record` (640 B) followed immediately by + `_otel_tag_encodings` (40 B) and `_otel_local_root_span_id` (8 B). + Contiguity is enforced by `alignas(8)` on `_otel_ctx_record` plus + `sizeof(OtelThreadContextRecord)` being a multiple of 8. +5. Returns the single buffer. This is the only JNI call in the initialization path. After this, all -hot-path operations are pure Java ByteBuffer writes. +hot-path operations are pure Java ByteBuffer writes into offset regions +of the one buffer. ### Signal-Safe TLS Access @@ -395,7 +401,7 @@ if (value.equals(attrCacheKeys[slot])) { // Both sidecar and OTEP attrs_data are written inside the detach/attach window // so a signal handler never sees a new sidecar encoding alongside old attrs_data. detach(); -sidecarBuffer.putInt(keyIndex * 4, encoding); +ctxBuffer.putInt(TAG_ENCODINGS_OFFSET + keyIndex * 4, encoding); replaceOtepAttribute(otepKeyIndex, utf8); attach(); ```