Bug report
Bug description:
While familiarizing myself with the state of the consts, for the purpose of #148972 (comment), I've noticed that there's thread_count in the format:
|
memcpy(&thread_count, &data[HDR_OFF_THREADS], HDR_SIZE_THREADS); |
That said, it seems that reader_get_or_create_thread_state in the BinaryReader just ignores it:
|
/* Get or create reader thread state for stack reconstruction */ |
|
static ReaderThreadState * |
|
reader_get_or_create_thread_state(BinaryReader *reader, uint64_t thread_id, |
|
uint32_t interpreter_id) |
|
{ |
|
/* Search existing threads (key is thread_id + interpreter_id) */ |
|
for (size_t i = 0; i < reader->thread_state_count; i++) { |
|
if (reader->thread_states[i].thread_id == thread_id && |
|
reader->thread_states[i].interpreter_id == interpreter_id) { |
|
return &reader->thread_states[i]; |
|
} |
|
} |
|
|
|
if (!reader->thread_states) { |
|
reader->thread_state_capacity = 16; |
|
reader->thread_states = PyMem_Calloc(reader->thread_state_capacity, sizeof(ReaderThreadState)); |
|
if (!reader->thread_states) { |
|
PyErr_NoMemory(); |
|
return NULL; |
|
} |
|
} else if (reader->thread_state_count >= reader->thread_state_capacity) { |
|
ReaderThreadState *new_states = grow_array(reader->thread_states, |
|
&reader->thread_state_capacity, |
|
sizeof(ReaderThreadState)); |
|
if (!new_states) { |
|
return NULL; |
|
} |
|
reader->thread_states = new_states; |
|
} |
|
|
|
ReaderThreadState *ts = &reader->thread_states[reader->thread_state_count]; |
|
memset(ts, 0, sizeof(ReaderThreadState)); |
|
ts->thread_id = thread_id; |
|
ts->interpreter_id = interpreter_id; |
|
ts->prev_timestamp = reader->start_time_us; |
|
ts->current_stack_capacity = MAX_STACK_DEPTH; |
|
ts->current_stack = PyMem_Malloc(ts->current_stack_capacity * sizeof(uint32_t)); |
|
if (!ts->current_stack) { |
|
PyErr_NoMemory(); |
|
return NULL; |
|
} |
|
// Increment count only after successful allocation to avoid |
|
// leaving a half-initialized entry visible to future lookups |
|
reader->thread_state_count++; |
|
return ts; |
|
} |
I believe that an evil file with a lot of (thread_id, interpreter_id) could effectively DoS the reader.
Reproduction
import struct
import tempfile
import resource
from _remote_debugging import (
BinaryReader,
FrameInfo,
InterpreterInfo,
LocationInfo,
ThreadInfo,
)
from profiling.sampling.binary_collector import BinaryCollector
# Thanks Claude for this idea...
class ThreadCountingSink:
def __init__(self):
self.unique_threads = set()
def collect(self, sample, timestamps):
for interp in sample:
for thread in interp.threads:
self.unique_threads.add(
(interp.interpreter_id, thread.thread_id)
)
N = 100_000
frame = FrameInfo(("x.py", LocationInfo((1, 1, 0, 0)), "f", None))
threads = [ThreadInfo((i, 0, [frame])) for i in range(N)]
with tempfile.NamedTemporaryFile(suffix=".bin") as f:
writer = BinaryCollector(f.name, 1000, compression="none")
writer.collect([InterpreterInfo((0, threads))], timestamp_us=1000)
writer.export(None)
# Introduce a lie. :-)
with open(f.name, "r+b") as raw:
raw.seek(32)
raw.write(struct.pack("<I", 1))
print("Reading...")
reader = BinaryReader(f.name)
before = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
sink = ThreadCountingSink()
samples = reader.replay(sink)
after = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
print(reader.get_info()["thread_count"])
print(f"{len(sink.unique_threads)} unique (interp_id, thread_id) pairs")
print(f"RSS diff: {(after - before) / 1024:.1f} MB")
print(f"samples: {samples}")
2026-04-26T13:47:07.878443000+0200 maurycy@gimel /Users/maurycy/src/github.com/maurycy/cpython (main 6d4ca16) % ./python.exe bye-sweet-prince.py
Reading...
1
100000 unique (interp_id, thread_id) pairs
RSS diff: 20528.0 MB
samples: 100000
(20528.0 MB is higher than I expected...)
Solution
I think the fix is either a preallocation or a simple if (reader->thread_state_count >= reader->thread_count).
Truth to be told, something like MAX_THREADS might be warranted there, too:
|
const size_t MAX_THREADS = 8192; |
Another obvious angle: check other values in the header.
CPython versions tested on:
CPython main branch
Operating systems tested on:
macOS
Bug report
Bug description:
While familiarizing myself with the state of the consts, for the purpose of #148972 (comment), I've noticed that there's
thread_countin the format:cpython/Modules/_remote_debugging/binary_io.h
Line 314 in 6d4ca16
cpython/Modules/_remote_debugging/binary_io_reader.c
Line 95 in 6d4ca16
That said, it seems that
reader_get_or_create_thread_statein theBinaryReaderjust ignores it:cpython/Modules/_remote_debugging/binary_io_reader.c
Lines 553 to 598 in 6d4ca16
I believe that an evil file with a lot of
(thread_id, interpreter_id)could effectively DoS the reader.Reproduction
(20528.0 MB is higher than I expected...)
Solution
I think the fix is either a preallocation or a simple
if (reader->thread_state_count >= reader->thread_count).Truth to be told, something like
MAX_THREADSmight be warranted there, too:cpython/Modules/_remote_debugging/threads.c
Line 32 in 6d4ca16
Another obvious angle: check other values in the header.
CPython versions tested on:
CPython main branch
Operating systems tested on:
macOS