Is this a duplicate?
Type of Bug
Performance
Component
cuda.bindings
Describe the bug
When using cuda.bindings.nvrtc to compile code in multiple threads, the threads run sequentially indeed. This is an obvious GIL issue, since NVRTC itself supports multi-threading.
How to Reproduce
import cuda.bindings.nvrtc as nvrtc
import concurrent.futures
import threading
import time
num_workers = 8
pools = [
(concurrent.futures.ThreadPoolExecutor(max_workers=num_workers), "thread"),
(concurrent.futures.ProcessPoolExecutor(max_workers=num_workers), "process")
]
def task(start_time):
code = f"""
extern "C" __global__
void test(float a, float *x, float *y, float *out, size_t n)
{{
size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid < n) {{
float val = x[tid];
for (int i = 0; i < 1000; ++i) {{
val = val * a + y[tid];
val = sinf(val) * cosf(y[tid] + (float)i);
val = val / (1.0f + sqrtf(val * val + (float)i));
}}
for (int i = 0; i < 1000; ++i) {{
val = val + a + y[tid];
val = sinf(val) * cosf(y[tid] + (float)i);
val = val / (1.0f + sqrtf(val * val + (float)i));
}}
for (int i = 0; i < 1000; ++i) {{
float divisor_a = a;
if (fabsf(divisor_a) < 1e-7f) {{
divisor_a = copysignf(1e-7f, a);
}}
val = val / divisor_a + y[tid];
val = sinf(val) * cosf(y[tid] + (float)i);
val = val * (1.0f + sqrtf(val * val + (float)i));
}}
for (int j = 0; j < 40; ++j) {{
val = val * sinf(((float)j + 1.0f) * 0.1f + y[tid] * a) + cosf(val - ((float)j + 1.0f) * 0.05f);
val = sqrtf(fabsf(val) + 0.1f + ((float)j + 1.0f) * 0.01f) / (0.1f + fabsf(y[tid] - ((float)j + 1.0f) * 0.2f + a));
val = val + expf(0.001f * (val - a + y[tid] - ((float)j + 1.0f) * 0.15f));
val = logf(fabsf(val) + 0.1f + fabsf(a) + ((float)j + 1.0f) * 0.02f) * tanhf(y[tid] / (1.0f + fabsf(val) + ((float)j + 1.0f) * 0.03f));
val = val - a * cosf(val + ((float)j + 1.0f) * 0.08f) + y[tid] * sinf(((float)j + 1.0f) * 0.03f - val);
}}
out[tid] = val;
}}
}}
"""
current_time_start = time.time()
print(f"[{threading.get_native_id()}] Start at {current_time_start-start_time}")
_, program = nvrtc.nvrtcCreateProgram(bytes(code, "utf-8"), bytes("test.cu", "utf-8"), 0, [], [])
options = [b"--gpu-architecture=sm_90"]
_ = nvrtc.nvrtcCompileProgram(program, len(options), options)
current_time_end = time.time()
print(f"[{threading.get_native_id()}] End at {current_time_end-start_time}")
for pool, name in pools:
print(f"Using {name} pool")
futures = []
start_time = time.time()
for i in range(num_workers):
future = pool.submit(task, start_time)
futures.append(future)
for future in concurrent.futures.as_completed(futures):
future.result()
A sample output is as follows:
Using thread pool
[237132] Start at 0.0005047321319580078
[237133] Start at 0.01818108558654785
[237134] Start at 0.8023033142089844
[237132] End at 0.802753210067749
[237177] Start at 0.8028905391693115
[237132] Start at 1.0601465702056885
[237134] End at 1.0604710578918457
[237132] End at 1.3186068534851074
[237208] Start at 1.3188414573669434
[237134] Start at 1.3195312023162842
[237177] End at 1.5766639709472656
[237178] Start at 1.3193464279174805
[237133] End at 1.0606462955474854
[237208] End at 1.8349909782409668
[237134] End at 2.3467905521392822
[237178] End at 2.3475685119628906
Using process pool
[237225] Start at 0.025279998779296875
[237226] Start at 0.025611162185668945
[237227] Start at 0.025903940200805664
[237228] Start at 0.026208162307739258
[237229] Start at 0.026407718658447266
[237230] Start at 0.026773691177368164
[237231] Start at 0.027005672454833984
[237232] Start at 0.027050018310546875
[237229] End at 0.27670884132385254
[237232] End at 0.28432679176330566
[237228] End at 0.28521203994750977
[237227] End at 0.28534388542175293
[237226] End at 0.28702807426452637
[237231] End at 0.28710436820983887
[237230] End at 0.28752636909484863
[237225] End at 0.28855133056640625
Which clearly shows that when using ThreadPoolExecutor, cuda.bindings.nvrtc is affected by GIL and runs sequentially.
Expected behavior
cuda.bindings.nvrtc should not be affected by GIL and should run in parallel when used in multiple threads.
Operating System
Ubuntu Linux 24.04
nvidia-smi output
No response
Is this a duplicate?
Type of Bug
Performance
Component
cuda.bindings
Describe the bug
When using cuda.bindings.nvrtc to compile code in multiple threads, the threads run sequentially indeed. This is an obvious GIL issue, since NVRTC itself supports multi-threading.
How to Reproduce
A sample output is as follows:
Which clearly shows that when using
ThreadPoolExecutor,cuda.bindings.nvrtcis affected by GIL and runs sequentially.Expected behavior
cuda.bindings.nvrtcshould not be affected by GIL and should run in parallel when used in multiple threads.Operating System
Ubuntu Linux 24.04
nvidia-smi output
No response