[core] improve error handling when wake up from sleep mode (#12981)
Signed-off-by: youkaichao <youkaichao@gmail.com>
This commit is contained in:
parent
29f1d47e73
commit
59fff4a01a
@ -12,14 +12,20 @@ extern "C" {
|
|||||||
#include <cuda_runtime_api.h>
|
#include <cuda_runtime_api.h>
|
||||||
#include <cuda.h>
|
#include <cuda.h>
|
||||||
|
|
||||||
|
char error_msg[10240]; // 10KB buffer to store error messages
|
||||||
|
CUresult no_error = CUresult(0);
|
||||||
|
CUresult error_code = no_error; // store error code
|
||||||
|
|
||||||
#define CUDA_CHECK(condition) \
|
#define CUDA_CHECK(condition) \
|
||||||
do { \
|
do { \
|
||||||
CUresult error = condition; \
|
CUresult error = condition; \
|
||||||
if (error != 0) { \
|
if (error != 0) { \
|
||||||
|
error_code = error; \
|
||||||
char* error_string; \
|
char* error_string; \
|
||||||
cuGetErrorString(error, (const char**)&error_string); \
|
cuGetErrorString(error, (const char**)&error_string); \
|
||||||
std::cerr << "CUDA Error: " << error_string << " at " << __FILE__ << ":" \
|
snprintf(error_msg, sizeof(error_msg), "CUDA Error: %s at %s:%d", \
|
||||||
<< __LINE__ << std::endl; \
|
error_string, __FILE__, __LINE__); \
|
||||||
|
std::cerr << error_msg << std::endl; \
|
||||||
} \
|
} \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
@ -54,14 +60,22 @@ void create_and_map(unsigned long long device, ssize_t size, CUdeviceptr d_mem,
|
|||||||
|
|
||||||
// Allocate memory using cuMemCreate
|
// Allocate memory using cuMemCreate
|
||||||
CUDA_CHECK(cuMemCreate(p_memHandle, size, &prop, 0));
|
CUDA_CHECK(cuMemCreate(p_memHandle, size, &prop, 0));
|
||||||
|
if (error_code != 0) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
CUDA_CHECK(cuMemMap(d_mem, size, 0, *p_memHandle, 0));
|
CUDA_CHECK(cuMemMap(d_mem, size, 0, *p_memHandle, 0));
|
||||||
|
if (error_code != 0) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
CUmemAccessDesc accessDesc = {};
|
CUmemAccessDesc accessDesc = {};
|
||||||
accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
|
accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
|
||||||
accessDesc.location.id = device;
|
accessDesc.location.id = device;
|
||||||
accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
|
accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
|
||||||
|
|
||||||
CUDA_CHECK(cuMemSetAccess(d_mem, size, &accessDesc, 1));
|
CUDA_CHECK(cuMemSetAccess(d_mem, size, &accessDesc, 1));
|
||||||
|
if (error_code != 0) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
// std::cout << "create_and_map: device=" << device << ", size=" << size << ",
|
// std::cout << "create_and_map: device=" << device << ", size=" << size << ",
|
||||||
// d_mem=" << d_mem << ", p_memHandle=" << p_memHandle << std::endl;
|
// d_mem=" << d_mem << ", p_memHandle=" << p_memHandle << std::endl;
|
||||||
}
|
}
|
||||||
@ -73,7 +87,13 @@ void unmap_and_release(unsigned long long device, ssize_t size,
|
|||||||
// ", d_mem=" << d_mem << ", p_memHandle=" << p_memHandle << std::endl;
|
// ", d_mem=" << d_mem << ", p_memHandle=" << p_memHandle << std::endl;
|
||||||
ensure_context(device);
|
ensure_context(device);
|
||||||
CUDA_CHECK(cuMemUnmap(d_mem, size));
|
CUDA_CHECK(cuMemUnmap(d_mem, size));
|
||||||
|
if (error_code != 0) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
CUDA_CHECK(cuMemRelease(*p_memHandle));
|
CUDA_CHECK(cuMemRelease(*p_memHandle));
|
||||||
|
if (error_code != 0) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
PyObject* create_tuple_from_c_integers(unsigned long long a,
|
PyObject* create_tuple_from_c_integers(unsigned long long a,
|
||||||
@ -121,12 +141,16 @@ void* my_malloc(ssize_t size, int device, CUstream stream) {
|
|||||||
size_t granularity;
|
size_t granularity;
|
||||||
CUDA_CHECK(cuMemGetAllocationGranularity(&granularity, &prop,
|
CUDA_CHECK(cuMemGetAllocationGranularity(&granularity, &prop,
|
||||||
CU_MEM_ALLOC_GRANULARITY_MINIMUM));
|
CU_MEM_ALLOC_GRANULARITY_MINIMUM));
|
||||||
|
if (error_code != 0) {
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
size_t alignedSize = ((size + granularity - 1) / granularity) * granularity;
|
size_t alignedSize = ((size + granularity - 1) / granularity) * granularity;
|
||||||
|
|
||||||
CUdeviceptr d_mem;
|
CUdeviceptr d_mem;
|
||||||
CUDA_CHECK(cuMemAddressReserve(&d_mem, alignedSize, 0, 0, 0));
|
CUDA_CHECK(cuMemAddressReserve(&d_mem, alignedSize, 0, 0, 0));
|
||||||
|
if (error_code != 0) {
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
// allocate the CUmemGenericAllocationHandle
|
// allocate the CUmemGenericAllocationHandle
|
||||||
CUmemGenericAllocationHandle* p_memHandle =
|
CUmemGenericAllocationHandle* p_memHandle =
|
||||||
(CUmemGenericAllocationHandle*)malloc(
|
(CUmemGenericAllocationHandle*)malloc(
|
||||||
@ -208,6 +232,9 @@ void my_free(void* ptr, ssize_t size, int device, CUstream stream) {
|
|||||||
|
|
||||||
// free address and the handle
|
// free address and the handle
|
||||||
CUDA_CHECK(cuMemAddressFree(d_mem, size));
|
CUDA_CHECK(cuMemAddressFree(d_mem, size));
|
||||||
|
if (error_code != 0) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
free(p_memHandle);
|
free(p_memHandle);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -258,6 +285,12 @@ static PyObject* python_unmap_and_release(PyObject* self, PyObject* args) {
|
|||||||
|
|
||||||
unmap_and_release(recv_device, recv_size, d_mem_ptr, p_memHandle);
|
unmap_and_release(recv_device, recv_size, d_mem_ptr, p_memHandle);
|
||||||
|
|
||||||
|
if (error_code != 0) {
|
||||||
|
error_code = no_error;
|
||||||
|
PyErr_SetString(PyExc_RuntimeError, error_msg);
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
Py_RETURN_NONE;
|
Py_RETURN_NONE;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -282,6 +315,12 @@ static PyObject* python_create_and_map(PyObject* self, PyObject* args) {
|
|||||||
|
|
||||||
create_and_map(recv_device, recv_size, d_mem_ptr, p_memHandle);
|
create_and_map(recv_device, recv_size, d_mem_ptr, p_memHandle);
|
||||||
|
|
||||||
|
if (error_code != 0) {
|
||||||
|
error_code = no_error;
|
||||||
|
PyErr_SetString(PyExc_RuntimeError, error_msg);
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
Py_RETURN_NONE;
|
Py_RETURN_NONE;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
|
||||||
|
import pytest
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
from vllm import LLM, SamplingParams
|
from vllm import LLM, SamplingParams
|
||||||
@ -9,6 +10,32 @@ from vllm.utils import GiB_bytes
|
|||||||
from ..utils import fork_new_process_for_each_test
|
from ..utils import fork_new_process_for_each_test
|
||||||
|
|
||||||
|
|
||||||
|
@fork_new_process_for_each_test
|
||||||
|
def test_python_error():
|
||||||
|
"""
|
||||||
|
Test if Python error occurs when there's low-level
|
||||||
|
error happening from the C++ side.
|
||||||
|
"""
|
||||||
|
allocator = CuMemAllocator.get_instance()
|
||||||
|
total_bytes = torch.cuda.mem_get_info()[1]
|
||||||
|
alloc_bytes = int(total_bytes * 0.7)
|
||||||
|
tensors = []
|
||||||
|
with allocator.use_memory_pool():
|
||||||
|
# allocate 70% of the total memory
|
||||||
|
x = torch.empty(alloc_bytes, dtype=torch.uint8, device='cuda')
|
||||||
|
tensors.append(x)
|
||||||
|
# release the memory
|
||||||
|
allocator.sleep()
|
||||||
|
|
||||||
|
# allocate more memory than the total memory
|
||||||
|
y = torch.empty(alloc_bytes, dtype=torch.uint8, device='cuda')
|
||||||
|
tensors.append(y)
|
||||||
|
with pytest.raises(RuntimeError):
|
||||||
|
# when the allocator is woken up, it should raise an error
|
||||||
|
# because we don't have enough memory
|
||||||
|
allocator.wake_up()
|
||||||
|
|
||||||
|
|
||||||
@fork_new_process_for_each_test
|
@fork_new_process_for_each_test
|
||||||
def test_basic_cumem():
|
def test_basic_cumem():
|
||||||
# some tensors from default memory pool
|
# some tensors from default memory pool
|
||||||
|
Loading…
x
Reference in New Issue
Block a user