Fix a race condition between the "ephemeral watchpoint disabling" and commands the...
[lldb.git] / parallel-libs / streamexecutor / lib / platforms / cuda / CUDAPlatformDevice.cpp
1 //===-- CUDAPlatformDevice.cpp - CUDAPlatformDevice implementation --------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 ///
10 /// \file
11 /// Implementation of CUDAPlatformDevice.
12 ///
13 //===----------------------------------------------------------------------===//
14
15 #include "streamexecutor/platforms/cuda/CUDAPlatformDevice.h"
16 #include "streamexecutor/PlatformDevice.h"
17
18 #include "cuda.h"
19
20 namespace streamexecutor {
21 namespace cuda {
22
23 static void *offset(const void *Base, size_t Offset) {
24   return const_cast<char *>(static_cast<const char *>(Base) + Offset);
25 }
26
27 Error CUresultToError(int CUResult, const llvm::Twine &Message) {
28   CUresult Result = static_cast<CUresult>(CUResult);
29   if (Result) {
30     const char *ErrorName;
31     if (cuGetErrorName(Result, &ErrorName))
32       ErrorName = "UNKNOWN ERROR NAME";
33     const char *ErrorString;
34     if (cuGetErrorString(Result, &ErrorString))
35       ErrorString = "UNKNOWN ERROR DESCRIPTION";
36     return make_error("CUDA driver error: '" + Message + "', error code = " +
37                       llvm::Twine(static_cast<int>(Result)) + ", name = " +
38                       ErrorName + ", description = '" + ErrorString + "'");
39   } else
40     return Error::success();
41 }
42
43 std::string CUDAPlatformDevice::getName() const {
44   static std::string CachedName = [](int DeviceIndex) {
45     static constexpr size_t MAX_DRIVER_NAME_BYTES = 1024;
46     std::string Name = "CUDA device " + std::to_string(DeviceIndex);
47     char NameFromDriver[MAX_DRIVER_NAME_BYTES];
48     if (!cuDeviceGetName(NameFromDriver, MAX_DRIVER_NAME_BYTES - 1,
49                          DeviceIndex)) {
50       NameFromDriver[MAX_DRIVER_NAME_BYTES - 1] = '\0';
51       Name.append(": ").append(NameFromDriver);
52     }
53     return Name;
54   }(DeviceIndex);
55   return CachedName;
56 }
57
58 Expected<CUDAPlatformDevice> CUDAPlatformDevice::create(size_t DeviceIndex) {
59   CUdevice DeviceHandle;
60   if (CUresult Result = cuDeviceGet(&DeviceHandle, DeviceIndex))
61     return CUresultToError(Result, "cuDeviceGet");
62
63   CUcontext ContextHandle;
64   if (CUresult Result = cuDevicePrimaryCtxRetain(&ContextHandle, DeviceHandle))
65     return CUresultToError(Result, "cuDevicePrimaryCtxRetain");
66
67   if (CUresult Result = cuCtxSetCurrent(ContextHandle))
68     return CUresultToError(Result, "cuCtxSetCurrent");
69
70   return CUDAPlatformDevice(DeviceIndex);
71 }
72
73 CUDAPlatformDevice::CUDAPlatformDevice(CUDAPlatformDevice &&Other) noexcept
74     : DeviceIndex(Other.DeviceIndex) {
75   Other.DeviceIndex = -1;
76 }
77
78 CUDAPlatformDevice &CUDAPlatformDevice::
79 operator=(CUDAPlatformDevice &&Other) noexcept {
80   DeviceIndex = Other.DeviceIndex;
81   Other.DeviceIndex = -1;
82   return *this;
83 }
84
85 CUDAPlatformDevice::~CUDAPlatformDevice() {
86   CUresult Result = cuDevicePrimaryCtxRelease(DeviceIndex);
87   (void)Result;
88   // TODO(jhen): Log error.
89 }
90
91 Expected<const void *>
92 CUDAPlatformDevice::createKernel(const MultiKernelLoaderSpec &Spec) {
93   // TODO(jhen): Maybe first check loaded modules?
94   if (!Spec.hasCUDAPTXInMemory())
95     return make_error("no CUDA code available to create kernel");
96
97   CUdevice Device = static_cast<int>(DeviceIndex);
98   int ComputeCapabilityMajor = 0;
99   int ComputeCapabilityMinor = 0;
100   if (CUresult Result = cuDeviceGetAttribute(
101           &ComputeCapabilityMajor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
102           Device))
103     return CUresultToError(
104         Result,
105         "cuDeviceGetAttribute CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR");
106   if (CUresult Result = cuDeviceGetAttribute(
107           &ComputeCapabilityMinor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
108           Device))
109     return CUresultToError(
110         Result,
111         "cuDeviceGetAttribute CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR");
112   const char *Code = Spec.getCUDAPTXInMemory().getCode(ComputeCapabilityMajor,
113                                                        ComputeCapabilityMinor);
114
115   if (!Code)
116     return make_error("no suitable CUDA source found for compute capability " +
117                       llvm::Twine(ComputeCapabilityMajor) + "." +
118                       llvm::Twine(ComputeCapabilityMinor));
119
120   CUmodule Module;
121   if (CUresult Result = cuModuleLoadData(&Module, Code))
122     return CUresultToError(Result, "cuModuleLoadData");
123
124   CUfunction Function;
125   if (CUresult Result =
126           cuModuleGetFunction(&Function, Module, Spec.getKernelName().c_str()))
127     return CUresultToError(Result, "cuModuleGetFunction");
128
129   // TODO(jhen): Should I save this function pointer in case someone asks for
130   // it again?
131
132   // TODO(jhen): Should I save the module pointer so I can unload it when I
133   // destroy this device?
134
135   return static_cast<const void *>(Function);
136 }
137
138 Error CUDAPlatformDevice::destroyKernel(const void *Handle) {
139   // TODO(jhen): Maybe keep track of kernels for each module and unload the
140   // module after they are all destroyed.
141   return Error::success();
142 }
143
144 Expected<const void *> CUDAPlatformDevice::createStream() {
145   CUstream Stream;
146   if (CUresult Result = cuStreamCreate(&Stream, CU_STREAM_DEFAULT))
147     return CUresultToError(Result, "cuStreamCreate");
148   return Stream;
149 }
150
151 Error CUDAPlatformDevice::destroyStream(const void *Handle) {
152   return CUresultToError(
153       cuStreamDestroy(static_cast<CUstream>(const_cast<void *>(Handle))),
154       "cuStreamDestroy");
155 }
156
157 Error CUDAPlatformDevice::launch(
158     const void *PlatformStreamHandle, BlockDimensions BlockSize,
159     GridDimensions GridSize, const void *PKernelHandle,
160     const PackedKernelArgumentArrayBase &ArgumentArray) {
161   CUfunction Function =
162       reinterpret_cast<CUfunction>(const_cast<void *>(PKernelHandle));
163   CUstream Stream =
164       reinterpret_cast<CUstream>(const_cast<void *>(PlatformStreamHandle));
165
166   auto Launch = [Function, Stream, BlockSize,
167                  GridSize](size_t SharedMemoryBytes, void **ArgumentAddresses) {
168     return CUresultToError(
169         cuLaunchKernel(Function,                              //
170                        GridSize.X, GridSize.Y, GridSize.Z,    //
171                        BlockSize.X, BlockSize.Y, BlockSize.Z, //
172                        SharedMemoryBytes, Stream, ArgumentAddresses, nullptr),
173         "cuLaunchKernel");
174   };
175
176   void **ArgumentAddresses = const_cast<void **>(ArgumentArray.getAddresses());
177   size_t SharedArgumentCount = ArgumentArray.getSharedCount();
178   if (SharedArgumentCount) {
179     // The argument handling in this case is not very efficient. We may need to
180     // come back and optimize it later.
181     //
182     // Perhaps introduce another branch for the case where there is exactly one
183     // shared memory argument and it is the first one. This is the only case
184     // that will be used for compiler-generated CUDA kernels, and OpenCL users
185     // can choose to take advantage of it by combining their dynamic shared
186     // memory arguments and putting them first in the kernel signature.
187     unsigned SharedMemoryBytes = 0;
188     size_t ArgumentCount = ArgumentArray.getArgumentCount();
189     llvm::SmallVector<void *, 16> NonSharedArgumentAddresses(
190         ArgumentCount - SharedArgumentCount);
191     size_t NonSharedIndex = 0;
192     for (size_t I = 0; I < ArgumentCount; ++I)
193       if (ArgumentArray.getType(I) == KernelArgumentType::SHARED_DEVICE_MEMORY)
194         SharedMemoryBytes += ArgumentArray.getSize(I);
195       else
196         NonSharedArgumentAddresses[NonSharedIndex++] = ArgumentAddresses[I];
197     return Launch(SharedMemoryBytes, NonSharedArgumentAddresses.data());
198   }
199   return Launch(0, ArgumentAddresses);
200 }
201
202 Error CUDAPlatformDevice::copyD2H(const void *PlatformStreamHandle,
203                                   const void *DeviceSrcHandle,
204                                   size_t SrcByteOffset, void *HostDst,
205                                   size_t DstByteOffset, size_t ByteCount) {
206   return CUresultToError(
207       cuMemcpyDtoHAsync(
208           offset(HostDst, DstByteOffset),
209           reinterpret_cast<CUdeviceptr>(offset(DeviceSrcHandle, SrcByteOffset)),
210           ByteCount,
211           static_cast<CUstream>(const_cast<void *>(PlatformStreamHandle))),
212       "cuMemcpyDtoHAsync");
213 }
214
215 Error CUDAPlatformDevice::copyH2D(const void *PlatformStreamHandle,
216                                   const void *HostSrc, size_t SrcByteOffset,
217                                   const void *DeviceDstHandle,
218                                   size_t DstByteOffset, size_t ByteCount) {
219   return CUresultToError(
220       cuMemcpyHtoDAsync(
221           reinterpret_cast<CUdeviceptr>(offset(DeviceDstHandle, DstByteOffset)),
222           offset(HostSrc, SrcByteOffset), ByteCount,
223           static_cast<CUstream>(const_cast<void *>(PlatformStreamHandle))),
224       "cuMemcpyHtoDAsync");
225 }
226
227 Error CUDAPlatformDevice::copyD2D(const void *PlatformStreamHandle,
228                                   const void *DeviceSrcHandle,
229                                   size_t SrcByteOffset,
230                                   const void *DeviceDstHandle,
231                                   size_t DstByteOffset, size_t ByteCount) {
232   return CUresultToError(
233       cuMemcpyDtoDAsync(
234           reinterpret_cast<CUdeviceptr>(offset(DeviceDstHandle, DstByteOffset)),
235           reinterpret_cast<CUdeviceptr>(offset(DeviceSrcHandle, SrcByteOffset)),
236           ByteCount,
237           static_cast<CUstream>(const_cast<void *>(PlatformStreamHandle))),
238       "cuMemcpyDtoDAsync");
239 }
240
241 Error CUDAPlatformDevice::blockHostUntilDone(const void *PlatformStreamHandle) {
242   return CUresultToError(cuStreamSynchronize(static_cast<CUstream>(
243                              const_cast<void *>(PlatformStreamHandle))),
244                          "cuStreamSynchronize");
245 }
246
247 Expected<void *> CUDAPlatformDevice::allocateDeviceMemory(size_t ByteCount) {
248   CUdeviceptr Pointer;
249   if (CUresult Result = cuMemAlloc(&Pointer, ByteCount))
250     return CUresultToError(Result, "cuMemAlloc");
251   return reinterpret_cast<void *>(Pointer);
252 }
253
254 Error CUDAPlatformDevice::freeDeviceMemory(const void *Handle) {
255   return CUresultToError(cuMemFree(reinterpret_cast<CUdeviceptr>(Handle)),
256                          "cuMemFree");
257 }
258
259 Error CUDAPlatformDevice::registerHostMemory(void *Memory, size_t ByteCount) {
260   return CUresultToError(cuMemHostRegister(Memory, ByteCount, 0u),
261                          "cuMemHostRegister");
262 }
263
264 Error CUDAPlatformDevice::unregisterHostMemory(const void *Memory) {
265   return CUresultToError(cuMemHostUnregister(const_cast<void *>(Memory)),
266                          "cuMemHostUnregister");
267 }
268
269 Error CUDAPlatformDevice::synchronousCopyD2H(const void *DeviceSrcHandle,
270                                              size_t SrcByteOffset,
271                                              void *HostDst,
272                                              size_t DstByteOffset,
273                                              size_t ByteCount) {
274   return CUresultToError(cuMemcpyDtoH(offset(HostDst, DstByteOffset),
275                                       reinterpret_cast<CUdeviceptr>(offset(
276                                           DeviceSrcHandle, SrcByteOffset)),
277                                       ByteCount),
278                          "cuMemcpyDtoH");
279 }
280
281 Error CUDAPlatformDevice::synchronousCopyH2D(const void *HostSrc,
282                                              size_t SrcByteOffset,
283                                              const void *DeviceDstHandle,
284                                              size_t DstByteOffset,
285                                              size_t ByteCount) {
286   return CUresultToError(
287       cuMemcpyHtoD(
288           reinterpret_cast<CUdeviceptr>(offset(DeviceDstHandle, DstByteOffset)),
289           offset(HostSrc, SrcByteOffset), ByteCount),
290       "cuMemcpyHtoD");
291 }
292
293 Error CUDAPlatformDevice::synchronousCopyD2D(const void *DeviceDstHandle,
294                                              size_t DstByteOffset,
295                                              const void *DeviceSrcHandle,
296                                              size_t SrcByteOffset,
297                                              size_t ByteCount) {
298   return CUresultToError(
299       cuMemcpyDtoD(
300           reinterpret_cast<CUdeviceptr>(offset(DeviceDstHandle, DstByteOffset)),
301           reinterpret_cast<CUdeviceptr>(offset(DeviceSrcHandle, SrcByteOffset)),
302           ByteCount),
303       "cuMemcpyDtoD");
304 }
305
306 } // namespace cuda
307 } // namespace streamexecutor