MNN GPU float16 使用原理

我观察到 MNN 在使用 GPU OpenCL 时,会默认使用 float16 的格式,导致模型评测时时间不同,如图。因此查看了 MNN 的源码,发现了一些有趣的东西。

Snipaste_2024-03-11_16-44-19.png

MNN 使用 MNN::BackendConfig::Precision_Low 时,会根据 GPU 的实际情况判断是否使用 float16 的数据格式。代码随附。

当导出的模型可以使用 float32 或者 float16 保存,当权重转换到 GPU 上时,会转换格式,在代码的第19行到第22行。代码随附。

Pipeline 中保存的 tensor 指向的 opencl buffer 保存的还是 float16。但是 OpenCL::onMapTensorOpenCL::onUnmapTensor 的实现保证了映射前后得到的是 float32。OpenCLBackend::onAcquire 给出了 tensor 中保存的 buffer 格式,其中调用了 isSupportedFP16 判断目前是否支持 float16,如果支持则使用 float16 的大小创建 buffer。

OpenCLBackend::onMapTensor 给出了映射 gpu buffer 到 cpu 上的实现。调用 onMapTensor 时如果不支持 SVM,会创建一个新的 cpu 内存块(svmPtr = allocMapTensorMemory),可以执行任意操作。对这块内存操作完毕后,如果是以 MAP_TENSOR_WRITE 的形式创建的 tensor,会将内存重新写回到 gpu buffer 中(onCopyBuffer(&srcTensor, dstTensor)),这个时候会执行一次数据格式转换,包括 float16 到 float32。这表示:每次只能 map 一个 gpu tensor,不能 map 两个

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
// MNN OpenCLBackend.cpp
Backend::MemObj* OpenCLBackend::onAcquire(const Tensor* nativeTensor, StorageType storageType) {
#ifdef LOG_VERBOSE
MNN_PRINT("Start OpenCLBackend::onAcquireBuffer !\n");
#endif

auto tensorShape = OpenCL::tensorShapeFormat(nativeTensor);
int N = tensorShape.at(0);
int H = tensorShape.at(1);
int W = tensorShape.at(2);
int C = tensorShape.at(3);

#ifdef LOG_VERBOSE
MNN_PRINT("OpenCLBackend::onAcquireBuffer: NHWC:[%d, %d, %d, %d]\n", N, H, W, C);
#endif

#ifndef MNN_OPENCL_BUFFER_CLOSED
if(mOpenCLRuntime->getGpuMemType() == BUFFER) {
size_t size;
if (nativeTensor->dimensions() >= 2) {
auto alignC = ROUND_UP(C, 8);
// increment of height and width
auto hR = ROUND_UP(H + 3, 4) - H;
auto wR = ROUND_UP(W + 3, 4) - W;
size = N * alignC * W * H;
size = size + hR * W * 4 + wR * 4;
} else {
size = nativeTensor->elementSize();
size = ROUND_UP(size, 4);
}

if (mOpenCLRuntime->isSupportedIntelSubgroup()) {
int cPack = TensorUtils::getTensorChannelPack(nativeTensor);
auto pads = TensorUtils::getDescribe(nativeTensor)->mPads;
size_t imageWidth = (size_t) ROUND_UP(UP_DIV(C, cPack), 2) * ROUND_UP(pads.left + W + pads.right, 4);//C-round to 8,W-round to 4, for memory alloc
size_t imageHeight = (size_t)N * H;
size = imageWidth*imageHeight*cPack;
}
cl_channel_type dataType = CL_FLOAT;
//when support and want fp16, use half datatype
if (getOpenCLRuntime()->isSupportedFP16()) {
dataType = CL_HALF_FLOAT;
}

if (storageType == DYNAMIC_SEPERATE) {
auto buffer = mBufferPool->alloc(size*
(dataType==CL_HALF_FLOAT?sizeof(half_float::half):sizeof(float)), true);
((Tensor*)nativeTensor)->buffer().device = (uint64_t)buffer;
return new CLMemReleaseBuffer(buffer, mBufferPool.get());
}
if (storageType == DYNAMIC) {
auto buffer = mBufferPool->alloc(size*
(dataType==CL_HALF_FLOAT?sizeof(half_float::half):sizeof(float)));
((Tensor*)nativeTensor)->buffer().device = (uint64_t)buffer;
return new CLMemReleaseBuffer(buffer, mBufferPool.get());
}
MNN_ASSERT(storageType == STATIC);
#ifdef MNN_LOW_MEMORY
// for weight quant model's weight
if ((nativeTensor->getType().code == halide_type_int) &&
(nativeTensor->getType().bits == 8 || nativeTensor->getType().bits == 4)) {
// int8 quant
size_t alloc_size = size;
if (nativeTensor->getType().bits == 4) {
// int4 quant
alloc_size = size / 2;
}
auto buffer = mStaticBufferPool->alloc(alloc_size);
((Tensor*)nativeTensor)->buffer().device = (uint64_t)buffer;
return new CLMemReleaseBuffer(buffer, mStaticBufferPool.get());
}
#endif
auto buffer = mStaticBufferPool->alloc(size*
(dataType == CL_HALF_FLOAT ? sizeof(half_float::half) : sizeof(float)));
((Tensor*)nativeTensor)->buffer().device = (uint64_t)buffer; // fix
return new CLMemReleaseBuffer(buffer, mStaticBufferPool.get());
}
else
#endif /* MNN_OPENCL_BUFFER_CLOSED */
{
size_t imageWidth = (size_t) (UP_DIV(C, 4) * W);//image mode only C pack to 4
size_t imageHeight = (size_t)N * H;
cl_channel_type dataType = CL_HALF_FLOAT;
//when user want high precision, use float datatype
if (mPrecision == BackendConfig::Precision_High) {
dataType = CL_FLOAT;
}

if (storageType == DYNAMIC_SEPERATE) {
auto image = mImagePool->alloc(imageWidth, imageHeight, dataType, true);
((Tensor*)nativeTensor)->buffer().device = (uint64_t)image; // fix
return new CLMemReleaseImage(image, mImagePool.get());
}
if (storageType == DYNAMIC) {
auto image = mImagePool->alloc(imageWidth, imageHeight, dataType);
((Tensor*)nativeTensor)->buffer().device = (uint64_t)image; // fix
return new CLMemReleaseImage(image, mImagePool.get());
}
MNN_ASSERT(storageType == STATIC);
auto image = mStaticImagePool->alloc(imageWidth, imageHeight, dataType);
((Tensor*)nativeTensor)->buffer().device = (uint64_t)image; // fix
return new CLMemReleaseImage(image, mStaticImagePool.get());
}
}

void* OpenCLBackend::onMapTensor(Tensor::MapType mtype, Tensor::DimensionType dtype, const Tensor* srcTensor) {
auto needSize = srcTensor->size();
clearRecord();
#ifdef MNN_OPENCL_SVM_ENABLE
auto svm_cap_ = mOpenCLRuntime->getSvmCapabilities();
bool use_svm = (svm_cap_ & CL_DEVICE_SVM_FINE_GRAIN_BUFFER);//support fine grain svm
use_svm |= ((svm_cap_ & CL_DEVICE_SVM_COARSE_GRAIN_BUFFER) && mOpenCLRuntime->getGpuType() == ADRENO);//support coarse grain svm and adreno gpu

mUseSvm = (mOpenCLRuntime->getCLVersion() > 1.99f && use_svm);
if(mUseSvm) {// CL version beyond 2.0 & support svm
svmPtr = allocMapTensorMemory(needSize, true, svm_cap_);

if(mtype == Tensor::MAP_TENSOR_READ) {
//tmpTensor alloc
MNN::Tensor tmpTensor(srcTensor, dtype, false);
tmpTensor.buffer().device = (uint64_t)svmPtr;

//Convert format
MNN_DATA_FORMAT format_type = MNN_DATA_FORMAT_NCHW;
if(dtype == MNN::Tensor::TENSORFLOW) {
format_type = MNN_DATA_FORMAT_NHWC;
} else if(dtype == MNN::Tensor::CAFFE_C4) {
format_type = MNN_DATA_FORMAT_NC4HW4;
}
mCLRuntime->convertFromDevice(srcTensor, &tmpTensor, format_type, true);
}

if(svm_cap_ & CL_DEVICE_SVM_FINE_GRAIN_BUFFER) {
//Make sure command finished
mOpenCLRuntime->commandQueue().finish();
return svmPtr;
}

auto map_flag = CL_MAP_WRITE;
if(mtype == Tensor::MAP_TENSOR_READ) {
map_flag = CL_MAP_READ;
}

cl_int res = clEnqueueSVMMap(mOpenCLRuntime->commandQueue().get(), true, map_flag, svmPtr, needSize, 0, nullptr, nullptr);

MNN_CHECK_CL_SUCCESS(res, "svm_map")
return svmPtr;
}
#endif

/**
Not Support Svm, Use onopyBuffer
*/
svmPtr = allocMapTensorMemory(needSize, false);

if(mtype == Tensor::MAP_TENSOR_READ) {
//tmpTensor alloc
MNN::Tensor tmpTensor(srcTensor, dtype, false);
tmpTensor.buffer().host = (uint8_t *)svmPtr;

//use onCopyBuffer
onCopyBuffer(srcTensor, &tmpTensor);
}
return svmPtr;
}

bool OpenCLBackend::onUnmapTensor(Tensor::MapType mtype, Tensor::DimensionType dtype, const Tensor* dstTensor, void* mapPtr) {
#ifdef MNN_OPENCL_SVM_ENABLE
auto svm_cap_ = mOpenCLRuntime->getSvmCapabilities();
if(mUseSvm) {// CL version beyond 2.0 & support svm

//If COARSE_SVM, Unmap first
if(!(svm_cap_ & CL_DEVICE_SVM_FINE_GRAIN_BUFFER)) {
cl_int res = clEnqueueSVMUnmap(mOpenCLRuntime->commandQueue().get(), svmPtr, 0, nullptr, nullptr);
MNN_CHECK_CL_SUCCESS(res, "svm_unmap")
}

if(mtype == Tensor::MAP_TENSOR_WRITE) {
//interTensor alloc
MNN::Tensor interTensor(dstTensor, dtype, false);
interTensor.buffer().device = (uint64_t)svmPtr;

//Convert format
MNN_DATA_FORMAT format_type = MNN_DATA_FORMAT_NCHW;
if(dtype == MNN::Tensor::TENSORFLOW) {
format_type = MNN_DATA_FORMAT_NHWC;
} else if(dtype == MNN::Tensor::CAFFE_C4) {
format_type = MNN_DATA_FORMAT_NC4HW4;
}
mCLRuntime->convertToDevice(&interTensor, dstTensor, format_type, true);
}
mOpenCLRuntime->commandQueue().finish();

return true;
}
#endif

/**
Not Support Svm, Use onopyBuffer
*/
if(mtype == Tensor::MAP_TENSOR_WRITE) {
//srcTensor alloc
MNN::Tensor srcTensor(dstTensor, dtype, false);
srcTensor.buffer().host = (uint8_t *)svmPtr;

//use onCopyBuffer
onCopyBuffer(&srcTensor, dstTensor);
}
return true;
}


// MNN ConvBufExecution.cpp
void ConvBufExecution::setConv1x1WeightBuffer(int packCout, int packCin, const float* filterDataPtr) {
cl_int res;
std::shared_ptr<Tensor> filterBuffer(Tensor::createDevice<float>({ROUND_UP(mOutputChannel, 8)/*Cout pack set to max 8*/, ROUND_UP(mInputChannel, packCin), mKernelWidth, mKernelHeight}));

int buffer_size = filterBuffer->elementSize();
if(mOpenCLBackend->getOpenCLRuntime()->isSupportedFP16()) {
buffer_size *= sizeof(half_float::half);
} else {
buffer_size *= sizeof(float);
}
mKernelBuffer.reset(new cl::Buffer(mOpenCLBackend->getOpenCLRuntime()->context(), CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, buffer_size));
auto kernelBufferPtr = mOpenCLBackend->getOpenCLRuntime()->commandQueue().enqueueMapBuffer(*(mKernelBuffer.get()), true, CL_MAP_WRITE, 0, buffer_size, nullptr, nullptr, &res);
if(kernelBufferPtr != nullptr && res == CL_SUCCESS){
::memset(kernelBufferPtr, 0, buffer_size);
for(int o = 0; o < mOutputChannel; o++){
for(int i = 0 ; i < mInputChannel; i++){
int bufferIdx = (o/packCout) * ROUND_UP(mInputChannel, packCin)*packCout + (i/packCin)*packCin*packCout + (o%packCout)*packCin + (i%packCin);//(Co/packCout, Ci/packCin, packCout, packCin)
int filterIdx = o*mInputChannel + i;
if(mOpenCLBackend->getOpenCLRuntime()->isSupportedFP16()){
((half_float::half*)kernelBufferPtr)[bufferIdx] = (half_float::half)(filterDataPtr[filterIdx]);
}else{
((float*)kernelBufferPtr)[bufferIdx] = (float)(filterDataPtr[filterIdx]);
}
}
}
}else{
MNN_ERROR("Map error ptrCL == nullptr \n");
MNN_ASSERT(false);
}
mOpenCLBackend->getOpenCLRuntime()->commandQueue().enqueueUnmapMemObject(*(mKernelBuffer.get()), kernelBufferPtr);
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
// MNN OpenCLRuntime.cpp
cl_device_fp_config fpConfig;
auto success = mFirstGPUDevicePtr->getInfo(CL_DEVICE_HALF_FP_CONFIG, &fpConfig);
mIsDeviceSupportedFP16 = CL_SUCCESS == success && fpConfig > 0;

//set gpu mode, tuning level and memory object
setGpuMode(cl_mode);

if(mMemType == AUTO) {
if(mGpuType == MALI || mGpuType == INTEL) {
mMemType = BUFFER;
} else {
mMemType = IMAGE;
}
}

auto permitFloat16 = false;
if (precision == BackendConfig::Precision_Low || (mMemType == BUFFER && precision == BackendConfig::Precision_Normal)) {//buffer mode not support Normal Precision yet
permitFloat16 = true;
}
mIsSupportedFP16 = mIsDeviceSupportedFP16 && permitFloat16;
MNN_PRINT("opencl support fp16: %d, device support fp16: %d, permit fp16: %d\n", mIsSupportedFP16, mIsDeviceSupportedFP16, permitFloat16);


MNN GPU float16 使用原理
http://hebangwen.github.io/2024/03/11/MNN_GPU_float16/
作者
何榜文
发布于
2024年3月11日
许可协议