opencl-tips

为什么从 CPU 上拷贝到 Image 里使用的是【CPU -> Buffer -> Image】而不是【CPU -> Image】？

因为 Image 一般都是用 RGBA 格式，需要填充为 4 通道，以及长宽 4 对齐等需求。如果在 CPU 上做，需要对内存进行补边再拷贝到 Image 里；不如【CPU -> Buffer -> Image】的兼容性更好。前者不用考虑 clEnqueueWriteBuffer 的参数，在 buffer_to_image 内核中注意边界条件即可。

为什么大部分 OpenCL 编程使用的都是全局内存，少用局部内存/共享内存？

【不使用局部内存】可能是一种编程规范。这样做有以下优点：

可以通过在线调优 local_work_size 的方式找到最优值，从而提高性能，因为使用局部内存可能需要已知的 lws 大小；如果不需要局部内存，lws 的值可以是任意的，从而可以更灵活地调优。
不同厂商（vendor）的实现可能不同，使用全局内存可以避免不同厂商的差异。比如 mali GPU 使用的是 global memory 模拟 local memory，此时使用 local memory 并不能带来较高的提升；而 NV 显卡就不同，local memory 的确是速度更快的片上内存。
一次编码要适应到所有的 case，这样做能够降低编码难度

MNN 的 Depthwise Conv 实现

关键参数列表如下：

参数名	SHAPE	IMAGE SHAPE	描述
输入	N * IC * IH * IW	(IW * IC/4) * (IH * N)	image shape 是 width * height，同时因为 Image 格式是 RGBA，所以将 IC/4
输出	N * OC * OH * OW	(OW * OC/4) * (OH * N)	同上
filter	OC * IC/4 * KH * KW	(KW * KH) * (OC / 4)	width * height，同时因为这是 depthwise conv，即 group == input_channel，所以 filter 实际上只有一个 input_channel，即 IC == 1
global_work_size	(OW / 4 * OC / 4) * (OH * B)		gws 没有 image shape，考虑的是总共启动多少个线程进行计算

注意到，gws 的大小为 (OW / 4 * OC / 4) * (OH * B)，即每个线程计算 OW/4 个输出；假设 OW 为 112，即每个线程计算 28 个输出。

kernel 原理

__kernel
void depthwise_conv2d_s1(GLOBAL_SIZE_2_DIMS __read_only image2d_t input, __read_only image2d_t filter,
                                  #ifndef NO_BIAS
                                  __read_only image2d_t bias,
                                  #endif
                                  __write_only image2d_t output,
                                  __private const int2 inputShape,
                                  __private const int inChannelBlocks, 
                                  __private const int2 outputShape,
                                  __private const int2 filterShape,
                                  __private const int2 paddingShape) {

    const int outChannelWidthIdx = get_global_id(0);
    const int outHeightBlockIdx     = get_global_id(1);
    // 忽略为了让 GWS 整除 LWS 而额外启动的线程数量
    DEAL_NON_UNIFORM_DIM2(outChannelWidthIdx, outHeightBlockIdx);
    // 计算输出宽度向上整除 4 的结果
    int ow4              = (outputShape.y + 3) / 4;
    // 因为 GWS 是 (OW / 4 * OC / 4)，所以这里需要将 OW 向上整除 4，即 ow4 * 4 = OW
    const int outChannelBlockIdx = outChannelWidthIdx / ow4;
    const int outWidthBlockidx   = outChannelWidthIdx % ow4;

    const int inChannelBlockIdx = outChannelBlockIdx;

    #ifndef NO_BIAS
    FLOAT4 outValue0 = RI_F(bias, SAMPLER, (int2)(outChannelBlockIdx, 0));
    #else
    FLOAT4 outValue0 = (FLOAT4)(0.0f);
    #endif
    FLOAT4 outValue1 = outValue0;
    FLOAT4 outValue2 = outValue0;
    FLOAT4 outValue3 = outValue0;

    const int outWidthBlockidx4 = outWidthBlockidx << 2;
    // 计算最左边像素的起始索引，后三个像素逐步 + 1
    const int inWidthOffset0             = outWidthBlockidx4 - paddingShape.y;
    const int inWidthOffset1             = inWidthOffset0 + 1;
    const int inWidthOffset2             = inWidthOffset0 + 2;
    const int inWidthOffset3             = inWidthOffset0 + 3;

    // 计算高度索引
    int heightIdx            = outHeightBlockIdx % outputShape.x - paddingShape.x;
    const int outBatchIdx = mul24((outHeightBlockIdx / outputShape.x), inputShape.x);
    const int inCurIdx = mul24(inChannelBlockIdx, inputShape.y);

    // 计算宽度索引
    const int inWidthIdx0 = select(inCurIdx + inWidthOffset0, -1, (inWidthOffset0 < 0 || inWidthOffset0 >= inputShape.y));
    const int inWidthIdx1 = select(inCurIdx + inWidthOffset1, -1, (inWidthOffset1 < 0 || inWidthOffset1 >= inputShape.y));
    const int inWidthIdx2 = select(inCurIdx + inWidthOffset2, -1, (inWidthOffset2 < 0 || inWidthOffset2 >= inputShape.y));

    FLOAT4 inValue0, inValue1, inValue2, inValue3;
    for (int kh = 0; kh < filterShape.x; kh++) {
        int inHeightIdx = select(heightIdx + outBatchIdx, -1, (heightIdx < 0 || heightIdx >= inputShape.x));
        heightIdx++;
        // 读取滑动窗口的前 3 个像素
        inValue1       = RI_F(input, SAMPLER, (int2)(inWidthIdx0, inHeightIdx));
        inValue2       = RI_F(input, SAMPLER, (int2)(inWidthIdx1, inHeightIdx));
        inValue3       = RI_F(input, SAMPLER, (int2)(inWidthIdx2, inHeightIdx));
        for (int kw = 0; kw < filterShape.y; kw++) {

            // 每次读取 1 个像素，并更新滑动窗口，将 inValue0 移动到 inValue1
            int filterIdx   = mad24(kh, filterShape.y, kw);
            inValue0 = inValue1;
            inValue1 = inValue2;
            inValue2 = inValue3;

            // 读取最新像素
            int inWidthIdx = inWidthOffset3 + kw;
            inWidthIdx = select(inCurIdx + inWidthIdx, -1, (inWidthIdx < 0 || inWidthIdx >= inputShape.y));
            inValue3  = RI_F(input, SAMPLER, (int2)(inWidthIdx, inHeightIdx));

            // 读取权重并计算结果
            FLOAT4 weights = RI_F(filter, SAMPLER, (int2)(filterIdx, inChannelBlockIdx));

            outValue0 = mad(inValue0, weights, outValue0);
            outValue1 = mad(inValue1, weights, outValue1);
            outValue2 = mad(inValue2, weights, outValue2);
            outValue3 = mad(inValue3, weights, outValue3);
        }
    }

#ifdef RELU
    outValue0 = fmax(outValue0, (FLOAT4)0);
    outValue1 = fmax(outValue1, (FLOAT4)0);
    outValue2 = fmax(outValue2, (FLOAT4)0);
    outValue3 = fmax(outValue3, (FLOAT4)0);
#endif

#ifdef RELU6
    outValue0 = clamp(outValue0, (FLOAT4)0, (FLOAT4)6);
    outValue1 = clamp(outValue1, (FLOAT4)0, (FLOAT4)6);
    outValue2 = clamp(outValue2, (FLOAT4)0, (FLOAT4)6);
    outValue3 = clamp(outValue3, (FLOAT4)0, (FLOAT4)6);
#endif

    // 确保通道不整除 4 也能够得到正确输出
    const int remain     = outputShape.y - outWidthBlockidx4;
    int outWidthIdx       = mul24(outChannelBlockIdx, outputShape.y) + outWidthBlockidx4;
    if (remain >= 4) {
        WI_F(output, (int2)(outWidthIdx, outHeightBlockIdx), outValue0);
        WI_F(output, (int2)(outWidthIdx + 1, outHeightBlockIdx), outValue1);
        WI_F(output, (int2)(outWidthIdx + 2, outHeightBlockIdx), outValue2);
        WI_F(output, (int2)(outWidthIdx + 3, outHeightBlockIdx), outValue3);
    } else if (remain == 3) {
        WI_F(output, (int2)(outWidthIdx, outHeightBlockIdx), outValue0);
        WI_F(output, (int2)(outWidthIdx + 1, outHeightBlockIdx), outValue1);
        WI_F(output, (int2)(outWidthIdx + 2, outHeightBlockIdx), outValue2);
    } else if (remain == 2) {
        WI_F(output, (int2)(outWidthIdx, outHeightBlockIdx), outValue0);
        WI_F(output, (int2)(outWidthIdx + 1, outHeightBlockIdx), outValue1);
    } else if (remain == 1) {
        WI_F(output, (int2)(outWidthIdx, outHeightBlockIdx), outValue0);
    }
}

#opencl

opencl-tips

http://hebangwen.github.io/2024/11/21/opencl-tips/

作者

何榜文

发布于

2024年11月21日

许可协议

关于 DeepSeek-R1-Distill-Qwen-1.5B 显存占用比下载权重高非常多的现象分析上一篇

将 ViT 的第一个卷积层转为 MatMul，要求步长和卷积核相等下一篇