Metal并行计算以及Metal程序的命令行编译(2)

start = mach_absolute_time()
//真正提交任务
cmds!.commit()
//等待完成GPU计算
cmds!.waitUntilCompleted()
//GPU计算式分批次汇总的,数量已经很少了,最后用CPU进行完整的汇总
for elem in results {
    result += elem
}
end = mach_absolute_time()
//显示GPU计算结果及所用时间
print("Metal result: \(result), time: \(Double(end - start) / Double(NSEC_PER_SEC))")
result = 0

//下面是使用CPU完整的计算一次,并显示结果、耗费时间
start = mach_absolute_time()
data.withUnsafeBufferPointer { buffer in
    for elem in buffer {
        result += elem
    }
}
end = mach_absolute_time()
print("CPU result: \(result), time: \(Double(end - start) / Double(NSEC_PER_SEC))")


shade程序命名为:shader.metal

//各项数据类型必须跟Swift中定义的相同
#include <metal_stdlib>
typedef unsigned int uint;
typedef int DataType;

kernel void parsum(const device DataType* data [[ buffer(0) ]],
                  const device uint& dataLength [[ buffer(1) ]],
                  device DataType* sums [[ buffer(2) ]],
                  const device uint& elementsPerSum [[ buffer(3) ]],

const uint tgPos [[ threadgroup_position_in_grid ]],
                  const uint tPerTg [[ threads_per_threadgroup ]],
                  const uint tPos [[ thread_position_in_threadgroup ]]) {
    //根据组索引、批次索引、组中位置计算总的索引值,这个是唯一的
    uint resultIndex = tgPos * tPerTg + tPos;
    //计算本批次数据的开始结束位置
    uint dataIndex = resultIndex * elementsPerSum; // Where the summation should begin
    uint endIndex = dataIndex + elementsPerSum < dataLength ? dataIndex + elementsPerSum : dataLength; // The index where summation should end
    //对本批次数据求和
    for (; dataIndex < endIndex; dataIndex++)
        sums[resultIndex] += data[dataIndex];
}

给一个在命令行使用的编译脚本:

#!/bin/bash
xcrun metal -o shader.air shader.metal
xcrun metal-ar rcs shader.metal-ar shader.air
xcrun metallib -o default.metallib shader.metal-ar
swiftc testCompute.swift⏎ 

在我的笔记本上运行效果如下:

metal> ./testCompute
Metal result: 495056208, time: 0.017362745
CPU result: 495056208, time: 1.210801891

作为一个比较片面的比较,GPU计算速度,比CPU快121倍。
 测试环境:
MacBook Pro (13-inch, 2017, Four Thunderbolt 3 Ports)
 CPU:3.1 GHz Intel Core i5
 Graphics:Intel Iris Plus Graphics 650 1536 MB
 Memory:8 GB 2133 MHz LPDDR3
 Xcode:9.4.1

参考资料:
https://stackoverflow.com/questions/38164634/compute-sum-of-array-values-in-parallel-with-metal-swift

Linux公社的RSS地址:https://www.linuxidc.com/rssFeed.aspx

内容版权声明:除非注明,否则皆为本站原创文章。

转载注明出处:https://www.heiqu.com/a67cff15609b54af8c345a5ff42529c5.html