start = mach_absolute_time()
//真正提交任务
cmds!.commit()
//等待完成GPU计算
cmds!.waitUntilCompleted()
//GPU计算式分批次汇总的,数量已经很少了,最后用CPU进行完整的汇总
for elem in results {
result += elem
}
end = mach_absolute_time()
//显示GPU计算结果及所用时间
print("Metal result: \(result), time: \(Double(end - start) / Double(NSEC_PER_SEC))")
result = 0
//下面是使用CPU完整的计算一次,并显示结果、耗费时间
start = mach_absolute_time()
data.withUnsafeBufferPointer { buffer in
for elem in buffer {
result += elem
}
}
end = mach_absolute_time()
print("CPU result: \(result), time: \(Double(end - start) / Double(NSEC_PER_SEC))")
shade程序命名为:shader.metal
//各项数据类型必须跟Swift中定义的相同
#include <metal_stdlib>
typedef unsigned int uint;
typedef int DataType;
kernel void parsum(const device DataType* data [[ buffer(0) ]],
const device uint& dataLength [[ buffer(1) ]],
device DataType* sums [[ buffer(2) ]],
const device uint& elementsPerSum [[ buffer(3) ]],
const uint tgPos [[ threadgroup_position_in_grid ]],
const uint tPerTg [[ threads_per_threadgroup ]],
const uint tPos [[ thread_position_in_threadgroup ]]) {
//根据组索引、批次索引、组中位置计算总的索引值,这个是唯一的
uint resultIndex = tgPos * tPerTg + tPos;
//计算本批次数据的开始结束位置
uint dataIndex = resultIndex * elementsPerSum; // Where the summation should begin
uint endIndex = dataIndex + elementsPerSum < dataLength ? dataIndex + elementsPerSum : dataLength; // The index where summation should end
//对本批次数据求和
for (; dataIndex < endIndex; dataIndex++)
sums[resultIndex] += data[dataIndex];
}
给一个在命令行使用的编译脚本:
#!/bin/bash
xcrun metal -o shader.air shader.metal
xcrun metal-ar rcs shader.metal-ar shader.air
xcrun metallib -o default.metallib shader.metal-ar
swiftc testCompute.swift⏎
在我的笔记本上运行效果如下:
metal> ./testCompute
Metal result: 495056208, time: 0.017362745
CPU result: 495056208, time: 1.210801891
作为一个比较片面的比较,GPU计算速度,比CPU快121倍。
测试环境:
MacBook Pro (13-inch, 2017, Four Thunderbolt 3 Ports)
CPU:3.1 GHz Intel Core i5
Graphics:Intel Iris Plus Graphics 650 1536 MB
Memory:8 GB 2133 MHz LPDDR3
Xcode:9.4.1
Linux公社的RSS地址:https://www.linuxidc.com/rssFeed.aspx