基于C#的GPU计算
acmilan2016/11/05软件综合 IP:四川

所需环境:

  • 系统:Windows 7或更高版本
  • 硬件:DirectX 10 with CS 4.0、DirectX 11支持的显卡
  • 环境:
    • Visual Studio 2010+
    • DirectX SDK June 2010(或DirectX Redist June 2010)
    • SharpDX 2.6.3(或其它2.x版本)
  • Windows 7如果要使用WARP11模拟GPU,需安装KB2670838、KB2834140

GPU寄存器:

  • b0 - b# :常量缓冲区(cbuffer)
  • t0 - t# :Shader资源视图(ShaderResourceView)绑定的只读缓冲区(StructuredBuffer)
  • u0 - u# :乱序访问视图(UnorderedAccessView)绑定的可读写缓冲区(RWStructuredBuffer)

除了StructuredBuffer和RWStructuredBuffer以外,还有ByteAddressBuffer和RWByteAddressBuffer,它们使用比较麻烦。

GPU线程模型:

thread.png

4个线程组,每个线程组4个线程,计算1+2+3+4=10。

<code class="language-cs">using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using SharpDX;
using SharpDX.Direct3D;
using SharpDX.Direct3D11;
using SharpDX.D3DCompiler;
using D3D11 = SharpDX.Direct3D11;
using DXGI = SharpDX.DXGI;
// 引用:
// SharpDX.dll
// SharpDX.Direct3D11.dll
// SharpDX.DXGI.dll
// SharpDX.D3DCompiler.dll
// 附加:
// d3dx11_43.dll(DirectX Redist June 2010)
// d3dcompiler_43.dll(DirectX Redist June 2010)
// d3dcsx_43.dll(DirectX Redist June 2010)

namespace dcomputecsharp
{
    class Program
    {
        static string CS = @"
// 要运行的GPU程序

// 常量缓冲区(必须为16的倍数)
cbuffer CB : register(b0)
{
    unsigned int a;
    unsigned int b;
    unsigned int c;
    unsigned int d;
};

// u0对应UnorderedAccessView
RWStructuredBuffer<unsigned int> Data : register(u0);

// 主程序(注意cs_4_0只支持M,N,1,只有cs_5_0才支持M,N,P)
[numthreads(4, 1, 1)]
void main(uint3 Gid : SV_GroupID, // 组别ID(Dispatch函数三个参数)
    uint3 DTid : SV_DispatchThreadID, // 总ID
    uint3 GTid : SV_GroupThreadID, // 组内线程ID(numthreads属性三个参数)
    uint GI : SV_GroupIndex) // 组内序号
{
    Data[DTid.x] = a + b + c + d;
}
";
        
        static Device dev;
        static DeviceContext immctx;
        static CompilationResult cs_main_comp;
        static ComputeShader cs_main;
        static D3D11.Buffer constbuf;
        static D3D11.Buffer gpubuf1;
        static ShaderResourceView gpubuf1_srv;
        static UnorderedAccessView gpubuf1_uav;
        static D3D11.Buffer cpubuf;

        struct ConstBuffer
        {
            public uint a;
            public uint b;
            public uint c;
            public uint d;
        }

        static void Main(string[] args)
        {
            try
            {
                // 创建设备,并获取ImmediateContext对象
                // DirverType.Hardware : 使用GPU
                // DirverType.Warp : 使用CPU (Win7要求KB2670838以支持WARP11)
                dev = new Device(DriverType.Hardware, DeviceCreationFlags.None, FeatureLevel.Level_11_0, FeatureLevel.Level_10_1, FeatureLevel.Level_10_0);
                immctx = dev.ImmediateContext;

                // 检查是否支持Compute Shader 4.0
                if (!dev.CheckFeatureSupport(Feature.D3D10XHardwareOptions))
                {
                    Console.WriteLine("No support for compute shaders.");
                    return;
                }
                
                // 编译并创建Compute Shader对象
                cs_main_comp = ShaderBytecode.Compile(CS, "main", "cs_4_0");
                cs_main = new ComputeShader(dev, cs_main_comp.Bytecode, null);
                
                // 创建常量缓冲区(注意大小必须是16的倍数)
                constbuf = new D3D11.Buffer(dev, 16, ResourceUsage.Default, BindFlags.ConstantBuffer, CpuAccessFlags.None, ResourceOptionFlags.None, 0);

                // 创建GPU缓冲区
                gpubuf1 = new D3D11.Buffer(dev, sizeof(uint) * 16, ResourceUsage.Default, BindFlags.ShaderResource | BindFlags.UnorderedAccess, CpuAccessFlags.None, ResourceOptionFlags.BufferStructured, sizeof(uint));
                
                // 为GPU缓冲区创建Shader资源视图绑定
                ShaderResourceViewDescription srvdesc = new ShaderResourceViewDescription();
                srvdesc.Format = DXGI.Format.Unknown;
                srvdesc.Dimension = ShaderResourceViewDimension.Buffer;
                srvdesc.Buffer.ElementCount = 16;
                gpubuf1_srv = new ShaderResourceView(dev, gpubuf1, srvdesc);
                
                // 为GPU缓冲区创建乱序访问视图绑定
                UnorderedAccessViewDescription uavdesc = new UnorderedAccessViewDescription();
                uavdesc.Format = DXGI.Format.Unknown;
                uavdesc.Dimension = UnorderedAccessViewDimension.Buffer;
                uavdesc.Buffer.ElementCount = 16;
                gpubuf1_uav = new UnorderedAccessView(dev, gpubuf1, uavdesc);
                
                // 创建CPU传输缓冲区
                cpubuf = new D3D11.Buffer(dev, sizeof(uint) * 16, ResourceUsage.Staging, BindFlags.None, CpuAccessFlags.Read, ResourceOptionFlags.BufferStructured, sizeof(uint));

                // 进行计算
                DoCompute();
            }
            finally
            {
                // 清理对象
                if (cpubuf != null) cpubuf.Dispose();
                if (gpubuf1_uav != null) gpubuf1_uav.Dispose();
                if (gpubuf1_srv != null) gpubuf1_srv.Dispose();
                if (gpubuf1 != null) gpubuf1.Dispose();
                if (constbuf != null) constbuf.Dispose();
                if (cs_main != null) cs_main.Dispose();
                if (cs_main_comp != null) cs_main_comp.Dispose();
                if (immctx != null) immctx.Dispose();
                if (dev != null) dev.Dispose();
            }
        }

        // 计算主程序
        private static void DoCompute()
        {
            // 设置常量
            ConstBuffer cb = new ConstBuffer() { a = 1, b = 2, c = 3, d = 4 };
            immctx.UpdateSubresource(ref cb, constbuf);
            immctx.ComputeShader.SetConstantBuffer(0, constbuf);

            // 上传数据
            uint[] buf = new uint[16];
            immctx.UpdateSubresource(buf, gpubuf1);
            immctx.ComputeShader.SetUnorderedAccessView(0, gpubuf1_uav);

            // 进行运算
            immctx.ComputeShader.SetShader(cs_main, null, 0);
            immctx.Dispatch(4, 1, 1);

            // 下载数据
            immctx.CopyResource(gpubuf1, cpubuf);
            DataStream ds;
            immctx.MapSubresource(cpubuf, 0, MapMode.Read, MapFlags.None, out ds);
            uint[] outbuf = ds.ReadRange<uint>(16);
            immctx.UnmapSubresource(cpubuf, 0);

            // 显示结果
            for (int i = 0; i < 16; i++)
            {
                Console.Write("{0} ", outbuf[i]);
            }
            Console.WriteLine();
        }
    }
}
</uint></unsigned></code>

双调排序算法(一种可并行的排序算法),微软官方示例的SharpDX移植版本(原版使用C++)。

<code class="language-cs">using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using SharpDX;
using SharpDX.Direct3D;
using SharpDX.Direct3D11;
using SharpDX.D3DCompiler;
using D3D11 = SharpDX.Direct3D11;
using DXGI = SharpDX.DXGI;
// 引用:
// SharpDX.dll
// SharpDX.Direct3D11.dll
// SharpDX.DXGI.dll
// SharpDX.D3DCompiler.dll
// 附加:
// d3dx11_43.dll(DirectX Redist June 2010)
// d3dcompiler_43.dll(DirectX Redist June 2010)
// d3dcsx_43.dll(DirectX Redist June 2010)

namespace dcomputecsharp
{
    class Program
    {
        static string CS = @"
//--------------------------------------------------------------------------------------
// 块大小定义
//--------------------------------------------------------------------------------------
#define BITONIC_BLOCK_SIZE 512
#define TRANSPOSE_BLOCK_SIZE 16

//--------------------------------------------------------------------------------------
// 常量缓冲区
//--------------------------------------------------------------------------------------
// b# 寄存器表示ConstantBuffer(常量缓冲区)
cbuffer CB : register( b0 )
{
    unsigned int g_iLevel;
    unsigned int g_iLevelMask;
    unsigned int g_iWidth;
    unsigned int g_iHeight;
};

//--------------------------------------------------------------------------------------
// 结构化缓冲区
//--------------------------------------------------------------------------------------
// t# 寄存器表示ShaderResourceView(Shader资源视图)
// u# 寄存器表示UnorderedAccessView(乱序访问视图)
StructuredBuffer<unsigned int> Input : register( t0 );
RWStructuredBuffer<unsigned int> Data : register( u0 );

//--------------------------------------------------------------------------------------
// 双调排序GPU程序(Compute Shader)
//--------------------------------------------------------------------------------------
groupshared unsigned int shared_data[BITONIC_BLOCK_SIZE]; // 组内共享的内部数据

[numthreads(BITONIC_BLOCK_SIZE, 1, 1)] // 组内线程数X*Y*Z,其中cs_4_0中Z必须为1,cs_5_0没有这个限制
void BitonicSort( uint3 Gid : SV_GroupID,  // 组ID
                  uint3 DTid : SV_DispatchThreadID, // 总线程ID 
                  uint3 GTid : SV_GroupThreadID, // 组内线程ID
                  uint GI : SV_GroupIndex ) // 组内线程序号
{
    // 从乱序访问视图加载组内共享的内部数据
    shared_data[GI] = Data[DTid.x];
    GroupMemoryBarrierWithGroupSync(); // 等待组内所有共享数据访问结束,且所有程序均到达此调用
    
    // 对组内共享的内部数据进行排序
    for (unsigned int j = g_iLevel >> 1 ; j > 0 ; j >>= 1)
    {
        unsigned int result = ((shared_data[GI & ~j] <= shared_data[gi | j])="=" (bool)(g_ilevelmask & dtid.x))? ^ j] : shared_data[gi]; groupmemorybarrierwithgroupsync(); shared_data[gi]="result;" } 将组内共享的内部数据存回乱序访问视图 data[dtid.x]="shared_data[GI];" -------------------------------------------------------------------------------------- 矩阵转置gpu程序(compute shader) groupshared unsigned int transpose_shared_data[transpose_block_size * transpose_block_size]; [numthreads(transpose_block_size, transpose_block_size, 1)] void matrixtranspose( uint3 gid sv_groupid, dtid sv_dispatchthreadid, gtid sv_groupthreadid, uint gi sv_groupindex ) { transpose_shared_data[gi]="Input[DTid.y" g_iwidth + dtid.x]; uint2 xy="DTid.yx" - gtid.yx gtid.xy; data[xy.y g_iheight xy.x]="transpose_shared_data[GTid.x" transpose_block_size gtid.y]; "; static device dev; devicecontext immctx; compilationresult cs_sort_comp; computeshader cs_sort; cs_transpose_comp; cs_transpose; d3d11.buffer constbuf; gpubuf1; shaderresourceview gpubuf1_srv; unorderedaccessview gpubuf1_uav; gpubuf2; gpubuf2_srv; gpubuf2_uav; cpubuf; struct constbuffer public ilevel; ilevelmask; iwidth; iheight; const num_elements="512" 512; bitonic_block_size="512;" matrix_width="BITONIC_BLOCK_SIZE;" matrix_height="NUM_ELEMENTS" bitonic_block_size; main(string[] args) try 创建设备,并获取immediatecontext对象 dirvertype.hardware 使用gpu dirvertype.warp 使用cpu (win7要求kb2670838以支持warp11) dev="new" device(drivertype.hardware, devicecreationflags.none, featurelevel.level_11_0, featurelevel.level_10_1, featurelevel.level_10_0); immctx="dev.ImmediateContext;" 检查是否支持compute shader 4.0 if (!dev.checkfeaturesupport(feature.d3d10xhardwareoptions)) console.writeline("no support for compute shaders."); return; 编译并创建compute shader对象 cs_sort_comp="ShaderBytecode.Compile(CS," "bitonicsort", "cs_4_0"); cs_sort="new" computeshader(dev, cs_sort_comp.bytecode, null); cs_transpose_comp="ShaderBytecode.Compile(CS," "matrixtranspose", cs_transpose="new" cs_transpose_comp.bytecode, 创建常量缓冲区(注意大小必须是16的倍数) constbuf="new" d3d11.buffer(dev, 16, resourceusage.default, bindflags.constantbuffer, cpuaccessflags.none, resourceoptionflags.none, 0); 创建两个gpu缓冲区 gpubuf1="new" sizeof(uint) (int)num_elements, bindflags.shaderresource bindflags.unorderedaccess, resourceoptionflags.bufferstructured, sizeof(uint)); gpubuf2="new" 为gpu缓冲区创建shader资源视图绑定 shaderresourceviewdescription srvdesc="new" shaderresourceviewdescription(); srvdesc.format="DXGI.Format.Unknown;" srvdesc.dimension="ShaderResourceViewDimension.Buffer;" srvdesc.buffer.elementcount="(int)NUM_ELEMENTS;" gpubuf1_srv="new" shaderresourceview(dev, gpubuf1, srvdesc); gpubuf2_srv="new" gpubuf2, 为gpu缓冲区创建乱序访问视图绑定 unorderedaccessviewdescription uavdesc="new" unorderedaccessviewdescription(); uavdesc.format="DXGI.Format.Unknown;" uavdesc.dimension="UnorderedAccessViewDimension.Buffer;" uavdesc.buffer.elementcount="(int)NUM_ELEMENTS;" gpubuf1_uav="new" unorderedaccessview(dev, uavdesc); gpubuf2_uav="new" 创建cpu传输缓冲区 cpubuf="new" resourceusage.staging, bindflags.none, cpuaccessflags.read, 进行计算 docompute(); finally 清理对象 (cpubuf !="null)" cpubuf.dispose(); (gpubuf2_uav gpubuf2_uav.dispose(); (gpubuf2_srv gpubuf2_srv.dispose(); (gpubuf2 gpubuf2.dispose(); (gpubuf1_uav gpubuf1_uav.dispose(); (gpubuf1_srv gpubuf1_srv.dispose(); (gpubuf1 gpubuf1.dispose(); (constbuf constbuf.dispose(); (cs_transpose cs_transpose.dispose(); (cs_transpose_comp cs_transpose_comp.dispose(); (cs_sort cs_sort.dispose(); (cs_sort_comp cs_sort_comp.dispose(); (immctx immctx.dispose(); (dev dev.dispose(); 设置常量 private setconstants(uint ilevel, ilevelmask, iwidth, iheight) cb="new" constbuffer() ilevel="iLevel," ilevelmask="iLevelMask," iwidth="iWidth," iheight="iHeight" }; immctx.updatesubresource(ref cb, constbuf); immctx.computeshader.setconstantbuffer(0, 计算主程序 docompute() console.writeline("generating random data"); 生成随机数据 uint[] buf="new" uint[num_elements]; rand="new" random(environment.tickcount); (int i="0;" < num_elements; i++) buf[i]="(uint)rand.NextLong();" console.writeline("random data generated"); console.writeline("gpu sorting begins"); 上传数据 immctx.updatesubresource(buf, gpubuf1); immctx.computeshader.setunorderedaccessview(0, gpubuf1_uav); 排序数据 先按不大于块大小的level对行数据进行排序 (uint level="2;" 2) setconstants(level, level, matrix_height, matrix_width); 对行数据进行排序 immctx.computeshader.setshader(cs_sort, null, immctx.dispatch((int)(num_elements bitonic_block_size), 1, 1); 然后按大于块大小的level对行列数据进行排序 转置,排序列,转置,排序行 2); setconstants((level (level ~num_elements) bitonic_block_size, matrix_width, matrix_height); 将数据由buffer1转置并存到buffer2 immctx.computeshader.setshaderresource(0, gpubuf2_uav); gpubuf1_srv); immctx.computeshader.setshader(cs_transpose, immctx.dispatch((int)(matrix_width transpose_block_size), (int)(matrix_height 排序转置后的列数据 setconstants(bitonic_block_size, 将数据由buffer2转置并存回buffer1 gpubuf2_srv); immctx.dispatch((int)(matrix_height (int)(matrix_width 排序行数据 下载数据 immctx.copyresource(gpubuf1, cpubuf); datastream ds; immctx.mapsubresource(cpubuf, 0, mapmode.read, mapflags.none, out ds); outbuf="ds.ReadRange<uint">((int)NUM_ELEMENTS);
            immctx.UnmapSubresource(cpubuf, 0);

            Console.WriteLine("GPU sorting ends");
            Console.WriteLine("CPU sorting begins");
            
            // 进行CPU对照排序(此处C#排序非常快,和GPU排序几乎看不出区别)
            Array.Sort(buf);

            Console.WriteLine("CPU sorting ends");
            Console.WriteLine("Comparing");

            // 比较结果是否一致
            bool same = true;
            for (int i = 0; i < NUM_ELEMENTS; i++)
            {
                if (buf[i] != outbuf[i])
                {
                    same = false;
                    break;
                }
            }
            Console.WriteLine("Result: {0}", same);
        }
    }
}
</=></unsigned></unsigned></code>

[修改于 7年5个月前 - 2016/11/06 20:05:40]

来自:计算机科学 / 软件综合
0
已屏蔽 原因:{{ notice.reason }}已屏蔽
{{notice.noticeContent}}
~~空空如也

想参与大家的讨论?现在就 登录 或者 注册

所属专业
所属分类
上级专业
同级专业
acmilan
进士 学者 笔友
文章
461
回复
2934
学术分
4
2009/05/30注册,5年2个月前活动
暂无简介
主体类型:个人
所属领域:无
认证方式:邮箱
IP归属地:未同步
文件下载
加载中...
{{errorInfo}}
{{downloadWarning}}
你在 {{downloadTime}} 下载过当前文件。
文件名称:{{resource.defaultFile.name}}
下载次数:{{resource.hits}}
上传用户:{{uploader.username}}
所需积分:{{costScores}},{{holdScores}}下载当前附件免费{{description}}
积分不足,去充值
文件已丢失

当前账号的附件下载数量限制如下:
时段 个数
{{f.startingTime}}点 - {{f.endTime}}点 {{f.fileCount}}
视频暂不能访问,请登录试试
仅供内部学术交流或培训使用,请先保存到本地。本内容不代表科创观点,未经原作者同意,请勿转载。
音频暂不能访问,请登录试试
支持的图片格式:jpg, jpeg, png
插入公式
评论控制
加载中...
文号:{{pid}}
投诉或举报
加载中...
{{tip}}
请选择违规类型:
{{reason.type}}

空空如也

加载中...
详情
详情
推送到专栏从专栏移除
设为匿名取消匿名
查看作者
回复
只看作者
加入收藏取消收藏
收藏
取消收藏
折叠回复
置顶取消置顶
评学术分
鼓励
设为精选取消精选
管理提醒
编辑
通过审核
评论控制
退修或删除
历史版本
违规记录
投诉或举报
加入黑名单移除黑名单
查看IP
{{format('YYYY/MM/DD HH:mm:ss', toc)}}