PyOpenCLでBitonicSort(備忘録)

Connected-Component Labelingの実装中にソートが必要だったのでPyOpenCL環境で記述.
pyopencl.algorithm下にあるっぽいのですが, どの道最終的にHLSLに移植する際には自分で記述しないといけないので書きました.
Connected-Component Labelingの実装の途中に記述してあるのを無理やり引っこ抜いてきたので謎の変数名が散見されますが, 備忘録ということでひとつ.

※注意※
実装方法に問題があるため基数ソートが安定ソートになっていません

<br />
import pyopencl as cl<br />
import numpy as np</p>
<p>ctx = cl.create_some_context()<br />
queue = cl.CommandQueue(ctx)</p>
<p>length = np.int32(256)<br />
labelElements = np.zeros(length * 2, dtype=np.int32)</p>
<p>length_Buf = cl.Buffer(ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, length.nbytes, hostbuf=length)<br />
sortedElements_Buf = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, labelElements.nbytes)</p>
<p>sequence = np.empty(2, dtype=np.int32)<br />
sequence_Buf = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, sequence.nbytes)</p>
<p>code_bitonicSort = &#8221;&#8217;<br />
#define GROUP_SIZE_X 16</p>
<p>__kernel __attribute__((reqd_work_group_size(GROUP_SIZE_X, 1, 1)))<br />
void firstSort(<br />
    __constant int *input,<br />
    __constant int *length,<br />
    __global int *result,<br />
    __global int *sequence<br />
){</p>
<p>    int thid = get_global_id(0);<br />
    int grid = get_local_id(0);<br />
    int gridx = get_group_id(0);</p>
<p>    __local int local_data[(GROUP_SIZE_X * 2) * 2];<br />
    local_data[(2 * grid) * 2]          = input[(2 * thid) * 2];        //index<br />
    local_data[(2 * grid) * 2 + 1]      = input[(2 * thid) * 2 + 1];    //key<br />
    local_data[(2 * grid + 1) * 2]      = input[(2 * thid + 1) * 2];    //index<br />
    local_data[(2 * grid + 1) * 2 + 1]  = input[(2 * thid + 1) * 2 + 1];//key<br />
    barrier(CLK_LOCAL_MEM_FENCE);</p>
<p>    __local int temp[GROUP_SIZE_X * 2];<br />
    //=========================================================================<br />
    // Radix Sort<br />
    //=========================================================================<br />
    for(int k=0; k&lt;32; k++){<br />
        int bit1 = (local_data[(2 * grid) * 2 + 1] &gt;&gt; k) &amp; 1;<br />
        int bit2 = (local_data[(2 * grid + 1) * 2 + 1] &gt;&gt; k) &amp; 1;<br />
        temp[2 * grid] = bit1 ^ 1;      //0と1を反転<br />
        temp[2 * grid + 1] = bit2 ^ 1;<br />
        barrier(CLK_LOCAL_MEM_FENCE);<br />
        int total = temp[(2 * GROUP_SIZE_X) &#8211; 1];</p>
<p>        //=====================================================================<br />
        // Scan<br />
        //=====================================================================<br />
        int offset = 1;<br />
        for(int i=(2 * GROUP_SIZE_X)&gt;&gt;1; i&gt;0; i&gt;&gt;=1){<br />
            barrier(CLK_LOCAL_MEM_FENCE);<br />
            if(grid &lt; i){<br />
                int ai = offset*(2*grid+1)-1;<br />
                int bi = offset*(2*grid+2)-1;<br />
                temp[bi]+=temp[ai];<br />
            }<br />
            offset &lt;&lt;= 1;<br />
        }<br />
        if(grid==0) temp[(2 * GROUP_SIZE_X) &#8211; 1] = 0;<br />
        for(int i=1; i&lt;(2 * GROUP_SIZE_X); i&lt;&lt;=1){<br />
            offset &gt;&gt;= 1;<br />
            barrier(CLK_LOCAL_MEM_FENCE);<br />
            if(grid &lt; i){<br />
                int ai = offset*(2*grid+1)-1;<br />
                int bi = offset*(2*grid+2)-1;<br />
                int t = temp[ai];<br />
                temp[ai] = temp[bi];<br />
                temp[bi] += t;<br />
            }<br />
        }<br />
        barrier(CLK_LOCAL_MEM_FENCE);<br />
        total += temp[(2 * GROUP_SIZE_X) &#8211; 1];</p>
<p>        //=====================================================================<br />
        // Sort<br />
        //=====================================================================<br />
        int t1 = (2 * grid) &#8211; temp[(2 * grid)] + total;<br />
        int t2 = (2 * grid + 1) &#8211; temp[(2 * grid + 1)] + total;<br />
        temp[(2 * grid)] = bit1 ? t1 : temp[(2 * grid)];<br />
        temp[(2 * grid + 1)] = bit2 ? t2 : temp[(2 * grid + 1)];</p>
<p>        int2 d1 = (int2)(local_data[(2 * grid) * 2], local_data[(2 * grid) * 2 + 1]);<br />
        int2 d2 = (int2)(local_data[(2 * grid + 1) * 2], local_data[(2 * grid + 1) * 2 + 1]);<br />
        barrier(CLK_LOCAL_MEM_FENCE);</p>
<p>        local_data[temp[(2 * grid)] * 2] = d1.x;<br />
        local_data[temp[(2 * grid)] * 2 + 1] = d1.y;<br />
        local_data[temp[(2 * grid + 1)] * 2] = d2.x;<br />
        local_data[temp[(2 * grid + 1)] * 2 + 1] = d2.y;</p>
<p>        barrier(CLK_LOCAL_MEM_FENCE);<br />
    }</p>
<p>    if(gridx%2==0){<br />
        result[(2 * thid) * 2]          = local_data[(2 * grid) * 2];<br />
        result[(2 * thid) * 2 + 1]      = local_data[(2 * grid) * 2 + 1];<br />
        result[(2 * thid + 1) * 2]      = local_data[(2 * grid + 1) * 2];<br />
        result[(2 * thid + 1) * 2 + 1]  = local_data[(2 * grid + 1) * 2 + 1];<br />
    }<br />
    else{<br />
        result[(2 * thid) * 2]          = local_data[(2 * ((GROUP_SIZE_X &#8211; 1) &#8211; grid) + 1) * 2];<br />
        result[(2 * thid) * 2 + 1]      = local_data[(2 * ((GROUP_SIZE_X &#8211; 1) &#8211; grid) + 1) * 2 + 1];<br />
        result[(2 * thid + 1) * 2]      = local_data[(2 * ((GROUP_SIZE_X &#8211; 1) &#8211; grid)) * 2];<br />
        result[(2 * thid + 1) * 2 + 1]  = local_data[(2 * ((GROUP_SIZE_X &#8211; 1) &#8211; grid)) * 2 + 1];<br />
    }</p>
<p>    // シーケンスを設定<br />
    if(thid==0){<br />
        sequence[0] = 4;<br />
        sequence[1] = sequence[0] &gt;&gt; 1;<br />
    }<br />
}</p>
<p>__kernel __attribute__((reqd_work_group_size(GROUP_SIZE_X, 1, 1)))<br />
void bitonicSort(<br />
    /*__constant int *input, */<br />
    __constant int *length,<br />
    __global int *result,<br />
    __global int *sequence<br />
){</p>
<p>    int thid = get_global_id(0);<br />
    int grid = get_local_id(0);<br />
    int gridx = get_group_id(0);</p>
<p>    int dir = 1;//ソートの方向. 1で小-&gt;大, 0で大-&gt;小   </p>
<p>    __local int local_data[(GROUP_SIZE_X * 2) * 2];<br />
    __local int temp[GROUP_SIZE_X * 2];</p>
<p>    //=========================================================================<br />
    // ソートするブロックと方向を指定<br />
    //=========================================================================<br />
    // 方向<br />
    dir = ((int)(gridx / (sequence[0] &gt;&gt; 1)) &amp; 1) ^ 1;// 1グループあたり2列を担当することに注意</p>
<p>    int seq[2] = {sequence[0], sequence[1]};<br />
    barrier(CLK_LOCAL_MEM_FENCE);<br />
    // 一つ目のブロック<br />
    local_data[grid * 2]<br />
        = result[(((int)(gridx / seq[1]) * (seq[1] * 2) + (gridx % seq[1])) * GROUP_SIZE_X + grid) * 2];        //index</p>
<p>    local_data[grid * 2 + 1]<br />
        = result[(((int)(gridx / seq[1]) * (seq[1] * 2) + (gridx % seq[1])) * GROUP_SIZE_X + grid) * 2 + 1];    //key</p>
<p>    // 二つ目のブロック<br />
    local_data[(grid + GROUP_SIZE_X) * 2]<br />
        = result[(((int)(gridx / seq[1]) * (seq[1] * 2) + (gridx % seq[1]) + seq[1]) * GROUP_SIZE_X + grid) * 2];    //index</p>
<p>    local_data[(grid + GROUP_SIZE_X) * 2 + 1]<br />
        = result[(((int)(gridx / seq[1]) * (seq[1] * 2) + (gridx % seq[1]) + seq[1]) * GROUP_SIZE_X + grid) * 2 + 1];//key</p>
<p>    barrier(CLK_LOCAL_MEM_FENCE); </p>
<p>    for(int k=0; k&lt;32; k++){<br />
        int bit1 = (local_data[(2 * grid) * 2 + 1] &gt;&gt; k) &amp; 1;<br />
        int bit2 = (local_data[(2 * grid + 1) * 2 + 1] &gt;&gt; k) &amp; 1;<br />
        temp[2 * grid] = bit1 ^ 1;<br />
        temp[2 * grid + 1] = bit2 ^ 1;<br />
        barrier(CLK_LOCAL_MEM_FENCE);<br />
        int total = temp[(2 * GROUP_SIZE_X) &#8211; 1];</p>
<p>        //=====================================================================<br />
        // Scan<br />
        //=====================================================================<br />
        int offset = 1;<br />
        for(int i=(2 * GROUP_SIZE_X)&gt;&gt;1; i&gt;0; i&gt;&gt;=1){<br />
            barrier(CLK_LOCAL_MEM_FENCE);<br />
            if(grid &lt; i){<br />
                int ai = offset*(2*grid+1)-1;<br />
                int bi = offset*(2*grid+2)-1;<br />
                temp[bi]+=temp[ai];<br />
            }<br />
            offset &lt;&lt;= 1;<br />
        }<br />
        if(grid==0) temp[(2 * GROUP_SIZE_X) &#8211; 1] = 0;<br />
        for(int i=1; i&lt;(2 * GROUP_SIZE_X); i&lt;&lt;=1){<br />
            offset &gt;&gt;= 1;<br />
            barrier(CLK_LOCAL_MEM_FENCE);<br />
            if(grid &lt; i){<br />
                int ai = offset*(2*grid+1)-1;<br />
                int bi = offset*(2*grid+2)-1;<br />
                int t = temp[ai];<br />
                temp[ai] = temp[bi];<br />
                temp[bi] += t;<br />
            }<br />
        }<br />
        barrier(CLK_LOCAL_MEM_FENCE);<br />
        total += temp[(2 * GROUP_SIZE_X) &#8211; 1];</p>
<p>        //=====================================================================<br />
        // Sort<br />
        //=====================================================================<br />
        int t1 = (2 * grid) &#8211; temp[(2 * grid)] + total;<br />
        int t2 = (2 * grid + 1) &#8211; temp[(2 * grid + 1)] + total;<br />
        temp[(2 * grid)] = bit1 ? t1 : temp[(2 * grid)];<br />
        temp[(2 * grid + 1)] = bit2 ? t2 : temp[(2 * grid + 1)];</p>
<p>        int2 d1 = (int2)(local_data[(2 * grid) * 2], local_data[(2 * grid) * 2 + 1]);<br />
        int2 d2 = (int2)(local_data[(2 * grid + 1) * 2], local_data[(2 * grid + 1) * 2 + 1]);<br />
        barrier(CLK_LOCAL_MEM_FENCE);</p>
<p>        local_data[temp[(2 * grid)] * 2] = d1.x;<br />
        local_data[temp[(2 * grid)] * 2 + 1] = d1.y;<br />
        local_data[temp[(2 * grid + 1)] * 2] = d2.x;<br />
        local_data[temp[(2 * grid + 1)] * 2 + 1] = d2.y;</p>
<p>        barrier(CLK_LOCAL_MEM_FENCE);<br />
    }</p>
<p>    //=========================================================================<br />
    // 結果を格納<br />
    //=========================================================================<br />
    if(dir){<br />
        // 一つ目のブロック<br />
        result[(((int)(gridx / seq[1]) * (seq[1] * 2) + (gridx % seq[1])) * GROUP_SIZE_X + grid) * 2]<br />
            = local_data[grid * 2];         //index</p>
<p>        result[(((int)(gridx / seq[1]) * (seq[1] * 2) + (gridx % seq[1])) * GROUP_SIZE_X + grid) * 2 + 1]<br />
            = local_data[grid * 2 + 1];     //key</p>
<p>        // 二つ目のブロック<br />
        result[(((int)(gridx / seq[1]) * (seq[1] * 2) + (gridx % seq[1]) + seq[1]) * GROUP_SIZE_X + grid) * 2]<br />
            = local_data[(grid + GROUP_SIZE_X) * 2];         //index</p>
<p>        result[(((int)(gridx / seq[1]) * (seq[1] * 2) + (gridx % seq[1]) + seq[1]) * GROUP_SIZE_X + grid) * 2 + 1]<br />
            = local_data[(grid + GROUP_SIZE_X) * 2 + 1];     //key<br />
    }<br />
    else{<br />
        // 上と同様(逆順)<br />
        result[(((int)(gridx / seq[1]) * (seq[1] * 2) + (gridx % seq[1])) * GROUP_SIZE_X + grid) * 2]<br />
            = local_data[(2 * GROUP_SIZE_X &#8211; 1 &#8211; grid) * 2];</p>
<p>        result[(((int)(gridx / seq[1]) * (seq[1] * 2) + (gridx % seq[1])) * GROUP_SIZE_X + grid) * 2 + 1]<br />
            = local_data[(2 * GROUP_SIZE_X &#8211; 1 &#8211; grid) * 2 + 1];</p>
<p>        result[(((int)(gridx / seq[1]) * (seq[1] * 2) + (gridx % seq[1]) + seq[1]) * GROUP_SIZE_X + grid) * 2]<br />
            = local_data[(GROUP_SIZE_X &#8211; 1 &#8211; grid) * 2];</p>
<p>        result[(((int)(gridx / seq[1]) * (seq[1] * 2) + (gridx % seq[1]) + seq[1]) * GROUP_SIZE_X + grid) * 2 + 1]<br />
            = local_data[(GROUP_SIZE_X &#8211; 1 &#8211; grid) * 2 + 1];<br />
    }</p>
<p>    barrier(CLK_LOCAL_MEM_FENCE);</p>
<p>    // シーケンスの更新<br />
    if(thid == 0){<br />
        if(sequence[1] &gt; 1){<br />
            sequence[1] &gt;&gt;= 1;<br />
        }<br />
        else{<br />
            sequence[0] &lt;&lt;= 1;<br />
            sequence[1] = sequence[0] &gt;&gt; 1;<br />
        }<br />
    }</p>
<p>}</p>
<p>&#8221;&#8217;</p>
<p>prg_bitonicSort = cl.Program(ctx, code_bitonicSort).build()</p>
<p>randomArray = np.empty(2*length, dtype=np.int32)<br />
randomArray[0:length*2:2] = np.arange(1, length+1, 1)<br />
randomArray[1:length*2+1:2] = np.random.randint(0, 1000, length)<br />
print(randomArray)<br />
randomArray_Buf = cl.Buffer(ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, randomArray.nbytes, hostbuf=randomArray)</p>
<p>event_firstSort = prg_bitonicSort.firstSort(queue,<br />
                        [length // 2], [16],<br />
                            randomArray_Buf, length_Buf, sortedElements_Buf, sequence_Buf)<br />
event_firstSort.wait()</p>
<p>sortedElements = np.empty(labelElements.shape, dtype=np.int32)<br />
cl.enqueue_copy(queue, sortedElements, sortedElements_Buf)<br />
print(sortedElements)</p>
<p>for i in range(sum(range(2, int(np.log2(length//16)) + 1))):<br />
    event_bitonicSort = prg_bitonicSort.bitonicSort(queue,<br />
                        [length // 2], [16],<br />
                            length_Buf, sortedElements_Buf, sequence_Buf)<br />
    event_bitonicSort.wait()</p>
<p>cl.enqueue_copy(queue, sortedElements, sortedElements_Buf)<br />
print(sortedElements)<br />

Unityだと文字を出力するタイプのプログラムのデバッグはやや面倒(というよりは冗長?)かと思ったのでPyOpenCL(Python+OpenCL)を使ってみました.
デバイス側のコードは結局CライクなのでHLSL触っていれば導入コストは低いと思います.
ドキュメンテーションがしっかりしているのも良い点.
PyOpenCLの方は, 私の検索が甘かったのかもしれませんが, あまり実装例が見当たらなかった印象.
Pythonを初めて触りましたが, Numpyがやや面倒でした.

参考
http://t-pot.com/program/90_BitonicSort/index.html (2016/09/26)
http://http.developer.nvidia.com/GPUGems3/gpugems3_ch39.html (2016/09/26)

Posted on