Search Unity

[Idea] Unity with C# to GPU power!

Discussion in 'General Discussion' started by Arowx, Jan 7, 2015.

  1. Tugrul_512bit

    Tugrul_512bit

    Joined:
    Apr 9, 2016
    Posts:
    46
    Here is Conway's game of life planet(needs Cekirdekler v1.1.9):


    here are codes(you may need to change the platforms.gpus().devicesAmd(true, true); part):

    here is the new camera script:
    Code (CSharp):
    1.  
    2. using UnityEngine;
    3. using System.Collections;
    4. using System;
    5. using Cekirdekler;
    6. using Cekirdekler.ClArrays;
    7. using System.IO;
    8. using System.Text;
    9. using System.Threading.Tasks;
    10. using System.Threading;
    11.  
    12. public class Kamera : MonoBehaviour
    13. {
    14.  
    15.  
    16.     public static class UnitySystemConsoleRedirector
    17.     {
    18.         private class UnityTextWriter : TextWriter
    19.         {
    20.             private StringBuilder buffer = new StringBuilder();
    21.  
    22.             public override void Flush()
    23.             {
    24.                 Debug.Log(buffer.ToString());
    25.                 buffer.Length = 0;
    26.             }
    27.  
    28.             public override void Write(string value)
    29.             {
    30.                 buffer.Append(value);
    31.                 if (value != null)
    32.                 {
    33.                     var len = value.Length;
    34.                     if (len > 0)
    35.                     {
    36.                         var lastChar = value[len - 1];
    37.                         if (lastChar == '\n')
    38.                         {
    39.                             Flush();
    40.                         }
    41.                     }
    42.                 }
    43.             }
    44.  
    45.             public override void Write(char value)
    46.             {
    47.                 buffer.Append(value);
    48.                 if (value == '\n')
    49.                 {
    50.                     Flush();
    51.                 }
    52.             }
    53.  
    54.             public override void Write(char[] value, int index, int count)
    55.             {
    56.                 Write(new string(value, index, count));
    57.             }
    58.  
    59.             public override Encoding Encoding
    60.             {
    61.                 get { return Encoding.Default; }
    62.             }
    63.         }
    64.  
    65.         public static void Redirect()
    66.         {
    67.             Console.SetOut(new UnityTextWriter());
    68.         }
    69.     }
    70.     public Vector3[] vertices = null;
    71.     public Vector3[] verticesBase = null;
    72.     public Vector3[] normals = null;
    73.     public Vector2[] colors = null;
    74.     public Mesh mesh = null;
    75.     public GameObject copy = null;
    76.     const int nConway = 2048;
    77.     byte[] data = null;
    78.     Texture2D tx = null;
    79.  
    80.  
    81.  
    82.     byte[] charLevel      = new byte[nConway*nConway];
    83.     byte[] charHitpoint   = new byte[nConway*nConway];
    84.     byte[] charAttack     = new byte[nConway*nConway];
    85.     byte[] charDefense    = new byte[nConway*nConway];
    86.     byte[] charExperience = new byte[nConway*nConway];
    87.     byte[] charClass      = new byte[nConway*nConway];
    88.     int[] charRandomBuf      = new int[nConway*nConway];
    89.  
    90.     ClArray<byte> charLevelGpu = null;
    91.     ClArray<byte> charHitpointGpu = null;
    92.     ClArray<byte> charAttackGpu = null;
    93.     ClArray<byte> charDefenseGpu = null;
    94.     ClArray<byte> charExperienceGpu = null;
    95.     ClArray<byte> charClassGpu = null;
    96.     ClArray<int> charRandomBufGpu = null;
    97.     void Start()
    98.     {
    99.         charLevelGpu = charLevel;
    100.         charHitpointGpu = charHitpoint;
    101.         charAttackGpu = charAttack;
    102.         charDefenseGpu = charDefense;
    103.         charExperienceGpu = charExperience;
    104.         charClassGpu = charClass;
    105.         charRandomBufGpu = charRandomBuf;
    106.         UnitySystemConsoleRedirector.Redirect();
    107.         copy = GameObject.CreatePrimitive(PrimitiveType.Sphere);
    108.         copy.transform.Rotate(1, 10, 100);
    109.         copy.transform.position = gameObject.transform.position + gameObject.transform.forward*0.64f;
    110.         mesh = createSphere(copy.GetComponent<MeshFilter>().mesh);
    111.      
    112.         //mesh = copy.GetComponent<MeshFilter>().mesh;
    113.         vertices = mesh.vertices;
    114.         verticesBase = mesh.vertices;
    115.         normals = mesh.normals;
    116.         tx = new Texture2D(nConway, nConway, TextureFormat.RGBA32,false);
    117.         colors = mesh.uv;
    118.         Debug.Log(colors.Length);
    119.         Debug.Log(colors[55240].x);
    120.         if (data == null)
    121.         {
    122.             data = tx.GetRawTextureData();
    123.         }
    124.         System.Random r = new System.Random();
    125.         for (int i=0;i<data.Length;i+=4)
    126.         {
    127.             var val = (byte)((r.Next() % 2) * 255);
    128.             data[i] = val;
    129.             data[i+1] = val;
    130.             data[i+2] = val;
    131.             data[i+3] = 255;
    132.         }
    133.         Debug.Log(data.Length);
    134.         tx.LoadRawTextureData(data);
    135.         tx.Apply();
    136.         copy.GetComponent<Renderer>().material.mainTexture = tx;
    137.      
    138.     }
    139.  
    140.     Mesh createSphere(Mesh mesh_)
    141.     {
    142.         Mesh mesh = mesh_;
    143.         mesh.Clear();
    144.  
    145.         float radius = 0.3f;
    146.         // Longitude |||
    147.         int nbLong = 224;
    148.         // Latitude ---
    149.         int nbLat = 256;
    150.  
    151.         #region Vertices
    152.         Vector3[] vertices = new Vector3[(nbLong + 1) * nbLat + 2];
    153.         float _pi = Mathf.PI;
    154.         float _2pi = _pi * 2f;
    155.  
    156.         vertices[0] = Vector3.up * radius;
    157.         for (int lat = 0; lat < nbLat; lat++)
    158.         {
    159.             float a1 = _pi * (float)(lat + 1) / (nbLat + 1);
    160.             float sin1 = Mathf.Sin(a1);
    161.             float cos1 = Mathf.Cos(a1);
    162.  
    163.             for (int lon = 0; lon <= nbLong; lon++)
    164.             {
    165.                 float a2 = _2pi * (float)(lon == nbLong ? 0 : lon) / nbLong;
    166.                 float sin2 = Mathf.Sin(a2);
    167.                 float cos2 = Mathf.Cos(a2);
    168.  
    169.                 vertices[lon + lat * (nbLong + 1) + 1] = new Vector3(sin1 * cos2, cos1, sin1 * sin2) * radius;
    170.             }
    171.         }
    172.         vertices[vertices.Length - 1] = Vector3.up * -radius;
    173.         #endregion
    174.  
    175.         #region Normales    
    176.         Vector3[] normales = new Vector3[vertices.Length];
    177.         for (int n = 0; n < vertices.Length; n++)
    178.             normales[n] = vertices[n].normalized;
    179.         #endregion
    180.  
    181.         #region UVs
    182.         Vector2[] uvs = new Vector2[vertices.Length];
    183.         uvs[0] = Vector2.up;
    184.         uvs[uvs.Length - 1] = Vector2.zero;
    185.         for (int lat = 0; lat < nbLat; lat++)
    186.             for (int lon = 0; lon <= nbLong; lon++)
    187.                 uvs[lon + lat * (nbLong + 1) + 1] = new Vector2((float)lon / nbLong, 1f - (float)(lat + 1) / (nbLat + 1));
    188.         #endregion
    189.  
    190.         #region Triangles
    191.         int nbFaces = vertices.Length;
    192.         int nbTriangles = nbFaces * 2;
    193.         int nbIndexes = nbTriangles * 3;
    194.         int[] triangles = new int[nbIndexes];
    195.  
    196.         //Top Cap
    197.         int i = 0;
    198.         for (int lon = 0; lon < nbLong; lon++)
    199.         {
    200.             triangles[i++] = lon + 2;
    201.             triangles[i++] = lon + 1;
    202.             triangles[i++] = 0;
    203.         }
    204.  
    205.         //Middle
    206.         for (int lat = 0; lat < nbLat - 1; lat++)
    207.         {
    208.             for (int lon = 0; lon < nbLong; lon++)
    209.             {
    210.                 int current = lon + lat * (nbLong + 1) + 1;
    211.                 int next = current + nbLong + 1;
    212.  
    213.                 triangles[i++] = current;
    214.                 triangles[i++] = current + 1;
    215.                 triangles[i++] = next + 1;
    216.  
    217.                 triangles[i++] = current;
    218.                 triangles[i++] = next + 1;
    219.                 triangles[i++] = next;
    220.             }
    221.         }
    222.  
    223.         //Bottom Cap
    224.         for (int lon = 0; lon < nbLong; lon++)
    225.         {
    226.             triangles[i++] = vertices.Length - 1;
    227.             triangles[i++] = vertices.Length - (lon + 2) - 1;
    228.             triangles[i++] = vertices.Length - (lon + 1) - 1;
    229.         }
    230.         #endregion
    231.  
    232.         mesh.vertices = vertices;
    233.         mesh.normals = normales;
    234.         mesh.uv = uvs;
    235.         mesh.triangles = triangles;
    236.  
    237.         mesh.RecalculateBounds();
    238.         mesh.Optimize();
    239.         return mesh;
    240.     }
    241.  
    242.     private float t = 0;
    243.     private float ctr = 0;
    244.     private ClNumberCruncher numberCruncher = null;
    245.     private ClArray<byte> xyzGPU = null;
    246.  
    247.     private ClArray<byte> xyznGPU = null;
    248.  
    249.     private ClArray<byte> xyzoGPU = null;
    250.     private ClArray<byte> colorsGPU = null;
    251.  
    252.     private ClArray<float> arguments = null;
    253.     private ClArray<byte> bData = null;
    254.     private ClArray<byte> bData2 = null;
    255.     private ClArray<byte> bData3 = null;
    256.     void Update()
    257.     {
    258.         if (ctr < 0.3)
    259.             ctr += 0.001f;
    260.         t += 0.001f;
    261.         if (vertices != null)
    262.         {
    263.             float x = verticesBase[0].x;
    264.             float y = verticesBase[0].y;
    265.             float x2 = verticesBase[25670].x;
    266.             float y2 = verticesBase[25670].y;
    267.             float x3 = verticesBase[52670].x;
    268.             float y3 = verticesBase[52670].y;
    269.             bool strategy = false;
    270.  
    271.             if (strategy)
    272.             {
    273.                 // CPU start
    274.                 Parallel.For(0, vertices.Length, i =>
    275.                  {
    276.                      float dx = verticesBase[i].x - x;
    277.                      float dy = verticesBase[i].y - y;
    278.                      vertices[i] = verticesBase[i] + 0.02f * normals[i] * ctr * (float)Math.Sin(40.0f * t + 100.0f * Math.Sqrt(dx * dx + dy * dy));
    279.                  });
    280.                 // CPU end
    281.             }
    282.             else
    283.             {
    284.                 // GPGPU start
    285.                 int nGPU = 224 * 256; // number of vertices aligned to multiple of 64
    286.  
    287.                 // init number cruncher start
    288.                 if (numberCruncher == null)
    289.                 {
    290.                     Cekirdekler.Hardware.ClPlatforms platforms = Cekirdekler.Hardware.ClPlatforms.all();
    291.                     var devices = platforms.gpus().devicesAmd(true, true);
    292.                     platforms.logInfo();
    293.                     devices.logInfo();
    294.                     numberCruncher = new ClNumberCruncher(devices, File.ReadAllText("assets/kernel.cl"), true);
    295.                 }
    296.                 // init number cruncher end
    297.  
    298.                 // init arrays start to optimize read/writes
    299.                 if (xyzGPU == null)
    300.                 {
    301.                     xyzGPU = ClArray<byte>.wrapArrayOfStructs(verticesBase); xyznGPU = ClArray<byte>.wrapArrayOfStructs(normals);
    302.                     xyzGPU.write = false;
    303.                     xyzoGPU = ClArray<byte>.wrapArrayOfStructs(vertices); arguments = new float[64]; xyzoGPU.read = false;
    304.                     bData = data; bData.numberOfElementsPerWorkItem = 4;
    305.                     bData2 = new ClArray<byte>(data.Length); bData2.numberOfElementsPerWorkItem = 4;
    306.                     bData3 = new ClArray<byte>(data.Length); bData3.numberOfElementsPerWorkItem = 4;
    307.                     arguments.write = false; arguments.partialRead = false;
    308.                     colorsGPU = ClArray<byte>.wrapArrayOfStructs(colors);
    309.                     colorsGPU.partialRead = true;
    310.                     colorsGPU.write = false;
    311.                     Debug.Log(colorsGPU.numberOfElementsPerWorkItem);
    312.                                     xyzGPU.partialRead = true;
    313.                 xyznGPU.partialRead = true;
    314.                 xyznGPU.write = false;
    315.                     xyznGPU.write = false;
    316.                 xyzoGPU.partialRead = true;
    317.                 }
    318.                 // init arrays end
    319.  
    320.                 // wave parameters for all vertices
    321.                 arguments[0] = ctr;
    322.                 arguments[1] = t;
    323.                 arguments[2] = x;
    324.                 arguments[3] = y;
    325.                 arguments[4] = nGPU;
    326.                 arguments[5] = x2;
    327.                 arguments[6] = y2;
    328.                 arguments[7] = x3;
    329.                 arguments[8] = y3;
    330.  
    331.                 // compute start CPU+GPU
    332.                 bData2.read = false;
    333.                 bData2.partialRead = false;
    334.                 bData2.write = false;
    335.                 bData3.read = false;
    336.                 bData3.partialRead = false;
    337.                 bData3.write = false;
    338.  
    339.  
    340.                 // Conway's game of life(2048x2048) kernel and 17x17 diffusion stencil kernel for height map(2048x2048) smoothing
    341.                 bData.nextParam(bData2,bData3).compute(numberCruncher, 2, "live live2", nConway * nConway,64);
    342.  
    343.  
    344.                 bData.read = false;
    345.                 bData.partialRead = false;
    346.                 bData.write = false;
    347.  
    348.                 // mesh deformation with heightmap, color change with life (224x256) on a sphere
    349.                 xyzGPU.nextParam(xyznGPU, xyzoGPU, arguments,colorsGPU,bData,bData3).compute(numberCruncher, 1, "waveEquation", nGPU, 64);
    350.  
    351.  
    352.  
    353.  
    354.                 xyzoGPU.read = false;xyzoGPU.partialRead = false;
    355.                 bData.read = true;
    356.                 bData.write = true;
    357.                 // compute end
    358.  
    359.  
    360.                 // GPGPU end
    361.             }
    362.  
    363.             tx.wrapMode = TextureWrapMode.Clamp;
    364.             tx.LoadRawTextureData(data);
    365.             tx.Apply();
    366.             mesh.vertices = vertices;
    367.             mesh.normals = normals;
    368.             mesh.RecalculateNormals(); // just for reflections
    369.            // mesh.RecalculateBounds();
    370.             copy.transform.Rotate(t, 0, 0);
    371.         }
    372.  
    373.         if (Input.GetKey(KeyCode.Escape))
    374.         {
    375.             Application.Quit();
    376.         }
    377.  
    378.     }
    379.  
    380.     void OnDestroy()
    381.     {
    382.  
    383.     }
    384.  
    385.  
    386.  
    387.  
    388.  
    389. }
    390.  
    391.  
    this is the .cl file ingredients(instead of a big string):
    Code (CSharp):
    1.  
    2. __constant int nConway = 2048;
    3. uint wang_hash(uint seed)
    4. {
    5.     seed = (seed ^ 61) ^ (seed >> 16);
    6.     seed *= 9;
    7.     seed = seed ^ (seed >> 4);
    8.     seed *= 0x27d4eb2d;
    9.     seed = seed ^ (seed >> 15);
    10.     return seed;
    11. }
    12.  
    13. void wang_rnd_0(__global unsigned int * rnd_buffer, int id)
    14. {
    15.     uint maxint = 0;
    16.     maxint--;
    17.     uint rndint = wang_hash(id);
    18.     rnd_buffer[id] = rndint;
    19. }
    20.  
    21. float wang_rnd(__global unsigned int * rnd_buffer, int id)
    22. {
    23.     uint maxint = 0;
    24.     maxint--;
    25.     uint rndint = wang_hash(rnd_buffer[id]);
    26.     rnd_buffer[id] = rndint;
    27.     return ((float)rndint) / (float)maxint;
    28. }
    29.  
    30. __kernel void rnd_init(__global unsigned int * rnd_buffer)
    31. {
    32.     int id = get_global_id(0);
    33.     wang_rnd_0(rnd_buffer, id);        
    34. }
    35.  
    36. __kernel void rnd_1(__global unsigned int * rnd_buffer)
    37. {
    38.     int id = get_global_id(0);
    39.  
    40.     float thread_private_random_number = wang_rnd(rnd_buffer, id);
    41. }
    42.  
    43. __constant uchar CHAR_FIGHTER = 0;
    44. __constant uchar CHAR_RANGER = 1;
    45. __constant uchar CHAR_SORCERER = 2;
    46. __constant uchar CHAR_ROGUE = 3;
    47. __constant uchar CHAR_BARBARIAN = 4;
    48. __constant uchar CHAR_BARD = 5;
    49. __constant uchar CHAR_MONK = 6;
    50. __constant uchar CHAR_WIZARD = 7;
    51. __constant uchar CHAR_CLERIC = 8;
    52. uchar dice(int n, int d, __global unsigned int *buf, int id)
    53. {
    54.     int val= ((int)(wang_rnd(buf, id)*d));
    55.     for (int i = 1; i < n; i++)
    56.         val += ((int)(wang_rnd(buf, id)*d));
    57.     return max(min(val,255),0);
    58. }
    59.  
    60. uchar baseHitpoint(uchar classType, uchar charLevel,__global int *randomBuffer,int id)
    61. {
    62.     if (classType == CHAR_FIGHTER)
    63.         return  dice(1, 10, randomBuffer, id);
    64.     else if (classType == CHAR_RANGER)
    65.         return  dice(1, 8, randomBuffer, id);
    66.     else
    67.         return 0;
    68. }
    69.  
    70. uchar baseAttack(uchar classType, uchar charLevel)
    71. {
    72.     if (classType == CHAR_FIGHTER)
    73.         return charLevel;
    74.     else if (classType == CHAR_RANGER)
    75.         return charLevel * 2 / 3;
    76.     else
    77.         return 0;
    78. }
    79.  
    80. uchar baseDefense(uchar classType, uchar charLevel)
    81. {
    82.     if (classType == CHAR_FIGHTER)
    83.         return charLevel/10;
    84.     else if (classType == CHAR_RANGER)
    85.         return charLevel/8;
    86.     else
    87.         return 0;
    88. }
    89.  
    90.  
    91.  
    92. __kernel void initCharacter(__global uchar * charLevel, __global uchar * charHitpoint, __global uchar * charAttack,
    93.     __global uchar * charDefense, __global uchar * charExperience, __global uchar * charClass, __global int * randomBuffer)
    94. {
    95.     int id = get_global_id(0);
    96.     charLevel[id] = 1;
    97.     charHitpoint[id] = baseHitpoint(charClass[id], charLevel[id],randomBuffer,id);
    98.     charExperience[id] = dice(1,100,randomBuffer,id);
    99.     charAttack[id] = baseAttack(charClass[id], charLevel[id]);
    100.     charDefense[id] = baseDefense(charClass[id], charLevel[id]);
    101. }
    102.  
    103. __kernel void live2(__global uchar * map, __global uchar * ctr, __global uchar * diffuseMap)
    104. {
    105.     int i = get_global_id(0);
    106.  
    107.     int groupId = i / 64;
    108.     int gx = groupId % (nConway / 8);
    109.     int gy = groupId / (nConway / 8);
    110.     int localId = get_local_id(0);
    111.     int lx = localId % 8;
    112.     int ly = localId / 8;
    113.     int kx = lx + gx * 8; int jy = ly + gy * 8;
    114.     i = kx + jy*nConway;
    115.     map[i * 4] = ctr[i * 4];
    116.     map[i * 4 + 1] = ctr[i * 4 + 1];
    117.     map[i * 4 + 2] = ctr[i * 4 + 2];
    118.     map[i * 4 + 3] = ctr[i * 4 + 3];
    119.     int acc = 0;
    120.  
    121.     __local ctrL[24][24];
    122.     ctrL[lx + 8][ly + 8] = ctr[(gx * 8 + lx) + (gy * 8 + ly)*nConway];
    123.     if (gx>0 && gy>0)
    124.         ctrL[lx][ly] = ctr[(gx * 8 + lx - 8) + (gy * 8 + ly - 8)*nConway];
    125.     if (gx<(nConway / 8 - 1) && gy<(nConway / 8 - 1))
    126.         ctrL[lx + 16][ly + 16] = ctr[(gx * 8 + lx + 8) + (gy * 8 + ly + 8)*nConway];
    127.     if (gx>0)
    128.         ctrL[lx][ly + 8] = ctr[(gx * 8 + lx - 8) + (gy * 8 + ly)*nConway];
    129.     if (gy>0)
    130.         ctrL[lx + 8][ly] = ctr[(gx * 8 + lx) + (gy * 8 + ly - 8)*nConway];
    131.     if (gx<(nConway / 8 - 1))
    132.         ctrL[lx + 16][ly + 8] = ctr[(gx * 8 + lx + 8) + (gy * 8 + ly)*nConway];
    133.     if (gy<(nConway / 8 - 1))
    134.         ctrL[lx + 8][ly + 16] = ctr[(gx * 8 + lx) + (gy * 8 + ly + 8)*nConway];
    135.     if (gx<(nConway / 8 - 1) && gy>(0))
    136.         ctrL[lx + 16][ly] = ctr[(gx * 8 + lx + 8) + (gy * 8 + ly - 8)*nConway];
    137.     if (gy<(nConway / 8 - 1) && gx>(0))
    138.         ctrL[lx][ly + 16] = ctr[(gx * 8 + lx - 8) + (gy * 8 + ly + 8)*nConway];
    139.     barrier(CLK_LOCAL_MEM_FENCE);
    140.     int minu = ((lx + gx * 8) >= 8 ? -8 : (-lx)) + 8;
    141.     int maxu = ((lx + gx * 8) <= (nConway - 9) ? (8) : (8 - lx)) + 8;
    142.     int minv = ((ly + gy * 8) >= 8 ? -8 : (-ly)) + 8;
    143.     int maxv = ((ly + gy * 8) <= (nConway - 9) ? (8) : (8 - ly)) + 8;
    144.     int w = abs(maxu) + abs(minu) + 1;
    145.     int h = abs(maxv) + abs(minv) + 1;
    146.     for (int u = minu; u <= maxu; u++)
    147.     {
    148.         for (int v = minv; v <= maxv; v++)
    149.         {
    150.             acc += ctrL[u][v];
    151.         }
    152.     }
    153.     diffuseMap[i * 4 + 3] = diffuseMap[i * 4 + 2];
    154.     diffuseMap[i * 4 + 2] = diffuseMap[i * 4 + 1];
    155.     diffuseMap[i * 4 + 1] = diffuseMap[i * 4];
    156.     diffuseMap[i * 4] = min(max(0, acc / (w*h)), 255);
    157. }
    158. __kernel void live(__global uchar * map, __global uchar * ctr, __global uchar * diffuseMap)
    159. {
    160.     int i = get_global_id(0);
    161.     int x = i%nConway;
    162.     int y = i / nConway;
    163.     int iMap = y*nConway + x;
    164.     int ctr_ = 0;
    165.     for (int j = -1; j <= 1; j++)
    166.         for (int k = -1; k <= 1; k++)
    167.         {
    168.             int index = iMap + nConway*j + k;
    169.             if (((index) >= 0) && ((index)<(nConway*nConway)))
    170.             {
    171.                 if ((x + k >= 0) && (x + k<nConway) && (y + j >= 0) && (y + j<nConway))
    172.                 {
    173.                     if (index != iMap)
    174.                     {
    175.                         if (map[index * 4]>0)
    176.                             ctr_++;
    177.                     }
    178.                 }
    179.             }
    180.  
    181.         }
    182.     if (ctr_ == 2 || ctr_ == 3)
    183.     {
    184.         if (ctr_ == 3 && map[iMap * 4] == 0)
    185.         {
    186.             ctr[iMap * 4] = 255;
    187.             ctr[iMap * 4 + 1] = 255;
    188.             ctr[iMap * 4 + 2] = 255;
    189.             ctr[iMap * 4 + 3] = 255;
    190.         }
    191.         else
    192.         {
    193.             ctr[iMap * 4] = map[iMap * 4];
    194.             ctr[iMap * 4 + 1] = map[iMap * 4 + 1];
    195.             ctr[iMap * 4 + 2] = map[iMap * 4 + 2];
    196.             ctr[iMap * 4 + 3] = 255;
    197.  
    198.         }
    199.     }
    200.     else
    201.     {
    202.         ctr[iMap * 4] = 0;
    203.         ctr[iMap * 4 + 1] = 0;
    204.         ctr[iMap * 4 + 2] = 0;
    205.         ctr[iMap * 4 + 3] = 255;
    206.     }
    207. }
    208.  
    209.  
    210. __kernel void calculateNormals(__global float *xyz, __global float *normals)
    211. {
    212.  
    213. }
    214.  
    215.  
    216. __kernel void waveEquation(__global float *xyz, __global float *xyzn,
    217.     __global float *xyzo, __global float * arguments,
    218.     __global float *colors, __global uchar *texture, __global uchar * diffuseMap)
    219. {
    220.     int threadId = get_global_id(0);
    221.     if (threadId<arguments[4])
    222.     {
    223.         float colorX = colors[threadId * 2];
    224.         float colorY = colors[threadId * 2 + 1];
    225.         int iX = (int)(colorX*nConway);
    226.         int iY = (int)(colorY*nConway);
    227.         float height = 0;
    228.         if (iX >= 0 && iX<nConway && iY >= 0 && iY<nConway)
    229.         {
    230.             height = (
    231.                 diffuseMap[(iY*nConway + iX) * 4] + diffuseMap[(iY*nConway + iX) * 4 + 1] +
    232.                 diffuseMap[(iY*nConway + iX) * 4 + 2] + diffuseMap[(iY*nConway + iX) * 4 + 3]
    233.                 ) / 20000.0f;
    234.         }
    235.  
    236.         xyzo[threadId * 3] = xyz[threadId * 3] + xyzn[threadId * 3] * height;
    237.         xyzo[threadId * 3 + 1] = xyz[threadId * 3 + 1] + xyzn[threadId * 3 + 1] * height;
    238.         xyzo[threadId * 3 + 2] = xyz[threadId * 3 + 2] + xyzn[threadId * 3 + 2] * height;
    239.  
    240.     }
    241. }
    242.  
    243.  
     
    ZJP and angrypenguin like this.
  2. Tugrul_512bit

    Tugrul_512bit

    Joined:
    Apr 9, 2016
    Posts:
    46
    This example demonstrates R7-240(a low end old amd card) and a RX-550(a low end new amd card) computing at the same time:



    it seems like rendering time is sensitive to pci-e bandwidth. Its sending 57k vertice-sphere. Maybe a shader-compute version with tesselation wouldn't get affected by bandwidth but how to run shader(opengl,directx) on multiple gpus without getting hardness levels of dx12 or vulkan?
     
    Flurgle likes this.
  3. ZJP

    ZJP

    Joined:
    Jan 22, 2010
    Posts:
    2,649
    Thanks for this useful video. :cool:
     
  4. Tugrul_512bit

    Tugrul_512bit

    Joined:
    Apr 9, 2016
    Posts:
    46
    Since youtube not asking for money ever, I keep filling it with GPGPU videos:

     
    Flurgle likes this.
  5. Tugrul_512bit

    Tugrul_512bit

    Joined:
    Apr 9, 2016
    Posts:
    46
    Now API has "device to device pipelining" feature which can attach a different GPU per kernel and run consecutive kernels concurrently with help of double buffering.


    https://github.com/tugrul512bit/Cekirdekler/wiki/Pipelining:-Device-to-Device



    Every iteration needs a data to be pushed from left end and it takes several steps before result pops out of the right end. Once first result is taken, it takes single step(pushData) to get new consecutive results.

    Maybe some graphics expert can build a geometry pipeline with this? Maybe states doing triangulation, rasterization, postprocessing.

    Not a Unity example but here it is:

     
    Flurgle and ippdev like this.
  6. yoonitee

    yoonitee

    Joined:
    Jun 27, 2013
    Posts:
    2,363
    So I'm still not clear. Is your system actually any faster than simply using Parallel.For and a CPU only solution? If so by how much?
     
  7. Tugrul_512bit

    Tugrul_512bit

    Joined:
    Apr 9, 2016
    Posts:
    46
    For just sqrt(sin(x)*sin(x)), its %100 faster than parallel.for, parallel for is like %80 faster than linq because linq is creating copies. This is for double precision arrays with 16M elements. Going float must be faster for both CPU and GPU and adding more calculations per byte helps GPU increase the performance gap.

    Low compute to data algorithms can make CPU faster or equal. For just c=a+b with device to device pipelining, CPU is faster. Without that pipelining but streaming(doesn't matter multip GPU or not), GPU is faster.
     
    ZJP and yoonitee like this.
  8. Tugrul_512bit

    Tugrul_512bit

    Joined:
    Apr 9, 2016
    Posts:
    46
    Here is a runnable demo of pipeline feature: download

    but it gives some error(%50 of the time, may need to try 1-2 more times to make it work) related to winforms controls thread safety(I used Invoke but it didn't solve), maybe I should create a Unity version.
     
  9. Tugrul_512bit

    Tugrul_512bit

    Joined:
    Apr 9, 2016
    Posts:
    46
    To optimize for single-GPU-only scenarios more, async enqueue mode is added with v1.2.10

    https://github.com/tugrul512bit/Cekirdekler/releases

    example code fragment:

    Code (csharp):
    1.  
    2. cruncher.enqueueMode = true;
    3.  
    4. // default queue (0)
    5. dataArrayA.nextParam(dataArrayB, constant).compute(cruncher, 1, "vecAdd", 1024 * 1024);
    6.  
    7. // next concurrent queue(1)
    8. cruncher.enqueueModeAsyncEnable = true;
    9. dataArrayC.nextParam(dataArrayD, constant2).compute(cruncher, 1, "vecMul", 1024 * 1024);
    10. cruncher.enqueueModeAsyncEnable = false;
    11.  
    12. // default queue(0)
    13. dataArrayE.nextParam(dataArrayF, constant3).compute(cruncher, 1, "vecDiv", 1024 * 1024);
    14.  
    15. // next concurrent queue(2)
    16. cruncher.enqueueModeAsyncEnable = true;
    17. dataArrayG.nextParam(dataArrayH, constant4).compute(cruncher, 1, "vecAddInt", 1024 * 1024);
    18. dataArrayG.nextParam(dataArrayH, constant4).compute(cruncher, 1, "vecAddInt", 1024 * 1024);
    19. dataArrayG.nextParam(dataArrayH, constant4).compute(cruncher, 1, "vecAddInt", 1024 * 1024);
    20. dataArrayG.nextParam(dataArrayH, constant4).compute(cruncher, 1, "vecAddInt", 1024 * 1024);
    21. cruncher.enqueueModeAsyncEnable = false;
    22.  
    23. // enqueue mode is also implicity async to host codes
    24. // calculateGameLogic()--> runs async to GPU queues
    25. // just before enqueue mode = false assignment
    26.  
    27.  
    28. cruncher.enqueueMode = false;
    29.  


    This can reduce latency by up to %45 in dense real-world scenarios. This picture shows %15 save. If I double the work, it goes to %30ish, if I keep adding more work, it satisfies at %45 time saving because of kernel compute time is roughly equal to sum of buffer read+write timings. All these concurrent queues are also asynchronous to host codes. You can compute other things by CPU while queues are working.


    Also I heard bullet physics has come to Unity so Amd cards can have performant physics too. Have you tried it yet?
     
    Last edited: May 31, 2017
  10. Tugrul_512bit

    Tugrul_512bit

    Joined:
    Apr 9, 2016
    Posts:
    46
    Here is an image processing pipeline made with v1.2.11's "single device pipeline" feature within hours:



    creating an already-known algorithm's pipeline would take no more than 30 minutes if opencl is already known (kind of C99 but constrained)
     
    Last edited: Jun 3, 2017
  11. Tugrul_512bit

    Tugrul_512bit

    Joined:
    Apr 9, 2016
    Posts:
    46
    Now OpenCL 2.0 dynamic parallelism is supported with v1.4.1 https://github.com/tugrul512bit/Cekirdekler/releases/tag/v1.4.1_update2

    "Also device-pool + task(kernels to be computed later) pool" feature can achieve a good performance with greedy scheduling for non-separable kernels(a gpu gets a new job as soon as its channels finish a task) but this is tested for only OpenCL 1.2.
     
    ZJP likes this.
  12. tinyant

    tinyant

    Joined:
    Aug 28, 2015
    Posts:
    127
    Very interesting Stuff!!:)
     
  13. ippdev

    ippdev

    Joined:
    Feb 7, 2010
    Posts:
    3,853
    How could this be used to get better performance on huge arch-viz scenes?
     
  14. Tugrul_512bit

    Tugrul_512bit

    Joined:
    Apr 9, 2016
    Posts:
    46
    Did you mean sketchup-like drawing/building scenes? Any info about its benchmarks? How does it render stuff?

    If they are already using gpu acceleration for rendering stuff, it may not help much. Maybe parsing the file and building geometry could be done with dynamic parallelism. For example, GPU starts with 1 thread, CPU sends it file bytes. It starts parsing. Whenever stumbles upon a geometry, allocates 64 workers for building its structure in memory. Continues, whenever stumbles upon a small geometry, allocates 16 workers. Whenever crosses a big geometry, allocates 1M workers. Even spawns other copies of itself to increase speed if object data start-end points are known.

    All these can happen in GPU without waiting command from CPU.

    I searched the internet a bit: http://www.cgarchitect.com/2016/11/...sualization-rendering-engine-survey---results

    this shows vray is already in effect so rendering part is already pro-performance. Or I am mistaken and that is just a benchmark between other renderers, not Arch-viz.

    In here, https://www.redshift3d.com/blog/building-a-bigger-arch-viz-business-with-redshift-for-3ds-max, pictures are high quality so it should have had GPU acceleration already. I wonder how long does opening a file take.

    With task/device pool, you can compute hundreds of compute requests. Maybe useful for remote rendering (a render service for clients from thousands of kilometers away with wooden PCs)
     
    Last edited: Jul 4, 2017
  15. Arowx

    Arowx

    Joined:
    Nov 12, 2009
    Posts:
    8,194
    I think he's referring to how a developer can manage a GPU with a high rendering load and GPU processing and balance the two?
     
  16. Tugrul_512bit

    Tugrul_512bit

    Joined:
    Apr 9, 2016
    Posts:
    46
    OpenCL can render on OpenGL textures but this project does not support that feature yet. So it only computes stuff. But it can schedule multiple kernels to multiple GPUs, or it can separate a kernel into smaller parts and feed them to multiple GPUs, or it can work as a pipeline so each stage(a kernel) of it run on a different GPU. If these types of work flow is not doable, then this project can not help, especially if there is no latency to hide behind another. OpenCL 2.0 part is a bit complicated but it cuts GPU's dependency to CPU for multiple kernel executions.
     
  17. ippdev

    ippdev

    Joined:
    Feb 7, 2010
    Posts:
    3,853
    Could this be used for an Occlusion Culling algorithm that had not been prebaked?
     
  18. Tugrul_512bit

    Tugrul_512bit

    Joined:
    Apr 9, 2016
    Posts:
    46
    I haven't done Occlusion Culling myself but it seems it has some sort of sorting objects before drawing so rendering resources are not wasted. If these sorting (with Z-buffer? idk.) includes massively parallel short range sorts, then maybe it could help but I also lack experience about detailed geometry pipelines so hardware-accelerated (I mean, automatically done by geometry engine of GPU) could be already faster.

    If I were more experienced in OpenGL, I could answer your question properly. I also just took a video of dynamic parallelism example here
    it does k-means clustering. Yes, I know k-means clustering may not help clustering geometry objects to cull them(or may it?)

    If its ray-traced scene, then culling may be done easier with applying some sort of acceleration structure(like a uniform grid) to cull backplane objects.
     
  19. ippdev

    ippdev

    Joined:
    Feb 7, 2010
    Posts:
    3,853
    Can it do any performance magic with something like this
    https://docs.unity3d.com/Manual/CullingGroupAPI.html
     
  20. Tugrul_512bit

    Tugrul_512bit

    Joined:
    Apr 9, 2016
    Posts:
    46
    It says

    "The API works by having you provide an array of bounding spheres. These visibility of these spheres relative to a particular camera is then calculated, along with a ‘distance band’ value that can be treated like a LOD level number."

    so this is a collision check or something and must be acceleratable but the cost of moving arrays from gpu to cpu then cpu to gpu again would make it slower imho. But, if cullin takes 1-2 seconds, then it may help to reduce it to sub-second timings(maybe).

    Is culling group api working on CPU already? Then GPU would help. But how many hours or days to write those codes, I don't know.
     
    Last edited: Jul 7, 2017
    Flurgle likes this.
  21. andrej-szontagh

    andrej-szontagh

    Joined:
    May 18, 2017
    Posts:
    20
    Unfortunately this forum doesn't seems to have a voting system so I will write a reply.
    With WebCL you could do all the crazy stuff you couldn't do with WebGL .. also you can do a lot of parallel heavy number crunching .. It would be nice if we could somehow access this from unity.
     
    Flurgle likes this.
  22. sgrein

    sgrein

    Joined:
    May 9, 2018
    Posts:
    11
    tugrul_512bit:

    What is the overhead when I want to call a kernel repeatedly?

    I tried Cudafy and managedCuda and both are slow (~50 ms) when transfering data to the GPU.
     
  23. AndersMalmgren

    AndersMalmgren

    Joined:
    Aug 31, 2014
    Posts:
    5,358
    Maybe if you have a huge team that can optimize the crap out of your scene. But for many of us setpass calls are the bottleneck. I know it's the case for our game. That's why I always bitch on unity to get Vulkan and gfx job working properly.

    Edit: haha, missed that this was an old Arowx thread :p
     
    Last edited: May 4, 2019
    Flurgle likes this.