[Idea] Unity with C# to GPU power!

Tugrul_512bit · Apr 25, 2017

ZJP said: ↑

Sorry for the late response. Thanks for the link.
Click to expand...

Here is Conway's game of life planet(needs Cekirdekler v1.1.9):

here are codes(you may need to change the platforms.gpus().devicesAmd(true, true); part):

here is the new camera script:

Code (CSharp):

using UnityEngine;

using System.Collections;

using System;

using Cekirdekler;

using Cekirdekler.ClArrays;

using System.IO;

using System.Text;

using System.Threading.Tasks;

using System.Threading;

public class Kamera : MonoBehaviour

{

public static class UnitySystemConsoleRedirector

{

private class UnityTextWriter : TextWriter

{

private StringBuilder buffer = new StringBuilder();

public override void Flush()

{

Debug.Log(buffer.ToString());

buffer.Length = 0;

}

public override void Write(string value)

{

buffer.Append(value);

if (value != null)

{

var len = value.Length;

if (len > 0)

{

var lastChar = value[len - 1];

if (lastChar == '\n')

{

Flush();

}

}

}

}

public override void Write(char value)

{

buffer.Append(value);

if (value == '\n')

{

Flush();

}

}

public override void Write(char[] value, int index, int count)

{

Write(new string(value, index, count));

}

public override Encoding Encoding

{

get { return Encoding.Default; }

}

}

public static void Redirect()

{

Console.SetOut(new UnityTextWriter());

}

}

public Vector3[] vertices = null;

public Vector3[] verticesBase = null;

public Vector3[] normals = null;

public Vector2[] colors = null;

public Mesh mesh = null;

public GameObject copy = null;

const int nConway = 2048;

byte[] data = null;

Texture2D tx = null;

byte[] charLevel = new byte[nConway*nConway];

byte[] charHitpoint = new byte[nConway*nConway];

byte[] charAttack = new byte[nConway*nConway];

byte[] charDefense = new byte[nConway*nConway];

byte[] charExperience = new byte[nConway*nConway];

byte[] charClass = new byte[nConway*nConway];

int[] charRandomBuf = new int[nConway*nConway];

ClArray<byte> charLevelGpu = null;

ClArray<byte> charHitpointGpu = null;

ClArray<byte> charAttackGpu = null;

ClArray<byte> charDefenseGpu = null;

ClArray<byte> charExperienceGpu = null;

ClArray<byte> charClassGpu = null;

ClArray<int> charRandomBufGpu = null;

void Start()

{

charLevelGpu = charLevel;

charHitpointGpu = charHitpoint;

charAttackGpu = charAttack;

charDefenseGpu = charDefense;

charExperienceGpu = charExperience;

charClassGpu = charClass;

charRandomBufGpu = charRandomBuf;

UnitySystemConsoleRedirector.Redirect();

copy = GameObject.CreatePrimitive(PrimitiveType.Sphere);

copy.transform.Rotate(1, 10, 100);

copy.transform.position = gameObject.transform.position + gameObject.transform.forward*0.64f;

mesh = createSphere(copy.GetComponent<MeshFilter>().mesh);

//mesh = copy.GetComponent<MeshFilter>().mesh;

vertices = mesh.vertices;

verticesBase = mesh.vertices;

normals = mesh.normals;

tx = new Texture2D(nConway, nConway, TextureFormat.RGBA32,false);

colors = mesh.uv;

Debug.Log(colors.Length);

Debug.Log(colors[55240].x);

if (data == null)

{

data = tx.GetRawTextureData();

}

System.Random r = new System.Random();

for (int i=0;i<data.Length;i+=4)

{

var val = (byte)((r.Next() % 2) * 255);

data[i] = val;

data[i+1] = val;

data[i+2] = val;

data[i+3] = 255;

}

Debug.Log(data.Length);

tx.LoadRawTextureData(data);

tx.Apply();

copy.GetComponent<Renderer>().material.mainTexture = tx;

}

Mesh createSphere(Mesh mesh_)

{

Mesh mesh = mesh_;

mesh.Clear();

float radius = 0.3f;

// Longitude |||

int nbLong = 224;

// Latitude ---

int nbLat = 256;

#region Vertices

Vector3[] vertices = new Vector3[(nbLong + 1) * nbLat + 2];

float _pi = Mathf.PI;

float _2pi = _pi * 2f;

vertices[0] = Vector3.up * radius;

for (int lat = 0; lat < nbLat; lat++)

{

float a1 = _pi * (float)(lat + 1) / (nbLat + 1);

float sin1 = Mathf.Sin(a1);

float cos1 = Mathf.Cos(a1);

for (int lon = 0; lon <= nbLong; lon++)

{

float a2 = _2pi * (float)(lon == nbLong ? 0 : lon) / nbLong;

float sin2 = Mathf.Sin(a2);

float cos2 = Mathf.Cos(a2);

vertices[lon + lat * (nbLong + 1) + 1] = new Vector3(sin1 * cos2, cos1, sin1 * sin2) * radius;

}

}

vertices[vertices.Length - 1] = Vector3.up * -radius;

#endregion

#region Normales

Vector3[] normales = new Vector3[vertices.Length];

for (int n = 0; n < vertices.Length; n++)

normales[n] = vertices[n].normalized;

#endregion

#region UVs

Vector2[] uvs = new Vector2[vertices.Length];

uvs[0] = Vector2.up;

uvs[uvs.Length - 1] = Vector2.zero;

for (int lat = 0; lat < nbLat; lat++)

for (int lon = 0; lon <= nbLong; lon++)

uvs[lon + lat * (nbLong + 1) + 1] = new Vector2((float)lon / nbLong, 1f - (float)(lat + 1) / (nbLat + 1));

#endregion

#region Triangles

int nbFaces = vertices.Length;

int nbTriangles = nbFaces * 2;

int nbIndexes = nbTriangles * 3;

int[] triangles = new int[nbIndexes];

//Top Cap

int i = 0;

for (int lon = 0; lon < nbLong; lon++)

{

triangles[i++] = lon + 2;

triangles[i++] = lon + 1;

triangles[i++] = 0;

}

//Middle

for (int lat = 0; lat < nbLat - 1; lat++)

{

for (int lon = 0; lon < nbLong; lon++)

{

int current = lon + lat * (nbLong + 1) + 1;

int next = current + nbLong + 1;

triangles[i++] = current;

triangles[i++] = current + 1;

triangles[i++] = next + 1;

triangles[i++] = current;

triangles[i++] = next + 1;

triangles[i++] = next;

}

}

//Bottom Cap

for (int lon = 0; lon < nbLong; lon++)

{

triangles[i++] = vertices.Length - 1;

triangles[i++] = vertices.Length - (lon + 2) - 1;

triangles[i++] = vertices.Length - (lon + 1) - 1;

}

#endregion

mesh.vertices = vertices;

mesh.normals = normales;

mesh.uv = uvs;

mesh.triangles = triangles;

mesh.RecalculateBounds();

mesh.Optimize();

return mesh;

}

private float t = 0;

private float ctr = 0;

private ClNumberCruncher numberCruncher = null;

private ClArray<byte> xyzGPU = null;

private ClArray<byte> xyznGPU = null;

private ClArray<byte> xyzoGPU = null;

private ClArray<byte> colorsGPU = null;

private ClArray<float> arguments = null;

private ClArray<byte> bData = null;

private ClArray<byte> bData2 = null;

private ClArray<byte> bData3 = null;

void Update()

{

if (ctr < 0.3)

ctr += 0.001f;

t += 0.001f;

if (vertices != null)

{

float x = verticesBase[0].x;

float y = verticesBase[0].y;

float x2 = verticesBase[25670].x;

float y2 = verticesBase[25670].y;

float x3 = verticesBase[52670].x;

float y3 = verticesBase[52670].y;

bool strategy = false;

if (strategy)

{

// CPU start

Parallel.For(0, vertices.Length, i =>

{

float dx = verticesBase[i].x - x;

float dy = verticesBase[i].y - y;

vertices[i] = verticesBase[i] + 0.02f * normals[i] * ctr * (float)Math.Sin(40.0f * t + 100.0f * Math.Sqrt(dx * dx + dy * dy));

});

// CPU end

}

else

{

// GPGPU start

int nGPU = 224 * 256; // number of vertices aligned to multiple of 64

// init number cruncher start

if (numberCruncher == null)

{

Cekirdekler.Hardware.ClPlatforms platforms = Cekirdekler.Hardware.ClPlatforms.all();

var devices = platforms.gpus().devicesAmd(true, true);

platforms.logInfo();

devices.logInfo();

numberCruncher = new ClNumberCruncher(devices, File.ReadAllText("assets/kernel.cl"), true);

}

// init number cruncher end

// init arrays start to optimize read/writes

if (xyzGPU == null)

{

xyzGPU = ClArray<byte>.wrapArrayOfStructs(verticesBase); xyznGPU = ClArray<byte>.wrapArrayOfStructs(normals);

xyzGPU.write = false;

xyzoGPU = ClArray<byte>.wrapArrayOfStructs(vertices); arguments = new float[64]; xyzoGPU.read = false;

bData = data; bData.numberOfElementsPerWorkItem = 4;

bData2 = new ClArray<byte>(data.Length); bData2.numberOfElementsPerWorkItem = 4;

bData3 = new ClArray<byte>(data.Length); bData3.numberOfElementsPerWorkItem = 4;

arguments.write = false; arguments.partialRead = false;

colorsGPU = ClArray<byte>.wrapArrayOfStructs(colors);

colorsGPU.partialRead = true;

colorsGPU.write = false;

Debug.Log(colorsGPU.numberOfElementsPerWorkItem);

xyzGPU.partialRead = true;

xyznGPU.partialRead = true;

xyznGPU.write = false;

xyznGPU.write = false;

xyzoGPU.partialRead = true;

}

// init arrays end

// wave parameters for all vertices

arguments[0] = ctr;

arguments[1] = t;

arguments[2] = x;

arguments[3] = y;

arguments[4] = nGPU;

arguments[5] = x2;

arguments[6] = y2;

arguments[7] = x3;

arguments[8] = y3;

// compute start CPU+GPU

bData2.read = false;

bData2.partialRead = false;

bData2.write = false;

bData3.read = false;

bData3.partialRead = false;

bData3.write = false;

// Conway's game of life(2048x2048) kernel and 17x17 diffusion stencil kernel for height map(2048x2048) smoothing

bData.nextParam(bData2,bData3).compute(numberCruncher, 2, "live live2", nConway * nConway,64);

bData.read = false;

bData.partialRead = false;

bData.write = false;

// mesh deformation with heightmap, color change with life (224x256) on a sphere

xyzGPU.nextParam(xyznGPU, xyzoGPU, arguments,colorsGPU,bData,bData3).compute(numberCruncher, 1, "waveEquation", nGPU, 64);

xyzoGPU.read = false;xyzoGPU.partialRead = false;

bData.read = true;

bData.write = true;

// compute end

// GPGPU end

}

tx.wrapMode = TextureWrapMode.Clamp;

tx.LoadRawTextureData(data);

tx.Apply();

mesh.vertices = vertices;

mesh.normals = normals;

mesh.RecalculateNormals(); // just for reflections

// mesh.RecalculateBounds();

copy.transform.Rotate(t, 0, 0);

}

if (Input.GetKey(KeyCode.Escape))

{

Application.Quit();

}

}

void OnDestroy()

{

}

}

this is the .cl file ingredients(instead of a big string):

Code (CSharp):

__constant int nConway = 2048;

uint wang_hash(uint seed)

{

seed = (seed ^ 61) ^ (seed >> 16);

seed *= 9;

seed = seed ^ (seed >> 4);

seed *= 0x27d4eb2d;

seed = seed ^ (seed >> 15);

return seed;

}

void wang_rnd_0(__global unsigned int * rnd_buffer, int id)

{

uint maxint = 0;

maxint--;

uint rndint = wang_hash(id);

rnd_buffer[id] = rndint;

}

float wang_rnd(__global unsigned int * rnd_buffer, int id)

{

uint maxint = 0;

maxint--;

uint rndint = wang_hash(rnd_buffer[id]);

rnd_buffer[id] = rndint;

return ((float)rndint) / (float)maxint;

}

__kernel void rnd_init(__global unsigned int * rnd_buffer)

{

int id = get_global_id(0);

wang_rnd_0(rnd_buffer, id);

}

__kernel void rnd_1(__global unsigned int * rnd_buffer)

{

int id = get_global_id(0);

float thread_private_random_number = wang_rnd(rnd_buffer, id);

}

__constant uchar CHAR_FIGHTER = 0;

__constant uchar CHAR_RANGER = 1;

__constant uchar CHAR_SORCERER = 2;

__constant uchar CHAR_ROGUE = 3;

__constant uchar CHAR_BARBARIAN = 4;

__constant uchar CHAR_BARD = 5;

__constant uchar CHAR_MONK = 6;

__constant uchar CHAR_WIZARD = 7;

__constant uchar CHAR_CLERIC = 8;

uchar dice(int n, int d, __global unsigned int *buf, int id)

{

int val= ((int)(wang_rnd(buf, id)*d));

for (int i = 1; i < n; i++)

val += ((int)(wang_rnd(buf, id)*d));

return max(min(val,255),0);

}

uchar baseHitpoint(uchar classType, uchar charLevel,__global int *randomBuffer,int id)

{

if (classType == CHAR_FIGHTER)

return dice(1, 10, randomBuffer, id);

else if (classType == CHAR_RANGER)

return dice(1, 8, randomBuffer, id);

else

return 0;

}

uchar baseAttack(uchar classType, uchar charLevel)

{

if (classType == CHAR_FIGHTER)

return charLevel;

else if (classType == CHAR_RANGER)

return charLevel * 2 / 3;

else

return 0;

}

uchar baseDefense(uchar classType, uchar charLevel)

{

if (classType == CHAR_FIGHTER)

return charLevel/10;

else if (classType == CHAR_RANGER)

return charLevel/8;

else

return 0;

}

__kernel void initCharacter(__global uchar * charLevel, __global uchar * charHitpoint, __global uchar * charAttack,

__global uchar * charDefense, __global uchar * charExperience, __global uchar * charClass, __global int * randomBuffer)

{

int id = get_global_id(0);

charLevel[id] = 1;

charHitpoint[id] = baseHitpoint(charClass[id], charLevel[id],randomBuffer,id);

charExperience[id] = dice(1,100,randomBuffer,id);

charAttack[id] = baseAttack(charClass[id], charLevel[id]);

charDefense[id] = baseDefense(charClass[id], charLevel[id]);

}

__kernel void live2(__global uchar * map, __global uchar * ctr, __global uchar * diffuseMap)

{

int i = get_global_id(0);

int groupId = i / 64;

int gx = groupId % (nConway / 8);

int gy = groupId / (nConway / 8);

int localId = get_local_id(0);

int lx = localId % 8;

int ly = localId / 8;

int kx = lx + gx * 8; int jy = ly + gy * 8;

i = kx + jy*nConway;

map[i * 4] = ctr[i * 4];

map[i * 4 + 1] = ctr[i * 4 + 1];

map[i * 4 + 2] = ctr[i * 4 + 2];

map[i * 4 + 3] = ctr[i * 4 + 3];

int acc = 0;

__local ctrL[24][24];

ctrL[lx + 8][ly + 8] = ctr[(gx * 8 + lx) + (gy * 8 + ly)*nConway];

if (gx>0 && gy>0)

ctrL[lx][ly] = ctr[(gx * 8 + lx - 8) + (gy * 8 + ly - 8)*nConway];

if (gx<(nConway / 8 - 1) && gy<(nConway / 8 - 1))

ctrL[lx + 16][ly + 16] = ctr[(gx * 8 + lx + 8) + (gy * 8 + ly + 8)*nConway];

if (gx>0)

ctrL[lx][ly + 8] = ctr[(gx * 8 + lx - 8) + (gy * 8 + ly)*nConway];

if (gy>0)

ctrL[lx + 8][ly] = ctr[(gx * 8 + lx) + (gy * 8 + ly - 8)*nConway];

if (gx<(nConway / 8 - 1))

ctrL[lx + 16][ly + 8] = ctr[(gx * 8 + lx + 8) + (gy * 8 + ly)*nConway];

if (gy<(nConway / 8 - 1))

ctrL[lx + 8][ly + 16] = ctr[(gx * 8 + lx) + (gy * 8 + ly + 8)*nConway];

if (gx<(nConway / 8 - 1) && gy>(0))

ctrL[lx + 16][ly] = ctr[(gx * 8 + lx + 8) + (gy * 8 + ly - 8)*nConway];

if (gy<(nConway / 8 - 1) && gx>(0))

ctrL[lx][ly + 16] = ctr[(gx * 8 + lx - 8) + (gy * 8 + ly + 8)*nConway];

barrier(CLK_LOCAL_MEM_FENCE);

int minu = ((lx + gx * 8) >= 8 ? -8 : (-lx)) + 8;

int maxu = ((lx + gx * 8) <= (nConway - 9) ? (8) : (8 - lx)) + 8;

int minv = ((ly + gy * 8) >= 8 ? -8 : (-ly)) + 8;

int maxv = ((ly + gy * 8) <= (nConway - 9) ? (8) : (8 - ly)) + 8;

int w = abs(maxu) + abs(minu) + 1;

int h = abs(maxv) + abs(minv) + 1;

for (int u = minu; u <= maxu; u++)

{

for (int v = minv; v <= maxv; v++)

{

acc += ctrL[u][v];

}

}

diffuseMap[i * 4 + 3] = diffuseMap[i * 4 + 2];

diffuseMap[i * 4 + 2] = diffuseMap[i * 4 + 1];

diffuseMap[i * 4 + 1] = diffuseMap[i * 4];

diffuseMap[i * 4] = min(max(0, acc / (w*h)), 255);

}

__kernel void live(__global uchar * map, __global uchar * ctr, __global uchar * diffuseMap)

{

int i = get_global_id(0);

int x = i%nConway;

int y = i / nConway;

int iMap = y*nConway + x;

int ctr_ = 0;

for (int j = -1; j <= 1; j++)

for (int k = -1; k <= 1; k++)

{

int index = iMap + nConway*j + k;

if (((index) >= 0) && ((index)<(nConway*nConway)))

{

if ((x + k >= 0) && (x + k<nConway) && (y + j >= 0) && (y + j<nConway))

{

if (index != iMap)

{

if (map[index * 4]>0)

ctr_++;

}

}

}

}

if (ctr_ == 2 || ctr_ == 3)

{

if (ctr_ == 3 && map[iMap * 4] == 0)

{

ctr[iMap * 4] = 255;

ctr[iMap * 4 + 1] = 255;

ctr[iMap * 4 + 2] = 255;

ctr[iMap * 4 + 3] = 255;

}

else

{

ctr[iMap * 4] = map[iMap * 4];

ctr[iMap * 4 + 1] = map[iMap * 4 + 1];

ctr[iMap * 4 + 2] = map[iMap * 4 + 2];

ctr[iMap * 4 + 3] = 255;

}

}

else

{

ctr[iMap * 4] = 0;

ctr[iMap * 4 + 1] = 0;

ctr[iMap * 4 + 2] = 0;

ctr[iMap * 4 + 3] = 255;

}

}

__kernel void calculateNormals(__global float *xyz, __global float *normals)

{

}

__kernel void waveEquation(__global float *xyz, __global float *xyzn,

__global float *xyzo, __global float * arguments,

__global float *colors, __global uchar *texture, __global uchar * diffuseMap)

{

int threadId = get_global_id(0);

if (threadId<arguments[4])

{

float colorX = colors[threadId * 2];

float colorY = colors[threadId * 2 + 1];

int iX = (int)(colorX*nConway);

int iY = (int)(colorY*nConway);

float height = 0;

if (iX >= 0 && iX<nConway && iY >= 0 && iY<nConway)

{

height = (

diffuseMap[(iY*nConway + iX) * 4] + diffuseMap[(iY*nConway + iX) * 4 + 1] +

diffuseMap[(iY*nConway + iX) * 4 + 2] + diffuseMap[(iY*nConway + iX) * 4 + 3]

) / 20000.0f;

}

xyzo[threadId * 3] = xyz[threadId * 3] + xyzn[threadId * 3] * height;

xyzo[threadId * 3 + 1] = xyz[threadId * 3 + 1] + xyzn[threadId * 3 + 1] * height;

xyzo[threadId * 3 + 2] = xyz[threadId * 3 + 2] + xyzn[threadId * 3 + 2] * height;

}

}

Tugrul_512bit · Apr 27, 2017

This example demonstrates R7-240(a low end old amd card) and a RX-550(a low end new amd card) computing at the same time:

it seems like rendering time is sensitive to pci-e bandwidth. Its sending 57k vertice-sphere. Maybe a shader-compute version with tesselation wouldn't get affected by bandwidth but how to run shader(opengl,directx) on multiple gpus without getting hardness levels of dx12 or vulkan?

ZJP · Apr 27, 2017

Thanks for this useful video.

Tugrul_512bit · Apr 27, 2017

Since youtube not asking for money ever, I keep filling it with GPGPU videos:

Tugrul_512bit · May 10, 2017

Now API has "device to device pipelining" feature which can attach a different GPU per kernel and run consecutive kernels concurrently with help of double buffering.

https://github.com/tugrul512bit/Cekirdekler/wiki/Pipelining:-Device-to-Device

Every iteration needs a data to be pushed from left end and it takes several steps before result pops out of the right end. Once first result is taken, it takes single step(pushData) to get new consecutive results.

Maybe some graphics expert can build a geometry pipeline with this? Maybe states doing triangulation, rasterization, postprocessing.

Not a Unity example but here it is:

yoonitee · May 12, 2017

Tugrul_512bit said: ↑

Now API has "device to device pipelining" feature which can attach a different GPU per kernel and run consecutive kernels concurrently with help of double buffering.

https://github.com/tugrul512bit/Cekirdekler/wiki/Pipelining:-Device-to-Device

Every iteration needs a data to be pushed from left end and it takes several steps before result pops out of the right end. Once first result is taken, it takes single step(pushData) to get new consecutive results.

Maybe some graphics expert can build a geometry pipeline with this? Maybe states doing triangulation, rasterization, postprocessing.

Not a Unity example but here it is:

Click to expand...

So I'm still not clear. Is your system actually any faster than simply using Parallel.For and a CPU only solution? If so by how much?

Tugrul_512bit · May 12, 2017

yoonitee said: ↑

So I'm still not clear. Is your system actually any faster than simply using Parallel.For and a CPU only solution? If so by how much?
Click to expand...

For just sqrt(sin(x)*sin(x)), its %100 faster than parallel.for, parallel for is like %80 faster than linq because linq is creating copies. This is for double precision arrays with 16M elements. Going float must be faster for both CPU and GPU and adding more calculations per byte helps GPU increase the performance gap.

Low compute to data algorithms can make CPU faster or equal. For just c=a+b with device to device pipelining, CPU is faster. Without that pipelining but streaming(doesn't matter multip GPU or not), GPU is faster.

Tugrul_512bit · May 13, 2017

Here is a runnable demo of pipeline feature: download

but it gives some error(%50 of the time, may need to try 1-2 more times to make it work) related to winforms controls thread safety(I used Invoke but it didn't solve), maybe I should create a Unity version.

Tugrul_512bit · May 31, 2017

To optimize for single-GPU-only scenarios more, async enqueue mode is added with v1.2.10

https://github.com/tugrul512bit/Cekirdekler/releases

example code fragment:

Code (csharp):

cruncher.enqueueMode = true;

// default queue (0)

dataArrayA.nextParam(dataArrayB, constant).compute(cruncher, 1, "vecAdd", 1024 * 1024);

// next concurrent queue(1)

cruncher.enqueueModeAsyncEnable = true;

dataArrayC.nextParam(dataArrayD, constant2).compute(cruncher, 1, "vecMul", 1024 * 1024);

cruncher.enqueueModeAsyncEnable = false;

// default queue(0)

dataArrayE.nextParam(dataArrayF, constant3).compute(cruncher, 1, "vecDiv", 1024 * 1024);

// next concurrent queue(2)

cruncher.enqueueModeAsyncEnable = true;

dataArrayG.nextParam(dataArrayH, constant4).compute(cruncher, 1, "vecAddInt", 1024 * 1024);

dataArrayG.nextParam(dataArrayH, constant4).compute(cruncher, 1, "vecAddInt", 1024 * 1024);

dataArrayG.nextParam(dataArrayH, constant4).compute(cruncher, 1, "vecAddInt", 1024 * 1024);

dataArrayG.nextParam(dataArrayH, constant4).compute(cruncher, 1, "vecAddInt", 1024 * 1024);

cruncher.enqueueModeAsyncEnable = false;

// enqueue mode is also implicity async to host codes

// calculateGameLogic()--> runs async to GPU queues

// just before enqueue mode = false assignment

cruncher.enqueueMode = false;

This can reduce latency by up to %45 in dense real-world scenarios. This picture shows %15 save. If I double the work, it goes to %30ish, if I keep adding more work, it satisfies at %45 time saving because of kernel compute time is roughly equal to sum of buffer read+write timings. All these concurrent queues are also asynchronous to host codes. You can compute other things by CPU while queues are working.

Also I heard bullet physics has come to Unity so Amd cards can have performant physics too. Have you tried it yet?

Tugrul_512bit · Jun 3, 2017

Here is an image processing pipeline made with v1.2.11's "single device pipeline" feature within hours:

creating an already-known algorithm's pipeline would take no more than 30 minutes if opencl is already known (kind of C99 but constrained)

Tugrul_512bit · Jul 3, 2017

Now OpenCL 2.0 dynamic parallelism is supported with v1.4.1 https://github.com/tugrul512bit/Cekirdekler/releases/tag/v1.4.1_update2

"Also device-pool + task(kernels to be computed later) pool" feature can achieve a good performance with greedy scheduling for non-separable kernels(a gpu gets a new job as soon as its channels finish a task) but this is tested for only OpenCL 1.2.

tinyant · Jul 4, 2017

Very interesting Stuff!!

ippdev · Jul 4, 2017

How could this be used to get better performance on huge arch-viz scenes?

Tugrul_512bit · Jul 4, 2017

ippdev said: ↑

How could this be used to get better performance on huge arch-viz scenes?
Click to expand...

Did you mean sketchup-like drawing/building scenes? Any info about its benchmarks? How does it render stuff?

If they are already using gpu acceleration for rendering stuff, it may not help much. Maybe parsing the file and building geometry could be done with dynamic parallelism. For example, GPU starts with 1 thread, CPU sends it file bytes. It starts parsing. Whenever stumbles upon a geometry, allocates 64 workers for building its structure in memory. Continues, whenever stumbles upon a small geometry, allocates 16 workers. Whenever crosses a big geometry, allocates 1M workers. Even spawns other copies of itself to increase speed if object data start-end points are known.

All these can happen in GPU without waiting command from CPU.

I searched the internet a bit: http://www.cgarchitect.com/2016/11/...sualization-rendering-engine-survey---results

this shows vray is already in effect so rendering part is already pro-performance. Or I am mistaken and that is just a benchmark between other renderers, not Arch-viz.

In here, https://www.redshift3d.com/blog/building-a-bigger-arch-viz-business-with-redshift-for-3ds-max, pictures are high quality so it should have had GPU acceleration already. I wonder how long does opening a file take.

With task/device pool, you can compute hundreds of compute requests. Maybe useful for remote rendering (a render service for clients from thousands of kilometers away with wooden PCs)

Arowx · Jul 5, 2017

ippdev said: ↑

How could this be used to get better performance on huge arch-viz scenes?
Click to expand...

I think he's referring to how a developer can manage a GPU with a high rendering load and GPU processing and balance the two?

Tugrul_512bit · Jul 6, 2017

Arowx said: ↑

I think he's referring to how a developer can manage a GPU with a high rendering load and GPU processing and balance the two?
Click to expand...

OpenCL can render on OpenGL textures but this project does not support that feature yet. So it only computes stuff. But it can schedule multiple kernels to multiple GPUs, or it can separate a kernel into smaller parts and feed them to multiple GPUs, or it can work as a pipeline so each stage(a kernel) of it run on a different GPU. If these types of work flow is not doable, then this project can not help, especially if there is no latency to hide behind another. OpenCL 2.0 part is a bit complicated but it cuts GPU's dependency to CPU for multiple kernel executions.

ippdev · Jul 6, 2017

Tugrul_512bit said: ↑

OpenCL can render on OpenGL textures but this project does not support that feature yet. So it only computes stuff. But it can schedule multiple kernels to multiple GPUs, or it can separate a kernel into smaller parts and feed them to multiple GPUs, or it can work as a pipeline so each stage(a kernel) of it run on a different GPU. If these types of work flow is not doable, then this project can not help, especially if there is no latency to hide behind another. OpenCL 2.0 part is a bit complicated but it cuts GPU's dependency to CPU for multiple kernel executions.
Click to expand...

Could this be used for an Occlusion Culling algorithm that had not been prebaked?

Tugrul_512bit · Jul 6, 2017

ippdev said: ↑

Could this be used for an Occlusion Culling algorithm that had not been prebaked?
Click to expand...

I haven't done Occlusion Culling myself but it seems it has some sort of sorting objects before drawing so rendering resources are not wasted. If these sorting (with Z-buffer? idk.) includes massively parallel short range sorts, then maybe it could help but I also lack experience about detailed geometry pipelines so hardware-accelerated (I mean, automatically done by geometry engine of GPU) could be already faster.

If I were more experienced in OpenGL, I could answer your question properly. I also just took a video of dynamic parallelism example here
it does k-means clustering. Yes, I know k-means clustering may not help clustering geometry objects to cull them(or may it?)

If its ray-traced scene, then culling may be done easier with applying some sort of acceleration structure(like a uniform grid) to cull backplane objects.

ippdev · Jul 6, 2017

Tugrul_512bit said: ↑

I haven't done Occlusion Culling myself but it seems it has some sort of sorting objects before drawing so rendering resources are not wasted. If these sorting (with Z-buffer? idk.) includes massively parallel short range sorts, then maybe it could help but I also lack experience about detailed geometry pipelines so hardware-accelerated (I mean, automatically done by geometry engine of GPU) could be already faster.

If I were more experienced in OpenGL, I could answer your question properly. I also just took a video of dynamic parallelism example here
it does k-means clustering. Yes, I know k-means clustering may not help clustering geometry objects to cull them(or may it?)

If its ray-traced scene, then culling may be done easier with applying some sort of acceleration structure(like a uniform grid) to cull backplane objects.
Click to expand...

Can it do any performance magic with something like this
https://docs.unity3d.com/Manual/CullingGroupAPI.html

Tugrul_512bit · Jul 7, 2017

ippdev said: ↑

Can it do any performance magic with something like this
https://docs.unity3d.com/Manual/CullingGroupAPI.html
Click to expand...

It says

"The API works by having you provide an array of bounding spheres. These visibility of these spheres relative to a particular camera is then calculated, along with a ‘distance band’ value that can be treated like a LOD level number."

so this is a collision check or something and must be acceleratable but the cost of moving arrays from gpu to cpu then cpu to gpu again would make it slower imho. But, if cullin takes 1-2 seconds, then it may help to reduce it to sub-second timings(maybe).

Is culling group api working on CPU already? Then GPU would help. But how many hours or days to write those codes, I don't know.

andrej-szontagh · Oct 5, 2017

Arowx said: ↑

Hey Unity's WebGL builds could also have a WebCL option!
Click to expand...

Unfortunately this forum doesn't seems to have a voting system so I will write a reply.
With WebCL you could do all the crazy stuff you couldn't do with WebGL .. also you can do a lot of parallel heavy number crunching .. It would be nice if we could somehow access this from unity.

sgrein · May 3, 2019

tugrul_512bit:

What is the overhead when I want to call a kernel repeatedly?

I tried Cudafy and managedCuda and both are slow (~50 ms) when transfering data to the GPU.

AndersMalmgren · May 4, 2019

angrypenguin said: ↑

Expanding on this, talking mostly about PC, most gamers and/or developers max out GPU time before they max out CPU time. Where possible, visual stuff is typically cranked up to the point where the system is only just managing an appropriate frame rate, and this usually puts more pressure on the GPU than the CPU. So, in the use case of a game or highly visual application, moving more stuff to the GPU when it's already under high pressure in order to reduce CPU load which is usually under less pressure doesn't make sense.

Exceptions to this are stuff that work really well on the GPU that would bog down a CPU, or less visual apps where the GPU isn't under particularly high load.
Click to expand...

Maybe if you have a huge team that can optimize the crap out of your scene. But for many of us setpass calls are the bottleneck. I know it's the case for our game. That's why I always bitch on unity to get Vulkan and gfx job working properly.

Edit: haha, missed that this was an old Arowx thread

Search Unity

Unity ID

Useful Searches

[Idea] Unity with C# to GPU power!