/// \file
/// \brief NeuroNet.cl
/// Library consist OpenCL kernels
/// \author DNG
/// \copyright Copyright 2019, DNG
//---
//--- by default some GPU doesn't support floats
//--- cl_khr_fp64 directive is used to enable work with floats
//#pragma OPENCL EXTENSION cl_khr_fp64 : enable
#define l1 1.0e-5f
#define l2 1.0e-2f
#define MAX_WEIGHT 1.0e-1f
//+------------------------------------------------------------------+
///\ingroup neuron_base_ff Feed forward process kernel
/// Describes the forward path process for the Neuron Base (#CNeuronBaseOCL).
///\details Detailed description on the link.
//+------------------------------------------------------------------+
__kernel void FeedForward(__global float *matrix_w,///<[in] Weights matrix (m+1)*n, where m - number of neurons in layer and n - number of outputs (neurons in next layer)
__global float *matrix_i,///<[in] Inputs tesor
__global float *matrix_o,///<[out] Output tensor
int inputs,///< Number of inputs
int activation///< Activation type (#ENUM_ACTIVATION)
)
{
int i = get_global_id(0);
float sum = 0;
float4 inp, weight;
int shift = (inputs + 1) * i;
for(int k = 0; k <= inputs; k = k + 4)
{
switch(inputs - k)
{
case 0:
inp = (float4)(1, 0, 0, 0);
weight = (float4)(matrix_w[shift + k], 0, 0, 0);
break;
case 1:
inp = (float4)(matrix_i[k], 1, 0, 0);
weight = (float4)(matrix_w[shift + k], matrix_w[shift + k + 1], 0, 0);
break;
case 2:
inp = (float4)(matrix_i[k], matrix_i[k + 1], 1, 0);
weight = (float4)(matrix_w[shift + k], matrix_w[shift + k + 1], matrix_w[shift + k + 2], 0);
break;
case 3:
inp = (float4)(matrix_i[k], matrix_i[k + 1], matrix_i[k + 2], 1);
weight = (float4)(matrix_w[shift + k], matrix_w[shift + k + 1], matrix_w[shift + k + 2], matrix_w[shift + k + 3]);
break;
default:
inp = (float4)(matrix_i[k], matrix_i[k + 1], matrix_i[k + 2], matrix_i[k + 3]);
weight = (float4)(matrix_w[shift + k], matrix_w[shift + k + 1], matrix_w[shift + k + 2], matrix_w[shift + k + 3]);
break;
}
float d = dot(inp, weight);
if(isnan(sum + d))
continue;
sum += d;
}
if(isnan(sum))
sum = 0;
switch(activation)
{
case 0:
sum = tanh(sum);
break;
case 1:
sum = 1 / (1 + exp(-sum));
break;
case 2:
if(sum < 0)
sum *= 0.01f;
break;
default:
break;
}
matrix_o[i] = sum;
}
//+------------------------------------------------------------------+
///\ingroup neuron_base_gr Neuron Base Output Gradients Calculation kernel
/// Describes the process of output gradients calculation for the Neuron Base (#CNeuronBaseOCL).
///\details Detailed description on the link.
//+------------------------------------------------------------------+
__kernel void CalcOutputGradient(__global float *matrix_t,///<[in] Target tensor
__global float *matrix_o,///<[in] Output tensor
__global float *matrix_ig,///<[out] Tensor of gradients
int activation,///< Activation type (#ENUM_ACTIVATION)
float error
)
{
int i = get_global_id(0);
float out = matrix_o[i];
float temp = 0;
switch(activation)
{
case 0:
//temp=clamp(matrix_t[i],-1.0,1.0)-out;
temp = 2.0f * (matrix_t[i] - out);
break;
case 1:
//temp=clamp(matrix_t[i],0.0,1.0)-out;
temp = 2 * (matrix_t[i] - out) * error;
temp = temp * out * (1 - out);
break;
case 2:
//temp=(matrix_t[i]-out)*(out>=0 ? 1.0 : 0.01);
temp = (2 * (matrix_t[i] - out) * error) * (out >= 0 ? 1.0f : 0.01f);
break;
default:
temp = 2 * (matrix_t[i] - out) * error;
break;
}
matrix_ig[i] = temp;
}
//+------------------------------------------------------------------+
///\ingroup neuron_base_gr Neuron Base Hidden Gradients Calculation kernel
/// Describes the process of hidden gradients calculation for the Neuron Base (#CNeuronBaseOCL).
///\details Detailed description on the link.
//+------------------------------------------------------------------+
__kernel void CalcHiddenGradient(__global float *matrix_w,///<[in] Weights matrix (m+1)*n, where m - number of neurons in previous layer and n - number of neurons in current layer
__global float *matrix_g,///<[in] Tensor of gradients at current layer
__global float *matrix_o,///<[in] Previous layer Output tensor
__global float *matrix_ig,///<[out] Tensor of gradients at previous layer
int outputs,///< Number of outputs
int activation///< Activation type (#ENUM_ACTIVATION)
)
{
int i = get_global_id(0);
int inputs = get_global_size(0);
float sum = 0;
float out = matrix_o[i];
float4 grad, weight;
for(int k = 0; k < outputs; k += 4)
{
switch(outputs - k)
{
case 1:
weight = (float4)(matrix_w[k * (inputs + 1) + i], 0, 0, 0);
grad = (float4)(matrix_g[k], 0, 0, 0);
break;
case 2:
grad = (float4)(matrix_g[k], matrix_g[k + 1], 0, 0);
weight = (float4)(matrix_w[k * (inputs + 1) + i], matrix_w[(k + 1) * (inputs + 1) + i], 0, 0);
break;
case 3:
grad = (float4)(matrix_g[k], matrix_g[k + 1], matrix_g[k + 2], 0);
weight = (float4)(matrix_w[k * (inputs + 1) + i], matrix_w[(k + 1) * (inputs + 1) + i], matrix_w[(k + 2) * (inputs + 1) + i], 0);
break;
default:
grad = (float4)(matrix_g[k], matrix_g[k + 1], matrix_g[k + 2], matrix_g[k + 3]);
weight = (float4)(matrix_w[k * (inputs + 1) + i], matrix_w[(k + 1) * (inputs + 1) + i], matrix_w[(k + 2) * (inputs + 1) + i], matrix_w[(k + 3) * (inputs + 1) + i]);
break;
}
sum += dot(grad, weight);
}
if(isnan(sum))
sum = 0;
switch(activation)
{
case 0:
out = clamp(out, -1.0f, 1.0f);
sum = clamp(sum + out, -1.0f, 1.0f) - out;
sum = sum * max(1 - pow(out, 2), 1.0e-4f);
break;
case 1:
out = clamp(out, 0.0f, 1.0f);
sum = clamp(sum + out, 0.0f, 1.0f) - out;
sum = sum * max(out * (1 - out), 1.0e-4f);
break;
case 2:
if(out < 0)
sum *= 0.01f;
break;
default:
break;
}
matrix_ig[i] = sum;
}
//+------------------------------------------------------------------+
///\ingroup neuron_base_opt Neuron Base SGD Updating Weights Calculation kernel
/// Describes the process of SGD optimization weights for the Neuron Base (#CNeuronBaseOCL).
///\details Detailed description on the link.
//+------------------------------------------------------------------+
__kernel void UpdateWeightsMomentum(__global float *matrix_w, ///<[in,out] Weights matrix (m+1)*n, where m - number of neurons in previous layer and n - number of neurons in current layer
__global float *matrix_g, ///<[in] Tensor of gradients at current layer
__global float *matrix_i, ///<[in] Inputs tesor
__global float *matrix_dw, ///<[in,out] Matrix of delta weights in last correction
int inputs, ///< Number of inputs
float learning_rates, ///< Learning rates
float momentum ///< Momentum multiplier
)
{
int i = get_global_id(0);
int j = get_global_id(1);
int wi = i * (inputs + 1) + j;
float delta = learning_rates * matrix_g[i] * (j < inputs ? matrix_i[j] : 1) + momentum * matrix_dw[wi];
if(!isnan(delta))
{
matrix_dw[wi] = delta;
if((delta * matrix_g[i]) > 0)
matrix_w[wi] = clamp(matrix_w[wi] + delta, -MAX_WEIGHT, MAX_WEIGHT);
}
}
//+------------------------------------------------------------------+
///\ingroup neuron_base_opt Neuron Base Adam Updating Weights Calculation kernel
/// Describes the process of Adam optimization weights for the Neuron Base (#CNeuronBaseOCL).
///\details Detailed description on the link.
//+------------------------------------------------------------------+
__kernel void UpdateWeightsAdam(__global float *matrix_w, ///<[in,out] Weights matrix (m+1)*n, where m - number of neurons in previous layer and n - number of neurons in current layer
__global const float *matrix_g, ///<[in] Tensor of gradients at current layer
__global const float *matrix_i, ///<[in] Inputs tesor
__global float *matrix_m, ///<[in,out] Matrix of first momentum
__global float *matrix_v, ///<[in,out] Matrix of seconfd momentum
const int inputs, ///< Number of inputs
const float l, ///< Learning rates
const float b1, ///< First momentum multiplier
const float b2 ///< Second momentum multiplier
)
{
const int i = get_global_id(0);
const int j = get_global_id(1);
const int wi = i * (inputs + 1) + j * 4;
float4 m, v, weight, inp;
switch(inputs + 1 - j * 4)
{
case 0:
inp = (float4)(1, 0, 0, 0);
weight = (float4)(matrix_w[wi], 0, 0, 0);
m = (float4)(matrix_m[wi], 0, 0, 0);
v = (float4)(matrix_v[wi], 0, 0, 0);
break;
case 1:
inp = (float4)(matrix_i[j * 4], 1, 0, 0);
weight = (float4)(matrix_w[wi], matrix_w[wi + 1], 0, 0);
m = (float4)(matrix_m[wi], matrix_m[wi + 1], 0, 0);
v = (float4)(matrix_v[wi], matrix_v[wi + 1], 0, 0);
break;
case 2:
inp = (float4)(matrix_i[j * 4], matrix_i[j * 4 + 1], 1, 0);
weight = (float4)(matrix_w[wi], matrix_w[wi + 1], matrix_w[wi + 2], 0);
m = (float4)(matrix_m[wi], matrix_m[wi + 1], matrix_m[wi + 2], 0);
v = (float4)(matrix_v[wi], matrix_v[wi + 1], matrix_v[wi + 2], 0);
break;
case 3:
inp = (float4)(matrix_i[j * 4], matrix_i[j * 4 + 1], matrix_i[j * 4 + 2], 1);
weight = (float4)(matrix_w[wi], matrix_w[wi + 1], matrix_w[wi + 2], matrix_w[wi + 3]);
m = (float4)(matrix_m[wi], matrix_m[wi + 1], matrix_m[wi + 2], matrix_m[wi + 3]);
v = (float4)(matrix_v[wi], matrix_v[wi + 1], matrix_v[wi + 2], matrix_v[wi + 3]);
break;
default:
inp = (float4)(matrix_i[j * 4], matrix_i[j * 4 + 1], matrix_i[j * 4 + 2], matrix_i[j * 4 + 3]);
weight = (float4)(matrix_w[wi], matrix_w[wi + 1], matrix_w[wi + 2], matrix_w[wi + 3]);
m = (float4)(matrix_m[wi], matrix_m[wi + 1], matrix_m[wi + 2], matrix_m[wi + 3]);
v = (float4)(matrix_v[wi], matrix_v[wi + 1], matrix_v[wi + 2], matrix_v[wi + 3]);
break;
}
float4 g = (float4)(matrix_g[i]) * inp;
float4 mt = b1 * m + (1 - b1) * g;
float4 vt = b2 * v + (1 - b2) * pow(g, 2);
float4 delta = l * (mt / (sqrt(vt) + 1.0e-37f) - (l1 * sign(weight) + l2 * weight));
switch(min(inputs + 1 - j * 4, 3))
{
case 3:
if(delta.s3 * g.s3 > 0)
matrix_w[wi + 3] = clamp(matrix_w[wi + 2] + delta.s3, -MAX_WEIGHT, MAX_WEIGHT);
matrix_m[wi + 3] = mt.s3;
matrix_v[wi + 3] = vt.s3;
case 2:
if(delta.s2 * g.s2 > 0)
matrix_w[wi + 2] = clamp(matrix_w[wi + 2] + delta.s2, -MAX_WEIGHT, MAX_WEIGHT);
matrix_m[wi + 2] = mt.s2;
matrix_v[wi + 2] = vt.s2;
case 1:
if(delta.s1 * g.s1 > 0)
matrix_w[wi + 1] = clamp(matrix_w[wi + 1] + delta.s1, -MAX_WEIGHT, MAX_WEIGHT);
matrix_m[wi + 1] = mt.s1;
matrix_v[wi + 1] = vt.s1;
case 0:
if(delta.s0 * g.s0 > 0)
matrix_w[wi] = clamp(matrix_w[wi] + delta.s0, -MAX_WEIGHT, MAX_WEIGHT);
matrix_m[wi] = mt.s0;
matrix_v[wi] = vt.s0;
break;
}
}
//+------------------------------------------------------------------+
///\ingroup neuron_base_opt Neuron Base Least Squares Updating Weights Calculation kernel
/// Describes the process of Least Squares optimization weights for the Neuron Base (#CNeuronBaseOCL).
//\details Detailed description on the link.
//+------------------------------------------------------------------+
__kernel void UpdateWeightsLS(__global float *matrix_w, ///<[in,out] Weights matrix (m+1)*n, where m - number of neurons in previous layer and n - number of neurons in current layer
__global const float *matrix_g, ///<[in] Tensor of gradients at current layer
__global const float *matrix_i, ///<[in] Inputs tesor
__global float *matrix_xg, ///<[in,out] Matrix of summ x*g
__global float *matrix_xx, ///<[in,out] Matrix of summ x*x
const int inputs, ///< Number of inputs
const float l, ///< Learning rates
const int update ///< Update flag
)
{
const int i = get_global_id(0);
const int j = get_global_id(1);
const int wi = i * (inputs + 1) + j * 4;
float4 xg, xx, weight, inp;
switch(inputs + 1 - j * 4)
{
case 0:
inp = (float4)(1, 0, 0, 0);
weight = (float4)(matrix_w[wi], 0, 0, 0);
break;
case 1:
inp = (float4)(matrix_i[j * 4], 1, 0, 0);
weight = (float4)(matrix_w[wi], matrix_w[wi + 1], 0, 0);
break;
case 2:
inp = (float4)(matrix_i[j * 4], matrix_i[j * 4 + 1], 1, 0);
weight = (float4)(matrix_w[wi], matrix_w[wi + 1], matrix_w[wi + 2], 0);
break;
case 3:
inp = (float4)(matrix_i[j * 4], matrix_i[j * 4 + 1], matrix_i[j * 4 + 2], 1);
weight = (float4)(matrix_w[wi], matrix_w[wi + 1], matrix_w[wi + 2], matrix_w[wi + 3]);
break;
default:
inp = (float4)(matrix_i[j * 4], matrix_i[j * 4 + 1], matrix_i[j * 4 + 2], matrix_i[j * 4 + 3]);
weight = (float4)(matrix_w[wi], matrix_w[wi + 1], matrix_w[wi + 2], matrix_w[wi + 3]);
break;
}
xg = (float4)(matrix_g[i]) * inp;
xx = pow(inp, 2.0f);
switch(min(inputs + 1 - j * 4, 3))
{
case 3:
if(update)
{
matrix_w[wi + 3] = matrix_w[wi + 3] + l * (matrix_xg[wi + 3] + xg.s3) / (matrix_xx[wi + 3] + xx.s3 + 1.0e-37f);
matrix_xg[wi + 3] = 0;
matrix_xx[wi + 3] = 0;
}
else
{
matrix_xg[wi + 3] += xg.s3;
matrix_xx[wi + 3] += xx.s3;
}
case 2:
if(update)
{
matrix_w[wi + 2] = matrix_w[wi + 2] + l * (matrix_xg[wi + 2] + xg.s2) / (matrix_xx[wi + 2] + xx.s2 + 1.0e-37f);
matrix_xg[wi + 2] = 0;
matrix_xx[wi + 2] = 0;
}
else
{
matrix_xg[wi + 2] += xg.s2;
matrix_xx[wi + 2] += xx.s2;
}
case 1:
if(update)
{
matrix_w[wi + 1] = matrix_w[wi + 1] + l * (matrix_xg[wi + 1] + xg.s1) / (matrix_xx[wi + 1] + xx.s1 + 1.0e-37f);
matrix_xg[wi + 1] = 0;
matrix_xx[wi + 1] = 0;
}
else
{
matrix_xg[wi + 1] += xg.s1;
matrix_xx[wi + 1] += xx.s1;
}
case 0:
if(update)
{
matrix_w[wi] = matrix_w[wi] + l * (matrix_xg[wi] + xg.s0) / (matrix_xx[wi] + xx.s0 + 1.0e-37f);
matrix_xg[wi] = 0;
matrix_xx[wi] = 0;
}
else
{
matrix_xg[wi] += xg.s0;
matrix_xx[wi] += xx.s0;
}
break;
}
}
//+------------------------------------------------------------------+
///\ingroup neuron_proof_ff
/// Kernel of the Pooling neuron for Feed forward process (#CNeuronProofOCL)
//+------------------------------------------------------------------+
__kernel void FeedForwardProof(__global float *matrix_i, ///<[in] Inputs tesor
__global float *matrix_o, ///<[out] Output tensor
int inputs, ///< Number of inputs
int window, ///< Size of input window
int step ///< Step size
)
{
int i = get_global_id(0);
int pos = i * step;
float result = matrix_i[pos];
for(int k = 1; k < window; k = k + 1)
{
int shift = k + pos;
if(shift >= inputs)
break;
result = max(result, matrix_i[shift]);
}
matrix_o[i] = result;
}
//+------------------------------------------------------------------+
///\ingroup neuron_proof_gr
/// Kernel of the Pooling neuron to transfer gradient to previous layer (#CNeuronProofOCL)
//+------------------------------------------------------------------+
__kernel void CalcInputGradientProof(__global float *matrix_i, ///<[in] Inputs tesor
__global float *matrix_g, ///<[in] Tensor of gradients at current layer
__global float *matrix_o, ///<[in] Output tensor
__global float *matrix_ig, ///<[out] Tensor of gradients at previous layer
int outputs, ///< Number of outputs
int window, ///< Size of input window
int step ///< Step size
)
{
int i = get_global_id(0);
float prev_gradient = 0;
float value = matrix_i[i];
int start = i - window + step;
start = (start - start % step) / step;
int stop = (i - i % step) / step + 1;
for(int out = max(0, start); out < min(outputs, stop); out++)
{
if(value == matrix_o[out])
prev_gradient += matrix_g[out];
}
matrix_ig[i] = prev_gradient;
}
//+------------------------------------------------------------------+
///\ingroup neuron_conv_ff
/// Kernel of the Convolution neuron for Feed forward process (#CNeuronConvOCL)
//+------------------------------------------------------------------+
__kernel void FeedForwardConv(__global float *matrix_w, ///<[in] Weights matrix (m+1)*n, where m - input window and n - output window
__global float *matrix_i, ///<[in] Inputs tesor
__global float *matrix_o, ///<[out] Output tensor
int inputs, ///< Number of inputs
int step, ///< Step size
int window_in, ///< Size of input window
int window_out, ///< Size of output window
int activation ///< Activation type (#ENUM_ACTIVATION)
)
{
int i = get_global_id(0);
int w_in = window_in;
int w_out = window_out;
float sum = 0;
float4 inp, weight;
int shift_out = w_out * i;
int shift_in = step * i;
for(int out = 0; out < w_out; out++)
{
int shift = (w_in + 1) * out;
int stop = (w_in <= (inputs - shift_in) ? w_in : (inputs - shift_in));
for(int k = 0; k <= stop; k += 4)
{
switch(stop - k)
{
case 0:
inp = (float4)(1, 0, 0, 0);
weight = (float4)(matrix_w[shift + k], 0, 0, 0);
break;
case 1:
inp = (float4)(matrix_i[shift_in + k], 1, 0, 0);
weight = (float4)(matrix_w[shift + k], matrix_w[shift + k + 1], 0, 0);
break;
case 2:
inp = (float4)(matrix_i[shift_in + k], matrix_i[shift_in + k + 1], 1, 0);
weight = (float4)(matrix_w[shift + k], matrix_w[shift + k + 1], matrix_w[shift + k + 2], 0);
break;
case 3:
inp = (float4)(matrix_i[shift_in + k], matrix_i[shift_in + k + 1], matrix_i[shift_in + k + 2], 1);
weight = (float4)(matrix_w[shift + k], matrix_w[shift + k + 1], matrix_w[shift + k + 2], matrix_w[shift + k + 3]);
break;
default:
inp = (float4)(matrix_i[shift_in + k], matrix_i[shift_in + k + 1], matrix_i[shift_in + k + 2], matrix_i[shift_in + k + 3]);
weight = (float4)(matrix_w[shift + k], matrix_w[shift + k + 1], matrix_w[shift + k + 2], matrix_w[shift + k + 3]);
break;
}
sum += dot(inp, weight);
}
if(isnan(sum))
sum = 0;
switch(activation)
{
case 0:
sum = tanh(sum);
break;
case 1:
sum = 1 / (1 + exp(-clamp(sum, -20.0f, 20.0f)));
break;
case 2:
if(sum < 0)
sum *= 0.01f;
break;
default:
break;
}
matrix_o[out + shift_out] = sum;
}
}
//+------------------------------------------------------------------+
///\ingroup neuron_conv_gr
/// Kernel of the Convolution neuron to transfer gradient to previous layer (#CNeuronConvOCL)
//+------------------------------------------------------------------+
__kernel void CalcHiddenGradientConv(__global float *matrix_w, ///<[in] Weights matrix (m+1)*n, where m - input window and n - output window
__global float *matrix_g, ///<[in] Tensor of gradients at current layer
__global float *matrix_o, ///<[in] Output tensor
__global float *matrix_ig, ///<[out] Tensor of gradients at previous layer
int outputs, ///< Number of outputs
int step, ///< Step size
int window_in, ///< Size of input window
int window_out, ///< Size of output window
int activation ///< Activation type (#ENUM_ACTIVATION)
)
{
int i = get_global_id(0);
int inputs = get_global_size(0);
float sum = 0;
float out = matrix_o[i];
int start = i - window_in + step;
start = max((start - start % step) / step, 0);
int stop = (i - i % step) / step + 1;
if(stop > (outputs / window_out))
stop = outputs / window_out;
for(int h = 0; h < window_out; h += 4)
{
for(int k = start; k < stop; k++)
{
int shift_g = k * window_out + h;
int shift_w = (stop - k - 1) * step + i % step + h * (window_in + 1);
if(shift_g >= outputs || shift_w >= (window_in + 1)*window_out)
break;
float grad = matrix_g[shift_g];
if(grad == 0.0f)
continue;
sum += grad * matrix_w[shift_w];
}
}
if(isnan(sum))
sum = 0;
if(sum != 0)
switch(activation)
{
case 0:
sum = clamp(sum + out, -1.0f, 1.0f) - out;
sum = sum * (1.0f - pow(out >= 1 || out <= -1 ? 1.0f : out, 2.0f));
break;
case 1:
sum = clamp(sum + out, 0.0f, 1.0f) - out;
sum = sum * (out * (1.0f - out));
break;
case 2:
if(out < 0.0f)
sum *= 0.01f;
break;
default:
break;
}
matrix_ig[i] = sum;
}
//+------------------------------------------------------------------+
///\ingroup neuron_conv_opt Convolution Neuron SGD optimization Updating Weights Calculation kernel
/// Describes the process of SGD optimization weights for the Convolution Neuron (#CNeuronConvOCL).
//+------------------------------------------------------------------+
__kernel void UpdateWeightsConvMomentum(__global float *matrix_w, ///<[in,out] Weights matrix (m+1)*n, where m - input window and n - output window
__global float *matrix_g, ///<[in] Tensor of gradients at current layer
__global float *matrix_i, ///<[in] Inputs tesor
__global float *matrix_dw, ///<[in,out] Matrix of delta weights in last correction
int inputs, ///< Number of inputs
float learning_rates, ///< Learning rates
float momentum, ///< Momentum multiplier
int window_in, ///< Size of input window
int window_out, ///< Size of output window
int step ///< Step size
)
{
const int i = get_global_id(0);
const int shift = i % (window_in + 1);
const int shift_out = (i - shift) / (window_in + 1);
int total = (inputs - window_in) % step;
total = (inputs - window_in - total) / step + (total > 0 ? 1 : 0);
float grad = 0;
for(int t = 0; t < total; t++)
{
if(shift != window_in && (shift + t * window_in) >= inputs)
break;
grad += matrix_g[t * window_out + shift_out] * (shift == window_in ? 1 : matrix_i[shift + t * step]);
}
float delta = learning_rates * grad + momentum * matrix_dw[i];
if(!isnan(delta))
{
matrix_dw[i] = delta;
if(delta * grad > 0)
matrix_w[i] = clamp(matrix_w[i] + delta, -MAX_WEIGHT, MAX_WEIGHT);
}
}
//+------------------------------------------------------------------+
///\ingroup neuron_conv_opt Convolution Neuron Adam optimization Updating Weights Calculation kernel
/// Describes the process of Adam optimization weights for the Convolution Neuron (#CNeuronConvOCL).
//+------------------------------------------------------------------+
__kernel void UpdateWeightsConvAdam(__global float *matrix_w, ///<[in,out] Weights matrix (m+1)*n, where m - input window and n - output window
__global const float *matrix_g, ///<[in] Tensor of gradients at current layer
__global const float *matrix_i, ///<[in] Inputs tesor
__global float *matrix_m, ///<[in] Matrix of first momentum
__global float *matrix_v, ///<[in] Matrix of seconfd momentum
const int inputs, ///< Number of inputs
const float l, ///< Learning rates
const float b1, ///< First momentum multiplier
const float b2, ///< Second momentum multiplier
int window_in, ///< Size of input window
int window_out, ///< Size of output window
int step ///< Step size
)
{
const int i = get_global_id(0);
if(i > window_in)
return;
//---
int total = (inputs - (window_in - step)) % step;
total = (inputs - (window_in - step) - total) / step + (total > 0 ? 1 : 0);
for(int out = 0; out < window_out; out++)
{
if((window_out - out) > 4)
{
float4 grad = {0, 0, 0, 0};
int shift_w = i + out * (window_in + 1);
for(int t = 0; t < total; t++)
{
if(i != window_in && (i + t * window_in) >= inputs)
break;
grad += (float4)(matrix_g[t * window_out + out], matrix_g[t * window_out + out + 1], matrix_g[t * window_out + out + 2], matrix_g[t * window_out + out + 3]) * (i == window_in ? 1 : matrix_i[i + t * step]);
}
float4 mt = clamp(b1 * (float4)(matrix_m[shift_w], matrix_m[shift_w + window_in + 1], matrix_m[shift_w + 2 * (window_in + 1)], matrix_m[shift_w + 3 * (window_in + 1)]) + (1 - b1) * grad, -1.0e5f, 1.0e5f);
float4 vt = clamp(b2 * (float4)(matrix_v[shift_w], matrix_v[shift_w + window_in + 1], matrix_v[shift_w + 2 * (window_in + 1)], matrix_v[shift_w + 3 * (window_in + 1)]) + (1 - b2) * pow(grad, 2), 1.0e-6f, 1.0e6f);
float4 delta = l * mt / sqrt(vt);
float4 weight = clamp((float4)(matrix_w[shift_w], matrix_w[shift_w + (window_in + 1)], matrix_w[shift_w + 2 * (window_in + 1)], matrix_w[shift_w + 3 * (window_in + 1)]) + delta, -MAX_WEIGHT, MAX_WEIGHT);
if(delta.s0 * grad.s0 > 0)
matrix_w[shift_w] = weight.s0;
if(delta.s1 * grad.s1 > 0)
matrix_w[shift_w + (window_in + 1)] = weight.s1;
if(delta.s2 * grad.s2 > 0)
matrix_w[shift_w + 2 * (window_in + 1)] = weight.s2;
if(delta.s3 * grad.s3 > 0)
matrix_w[shift_w + 3 * (window_in + 1)] = weight.s3;
matrix_m[shift_w] = mt.s0;
matrix_m[shift_w + (window_in + 1)] = mt.s1;
matrix_m[shift_w + 2 * (window_in + 1)] = mt.s2;
matrix_m[shift_w + 3 * (window_in + 1)] = mt.s3;
matrix_v[shift_w] = vt.s0;
matrix_v[shift_w + (window_in + 1)] = vt.s1;
matrix_v[shift_w + 2 * (window_in + 1)] = vt.s2;
matrix_v[shift_w + 3 * (window_in + 1)] = vt.s3;
out += 3;
}
else
{
float grad = 0;
int shift_w = i + out * (window_in + 1);
for(int t = 0; t < total; t++)
{
if(i != window_in && (i + t * window_in) >= inputs)
break;
grad += matrix_g[t * window_out + out] * (i == window_in ? 1 : matrix_i[i + t * step]);
}
float mt = clamp(b1 * matrix_m[shift_w] + (1 - b1) * grad, -1.0e5f, 1.0e5f);
float vt = clamp(b2 * matrix_v[shift_w] + (1 - b2) * pow(grad, 2), 1.0e-6f, 1.0e6f);
float delta = l * mt / sqrt(vt);
if(delta * grad > 0)
matrix_w[shift_w] = clamp(matrix_w[shift_w] + delta, -MAX_WEIGHT, MAX_WEIGHT);
matrix_m[shift_w] = mt;
matrix_v[shift_w] = vt;
}
}
}
//+------------------------------------------------------------------+
///\ingroup neuron_conv_opt Convolution Neuron Least Squares optimization Updating Weights Calculation kernel
/// Describes the process of Least Squares optimization weights for the Convolution Neuron (#CNeuronConvOCL).
//+------------------------------------------------------------------+
__kernel void UpdateWeightsConvLS(__global float *matrix_w, ///<[in,out] Weights matrix (m+1)*n, where m - input window and n - output window
__global const float *matrix_g, ///<[in] Tensor of gradients at current layer
__global const float *matrix_i, ///<[in] Inputs tesor
__global float *matrix_xg, ///<[in] Matrix of summ x*g
__global float *matrix_xx, ///<[in] Matrix of summ x*x
const int inputs, ///< Number of inputs
const float l, ///< Learning rates
const int update, ///< Update flag
int window_in, ///< Size of input window
int window_out, ///< Size of output window
int step ///< Step size
)
{
const int i = get_global_id(0);
if(i > window_in)
return;
//---
int total = (inputs - (window_in - step)) % step;
total = (inputs - (window_in - step) - total) / step + (total > 0 ? 1 : 0);
for(int out = 0; out < window_out; out++)
{
if((window_out - out) > 4)
{
float4 xg = {0, 0, 0, 0};
float x2 = 0;
int shift_w = i + out * (window_in + 1);
for(int t = 0; t < total; t++)
{
if(i != window_in && (i + t * window_in) >= inputs)
break;
xg += (float4)(matrix_g[t * window_out + out], matrix_g[t * window_out + out + 1], matrix_g[t * window_out + out + 2], matrix_g[t * window_out + out + 3]) * (i == window_in ? 1 : matrix_i[i + t * step]);
x2 += (i == window_in ? 1 : pow(matrix_i[i + t * step], 2.0f));
}
if(update)
{
xg = (float4)(matrix_xg[shift_w], matrix_xg[shift_w + window_in + 1], matrix_xg[shift_w + 2 * (window_in + 1)], matrix_xg[shift_w + 3 * (window_in + 1)]) + xg;
float4 xx = (float4)(matrix_xx[shift_w], matrix_xx[shift_w + window_in + 1], matrix_xx[shift_w + 2 * (window_in + 1)], matrix_xx[shift_w + 3 * (window_in + 1)]) + x2;
float4 delta = l * xg / (xx + 1.0e-37f);
float4 weight = (float4)(matrix_w[shift_w], matrix_w[shift_w + (window_in + 1)], matrix_w[shift_w + 2 * (window_in + 1)], matrix_w[shift_w + 3 * (window_in + 1)]) + delta;
matrix_w[shift_w] = weight.s0;
matrix_w[shift_w + (window_in + 1)] = weight.s1;
matrix_w[shift_w + 2 * (window_in + 1)] = weight.s2;
matrix_w[shift_w + 3 * (window_in + 1)] = weight.s3;
matrix_xg[shift_w] = 0;
matrix_xg[shift_w + (window_in + 1)] = 0;
matrix_xg[shift_w + 2 * (window_in + 1)] = 0;
matrix_xg[shift_w + 3 * (window_in + 1)] = 0;
matrix_xx[shift_w] = 0;
matrix_xx[shift_w + (window_in + 1)] = 0;
matrix_xx[shift_w + 2 * (window_in + 1)] = 0;
matrix_xx[shift_w + 3 * (window_in + 1)] = 0;
}
else
{
matrix_xg[shift_w] += xg.s0;
matrix_xg[shift_w + (window_in + 1)] += xg.s1;
matrix_xg[shift_w + 2 * (window_in + 1)] += xg.s2;
matrix_xg[shift_w + 3 * (window_in + 1)] += xg.s3;
matrix_xx[shift_w] = matrix_xx[shift_w + (window_in + 1)] = matrix_xx[shift_w + 2 * (window_in + 1)] = matrix_xx[shift_w + 3 * (window_in + 1)] += x2;
}
out += 3;
}
else
{
float xg = 0;
float xx = 0;
int shift_w = i + out * (window_in + 1);
for(int t = 0; t < total; t++)
{
if(i != window_in && (i + t * window_in) >= inputs)
break;
xg += matrix_g[t * window_out + out] * (i == window_in ? 1 : matrix_i[i + t * step]);
xx += (i == window_in ? 1 : pow(matrix_i[i + t * step], 2.0f));
}
if(update)
{
xg = matrix_xg[shift_w] + xg;
xx = matrix_xx[shift_w] + xx;
float delta = l * xg / (xx + 1.0e-37f);
matrix_w[shift_w] = matrix_w[shift_w] + delta;
matrix_xg[shift_w] = 0;
matrix_xx[shift_w] = 0;
}
else
{
matrix_xg[shift_w] += xg;
matrix_xx[shift_w] += xx;
}
}
}
}
//+------------------------------------------------------------------+
///\ingroup neuron_atten_ff Attention Neuron Score calculation kernel |
/// Describes the Score calculation process for the Neuron of attention layer (#CNeuronAttentionOCL).
///\details Detailed description on the link.
//+------------------------------------------------------------------+
__kernel void AttentionScore(__global float *querys, ///<[in] Matrix of Querys
__global float *keys, ///<[in] Matrix of Keys
__global float *score, ///<[out] Matrix of Scores
int dimension, ///< Dimension of Key
int mask ///< 1 - calc only previous units, 0 - calc all
)
{
int q = get_global_id(0);
int shift_q = q * dimension;
int units = get_global_size(0);
int shift_s = q * units;
float koef = sqrt((float)dimension);
if(koef < 1)
koef = 1;
float sum = 0;
for(int k = 0; k < units; k++)
{
if(mask > 0 && k > q)
{
score[shift_s + k] = 0;
continue;
}
float result = 0;
int shift_k = k * dimension;
for(int i = 0; i < dimension; i++)
result += (querys[shift_q + i] * keys[shift_k + i]);
result = exp(result / koef);
if(isnan(result))
result = 0;
score[shift_s + k] = result;
sum += result;
}
for(int k = 0; (k < units && sum > 0); k++)
score[shift_s + k] /= sum;
}
//+------------------------------------------------------------------+
///\ingroup neuron_atten_ff Attention Neuron Out calculation kernel
/// Describes the Attention out calculation process for the Neuron of attention layer (#CNeuronAttentionOCL).
///\details Detailed description on the link.
//+------------------------------------------------------------------+
__kernel void AttentionOut(__global float *scores, ///<[in] Matrix of Scores
__global float *values, ///<[in] Matrix of Values
__global float *inputs, ///<[in] Inputs tesor
__global float *out ///<[out] Output tesor
)
{
int units = get_global_size(0);
int u = get_global_id(0);
int d = get_global_id(1);
int dimension = get_global_size(1);
int shift = u * dimension + d;
float result = 0;
for(int i = 0; i < units; i++)
result += scores[u * units + i] * values[i * dimension + d];
out[shift] = (isnan(result) ? 0 : result) + inputs[shift];
}
//+------------------------------------------------------------------+
///\ingroup neuron_atten_ff Kernel for calculation Sum of 2 matrixs with multiplyer.
/// Describes the calculation Sum of 2 matrixs.
///\details Detailed description on the link.
//+------------------------------------------------------------------+
__kernel void SumMatrix(__global float *matrix1, ///<[in] First matrix
__global float *matrix2, ///<[in] Second matrix
__global float *matrix_out, ///<[out] Output matrix
int dimension, ///< Dimension of matrix
float multiplyer ///< Multiplyer for output
)
{
const int i = get_global_id(0) * dimension;
for(int k = 0; k < dimension; k++)
matrix_out[i + k] = (matrix1[i + k] + matrix2[i + k]) * multiplyer;
}
//+------------------------------------------------------------------+
///\ingroup neuron_atten_ff Kernel for calculation Sum of 4 matrixs with multiplyer.
/// Describes the calculation Sum of 4 matrixs.
///\details Detailed description on the link.
//+------------------------------------------------------------------+
__kernel void Sum5Matrix(__global float *matrix1, ///<[in] First matrix
__global float *matrix2, ///<[in] Second matrix
__global float *matrix3, ///<[in] Third matrix
__global float *matrix4, ///<[in] Fourth matrix
__global float *matrix5, ///<[in] Fifth matrix
__global float *matrix_out, ///<[out] Output matrix
int dimension, ///< Dimension of matrix
float multiplyer ///< Multiplyer for output
)
{
const int i = get_global_id(0) * dimension;
for(int k = 0; k < dimension; k++)
matrix_out[i + k] = (matrix1[i + k] + matrix2[i + k] + matrix3[i + k] + matrix4[i + k] + matrix5[i + k]) * multiplyer;
}
//+------------------------------------------------------------------+
///\ingroup neuron_atten_gr Attention layer's neuron Gradients Calculation kernel
/// Describes the gradients calculation process for the Neuron of attention layer (#CNeuronAttentionOCL).
///\details Detailed description on the link.
/// @param[in] querys Matrix of Querys
/// @param[out] querys_g Matrix of Querys' Gradients
/// @param[in] keys Matrix of Keys
/// @param[out] keys_g Matrix of Keys' Gradients
/// @param[in] values Matrix of Values
/// @param[out] values_g Matrix of Values' Gradients
/// @param[in] scores Matrix of Scores
/// @param[in] gradient Matrix of Gradients from previous iteration
//+------------------------------------------------------------------+
__kernel void AttentionInsideGradients(__global float *querys, __global float *querys_g,
__global float *keys, __global float *keys_g,
__global float *values, __global float *values_g,
__global float *scores,
__global float *gradient)
{
int u = get_global_id(0);
int d = get_global_id(1);
int units = get_global_size(0);
int dimension = get_global_size(1);
float koef = sqrt((float)dimension);
if(koef < 1)
koef = 1;
float vg = 0;
float qg = 0;
float kg = 0;
for(int iu = 0; iu < units; iu++)
{
float g = gradient[iu * dimension + d];
float sc = scores[iu * units + u];
vg += sc * g;
//---
float sqg = 0;
float skg = 0;
for(int id = 0; id < dimension; id++)
{
sqg += values[iu * dimension + id] * gradient[u * dimension + id];
skg += values[u * dimension + id] * gradient[iu * dimension + id];
}
qg += (scores[u * units + iu] == 0 || scores[u * units + iu] == 1 ? 0.0001f : scores[u * units + iu] * (1 - scores[u * units + iu])) * sqg * keys[iu * dimension + d] / koef;
//---
kg += (scores[iu * units + u] == 0 || scores[iu * units + u] == 1 ? 0.0001f : scores[iu * units + u] * (1 - scores[iu * units + u])) * skg * querys[iu * dimension + d] / koef;
}
int shift = u * dimension + d;
values_g[shift] = clamp((isnan(vg) ? 0.0f : vg), -1.0f, 1.0f);
querys_g[shift] = clamp((isnan(qg) ? 0.0f : qg), -1.0f, 1.0f);
keys_g[shift] = clamp((isnan(kg) ? 0.0f : kg), -1.0f, 1.0f);
}
//+------------------------------------------------------------------+
///\ingroup neuron_norm Kernels of matrix normalization process
/// Describes the process of matrix normalization.
///\details Detailed description on the link.
/// @param[in,out] buffer In/Out Matrix
/// @param[in] dimension Dimension of matrix
//+------------------------------------------------------------------+
__kernel void Normalize(__global float *buffer,
int dimension)
{
int n = get_global_id(0);
int shift = n * dimension;
if(dimension <= 0)
return;
//---
float mean = 0;
for(int i = 0; i < dimension; i++)
{
if(isnan(buffer[shift + i]))
buffer[shift + i] = 0;
else
mean += buffer[shift + i] / dimension;
}
float variance = 0;
for(int i = 0; i < dimension; i++)
variance += pow(buffer[shift + i] - mean, 2) / dimension;
variance = sqrt((isnan(variance) ? 0 : variance));
if(variance == 0)
{
for(int i = 0; i < dimension; i++)
variance = fmax(fabs(buffer[shift + i] - mean), variance);
}
for(int i = 0; i < dimension; i++)
buffer[shift + i] = (buffer[shift + i] - mean) / (variance + 1.0e-37f);
}
//+------------------------------------------------------------------+
///\ingroup neuron_norm Kernels of weights matrix normalization process
/// Describes the process of weights matrix normalization.
///\details Detailed description on the link.
/// @param[in,out] buffer In/Out Matrix
/// @param[in] dimension Dimension of matrix
//+------------------------------------------------------------------+
__kernel void NormalizeWeights(__global float *buffer,
int dimension)
{
int n = get_global_id(0);
int shift = n * dimension;
float sum = 0;
float k = 1;
do
{
for(int i = 0; (i < dimension && !isnan(sum)); i++)
sum = pow(buffer[shift + i] / k, 2) / dimension;
if(isnan(sum))
k *= 10;
}
while(isnan(sum));
sum = sqrt(sum);
if(k * sum > 1)
for(int i = 0; i < dimension; i++)
buffer[shift + i] /= k * sum;
}
//+------------------------------------------------------------------+
///\ingroup neuron_atten_ff
/// Describes the process of concatenate 4 matrices.
///\details Detailed description on the link.
/// @param[in] input1, input2, input3, input4 Input buffers
/// @param[in] window1, window2, window3, window4 Windows for every buffer
/// @param[out] output Output buffer
//+------------------------------------------------------------------+
__kernel void ConcatenateBuffers(__global float *input1, int window1,
__global float *input2, int window2,
__global float *input3, int window3,
__global float *input4, int window4,
__global float *output)
{
int n = get_global_id(0);
int shift = n * (window1 + window2 + window3 + window4);
int shift_in = n * window1;
for(int i = 0; i < window1; i++)
output[shift + i] = input1[shift_in + i];
//---
shift += window1;
shift_in = n * window2;
for(int i = 0; i < window2; i++)
output[shift + i] = input2[shift_in + i];
//---
shift += window2;
shift_in = n * window3;
for(int i = 0; i < window3; i++)
output[shift + i] = input3[shift_in + i];
//---
shift += window3;
shift_in = n * window4;
for(int i = 0; i < window4; i++)
output[shift + i] = input4[shift_in + i];
}
//+------------------------------------------------------------------+
///\ingroup neuron_atten_gr
/// Describes the process of deconcatenate matrix.
///\details Detailed description on the link.
/// @param[in] output1, output2, output3, output4 Output buffers
/// @param[in] window1, window2, window3, window4 Windows for every buffer
/// @param[out] inputs Input buffer
//+------------------------------------------------------------------+
__kernel void DeconcatenateBuffers(__global float *output1, int window1,
__global float *output2, int window2,
__global float *output3, int window3,
__global float *output4, int window4,
__global float *inputs)
{
int n = get_global_id(0);
//--- Head 1
int shift = n * (window1 + window2 + window3 + window4);
int shift_out = n * window1;
for(int i = 0; i < window1; i++)
output1[shift_out + i] = inputs[shift + i];
//--- Head 2
shift += window1;
shift_out = n * window2;
for(int i = 0; i < window2; i++)
output2[shift_out + i] = inputs[shift + i];
//--- Head 3
shift += window2;
shift_out = n * window3;
for(int i = 0; i < window3; i++)
output3[shift_out + i] = inputs[shift + i];
//--- Head 4
shift += window3;
shift_out = n * window4;
for(int i = 0; i < window4; i++)
output4[shift_out + i] = inputs[shift + i];
}
//+------------------------------------------------------------------+
///\ingroup neuron_atten_ff Multi-Heads Attention Neuron Score calculation kernel
/// Describes the Score calculation process for the Neuron of multi-heads attention layer (#CNeuronMLMHAttentionOCL).
///\details Detailed description on the link.
//+------------------------------------------------------------------+
__kernel void MHAttentionScore(__global float *qkv, ///<[in] Matrix of Querys, Keys, Values
__global float *score, ///<[out] Matrix of Scores
int dimension, ///< Dimension of Key
int mask ///< 1 - calc only previous units, 0 - calc all
)
{
int q = get_global_id(0);
int h = get_global_id(1);
int units = get_global_size(0);
int heads = get_global_size(1);
//---
int shift_q = dimension * (h + 3 * q * heads);
int shift_s = units * (h + q * heads);
//---
float koef = sqrt((float)dimension);
if(koef < 1)
koef = 1;
float sum = 0;
for(int k = 0; k < units; k++)
{
if(mask > 0 && k > q)
{
score[shift_s + k] = 0;
continue;
}
float result = 0;
int shift_k = dimension * (h + heads * (3 * k + 1));
for(int i = 0; i < dimension; i++)
{
if((dimension - i) > 4)
{
result += dot((float4)(qkv[shift_q + i], qkv[shift_q + i + 1], qkv[shift_q + i + 2], qkv[shift_q + i + 3]),
(float4)(qkv[shift_k + i], qkv[shift_k + i + 1], qkv[shift_k + i + 2], qkv[shift_k + i + 3]));
i += 3;
}
else
result += (qkv[shift_q + i] * qkv[shift_k + i]);
}
result = exp(clamp(result / koef, -30.0f, 30.0f));
if(isnan(result))
result = 0;
score[shift_s + k] = result;
sum += result;
}
for(int k = 0; (k < units && sum > 1); k++)
score[shift_s + k] /= sum;
}
//+------------------------------------------------------------------+
///\ingroup neuron_atten_ff Multi-heads Attention Neuron Out calculation kernel
/// Describes the Multi-heads Attention out calculation process for the Neuron of multi-heads attention layer (#CNeuronMLMHAttentionOCL).
///\details Detailed description on the link.
//+------------------------------------------------------------------+
__kernel void MHAttentionOut(__global float *scores, ///<[in] Matrix of Scores
__global float *qkv, ///<[in] Matrix of Values
__global float *out, ///<[out] Output tesor
int dimension ///< Dimension of Value
)
{
int u = get_global_id(0);
int units = get_global_size(0);
int h = get_global_id(1);
int heads = get_global_size(1);
//---
int shift_s = units * (h + heads * u);
int shift_out = dimension * (h + heads * u);
int layer = 3 * dimension * heads;
//---
for(int d = 0; d < dimension; d++)
{
float result = 0;
for(int v = 0; v < units; v += 4)
{
int shift_v = dimension * (h + heads * (3 * v + 2)) + d;
if((units - v) > 4)
{
result += dot((float4)(scores[shift_s + v], scores[shift_s + v + 1], scores[shift_s + v + 2], scores[shift_s + v + 3]),
(float4)(qkv[shift_v], qkv[shift_v + layer], qkv[shift_v + 2 * layer], qkv[shift_v + 3 * layer]));
}
else
for(int l = 0; l < (int)fmin((float)(units - v), 4.0f); l++)
result += scores[shift_s + v + l] * qkv[shift_v + l * layer];
}
out[shift_out + d] = result;
}
}
//+------------------------------------------------------------------+
///\ingroup neuron_atten_gr Attention layer's neuron Gradients Calculation kernel
/// Describes the gradients calculation process for the Neuron of attention layer (#CNeuronMLMHAttentionOCL).
///\details Detailed description on the link.
/// @param[in] qkv Matrix of Querys, Keys and Values
/// @param[out] qkv_g Matrix of Querys', Keys' and Values' Gradients
/// @param[in] scores Matrix of Scores
/// @param[in] scores_g Matrix of Scores' Gradients
/// @param[in] gradient Matrix of Gradients from previous iteration
/// @param[in] dimension Dimension of Key vector
//+------------------------------------------------------------------+
__kernel void MHAttentionInsideGradients(__global float *qkv, __global float *qkv_g,
__global float *scores, __global float *scores_g,
__global float *gradient, int dimension)
{
int u = get_global_id(0);
int h = get_global_id(1);
int units = get_global_size(0);
int heads = get_global_size(1);
float koef = sqrt((float)dimension);
if(koef < 1)
koef = 1;
//--- Calculating score's gradients
uint shift_s = units * (h + u * heads);
for(int v = 0; v < units; v++)
{
float s = scores[shift_s + v];
if(s <= 0)
continue;
float sg = 0;
int shift_v = dimension * (h + heads * (3 * v + 2));
int shift_g = dimension * (h + heads * v);
for(int d = 0; d < dimension; d++)
sg += qkv[shift_v + d] * gradient[shift_g + d];
scores_g[shift_s + v] = sg * (s < 1 ? s * (1 - s) : 1) / koef;
}
barrier(CLK_GLOBAL_MEM_FENCE);
//--- Calculating gradients for Query, Key and Value
uint shift_qg = dimension * (h + 3 * u * heads);
uint shift_kg = dimension * (h + (3 * u + 1) * heads);
uint shift_vg = dimension * (h + (3 * u + 2) * heads);
for(int d = 0; d < dimension; d++)
{
float vg = 0;
float qg = 0;
float kg = 0;
for(int l = 0; l < units; l++)
{
float sg = scores[shift_s + l];
if(sg <= 0)
continue;
uint shift_q = dimension * (h + 3 * l * heads) + d;
uint shift_k = dimension * (h + (3 * l + 1) * heads) + d;
uint shift_g = dimension * (h + heads * l) + d;
//---
vg += gradient[shift_g] * sg;
sg = scores_g[shift_s + l];
kg += sg * qkv[shift_q];
qg += sg * qkv[shift_k];
}
qkv_g[shift_qg + d] = qg;
qkv_g[shift_kg + d] = kg;
qkv_g[shift_vg + d] = vg;
}
}
//+------------------------------------------------------------------+
///\ingroup neuron_dropout Kernel for Dropout.
/// Describes the dropout method.
///\details Detailed description on the link.
//+------------------------------------------------------------------+
__kernel void Dropout(__global float *inputs, ///<[in] Input matrix
__global float *map, ///<[in] Dropout map matrix
__global float *out, ///<[out] Output matrix
int dimension ///< Dimension of matrix
)
{
const int i = get_global_id(0) * 4;
if(i + 3 < dimension)
{
float4 k = (float4)(inputs[i], inputs[i + 1], inputs[i + 2], inputs[i + 3]) * (float4)(map[i], map[i + 1], map[i + 2], map[i + 3]);
out[i] = k.s0;
out[i + 1] = k.s1;
out[i + 2] = k.s2;
out[i + 3] = k.s3;
}
else
for(int k = i; k < min(dimension, i + 4); k++)
out[i + k] = (inputs[i + k] * map[i + k]);
}
//+------------------------------------------------------------------+
///\ingroup neuron_norm Kernels of Batch normalization process
/// Describes the process of Batch normalization. (#CNeuronBatchNormOCL)
///\details Detailed description on the link.
/// @param[in] inputs Input data tenzor
/// @param[in,out] options Tenzor of variables
/// @param[out] output Tenzor of output data
/// @param[in] batch Batch size
/// @param[in] optimization Optimization type
/// @param[in] activation Activation type
//+------------------------------------------------------------------+
__kernel void BatchFeedForward(__global float *inputs,
__global float *options,
__global float *output,
int batch,
int optimization,
int activation)
{
if(batch <= 1)
return;
int n = get_global_id(0);
int shift = n * (optimization == 0 ? 7 : 9);
//---
for(int i = 0; i < (optimization == 0 ? 7 : 9); i++)
if(isnan(options[shift + i]))
options[shift + i] = 0;
//---
float mean = (options[shift] * ((float)batch - 1) + inputs[n]) / ((float)batch);
float delt = inputs[n] - mean;
float variance = options[shift + 1] * ((float)batch - 1.0f) + pow(delt, 2);
if(options[shift + 1] > 0)
variance /= (float)batch;
float nx = delt / sqrt(variance + 1.0e-37f);
//---
if(options[shift + 3] == 0)
options[shift + 3] = 1;
//---
float res = options[shift + 3] * nx + options[shift + 4];
switch(activation)
{
case 0:
res = tanh(clamp(res, -20.0f, 20.0f));
break;
case 1:
res = 1 / (1 + exp(-clamp(res, -20.0f, 20.0f)));
break;
case 2:
if(res < 0)
res *= 0.01f;
break;
default:
break;
}
//---
options[shift] = mean;
options[shift + 1] = variance;
options[shift + 2] = nx;
output[n] = res;
}
//+------------------------------------------------------------------+
///\ingroup neuron_gr
/// Kernel of the Batch neuron to transfer gradient to previous layer (#CNeuronBatchNormOCL)
///\details Detailed description on the link.
//+------------------------------------------------------------------+
__kernel void CalcHiddenGradientBatch(__global float *options, ///<[in] Options matrix m*(7 or 9), where m - Number of neurons in previous layer
__global float *matrix_g, ///<[in] Tensor of gradients at current layer
__global float *matrix_i, ///<[in] Tensor of previous layer output
__global float *matrix_ig, ///<[out] Tensor of gradients at previous layer
int activation, ///< Activation type (#ENUM_ACTIVATION)
int batch, ///< Batch size
int optimization ///< Optimization type
)
{
if(batch <= 1)
return;
//---
int n = get_global_id(0);
int shift = n * (optimization == 0 ? 7 : 9);
//---
float inp = matrix_i[n];
float gnx = matrix_g[n] * options[shift + 3];
float temp = 1 / sqrt(options[shift + 1] + 1e-37f);
float gmu = (-temp) * gnx;
float gvar = (options[shift] * inp) / (2 * pow(options[shift + 1] + 1.0e-37f, 3 / 2)) * gnx;
float gx = temp * gnx + gmu / batch + gvar * 2 * inp / batch * pow((float)(batch - 1) / batch, 2.0f);
//---
if(isnan(gx))
gx = 0;
switch(activation)
{
case 0:
gx = clamp(gx + inp, -1.0f, 1.0f) - inp;
gx = gx * (1 - pow(inp == 1 || inp == -1 ? 0.99999999f : inp, 2));
break;
case 1:
gx = clamp(gx + inp, 0.0f, 1.0f) - inp;
gx = gx * (inp == 0 || inp == 1 ? 0.00000001f : (inp * (1 - inp)));
break;
case 2:
if(inp < 0)
gx *= 0.01f;
break;
default:
break;
}
matrix_ig[n] = gx;
}
//+------------------------------------------------------------------+
///\ingroup neuron_opt Batch normalization Neuron SGD optimization Updating options kernel
/// Describes the process of SGD optimization options for the Batch normalization Neuron (#CNeuronBatchNormOCL).
///\details Detailed description on the link.
//+------------------------------------------------------------------+
__kernel void UpdateBatchOptionsMomentum(__global float *options, ///<[in,out] Options matrix m*7, where m - Number of neurons in previous layer
__global float *matrix_g, ///<[in] Tensor of gradients at current layer
float learning_rates, ///< Learning rates
float momentum ///< Momentum multiplier
)
{
const int n = get_global_id(0);
int inputs = get_global_size(0);
const int shift = n * 7;
float grad = matrix_g[n];
//---
float2 delta = learning_rates * grad * (float2)(options[shift + 2], 1) + momentum * (float2)(options[shift + 5], options[shift + 6]);
if(!isnan(delta.s0) && !isnan(delta.s1))
{
options[shift + 5] = delta.s0;
if(delta.s0 * grad > 0)
options[shift + 3] = clamp(options[shift + 3] + delta.s0, -MAX_WEIGHT, MAX_WEIGHT);
if(delta.s1 * grad > 0)
options[shift + 6] = delta.s1;
options[shift + 4] += delta.s1 - learning_rates * (l1 * sign(options[shift + 4]) + l2 * options[shift + 4] / inputs);
}
}
//+------------------------------------------------------------------+
///\ingroup neuron_opt Batch normalization Neuron Adam optimization Updating options kernel
/// Describes the process of Adam optimization options for the Batch normalization Neuron (#CNeuronBatchNormOCL).
///\details Detailed description on the link.
//+------------------------------------------------------------------+
__kernel void UpdateBatchOptionsAdam(__global float *options, ///<[in,out] Options matrix m*9, where m - Number of neurons in previous layer
__global float *matrix_g, ///<[in] Tensor of gradients at current layer
const float l, ///< Learning rates
const float b1, ///< First momentum multiplier
const float b2 ///< Second momentum multiplier
)
{
const int n = get_global_id(0);
int inputs = get_global_size(0);
const int shift = n * 9;
float grad = matrix_g[n];
//---
float2 mt = b1 * (float2)(options[shift + 5], options[shift + 6]) + (1 - b1) * (float2)(grad * options[shift + 2] - (l1 * sign(options[shift + 3]) + l2 * options[shift + 3]), grad - (l1 * sign(options[shift + 4]) + l2 * options[shift + 4]));
float2 vt = b2 * (float2)(options[shift + 5], options[shift + 6]) + (1 - b2) * pow((float2)(grad * options[shift + 2], grad), 2);
float2 delta = l * mt / sqrt(vt + 1.0e-37f);
if(isnan(delta.s0) || isnan(delta.s1))
return;
float2 weight = l * (l1 * sign((float2)(options[shift + 3], options[shift + 4])) + l2 * (float2)(options[shift + 3], options[shift + 4]) / inputs) + delta;
//---
if(!isnan(weight.s0) && !isnan(weight.s1))
{
if(delta.s0 * grad > 0)
options[shift + 3] = weight.s0;
if(delta.s1 * grad > 0)
options[shift + 4] = weight.s1;
options[shift + 5] = mt.s0;
options[shift + 6] = mt.s1;
options[shift + 7] = vt.s0;
options[shift + 8] = vt.s1;
}
};
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void VAE_FeedForward(__global float* inputs,
__global float* random,
__global float* outputs
)
{
uint i = (uint)get_global_id(0);
uint total = (uint)get_global_size(0);
outputs[i] = inputs[i] + exp(0.5f * inputs[i + total]) * random[i];
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void VAE_CalcHiddenGradient(__global float* inputs,
__global float* inp_grad,
__global float* random,
__global float* gradient,
const float kld_mult
)
{
uint i = (uint)get_global_id(0);
uint total = (uint)get_global_size(0);
float kld = kld_mult * 0.5f * (inputs[i + total] - exp(inputs[i + total]) - pow(inputs[i], 2.0f) + 1);
inp_grad[i] = gradient[i] + kld * inputs[i];
inp_grad[i + total] = 0.5f * (gradient[i] * random[i] * exp(0.5f * inputs[i + total]) - kld * (1 - exp(inputs[i + total]))) ;
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void LSTM_FeedForward(__global float* inputs, int inputs_size,
__global float* weights,
__global float* concatenated,
__global float* memory,
__global float* output
)
{
uint id = (uint)get_global_id(0);
uint total = (uint)get_global_size(0);
uint id2 = (uint) get_local_id(1);
//---
float sum = 0;
uint shift = (id + id2 * total) * (total + inputs_size + 1);
for(uint i = 0; i < total; i += 4)
{
if(total - i > 4)
sum += dot((float4)(output[i], output[i + 1], output[i + 2], output[i + 3]),
(float4)(weights[shift + i], weights[shift + i + 1], weights[shift + i + 2], weights[shift + i + 3]));
else
for(uint k = i; k < total; k++)
sum += output[k] + weights[shift + k];
}
//---
shift += total;
for(uint i = 0; i < inputs_size; i += 4)
{
if(total - i > 4)
sum += dot((float4)(inputs[i], inputs[i + 1], inputs[i + 2], inputs[i + 3]),
(float4)(weights[shift + i], weights[shift + i + 1], weights[shift + i + 2], weights[shift + i + 3]));
else
for(uint k = i; k < total; k++)
sum += inputs[k] + weights[shift + k];
}
sum += weights[shift + inputs_size];
if(id2 < 3)
concatenated[id2 * total + id] = 1.0f / (1.0f + exp(sum));
else
concatenated[id2 * total + id] = tanh(sum);
//---
barrier(CLK_LOCAL_MEM_FENCE);
if(id2 == 0)
{
float mem = memory[id + total] = memory[id];
float fg = concatenated[id];
float ig = concatenated[id + total];
float og = concatenated[id + 2 * total];
float nc = concatenated[id + 3 * total];
//---
memory[id] = mem = mem * fg + ig * nc;
output[id] = og * tanh(mem);
}
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void LSTM_ConcatenatedGradient(__global float* gradient,
__global float* concatenated_gradient,
__global float* memory,
__global float* concatenated
)
{
uint id = get_global_id(0);
uint total = get_global_size(0);
float t = tanh(memory[id]);
concatenated_gradient[id + 2 * total] = gradient[id] * t; //output gate
float memory_gradient = gradient[id] * concatenated[id + 2 * total];
memory_gradient *= 1 - pow(t, 2.0f);
concatenated_gradient[id + 3 * total] = memory_gradient * concatenated[id + total]; //new content
concatenated_gradient[id + total] = memory_gradient * concatenated[id + 3 * total]; //input gate
concatenated_gradient[id] = memory_gradient * memory[id + total]; //forgat gate
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void LSTM_HiddenGradient(__global float* concatenated_gradient,
__global float* inputs_gradient,
__global float* weights_gradient,
__global float* hidden_state,
__global float* inputs,
__global float* weights,
__global float* output,
const int hidden_size,
const int inputs_size
)
{
uint id = get_global_id(0);
uint total = get_global_size(0);
uint weights_step = hidden_size + inputs_size + 1;
for(int i = id; i < (hidden_size + inputs_size); i += total)
{
float inp = 0;
if(i < hidden_size)
{
inp = hidden_state[i];
hidden_state[i] = output[i];
}
else
{
inp = inputs[i - hidden_size];
float grad = 0;
for(uint g = 0; g < 3 * hidden_size; g++)
{
float temp = concatenated_gradient[g];
grad += temp * (1 - temp) * weights[i + g * weights_step];
}
for(uint g = 3 * hidden_size; g < 4 * hidden_size; g++)
{
float temp = concatenated_gradient[g];
grad += temp * (1 - pow(temp, 2.0f)) * weights[i + g * weights_step];
}
inputs_gradient[i - hidden_size] = grad;
}
//---
for(uint g = 0; g < 3 * hidden_size; g++)
{
float temp = concatenated_gradient[g];
weights[i + g * weights_step] = temp * (1 - temp) * inp;
}
for(uint g = 3 * hidden_size; g < 4 * hidden_size; g++)
{
float temp = concatenated_gradient[g];
weights[i + g * weights_step] = temp * (1 - pow(temp, 2.0f)) * inp;
}
}
//---
for(int i = id; i < 4 * hidden_size; i += total)
{
float temp = concatenated_gradient[(i + 1) * hidden_size];
if(i < 3 * hidden_size)
weights[(i + 1) * weights_step] = temp * (1 - temp);
else
weights[(i + 1) * weights_step] = 1 - pow(temp, 2.0f);
}
}
//+------------------------------------------------------------------+
///\ingroup LSTM_opt LSTM Adam Updating Weights Calculation kernel
/// Describes the process of Adam optimization weights for the Neuron LSTM (#CNeuronLSTMOCL).
//+------------------------------------------------------------------+
__kernel void LSTM_UpdateWeightsAdam(__global float *weights, ///<[in,out] Weights matrix (m+1)*n, where m - number of neurons in previous layer and n - number of neurons in current layer
__global float *weights_gradient, ///<[in] Tensor of gradients at current layer
__global float *matrix_m, ///<[in,out] Matrix of first momentum
__global float *matrix_v, ///<[in,out] Matrix of seconfd momentum
const float l, ///< Learning rates
const float b1, ///< First momentum multiplier
const float b2 ///< Second momentum multiplier
)
{
const uint id = get_global_id(0);
const uint total = get_global_size(0);
const uint id1 = get_global_id(1);
const uint wi = id1 * total + id;
float g = weights_gradient[wi];
float mt = b1 * matrix_m[wi] + (1 - b1) * g;
float vt = b2 * matrix_v[wi] + (1 - b2) * pow(g, 2);
float delta = l * (mt / (sqrt(vt) + 1.0e-37f) - (l1 * sign(weights[wi]) + l2 * weights[wi] / total));
weights[wi] = clamp(weights[wi] + delta, -MAX_WEIGHT, MAX_WEIGHT);
matrix_m[wi] = mt;
matrix_v[wi] = vt;
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void SoftMax_FeedForward(__global float *inputs,
__global float *outputs,
const int total)
{
uint i = (uint)get_global_id(0);
uint l = (uint)get_local_id(0);
uint h = (uint)get_global_id(1);
uint ls = min((uint)get_local_size(0), (uint)256);
uint shift_head = h * total;
//---
__local float temp[256];
uint count = 0;
if(l < 256)
do
{
uint shift = shift_head + count * ls + l;
temp[l] = (count > 0 ? temp[l] : 0) + (shift < ((h + 1) * total) ? exp(inputs[shift]) : 0);
count++;
}
while((count * ls + l) < total);
barrier(CLK_LOCAL_MEM_FENCE);
count = min(ls, (uint)total);
do
{
count = (count + 1) / 2;
if(l < 256)
temp[l] += (l < count && (l + count) < total ? temp[l + count] : 0);
if(l + count < 256)
temp[l + count] = 0;
barrier(CLK_LOCAL_MEM_FENCE);
}
while(count > 1);
//---
float sum = temp[0];
if(sum != 0)
{
count = 0;
while((count * ls + l) < total)
{
uint shift = shift_head + count * ls + l;
if(shift < ((h + 1)*total))
outputs[shift] = exp(inputs[shift]) / (sum + 1e-37f);
count++;
}
}
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void SoftMax_HiddenGradient(__global float* outputs,
__global float* output_gr,
__global float* input_gr)
{
size_t i = get_global_id(0);
size_t outputs_total = get_global_size(0);
size_t h = get_global_id(1);
uint shift = h * outputs_total;
float output = outputs[shift + i];
float result = 0;
for(int j = 0; j < outputs_total ; j++)
result += outputs[shift + j] * output_gr[shift + j] * ((float)(i == j) - output);
input_gr[shift + i] = result;
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void SoftMax_OutputGradient(__global float* outputs,
__global float* targets,
__global float* output_gr)
{
size_t i = get_global_id(0);
output_gr[i] = targets[i] / (outputs[i] + 1e-37f);
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void FQF_Cosine(__global float* softmax,
__global float* output)
{
size_t i = get_global_id(0);
size_t total = get_global_size(0);
size_t action = get_global_id(1);
int shift = action * total;
//---
float result = 0;
for(int it = 0; it < i; it++)
result += softmax[shift + it];
result += softmax[shift + i] / 2.0f;
output[shift + i] = cos(i * M_PI_F * result);
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void FQF_Output(__global float* quantiles,
__global float* delta_taus,
__global float* output,
int total)
{
size_t action = get_global_id(0);
int shift = action * total;
//---
float result = 0;
for(int i = 0; i < total; i++)
result += quantiles[shift + i] * delta_taus[shift + i];
output[action] = result;
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void FQF_OutputGradient(__global float* quantiles,
__global float* delta_taus,
__global float* output_gr,
__global float* quantiles_gr,
__global float* taus_gr
)
{
size_t i = get_global_id(0);
size_t total = get_global_size(0);
size_t action = get_global_id(1);
int shift = action * total;
//---
float gradient = output_gr[action];
quantiles_gr[shift + i] = gradient * delta_taus[shift + i];
taus_gr[shift + i] = gradient * quantiles[shift + i];
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void FQF_QuantileGradient(__global float* state_embeding,
__global float* taus_embeding,
__global float* quantiles_gr,
__global float* state_gr,
__global float* taus_gr
)
{
size_t i = get_global_id(0);
size_t total = get_global_size(0);
size_t action = get_global_id(1);
int shift = action * total;
//---
float gradient = quantiles_gr[shift + i];
state_gr[shift + i] = gradient * taus_embeding[shift + i];
taus_gr[shift + i] = gradient * state_embeding[shift + i];
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void FQF_CosineGradient(__global float* softmax,
__global float* output_gr,
__global float* softmax_gr
)
{
size_t i = get_global_id(0);
size_t total = get_global_size(0);
size_t action = get_global_id(1);
int shift = action * total;
//---
float cumul = 0;
for(int it = 0; it < i; it++)
cumul += softmax[shift + it];
float result = -M_PI_F * i * sin(M_PI_F * i * (cumul + softmax[shift + i] / 2)) * output_gr[shift + i];
for(int it = i + 1; it < total; it++)
{
cumul += softmax[shift + it - 1];
float temp = cumul + softmax[shift + it] / 2;
result += -M_PI_F * it * sin(M_PI_F * it * temp) * output_gr[shift + it] * softmax[shift + it] / temp;
}
softmax_gr[shift + i] += result;
}
//+------------------------------------------------------------------+
//| Sparse Attention |
//+------------------------------------------------------------------+
__kernel void MHSparseAttentionScore(__global float *qkv, ///<[in] Matrix of Querys, Keys, Values
__global float *score, ///<[out] Matrix of Scores
int dimension, ///< Dimension of Key
float sparse ///< less than 1.0 coefficient of sparse
)
{
int q = get_global_id(0);
int h = get_global_id(1);
int units = get_global_size(0);
int heads = get_global_size(1);
//---
int shift_q = dimension * (h + 3 * q * heads);
int shift_s = units * (h + q * heads);
int active_units = (int)max((float)(units * sparse), min((float)units, 3.0f));
//---
float koef = sqrt((float)dimension);
if(koef < 1)
koef = 1;
float sum = 0.0f;
float min_s = 0.0f;
float max_s = 0.0f;
for(int k = 0; k < units; k++)
{
float result = 0;
int shift_k = dimension * (h + heads * (3 * k + 1));
for(int i = 0; i < dimension; i++)
{
if((dimension - i) > 4)
{
result += dot((float4)(qkv[shift_q + i], qkv[shift_q + i + 1], qkv[shift_q + i + 2], qkv[shift_q + i + 3]),
(float4)(qkv[shift_k + i], qkv[shift_k + i + 1], qkv[shift_k + i + 2], qkv[shift_k + i + 3]));
i += 3;
}
else
result += (qkv[shift_q + i] * qkv[shift_k + i]);
}
score[shift_s + k] = result;
if(k == 0)
min_s = max_s = result;
else
{
max_s = max(max_s, result);
min_s = min(min_s, result);
}
}
//---
int count = units;
float temp = max_s;
while(count > active_units)
{
count = 0;
for(int k = 0; k < units; k++)
{
float value = score[shift_s + k];
if(value < min_s)
continue;
count++;
if(value < temp && value > min_s)
temp = value;
}
if(count > active_units)
min_s = temp;
}
//---
if(max_s == 0.0f)
max_s = 1.0f;
for(int k = 0; k < units; k++)
{
float value = score[shift_s + k];
if(value < min_s)
{
score[shift_s + k] = 0.0f;
continue;
}
value = exp(value / max_s / koef);
score[shift_s + k] = value;
sum += value;
}
//---
for(int k = 0; (k < units && sum > 1); k++)
{
temp = score[shift_s + k];
if(temp == 0.0f)
continue;
score[shift_s + k] = temp / sum;
}
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void MHSparseAttentionOut(__global float *scores, ///<[in] Matrix of Scores
__global float *qkv, ///<[in] Matrix of Values
__global float *out, ///<[out] Output tesor
int dimension ///< Dimension of Value
)
{
int u = get_global_id(0);
int units = get_global_size(0);
int h = get_global_id(1);
int heads = get_global_size(1);
//---
int shift_s = units * (h + heads * u);
int shift_out = dimension * (h + heads * u);
//---
for(int d = 0; d < dimension; d++)
{
float result = 0;
for(int v = 0; v < units; v ++)
{
float cur_score = scores[shift_s + v];
if(cur_score == 0)
continue;
int shift_v = dimension * (h + heads * (3 * v + 2)) + d;
result += cur_score * qkv[shift_v];
}
out[shift_out + d] = result;
}
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void FeedForwardMultiModels(__global float *matrix_w,///<[in] Weights matrix (m+1)*n, where m - number of neurons in layer and n - number of outputs (neurons in next layer)
__global float *matrix_i,///<[in] Inputs tesor
__global float *matrix_o,///<[out] Output tensor
int inputs,///< Number of inputs
int activation///< Activation type (#ENUM_ACTIVATION)
)
{
int i = get_global_id(0);
int outputs = get_global_size(0);
int m = get_global_id(1);
int models = get_global_size(1);
//---
float sum = 0;
float4 inp, weight;
int shift = (inputs + 1) * (i + outputs * m);
int shift_in = inputs * m;
int shift_out = outputs * m;
for(int k = 0; k <= inputs; k = k + 4)
{
switch(inputs - k)
{
case 0:
inp = (float4)(1, 0, 0, 0);
weight = (float4)(matrix_w[shift + k], 0, 0, 0);
break;
case 1:
inp = (float4)(matrix_i[shift_in + k], 1, 0, 0);
weight = (float4)(matrix_w[shift + k], matrix_w[shift + k + 1], 0, 0);
break;
case 2:
inp = (float4)(matrix_i[shift_in + k], matrix_i[shift_in + k + 1], 1, 0);
weight = (float4)(matrix_w[shift + k], matrix_w[shift + k + 1], matrix_w[shift + k + 2], 0);
break;
case 3:
inp = (float4)(matrix_i[shift_in + k], matrix_i[shift_in + k + 1], matrix_i[shift_in + k + 2], 1);
weight = (float4)(matrix_w[shift + k], matrix_w[shift + k + 1], matrix_w[shift + k + 2], matrix_w[shift + k + 3]);
break;
default:
inp = (float4)(matrix_i[shift_in + k], matrix_i[shift_in + k + 1], matrix_i[shift_in + k + 2], matrix_i[shift_in + k + 3]);
weight = (float4)(matrix_w[shift + k], matrix_w[shift + k + 1], matrix_w[shift + k + 2], matrix_w[shift + k + 3]);
break;
}
float d = dot(inp, weight);
if(isnan(sum + d))
continue;
sum += d;
}
if(isnan(sum))
sum = 0;
switch(activation)
{
case 0:
sum = tanh(sum);
break;
case 1:
sum = 1 / (1 + exp(-sum));
break;
case 2:
if(sum < 0)
sum *= 0.01f;
break;
default:
break;
}
matrix_o[shift_out + i] = sum;
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void CalcHiddenGradientMultiModels(__global float *matrix_w,///<[in] Weights matrix (m+1)*n, where m - number of neurons in previous layer and n - number of neurons in current layer
__global float *matrix_g,///<[in] Tensor of gradients at current layer
__global float *matrix_o,///<[in] Previous layer Output tensor
__global float *matrix_ig,///<[out] Tensor of gradients at previous layer
int outputs,///< Number of outputs
int activation,///< Activation type (#ENUM_ACTIVATION),
int model
)
{
int i = get_global_id(0);
int inputs = get_global_size(0);
int m = get_global_id(1);
int models = get_global_size(1);
//---
int shift_in = inputs * m;
if(model >= 0 && model != m)
{
matrix_ig[shift_in + i] = 0;
return;
}
//---
int shift_out = outputs * m;
int shift_w = (inputs + 1) * outputs * m;
float sum = 0;
float out = matrix_o[shift_in + i];
float4 grad, weight;
for(int k = 0; k < outputs; k += 4)
{
switch(outputs - k)
{
case 1:
weight = (float4)(matrix_w[shift_w + k * (inputs + 1) + i], 0, 0, 0);
grad = (float4)(matrix_g[shift_out + k], 0, 0, 0);
break;
case 2:
grad = (float4)(matrix_g[shift_out + k], matrix_g[shift_out + k + 1], 0, 0);
weight = (float4)(matrix_w[shift_w + k * (inputs + 1) + i], matrix_w[shift_w + (k + 1) * (inputs + 1) + i], 0, 0);
break;
case 3:
grad = (float4)(matrix_g[shift_out + k], matrix_g[shift_out + k + 1], matrix_g[shift_out + k + 2], 0);
weight = (float4)(matrix_w[shift_w + k * (inputs + 1) + i], matrix_w[shift_w + (k + 1) * (inputs + 1) + i], matrix_w[shift_w + (k + 2) * (inputs + 1) + i], 0);
break;
default:
grad = (float4)(matrix_g[shift_out + k], matrix_g[shift_out + k + 1], matrix_g[shift_out + k + 2], matrix_g[shift_out + k + 3]);
weight = (float4)(matrix_w[shift_w + k * (inputs + 1) + i], matrix_w[shift_w + (k + 1) * (inputs + 1) + i], matrix_w[shift_w + (k + 2) * (inputs + 1) + i], matrix_w[shift_w + (k + 3) * (inputs + 1) + i]);
break;
}
sum += dot(grad, weight);
}
if(isnan(sum))
sum = 0;
switch(activation)
{
case 0:
out = clamp(out, -1.0f, 1.0f);
sum = clamp(sum + out, -1.0f, 1.0f) - out;
sum = sum * max(1 - pow(out, 2), 1.0e-4f);
break;
case 1:
out = clamp(out, 0.0f, 1.0f);
sum = clamp(sum + out, 0.0f, 1.0f) - out;
sum = sum * max(out * (1 - out), 1.0e-4f);
break;
case 2:
if(out < 0)
sum *= 0.01f;
break;
default:
break;
}
matrix_ig[shift_in + i] = sum;
}
//+------------------------------------------------------------------+
//| |
//+------------------------------------------------------------------+
__kernel void UpdateWeightsAdamMultiModels(__global float *matrix_w, ///<[in,out] Weights matrix (m+1)*n, where m - number of neurons in previous layer and n - number of neurons in current layer
__global const float *matrix_g, ///<[in] Tensor of gradients at current layer
__global const float *matrix_i, ///<[in] Inputs tesor
__global float *matrix_m, ///<[in,out] Matrix of first momentum
__global float *matrix_v, ///<[in,out] Matrix of seconfd momentum
const int inputs, ///< Number of inputs
const float l, ///< Learning rates
const float b1, ///< First momentum multiplier
const float b2, ///< Second momentum multiplier
const int model
)
{
const int outputs = get_global_size(0);
const int i = get_global_id(0);
const int j = get_global_id(1);
const int wi = (i + outputs * model) * (inputs + 1) + j * 4;
float4 m, v, weight, inp;
int shift_in = j * 4 + inputs * model;
if((inputs + 1 - j * 4) < 0)
return;
switch(inputs + 1 - j * 4)
{
case 0:
inp = (float4)(1, 0, 0, 0);
weight = (float4)(matrix_w[wi], 0, 0, 0);
m = (float4)(matrix_m[wi], 0, 0, 0);
v = (float4)(matrix_v[wi], 0, 0, 0);
break;
case 1:
inp = (float4)(matrix_i[shift_in], 1, 0, 0);
weight = (float4)(matrix_w[wi], matrix_w[wi + 1], 0, 0);
m = (float4)(matrix_m[wi], matrix_m[wi + 1], 0, 0);
v = (float4)(matrix_v[wi], matrix_v[wi + 1], 0, 0);
break;
case 2:
inp = (float4)(matrix_i[shift_in], matrix_i[shift_in + 1], 1, 0);
weight = (float4)(matrix_w[wi], matrix_w[wi + 1], matrix_w[wi + 2], 0);
m = (float4)(matrix_m[wi], matrix_m[wi + 1], matrix_m[wi + 2], 0);
v = (float4)(matrix_v[wi], matrix_v[wi + 1], matrix_v[wi + 2], 0);
break;
case 3:
inp = (float4)(matrix_i[shift_in], matrix_i[shift_in + 1], matrix_i[shift_in + 2], 1);
weight = (float4)(matrix_w[wi], matrix_w[wi + 1], matrix_w[wi + 2], matrix_w[wi + 3]);
m = (float4)(matrix_m[wi], matrix_m[wi + 1], matrix_m[wi + 2], matrix_m[wi + 3]);
v = (float4)(matrix_v[wi], matrix_v[wi + 1], matrix_v[wi + 2], matrix_v[wi + 3]);
break;
default:
inp = (float4)(matrix_i[shift_in], matrix_i[shift_in + 1], matrix_i[shift_in + 2], matrix_i[shift_in + 3]);
weight = (float4)(matrix_w[wi], matrix_w[wi + 1], matrix_w[wi + 2], matrix_w[wi + 3]);
m = (float4)(matrix_m[wi], matrix_m[wi + 1], matrix_m[wi + 2], matrix_m[wi + 3]);
v = (float4)(matrix_v[wi], matrix_v[wi + 1], matrix_v[wi + 2], matrix_v[wi + 3]);
break;
}
float4 g = (float4)(matrix_g[(outputs + 1) * model + i]) * inp;
float4 mt = b1 * m + (1 - b1) * g;
float4 vt = b2 * v + (1 - b2) * pow(g, 2);
float4 delta = l * (mt / (sqrt(vt) + 1.0e-37f) - (l1 * sign(weight) + l2 * weight));
switch(min(inputs + 1 - j * 4, 3))
{
case 3:
if(delta.s3 * g.s3 > 0)
matrix_w[wi + 3] = clamp(matrix_w[wi + 2] + delta.s3, -MAX_WEIGHT, MAX_WEIGHT);
matrix_m[wi + 3] = mt.s3;
matrix_v[wi + 3] = vt.s3;
case 2:
if(delta.s2 * g.s2 > 0)
matrix_w[wi + 2] = clamp(matrix_w[wi + 2] + delta.s2, -MAX_WEIGHT, MAX_WEIGHT);
matrix_m[wi + 2] = mt.s2;
matrix_v[wi + 2] = vt.s2;
case 1:
if(delta.s1 * g.s1 > 0)
matrix_w[wi + 1] = clamp(matrix_w[wi + 1] + delta.s1, -MAX_WEIGHT, MAX_WEIGHT);
matrix_m[wi + 1] = mt.s1;
matrix_v[wi + 1] = vt.s1;
case 0:
if(delta.s0 * g.s0 > 0)
matrix_w[wi] = clamp(matrix_w[wi] + delta.s0, -MAX_WEIGHT, MAX_WEIGHT);
matrix_m[wi] = mt.s0;
matrix_v[wi] = vt.s0;
break;
}
}
//+------------------------------------------------------------------+