in maga_transformer/cpp/devices/arm_impl/ArmLayerNormOp.cc [988:1464]
LayernormOutput ArmCpuDevice::layernorm(const LayernormParams& params) {
BufferPtr input = params.input;
BufferPtr norm_output = input;
const auto& weights = params.norm_weight; //before_norm_output is using for pre-norm,currently not implemented
void* gamma = weights ? weights->get().gamma.get()->data() : nullptr; //
void* beta = (weights && weights->get().beta) ? weights->get().beta.get()->data() : nullptr;
const auto eps = params.eps;
void* before_norm_output= params.before_norm_output ? params.before_norm_output->data() : nullptr;
void* residual = params.residual1 ? params.residual1->get().data() : nullptr;
void* bias = params.bias.has_value() ? params.bias->get().data() : nullptr;
bool is_output = (params.residual1.has_value() || params.bias.has_value());
int numThreads = omp_get_num_threads();;
const auto norm_type = params.norm_type;
int m = input->shape()[0];
int n = input->shape()[1];
const auto data_type = input->type();
if (!params.is_inplace && params.qscheme == QScheme::NoQuantize) {
norm_output = allocateBufferLike(*params.input);
} else if (params.qscheme == Qint8PerToken) {
throw OpException(OpErrorType::ERROR_UNIMPLEMENTED);
}
int convert_gamma = 0;
int convert_beta = 0;
int convert_bias = 0;
if (data_type == DataType::TYPE_FP32) {
if (gamma) {
if (weights->get().gamma.get()->type() == DataType::TYPE_FP16) {
convert_gamma = 1;
}
}
if (beta) {
if (weights->get().beta.get()->type() == DataType::TYPE_FP16) {
convert_beta = 1;
}
}
if (bias) {
if (params.bias->get().type() == DataType::TYPE_FP16) {
convert_bias = 1;
}
}
}
// for BERT
// before_norm_output params.return_norm_output bias/residual exist
// . . F
// layernorm(input)->normed_output
// F . T
// layernorm(input+bias+residual)->normed_output
// T T T
// layernorm(input+bias+residual)->before_norm_output
// layernorm(input+bias+residual)->normed_output
// T F T
// (input+bias+residual)->before_norm_output
// layernorm(input+bias+residual)->normed_output
if (norm_type == NormType::layernorm && (convert_gamma || convert_beta || convert_bias)) {
float* gamma_converted = new float[n];
if (gamma) {
if (convert_gamma) {
convert_fp16_to_float((__fp16*)gamma,gamma_converted,n);
} else {
for (int d = 0; d < n; ++d) {
gamma_converted[d] = static_cast<float>(((float*)gamma)[d]);
}
}
}
if(!is_output){//. . F
if (!gamma || std::all_of((float *)gamma_converted, (float *)gamma_converted + n, [](float value) { return value == 1.0f; })){
#pragma omp parallel for num_threads(std::min(m,numThreads)) if(m>=2)
for(int i=0;i<m;i++){
if(convert_beta) {
float* beta_converted = new float[n];
convert_fp16_to_float((__fp16*)beta,beta_converted,n);
layerNorm_Nogamma(n,(float*)input->data()+i*n, (float*)norm_output->data()+i*n, beta_converted, eps);
delete[] beta_converted;
} else {
layerNorm_Nogamma(n,(float*)input->data()+i*n, (float*)norm_output->data()+i*n, (float*)beta, eps);
}
}
return LayernormOutput({norm_output, params.before_norm_output});
}//(gamma =1,1......)OR (no gamma)
else{
#pragma omp parallel for num_threads(std::min(m,numThreads)) if(m>=2)
for(int i=0;i<m;i++){
if(convert_beta) {
float* beta_converted = new float[n];
convert_fp16_to_float((__fp16*)beta,beta_converted,n);
layerNorm(n,(float*)input->data()+i*n, (float*)norm_output->data()+i*n, gamma_converted, beta_converted, eps);
delete[] beta_converted;
} else {
layerNorm(n,(float*)input->data()+i*n, (float*)norm_output->data()+i*n, gamma_converted, (float*)beta, eps);
}
}
return LayernormOutput({norm_output, params.before_norm_output});
}
}
else if(!before_norm_output){//add bias residual //F . T
if (!gamma || std::all_of((float *)gamma_converted, (float *)gamma_converted + n, [](float value) { return value == 1.0f; })){
#pragma omp parallel for num_threads(std::min(m,numThreads)) if(m>=2)
for(int i=0;i<m;i++){
if (convert_beta && convert_bias) {
float* beta_converted = new float[n];
float* bias_converted = new float[n];
convert_fp16_to_float((__fp16*)beta,beta_converted,n);
convert_fp16_to_float((__fp16*)bias,bias_converted,n);
layerNorm_Nogamma_isoutput(n,(float*)input->data()+i*n, (float*)norm_output->data()+i*n, beta_converted,(residual != nullptr) ? (float*)residual + i*n : (float*)residual,bias_converted, eps);
delete[] beta_converted;
delete[] bias_converted;
} else {
layerNorm_Nogamma_isoutput(n,(float*)input->data()+i*n, (float*)norm_output->data()+i*n, (float*)beta,(residual != nullptr) ? (float*)residual + i*n : (float*)residual,(float*)bias, eps);
}
}
return LayernormOutput({norm_output, params.before_norm_output});
}//(gamma =1,1......)OR (no gamma)
else{
#pragma omp parallel for num_threads(std::min(m,numThreads)) if(m>=2)
for(int i=0;i<m;i++){
if (convert_beta && convert_bias) {
float* beta_converted = new float[n];
float* bias_converted = new float[n];
convert_fp16_to_float((__fp16*)beta,beta_converted,n);
convert_fp16_to_float((__fp16*)bias,bias_converted,n);
layerNorm_isoutput(n,(float*)input->data()+i*n, (float*)norm_output->data()+i*n, gamma_converted, beta_converted,(residual != nullptr) ? ((float*)residual + i*n) : nullptr ,bias_converted, eps);
delete[] beta_converted;
delete[] bias_converted;
} else {
layerNorm_isoutput(n,(float*)input->data()+i*n, (float*)norm_output->data()+i*n, (float*)gamma, (float*)beta,(residual != nullptr) ? ((float*)residual + i*n) : nullptr ,(float*)bias, eps);
}
}
return LayernormOutput({norm_output, params.before_norm_output});
}
}
else if(params.return_normed_output){// T T T
if (!gamma || std::all_of((float *)gamma_converted, (float *)gamma_converted + n, [](float value) { return value == 1.0f; })){
#pragma omp parallel for num_threads(std::min(m,numThreads)) if(m>=2)
for(int i=0;i<m;i++){
if (convert_beta && convert_bias) {
float* beta_converted = new float[n];
float* bias_converted = new float[n];
convert_fp16_to_float((__fp16*)beta,beta_converted,n);
convert_fp16_to_float((__fp16*)bias,bias_converted,n);
layerNorm_Nogamma_isoutput(n,(float*)input->data()+i*n, (float*)norm_output->data()+i*n, (float*)beta,(residual != nullptr) ? ((float*)residual + i*n) : (float*)residual,(float*)bias, eps);
if(before_norm_output != norm_output->data())std::memcpy((float*)before_norm_output + i*n,(float*)norm_output->data() + i*n,n * sizeof(float));
delete[] beta_converted;
delete[] bias_converted;
} else {
layerNorm_Nogamma_isoutput(n,(float*)input->data()+i*n, (float*)norm_output->data()+i*n, (float*)beta,(residual != nullptr) ? ((float*)residual + i*n) : (float*)residual,(float*)bias, eps);
if(before_norm_output != norm_output->data())std::memcpy((float*)before_norm_output + i*n,(float*)norm_output->data() + i*n,n * sizeof(float));
}
}
return LayernormOutput({norm_output, params.before_norm_output});
}//gamma =1,1...... No gamma
else{
#pragma omp parallel for num_threads(std::min(m,numThreads)) if(m>=2)
for(int i=0;i<m;i++){
if (convert_beta && convert_bias) {
float* beta_converted = new float[n];
float* bias_converted = new float[n];
convert_fp16_to_float((__fp16*)beta,beta_converted,n);
convert_fp16_to_float((__fp16*)bias,bias_converted,n);
layerNorm_isoutput(n,(float*)input->data()+i*n, (float*)norm_output->data()+i*n, gamma_converted, beta_converted,(residual != nullptr) ? ((float*)residual + i*n) : (float*)residual ,bias_converted, eps);
if(before_norm_output != norm_output->data()) std::memcpy((float*)before_norm_output + i*n,(float*)norm_output->data() + i*n,n * sizeof(float));
delete[] beta_converted;
delete[] bias_converted;
} else {
layerNorm_isoutput(n,(float*)input->data()+i*n, (float*)norm_output->data()+i*n, (float*)gamma, (float*)beta,(residual != nullptr) ? ((float*)residual + i*n) : (float*)residual ,(float*)bias, eps);
if(before_norm_output != norm_output->data()) std::memcpy((float*)before_norm_output + i*n,(float*)norm_output->data() + i*n,n * sizeof(float));
}
}
return LayernormOutput({norm_output, params.before_norm_output});
}
}
else{ //T F T
if (!gamma || std::all_of((float *)gamma_converted, (float *)gamma_converted + n, [](float value) { return value == 1.0f; })){
#pragma omp parallel for num_threads(std::min(m,numThreads)) if(m>=2)
for(int i=0;i<m;i++){
if (convert_beta && convert_bias) {
float* beta_converted = new float[n];
float* bias_converted = new float[n];
convert_fp16_to_float((__fp16*)beta,beta_converted,n);
convert_fp16_to_float((__fp16*)bias,bias_converted,n);
layerNorm_Nogamma_isoutput_unnormedout(n,(float*)input->data()+i*n, (float*)norm_output->data()+i*n, beta_converted, (residual != nullptr) ? ((float*)residual + i*n) : (float*)residual,bias_converted, ((float*)before_norm_output + i*n), eps);
delete[] beta_converted;
delete[] bias_converted;
} else {
layerNorm_Nogamma_isoutput_unnormedout(n,(float*)input->data()+i*n, (float*)norm_output->data()+i*n, (float*)beta,(residual != nullptr) ? ((float*)residual + i*n) : (float*)residual,(float*)bias, ((float*)before_norm_output + i*n), eps);
}
}
return LayernormOutput({norm_output, params.before_norm_output});
}//gamma =1,1...... No gamma
else{
#pragma omp parallel for num_threads(std::min(m,numThreads)) if(m>=2)
for(int i=0;i<m;i++){
if (convert_beta && convert_bias) {
float* beta_converted = new float[n];
float* bias_converted = new float[n];
convert_fp16_to_float((__fp16*)beta,beta_converted,n);
convert_fp16_to_float((__fp16*)bias,bias_converted,n);
layerNorm_isoutput_unnormedout(n,(float*)input->data()+i*n, (float*)norm_output->data()+i*n, gamma_converted, beta_converted,(residual != nullptr) ? ((float*)residual + i*n) : (float*)residual ,bias_converted, ((float*)before_norm_output+ i*n),eps);
delete[] beta_converted;
delete[] bias_converted;
} else {
layerNorm_isoutput_unnormedout(n,(float*)input->data()+i*n, (float*)norm_output->data()+i*n, (float*)gamma, (float*)beta,(residual != nullptr) ? ((float*)residual + i*n) : (float*)residual ,(float*)bias, ((float*)before_norm_output+ i*n),eps);
}
}
return LayernormOutput({norm_output, params.before_norm_output});
}
}
}
// Due to the cumulative errors caused by using fp16 precision calculations, the fp16 input is first converted to fp32 before using the fp32 kernel.
if(norm_type == NormType::rmsnorm){
if (!weights.has_value()) {//In this case, norm_output = input+residual
if (data_type == DataType::TYPE_FP32){
#pragma omp parallel for num_threads(std::min(m,numThreads)) if(m>=2)
for(int i = 0 ; i<m ; i++){
add_residual_bias_float((float*)norm_output->data()+i*n, (float*)input->data()+i*n, (bool)residual ? (float*)residual+i*n : nullptr, (float*)bias, n);
}
}
else if (data_type == DataType::TYPE_FP16){
#pragma omp parallel for num_threads(std::min(m,numThreads)) if(m>=2)
for(int i = 0 ; i<m ; i++){
add_residual_bias_fp16((__fp16*)norm_output->data()+i*n, (__fp16*)input->data()+i*n, (bool)residual ? (__fp16*)residual+i*n : nullptr, (__fp16*)bias, n);
}
}
else {
throw OpException(OpErrorType::ERROR_UNIMPLEMENTED);
}
return LayernormOutput({norm_output, params.before_norm_output});
}
if(data_type == DataType::TYPE_FP32||data_type == DataType::TYPE_FP16){ //
if(!is_output && (!before_norm_output || before_norm_output != norm_output->data())){//without before_norm_output is_output false
if ((data_type == DataType::TYPE_FP32&&(!gamma || std::all_of((float *)gamma, (float *)gamma + n, [](float value) { return value == 1.0f; })))
||(data_type == DataType::TYPE_FP16&&(!gamma || std::all_of((__fp16 *)gamma, (__fp16 *)gamma + n, [](__fp16 value) { return value == 1.0; })))) {
#pragma omp parallel for num_threads(std::min(m,numThreads)) if(m>=2)
for(int i=0;i<m;i++){
if(data_type == DataType::TYPE_FP16){//convert_float_to_fp16
float* input_converted = new float[n];
float* output_converted = new float[n];
float* beta_converted = new float[n];
convert_fp16_to_float((__fp16*)input->data()+i*n,input_converted,n);
convert_fp16_to_float((__fp16*)norm_output->data()+i*n,output_converted,n);
if(beta) convert_fp16_to_float((__fp16*)beta,beta_converted,n);
RMSNorm_Nogamma(n,input_converted, output_converted, beta!= nullptr ? beta_converted : nullptr, eps);
convert_float_to_fp16(output_converted,(__fp16*)norm_output->data()+i*n,n);
delete[] input_converted;
delete[] output_converted;
delete[] beta_converted;
}
else if(data_type == DataType::TYPE_FP32){
RMSNorm_Nogamma(n,(float*)input->data()+i*n, (float*)norm_output->data()+i*n, beta!= nullptr ? (float*)beta : nullptr, eps);
}
else throw OpException(OpErrorType::ERROR_UNIMPLEMENTED);
}
return LayernormOutput({norm_output, params.before_norm_output});
}
else{
#pragma omp parallel for num_threads(std::min(m,numThreads)) if(m>=2)
for(int i=0;i<m;i++){
if(data_type == DataType::TYPE_FP16){//convert_float_to_fp16
float* input_converted = new float[n];
float* output_converted = new float[n];
float* beta_converted = new float[n];
float* gamma_converted = new float[n];
convert_fp16_to_float((__fp16*)input->data()+i*n,input_converted,n);
convert_fp16_to_float((__fp16*)norm_output->data()+i*n,output_converted,n);
if(beta) convert_fp16_to_float((__fp16*)beta,beta_converted,n);
convert_fp16_to_float((__fp16*)gamma,gamma_converted,n);
RMSNorm(n, input_converted, output_converted, gamma_converted, beta!= nullptr ? beta_converted : nullptr, eps);
convert_float_to_fp16(output_converted,(__fp16*)norm_output->data()+i*n,n);
delete[] input_converted;
delete[] output_converted;
delete[] gamma_converted;
delete[] beta_converted;
}
else if(data_type == DataType::TYPE_FP32){ // beta!= nullptr ? (float*)beta : nullptr
RMSNorm(n,(float*)input->data()+i*n, (float*)norm_output->data()+i*n, (float*) gamma ,beta!= nullptr ? (float*)beta : nullptr, eps);
}
else throw OpException(OpErrorType::ERROR_UNIMPLEMENTED);
}
return LayernormOutput({norm_output, params.before_norm_output});
}
}
else{
if ((data_type == DataType::TYPE_FP32&&(!gamma || std::all_of((float *)gamma, (float *)gamma + n, [](float value) { return value == 1.0f; })))
||(data_type == DataType::TYPE_FP16&&(!gamma || std::all_of((__fp16 *)gamma, (__fp16 *)gamma + n, [](__fp16 value) { return value == 1.0; })))) {
#pragma omp parallel for num_threads(std::min(m,numThreads)) if(m>=2)
for(int i=0;i<m;i++){
if(data_type == DataType::TYPE_FP16){
float* input_converted = new float[n];
float* output_converted = new float[n];
float* beta_converted = new float[n];
float* bias_converted = new float[n];
float* before_norm_output_converted = new float[n];
float* residual_converted = new float[n];
convert_fp16_to_float((__fp16*)input->data()+i*n,input_converted,n);
convert_fp16_to_float((__fp16*)norm_output->data()+i*n,output_converted,n);
if(beta) convert_fp16_to_float((__fp16*)beta,beta_converted,n);
if(before_norm_output && before_norm_output != norm_output->data())convert_fp16_to_float((__fp16*)before_norm_output+i*n,before_norm_output_converted,n);
if(residual)convert_fp16_to_float((__fp16*)residual + i*n,residual_converted,n);
if(bias)convert_fp16_to_float((__fp16*)bias + i*n,bias_converted,n);
RMSNorm_Nogamma_isoutput(n,\
(before_norm_output && before_norm_output != norm_output->data())? before_norm_output_converted : nullptr,\
input_converted,\
output_converted,\
beta!= nullptr ? beta_converted : nullptr,\
(residual != nullptr) ? residual_converted : nullptr,\
(bias != nullptr) ? bias_converted:nullptr,\
eps);
convert_float_to_fp16(output_converted,(__fp16*)norm_output->data()+i*n,n);
convert_float_to_fp16(before_norm_output_converted,(__fp16*)before_norm_output+i*n,n);
delete[] input_converted;
delete[] output_converted;
delete[] beta_converted ;
delete[] bias_converted ;
delete[] before_norm_output_converted ;
delete[] residual_converted;
}
else{
RMSNorm_Nogamma_isoutput(n,\
(before_norm_output && before_norm_output != norm_output->data())?(float*)before_norm_output+i*n : nullptr,\
(float*)input->data()+i*n, \
(float*)norm_output->data()+i*n, \
(float*)beta, \
(residual != nullptr) ? (float*)residual + i*n : nullptr,\
(bias != nullptr) ?(float*)bias:nullptr, \
eps);
}
}
return LayernormOutput({norm_output, params.before_norm_output});
}
else{
#pragma omp parallel for num_threads(std::min(m,numThreads)) if(m>=2)
for(int i=0;i<m;i++){
if(data_type == DataType::TYPE_FP16){
float* input_converted = new float[n];
float* output_converted = new float[n];
float* gamma_converted = new float[n];
float* beta_converted = new float[n];
float* bias_converted = new float[n];
float* before_norm_output_converted = new float[n];
float* residual_converted = new float[n];
convert_fp16_to_float((__fp16*)input->data()+i*n,input_converted,n);
convert_fp16_to_float((__fp16*)gamma,gamma_converted,n);
if(beta) convert_fp16_to_float((__fp16*)beta,beta_converted,n);
if(before_norm_output && before_norm_output != norm_output->data()) convert_fp16_to_float((__fp16*)before_norm_output+i*n,before_norm_output_converted,n);
if(residual)convert_fp16_to_float((__fp16*)residual + i*n,residual_converted,n);
if(bias)convert_fp16_to_float((__fp16*)bias,bias_converted,n);
RMSNorm_isoutput(n,\
(before_norm_output && before_norm_output != norm_output->data())? before_norm_output_converted : nullptr,\
input_converted,\
output_converted,\
gamma_converted,\
(beta!= nullptr) ? beta_converted : nullptr,\
(residual != nullptr) ? residual_converted : nullptr,\
(bias != nullptr) ? bias_converted:nullptr,\
eps);
convert_float_to_fp16(output_converted,(__fp16*)norm_output->data()+i*n,n);
if(before_norm_output && before_norm_output != norm_output->data()) convert_float_to_fp16(before_norm_output_converted,(__fp16*)before_norm_output+i*n,n);
delete[] input_converted;
delete[] output_converted;
delete[] gamma_converted;
delete[] beta_converted;
delete[] bias_converted;
delete[] before_norm_output_converted;
delete[] residual_converted;
}
else{
float* before_norm_output_converted = new float[n];
RMSNorm_isoutput(n,\
(before_norm_output && before_norm_output != norm_output->data())?before_norm_output_converted : nullptr,\
(float*)input->data()+i*n, \
(float*)norm_output->data()+i*n, \
(float*) gamma ,\
(beta!= nullptr) ? (float*)beta : nullptr, \
(residual != nullptr) ? (float*)residual + i*n : (float*)residual,\
(bias != nullptr) ? (float*)bias:nullptr,\
eps);
if(before_norm_output && before_norm_output != norm_output->data()) std::memcpy((float*)before_norm_output+i*n, before_norm_output_converted, n * sizeof(float));
delete[] before_norm_output_converted;
}
}
return LayernormOutput({norm_output, params.before_norm_output});
}
}
}
else throw OpException(OpErrorType::ERROR_UNIMPLEMENTED);
}
// **********************************************
// before_norm_output params.return_norm_output bias/residual exist
// . . F
// layernorm(input)->normed_output
// F . T
// layernorm(input+bias+residual)->normed_output
// T T T
// layernorm(input+bias+residual)->before_norm_output
// layernorm(input+bias+residual)->normed_output
// T F T
// (input+bias+residual)->before_norm_output
// layernorm(input+bias+residual)->normed_output
// **********************************************
else if (norm_type == NormType::layernorm && data_type == DataType::TYPE_FP32){
if(!is_output){//. . F
if (!gamma || std::all_of((float *)gamma, (float *)gamma + n, [](float value) { return value == 1.0f; })){
#pragma omp parallel for num_threads(std::min(m,numThreads)) if(m>=2)
for(int i=0;i<m;i++){
layerNorm_Nogamma(n,(float*)input->data()+i*n, (float*)norm_output->data()+i*n, (float*)beta, eps);
}
return LayernormOutput({norm_output, params.before_norm_output});
}//(gamma =1,1......)OR (no gamma)
else{
#pragma omp parallel for num_threads(std::min(m,numThreads)) if(m>=2)
for(int i=0;i<m;i++){
layerNorm(n,(float*)input->data()+i*n, (float*)norm_output->data()+i*n, (float*)gamma, (float*)beta, eps);
}
return LayernormOutput({norm_output, params.before_norm_output});
}
}
else if(!before_norm_output){//add bias residual //F . T
if (!gamma || std::all_of((float *)gamma, (float *)gamma + n, [](float value) { return value == 1.0f; })){
#pragma omp parallel for num_threads(std::min(m,numThreads)) if(m>=2)
for(int i=0;i<m;i++){
layerNorm_Nogamma_isoutput(n,(float*)input->data()+i*n, (float*)norm_output->data()+i*n, (float*)beta,(residual != nullptr) ? (float*)residual + i*n : (float*)residual,(float*)bias, eps);
}
return LayernormOutput({norm_output, params.before_norm_output});
}//(gamma =1,1......)OR (no gamma)
else{
#pragma omp parallel for num_threads(std::min(m,numThreads)) if(m>=2)
for(int i=0;i<m;i++){
layerNorm_isoutput(n,(float*)input->data()+i*n, (float*)norm_output->data()+i*n, (float*)gamma, (float*)beta,(residual != nullptr) ? ((float*)residual + i*n) : nullptr ,(float*)bias, eps);
}
return LayernormOutput({norm_output, params.before_norm_output});
}
}
else if(params.return_normed_output){// T T T
if (!gamma || std::all_of((float *)gamma, (float *)gamma + n, [](float value) { return value == 1.0f; })){
#pragma omp parallel for num_threads(std::min(m,numThreads)) if(m>=2)
for(int i=0;i<m;i++){
layerNorm_Nogamma_isoutput(n,(float*)input->data()+i*n, (float*)norm_output->data()+i*n, (float*)beta,(residual != nullptr) ? ((float*)residual + i*n) : (float*)residual,(float*)bias, eps);
if(before_norm_output != norm_output->data())std::memcpy((float*)before_norm_output + i*n,(float*)norm_output->data() + i*n,n * sizeof(float));
}
return LayernormOutput({norm_output, params.before_norm_output});
}//gamma =1,1...... No gamma
else{
#pragma omp parallel for num_threads(std::min(m,numThreads)) if(m>=2)
for(int i=0;i<m;i++){
layerNorm_isoutput(n,(float*)input->data()+i*n, (float*)norm_output->data()+i*n, (float*)gamma, (float*)beta,(residual != nullptr) ? ((float*)residual + i*n) : (float*)residual ,(float*)bias, eps);
if(before_norm_output != norm_output->data()) std::memcpy((float*)before_norm_output + i*n,(float*)norm_output->data() + i*n,n * sizeof(float));
}
return LayernormOutput({norm_output, params.before_norm_output});
}
}
else{ //T F T
if (!gamma || std::all_of((float *)gamma, (float *)gamma + n, [](float value) { return value == 1.0f; })){
#pragma omp parallel for num_threads(std::min(m,numThreads)) if(m>=2)
for(int i=0;i<m;i++){
layerNorm_Nogamma_isoutput_unnormedout(n,(float*)input->data()+i*n, (float*)norm_output->data()+i*n, (float*)beta,(residual != nullptr) ? ((float*)residual + i*n) : (float*)residual,(float*)bias, ((float*)before_norm_output + i*n), eps);
}
return LayernormOutput({norm_output, params.before_norm_output});
}//gamma =1,1...... No gamma
else{
#pragma omp parallel for num_threads(std::min(m,numThreads)) if(m>=2)
for(int i=0;i<m;i++){
layerNorm_isoutput_unnormedout(n,(float*)input->data()+i*n, (float*)norm_output->data()+i*n, (float*)gamma, (float*)beta,(residual != nullptr) ? ((float*)residual + i*n) : (float*)residual ,(float*)bias, ((float*)before_norm_output+ i*n),eps);
}
return LayernormOutput({norm_output, params.before_norm_output});
}
}
}
else throw OpException(OpErrorType::ERROR_UNIMPLEMENTED);
}