LayernormOutput ArmCpuDevice::layernorm()

in maga_transformer/cpp/devices/arm_impl/ArmLayerNormOp.cc [988:1464]


LayernormOutput ArmCpuDevice::layernorm(const LayernormParams& params) {
    BufferPtr   input       = params.input;
    BufferPtr   norm_output = input;
    const auto& weights     = params.norm_weight;                                                                 //before_norm_output is using for pre-norm,currently not implemented
    void*       gamma       = weights ? weights->get().gamma.get()->data() : nullptr;                             //
    void*       beta        = (weights && weights->get().beta) ? weights->get().beta.get()->data() : nullptr;
    const auto  eps         = params.eps;  

    void* before_norm_output= params.before_norm_output ? params.before_norm_output->data() : nullptr;  
    void*       residual    = params.residual1 ? params.residual1->get().data() : nullptr;
    void*       bias        = params.bias.has_value() ? params.bias->get().data() : nullptr;
    bool        is_output   = (params.residual1.has_value() || params.bias.has_value());
    int         numThreads  = omp_get_num_threads();;
    const auto  norm_type   = params.norm_type;
    int         m           = input->shape()[0];
    int         n           = input->shape()[1];
    const auto data_type = input->type();
    if (!params.is_inplace && params.qscheme == QScheme::NoQuantize) {
        norm_output = allocateBufferLike(*params.input);
    } else if (params.qscheme == Qint8PerToken) {
        throw OpException(OpErrorType::ERROR_UNIMPLEMENTED);
    }
   
    int convert_gamma = 0;
    int convert_beta = 0;
    int convert_bias = 0;
    if (data_type == DataType::TYPE_FP32) {
        if (gamma) {
            if (weights->get().gamma.get()->type() == DataType::TYPE_FP16) {
                convert_gamma = 1;
            }
        }
        if (beta) {
            if (weights->get().beta.get()->type() == DataType::TYPE_FP16) {
                convert_beta = 1;
            }
        }
        if (bias) {
            if (params.bias->get().type() == DataType::TYPE_FP16) {
                convert_bias = 1;
            }
        }
    }
    // for BERT
    // before_norm_output       params.return_norm_output        bias/residual exist
    // .  .  F
    // layernorm(input)->normed_output  
    // F  .  T
    // layernorm(input+bias+residual)->normed_output  
    // T  T  T
    // layernorm(input+bias+residual)->before_norm_output
    // layernorm(input+bias+residual)->normed_output
    // T  F  T
    // (input+bias+residual)->before_norm_output
    // layernorm(input+bias+residual)->normed_output
    if (norm_type == NormType::layernorm && (convert_gamma || convert_beta || convert_bias)) {
        float* gamma_converted = new float[n];
        if (gamma) {
            if (convert_gamma) {
                convert_fp16_to_float((__fp16*)gamma,gamma_converted,n);
            } else {
                for (int d = 0; d < n; ++d) {
                    gamma_converted[d] = static_cast<float>(((float*)gamma)[d]);
                }
            }
        }
        if(!is_output){//. .  F
            if (!gamma || std::all_of((float *)gamma_converted, (float *)gamma_converted + n, [](float value) { return value == 1.0f; })){
                #pragma omp parallel for num_threads(std::min(m,numThreads)) if(m>=2)
                for(int i=0;i<m;i++){
                    if(convert_beta) {
                        float* beta_converted   = new float[n];
                        convert_fp16_to_float((__fp16*)beta,beta_converted,n);
                        layerNorm_Nogamma(n,(float*)input->data()+i*n, (float*)norm_output->data()+i*n, beta_converted, eps);
                        delete[] beta_converted;
                    } else {
                        layerNorm_Nogamma(n,(float*)input->data()+i*n, (float*)norm_output->data()+i*n, (float*)beta, eps);
                    }
                }
                return LayernormOutput({norm_output, params.before_norm_output});          
            }//(gamma =1,1......)OR (no gamma)
            else{
                #pragma omp parallel for num_threads(std::min(m,numThreads)) if(m>=2)
                for(int i=0;i<m;i++){
                    if(convert_beta) {
                        float* beta_converted   = new float[n];
                        convert_fp16_to_float((__fp16*)beta,beta_converted,n);
                        layerNorm(n,(float*)input->data()+i*n, (float*)norm_output->data()+i*n, gamma_converted, beta_converted, eps);
                        delete[] beta_converted;
                    } else {
                        layerNorm(n,(float*)input->data()+i*n, (float*)norm_output->data()+i*n, gamma_converted, (float*)beta, eps);
                    }     
                }
                return LayernormOutput({norm_output, params.before_norm_output});              
            }          
        }
        else if(!before_norm_output){//add bias residual   //F . T 
            if (!gamma || std::all_of((float *)gamma_converted, (float *)gamma_converted + n, [](float value) { return value == 1.0f; })){
                #pragma omp parallel for num_threads(std::min(m,numThreads)) if(m>=2)
                for(int i=0;i<m;i++){
                    if (convert_beta && convert_bias) {
                        float* beta_converted   = new float[n];
                        float* bias_converted   = new float[n];
                        convert_fp16_to_float((__fp16*)beta,beta_converted,n);
                        convert_fp16_to_float((__fp16*)bias,bias_converted,n);
                        layerNorm_Nogamma_isoutput(n,(float*)input->data()+i*n, (float*)norm_output->data()+i*n, beta_converted,(residual != nullptr) ? (float*)residual + i*n : (float*)residual,bias_converted, eps);
                        delete[] beta_converted;
                        delete[] bias_converted;
                    } else {
                        layerNorm_Nogamma_isoutput(n,(float*)input->data()+i*n, (float*)norm_output->data()+i*n, (float*)beta,(residual != nullptr) ? (float*)residual + i*n : (float*)residual,(float*)bias, eps);
                    }
                }
                return LayernormOutput({norm_output, params.before_norm_output});          
            }//(gamma =1,1......)OR (no gamma)
            else{
                #pragma omp parallel for num_threads(std::min(m,numThreads)) if(m>=2)
                for(int i=0;i<m;i++){
                    if (convert_beta && convert_bias) {
                        float* beta_converted   = new float[n];
                        float* bias_converted   = new float[n];
                        convert_fp16_to_float((__fp16*)beta,beta_converted,n);
                        convert_fp16_to_float((__fp16*)bias,bias_converted,n);
                        layerNorm_isoutput(n,(float*)input->data()+i*n, (float*)norm_output->data()+i*n, gamma_converted, beta_converted,(residual != nullptr) ? ((float*)residual + i*n) : nullptr ,bias_converted, eps);
                        delete[] beta_converted;
                        delete[] bias_converted;
                    } else {
                        layerNorm_isoutput(n,(float*)input->data()+i*n, (float*)norm_output->data()+i*n, (float*)gamma, (float*)beta,(residual != nullptr) ? ((float*)residual + i*n) : nullptr ,(float*)bias, eps);   
                    }
                }
                return LayernormOutput({norm_output, params.before_norm_output});              
            }               
        }
        else if(params.return_normed_output){// T  T  T
            if (!gamma || std::all_of((float *)gamma_converted, (float *)gamma_converted + n, [](float value) { return value == 1.0f; })){
                #pragma omp parallel for num_threads(std::min(m,numThreads)) if(m>=2)
                for(int i=0;i<m;i++){
                    if (convert_beta && convert_bias) {
                        float* beta_converted   = new float[n];
                        float* bias_converted   = new float[n];
                        convert_fp16_to_float((__fp16*)beta,beta_converted,n);
                        convert_fp16_to_float((__fp16*)bias,bias_converted,n);
                        layerNorm_Nogamma_isoutput(n,(float*)input->data()+i*n, (float*)norm_output->data()+i*n, (float*)beta,(residual != nullptr) ? ((float*)residual + i*n) : (float*)residual,(float*)bias, eps);
                        if(before_norm_output != norm_output->data())std::memcpy((float*)before_norm_output + i*n,(float*)norm_output->data() + i*n,n * sizeof(float));
                        delete[] beta_converted;
                        delete[] bias_converted;
                    } else {
                        layerNorm_Nogamma_isoutput(n,(float*)input->data()+i*n, (float*)norm_output->data()+i*n, (float*)beta,(residual != nullptr) ? ((float*)residual + i*n) : (float*)residual,(float*)bias, eps);
                        if(before_norm_output != norm_output->data())std::memcpy((float*)before_norm_output + i*n,(float*)norm_output->data() + i*n,n * sizeof(float)); 
                    }
                }
                return LayernormOutput({norm_output, params.before_norm_output});          
            }//gamma =1,1......  No gamma
            else{
                #pragma omp parallel for num_threads(std::min(m,numThreads)) if(m>=2)
                for(int i=0;i<m;i++){
                    if (convert_beta && convert_bias) {
                        float* beta_converted   = new float[n];
                        float* bias_converted   = new float[n];
                        convert_fp16_to_float((__fp16*)beta,beta_converted,n);
                        convert_fp16_to_float((__fp16*)bias,bias_converted,n);
                        layerNorm_isoutput(n,(float*)input->data()+i*n, (float*)norm_output->data()+i*n, gamma_converted, beta_converted,(residual != nullptr) ? ((float*)residual + i*n) : (float*)residual ,bias_converted, eps);
                        if(before_norm_output != norm_output->data()) std::memcpy((float*)before_norm_output + i*n,(float*)norm_output->data() + i*n,n * sizeof(float)); 
                        delete[] beta_converted;
                        delete[] bias_converted;
                    } else {
                        layerNorm_isoutput(n,(float*)input->data()+i*n, (float*)norm_output->data()+i*n, (float*)gamma, (float*)beta,(residual != nullptr) ? ((float*)residual + i*n) : (float*)residual ,(float*)bias, eps);
                        if(before_norm_output != norm_output->data()) std::memcpy((float*)before_norm_output + i*n,(float*)norm_output->data() + i*n,n * sizeof(float)); 
                    }
                }
                return LayernormOutput({norm_output, params.before_norm_output});              
            } 
        }
        else{ //T F T
            if (!gamma || std::all_of((float *)gamma_converted, (float *)gamma_converted + n, [](float value) { return value == 1.0f; })){
                #pragma omp parallel for num_threads(std::min(m,numThreads)) if(m>=2)
                for(int i=0;i<m;i++){
                    if (convert_beta && convert_bias) {
                        float* beta_converted   = new float[n];
                        float* bias_converted   = new float[n];
                        convert_fp16_to_float((__fp16*)beta,beta_converted,n);
                        convert_fp16_to_float((__fp16*)bias,bias_converted,n);
                        layerNorm_Nogamma_isoutput_unnormedout(n,(float*)input->data()+i*n, (float*)norm_output->data()+i*n, beta_converted, (residual != nullptr) ? ((float*)residual + i*n) : (float*)residual,bias_converted, ((float*)before_norm_output + i*n), eps); 
                        delete[] beta_converted;
                        delete[] bias_converted;
                    } else {
                        layerNorm_Nogamma_isoutput_unnormedout(n,(float*)input->data()+i*n, (float*)norm_output->data()+i*n, (float*)beta,(residual != nullptr) ? ((float*)residual + i*n) : (float*)residual,(float*)bias, ((float*)before_norm_output + i*n), eps); 
                    }
                }
                return LayernormOutput({norm_output, params.before_norm_output});          
            }//gamma =1,1......  No gamma
            else{
                #pragma omp parallel for num_threads(std::min(m,numThreads)) if(m>=2)
                for(int i=0;i<m;i++){
                    if (convert_beta && convert_bias) {
                        float* beta_converted   = new float[n];
                        float* bias_converted   = new float[n];
                        convert_fp16_to_float((__fp16*)beta,beta_converted,n);
                        convert_fp16_to_float((__fp16*)bias,bias_converted,n);
                        layerNorm_isoutput_unnormedout(n,(float*)input->data()+i*n, (float*)norm_output->data()+i*n, gamma_converted, beta_converted,(residual != nullptr) ? ((float*)residual + i*n) : (float*)residual ,bias_converted, ((float*)before_norm_output+ i*n),eps);
                        delete[] beta_converted;
                        delete[] bias_converted;
                    } else {
                        layerNorm_isoutput_unnormedout(n,(float*)input->data()+i*n, (float*)norm_output->data()+i*n, (float*)gamma, (float*)beta,(residual != nullptr) ? ((float*)residual + i*n) : (float*)residual ,(float*)bias, ((float*)before_norm_output+ i*n),eps);
                    }
                }
                return LayernormOutput({norm_output, params.before_norm_output});              
            }
        }
    } 

    // Due to the cumulative errors caused by using fp16 precision calculations, the fp16 input is first converted to fp32 before using the fp32 kernel.
    if(norm_type == NormType::rmsnorm){
        if (!weights.has_value()) {//In this case, norm_output = input+residual
            if (data_type == DataType::TYPE_FP32){
            #pragma omp parallel for num_threads(std::min(m,numThreads)) if(m>=2)
                for(int i = 0 ; i<m ; i++){
                    add_residual_bias_float((float*)norm_output->data()+i*n, (float*)input->data()+i*n, (bool)residual ? (float*)residual+i*n : nullptr, (float*)bias, n);
                }  
            }
            else if (data_type == DataType::TYPE_FP16){                 
                #pragma omp parallel for num_threads(std::min(m,numThreads)) if(m>=2)
                for(int i = 0 ; i<m ; i++){
                    add_residual_bias_fp16((__fp16*)norm_output->data()+i*n, (__fp16*)input->data()+i*n, (bool)residual ? (__fp16*)residual+i*n : nullptr, (__fp16*)bias, n);
                }                
            }
            else { 
                throw OpException(OpErrorType::ERROR_UNIMPLEMENTED);
            }
            return LayernormOutput({norm_output, params.before_norm_output});
        }   
        
        if(data_type == DataType::TYPE_FP32||data_type == DataType::TYPE_FP16){  //
            if(!is_output && (!before_norm_output || before_norm_output != norm_output->data())){//without before_norm_output  is_output false
                if ((data_type == DataType::TYPE_FP32&&(!gamma || std::all_of((float *)gamma, (float *)gamma + n, [](float value) { return value == 1.0f; })))
                  ||(data_type == DataType::TYPE_FP16&&(!gamma || std::all_of((__fp16 *)gamma, (__fp16 *)gamma + n, [](__fp16 value) { return value == 1.0; })))) {
                    #pragma omp parallel for num_threads(std::min(m,numThreads)) if(m>=2)
                    for(int i=0;i<m;i++){
                        if(data_type == DataType::TYPE_FP16){//convert_float_to_fp16
                            float* input_converted  = new float[n];
                            float* output_converted = new float[n];
                            float* beta_converted   = new float[n];
                            convert_fp16_to_float((__fp16*)input->data()+i*n,input_converted,n);
                            convert_fp16_to_float((__fp16*)norm_output->data()+i*n,output_converted,n);
                            if(beta) convert_fp16_to_float((__fp16*)beta,beta_converted,n);
                            RMSNorm_Nogamma(n,input_converted, output_converted, beta!= nullptr ? beta_converted : nullptr, eps);  
                            convert_float_to_fp16(output_converted,(__fp16*)norm_output->data()+i*n,n);
                            delete[] input_converted;
                            delete[] output_converted;
                            delete[] beta_converted;             
                        }
                        else if(data_type == DataType::TYPE_FP32){
                            RMSNorm_Nogamma(n,(float*)input->data()+i*n, (float*)norm_output->data()+i*n, beta!= nullptr ? (float*)beta : nullptr, eps);                             
                        }
                        else throw OpException(OpErrorType::ERROR_UNIMPLEMENTED);                            
                    }
                    return LayernormOutput({norm_output, params.before_norm_output}); 
                }
                else{
                    #pragma omp parallel for num_threads(std::min(m,numThreads)) if(m>=2)
                    for(int i=0;i<m;i++){                  
                        if(data_type == DataType::TYPE_FP16){//convert_float_to_fp16
                            float* input_converted  = new float[n];
                            float* output_converted = new float[n];
                            float* beta_converted   = new float[n];
                            float* gamma_converted  = new float[n];                        
                            convert_fp16_to_float((__fp16*)input->data()+i*n,input_converted,n);
                            convert_fp16_to_float((__fp16*)norm_output->data()+i*n,output_converted,n);
                            if(beta) convert_fp16_to_float((__fp16*)beta,beta_converted,n);
                            convert_fp16_to_float((__fp16*)gamma,gamma_converted,n);
                            RMSNorm(n, input_converted, output_converted, gamma_converted, beta!= nullptr ? beta_converted : nullptr, eps);
                            convert_float_to_fp16(output_converted,(__fp16*)norm_output->data()+i*n,n);
                            delete[] input_converted;
                            delete[] output_converted;
                            delete[] gamma_converted;       
                            delete[] beta_converted;                    
                        }
                        else if(data_type == DataType::TYPE_FP32){   // beta!= nullptr ? (float*)beta : nullptr
                            RMSNorm(n,(float*)input->data()+i*n, (float*)norm_output->data()+i*n, (float*) gamma ,beta!= nullptr ? (float*)beta : nullptr, eps); 
                        }
                        else throw OpException(OpErrorType::ERROR_UNIMPLEMENTED);  
                    }    
                    return LayernormOutput({norm_output, params.before_norm_output});                 
                }
            }
            else{
                if ((data_type == DataType::TYPE_FP32&&(!gamma || std::all_of((float *)gamma, (float *)gamma + n, [](float value) { return value == 1.0f; })))
                  ||(data_type == DataType::TYPE_FP16&&(!gamma || std::all_of((__fp16 *)gamma, (__fp16 *)gamma + n, [](__fp16 value) { return value == 1.0; })))) {
                    #pragma omp parallel for num_threads(std::min(m,numThreads)) if(m>=2)
                    for(int i=0;i<m;i++){
                        if(data_type == DataType::TYPE_FP16){
                        float* input_converted  = new float[n];
                        float* output_converted = new float[n];
                        float* beta_converted   = new float[n];
                        float* bias_converted   = new float[n];
                        float* before_norm_output_converted = new float[n];
                        float* residual_converted = new float[n];
                        convert_fp16_to_float((__fp16*)input->data()+i*n,input_converted,n);
                        convert_fp16_to_float((__fp16*)norm_output->data()+i*n,output_converted,n);
                        if(beta) convert_fp16_to_float((__fp16*)beta,beta_converted,n);
                        if(before_norm_output && before_norm_output != norm_output->data())convert_fp16_to_float((__fp16*)before_norm_output+i*n,before_norm_output_converted,n);
                        if(residual)convert_fp16_to_float((__fp16*)residual + i*n,residual_converted,n);
                        if(bias)convert_fp16_to_float((__fp16*)bias + i*n,bias_converted,n);
                        RMSNorm_Nogamma_isoutput(n,\
                                                (before_norm_output && before_norm_output != norm_output->data())? before_norm_output_converted : nullptr,\
                                                input_converted,\
                                                output_converted,\
                                                beta!= nullptr ? beta_converted : nullptr,\
                                                (residual != nullptr) ? residual_converted : nullptr,\
                                                (bias != nullptr) ? bias_converted:nullptr,\
                                                eps); 
                        convert_float_to_fp16(output_converted,(__fp16*)norm_output->data()+i*n,n);
                        convert_float_to_fp16(before_norm_output_converted,(__fp16*)before_norm_output+i*n,n);
                        delete[] input_converted;
                        delete[] output_converted;
                        delete[] beta_converted ;
                        delete[] bias_converted ;
                        delete[] before_norm_output_converted ;
                        delete[] residual_converted;
                        }
                    else{
                        RMSNorm_Nogamma_isoutput(n,\
                                                (before_norm_output && before_norm_output != norm_output->data())?(float*)before_norm_output+i*n : nullptr,\
                                                (float*)input->data()+i*n, \
                                                (float*)norm_output->data()+i*n, \
                                                (float*)beta, \
                                                (residual != nullptr) ? (float*)residual + i*n : nullptr,\
                                                (bias != nullptr) ?(float*)bias:nullptr, \
                                                eps);                                 
                        }
                    }
                    return LayernormOutput({norm_output, params.before_norm_output});            
                }
                else{
                    #pragma omp parallel for num_threads(std::min(m,numThreads)) if(m>=2)
                    for(int i=0;i<m;i++){
                        if(data_type == DataType::TYPE_FP16){
                        float* input_converted  = new float[n];
                        float* output_converted = new float[n];
                        float* gamma_converted  = new float[n];
                        float* beta_converted   = new float[n];
                        float* bias_converted   = new float[n];
                        float* before_norm_output_converted = new float[n];
                        float* residual_converted = new float[n];
                        convert_fp16_to_float((__fp16*)input->data()+i*n,input_converted,n);
                        convert_fp16_to_float((__fp16*)gamma,gamma_converted,n);
                        if(beta) convert_fp16_to_float((__fp16*)beta,beta_converted,n);
                        if(before_norm_output && before_norm_output != norm_output->data()) convert_fp16_to_float((__fp16*)before_norm_output+i*n,before_norm_output_converted,n);
                        if(residual)convert_fp16_to_float((__fp16*)residual + i*n,residual_converted,n);
                        if(bias)convert_fp16_to_float((__fp16*)bias,bias_converted,n);

                        RMSNorm_isoutput(n,\
                                        (before_norm_output && before_norm_output != norm_output->data())? before_norm_output_converted : nullptr,\
                                        input_converted,\
                                        output_converted,\
                                        gamma_converted,\
                                        (beta!= nullptr) ? beta_converted : nullptr,\
                                        (residual != nullptr) ? residual_converted : nullptr,\
                                        (bias != nullptr) ? bias_converted:nullptr,\
                                        eps); 
                        convert_float_to_fp16(output_converted,(__fp16*)norm_output->data()+i*n,n);
                        if(before_norm_output && before_norm_output != norm_output->data()) convert_float_to_fp16(before_norm_output_converted,(__fp16*)before_norm_output+i*n,n);
                        delete[] input_converted;
                        delete[] output_converted;
                        delete[] gamma_converted;
                        delete[] beta_converted;
                        delete[] bias_converted;
                        delete[] before_norm_output_converted;
                        delete[] residual_converted;
                        }
                        else{
                        float* before_norm_output_converted = new float[n];
                        RMSNorm_isoutput(n,\
                                        (before_norm_output && before_norm_output != norm_output->data())?before_norm_output_converted : nullptr,\
                                        (float*)input->data()+i*n, \
                                        (float*)norm_output->data()+i*n, \
                                        (float*) gamma ,\
                                        (beta!= nullptr) ? (float*)beta : nullptr, \
                                        (residual != nullptr) ? (float*)residual + i*n : (float*)residual,\
                                        (bias != nullptr) ? (float*)bias:nullptr,\
                                        eps);   
                        if(before_norm_output && before_norm_output != norm_output->data()) std::memcpy((float*)before_norm_output+i*n, before_norm_output_converted, n * sizeof(float));
                        delete[] before_norm_output_converted;
                        }
                    }
                    return LayernormOutput({norm_output, params.before_norm_output});                 
                }
            }
        }
        else throw OpException(OpErrorType::ERROR_UNIMPLEMENTED);
    }
// **********************************************

// before_norm_output       params.return_norm_output        bias/residual exist
// .  .  F
// layernorm(input)->normed_output  
// F  .  T
// layernorm(input+bias+residual)->normed_output  
// T  T  T
// layernorm(input+bias+residual)->before_norm_output
// layernorm(input+bias+residual)->normed_output
// T  F  T
// (input+bias+residual)->before_norm_output
// layernorm(input+bias+residual)->normed_output

// **********************************************
    else if (norm_type == NormType::layernorm && data_type == DataType::TYPE_FP32){
        if(!is_output){//. .  F  
            if (!gamma || std::all_of((float *)gamma, (float *)gamma + n, [](float value) { return value == 1.0f; })){

                #pragma omp parallel for num_threads(std::min(m,numThreads)) if(m>=2)
                for(int i=0;i<m;i++){
                    layerNorm_Nogamma(n,(float*)input->data()+i*n, (float*)norm_output->data()+i*n, (float*)beta, eps);
                }

                return LayernormOutput({norm_output, params.before_norm_output});          
            }//(gamma =1,1......)OR (no gamma)
            else{
                #pragma omp parallel for num_threads(std::min(m,numThreads)) if(m>=2)
                for(int i=0;i<m;i++){
                    layerNorm(n,(float*)input->data()+i*n, (float*)norm_output->data()+i*n, (float*)gamma, (float*)beta, eps);              
                }
                return LayernormOutput({norm_output, params.before_norm_output});              
            }          
        }
        else if(!before_norm_output){//add bias residual   //F . T 
            if (!gamma || std::all_of((float *)gamma, (float *)gamma + n, [](float value) { return value == 1.0f; })){
                #pragma omp parallel for num_threads(std::min(m,numThreads)) if(m>=2)
                for(int i=0;i<m;i++){
                    layerNorm_Nogamma_isoutput(n,(float*)input->data()+i*n, (float*)norm_output->data()+i*n, (float*)beta,(residual != nullptr) ? (float*)residual + i*n : (float*)residual,(float*)bias, eps);                 
                }
                return LayernormOutput({norm_output, params.before_norm_output});          
            }//(gamma =1,1......)OR (no gamma)
            else{
                #pragma omp parallel for num_threads(std::min(m,numThreads)) if(m>=2)
                for(int i=0;i<m;i++){
                    layerNorm_isoutput(n,(float*)input->data()+i*n, (float*)norm_output->data()+i*n, (float*)gamma, (float*)beta,(residual != nullptr) ? ((float*)residual + i*n) : nullptr ,(float*)bias, eps);          
                }
                return LayernormOutput({norm_output, params.before_norm_output});              
            }               
        }
        else if(params.return_normed_output){// T  T  T
            if (!gamma || std::all_of((float *)gamma, (float *)gamma + n, [](float value) { return value == 1.0f; })){
                #pragma omp parallel for num_threads(std::min(m,numThreads)) if(m>=2)
                for(int i=0;i<m;i++){
                    layerNorm_Nogamma_isoutput(n,(float*)input->data()+i*n, (float*)norm_output->data()+i*n, (float*)beta,(residual != nullptr) ? ((float*)residual + i*n) : (float*)residual,(float*)bias, eps);
                    if(before_norm_output != norm_output->data())std::memcpy((float*)before_norm_output + i*n,(float*)norm_output->data() + i*n,n * sizeof(float));                 
                }
                return LayernormOutput({norm_output, params.before_norm_output});          
            }//gamma =1,1......  No gamma
            else{
                #pragma omp parallel for num_threads(std::min(m,numThreads)) if(m>=2)
                for(int i=0;i<m;i++){
                    layerNorm_isoutput(n,(float*)input->data()+i*n, (float*)norm_output->data()+i*n, (float*)gamma, (float*)beta,(residual != nullptr) ? ((float*)residual + i*n) : (float*)residual ,(float*)bias, eps);
                    if(before_norm_output != norm_output->data()) std::memcpy((float*)before_norm_output + i*n,(float*)norm_output->data() + i*n,n * sizeof(float));                 
                }
                return LayernormOutput({norm_output, params.before_norm_output});              
            } 
        }
        else{ //T F T
            if (!gamma || std::all_of((float *)gamma, (float *)gamma + n, [](float value) { return value == 1.0f; })){
                #pragma omp parallel for num_threads(std::min(m,numThreads)) if(m>=2)
                for(int i=0;i<m;i++){
                    layerNorm_Nogamma_isoutput_unnormedout(n,(float*)input->data()+i*n, (float*)norm_output->data()+i*n, (float*)beta,(residual != nullptr) ? ((float*)residual + i*n) : (float*)residual,(float*)bias, ((float*)before_norm_output + i*n), eps);                
                }
                return LayernormOutput({norm_output, params.before_norm_output});          
            }//gamma =1,1......  No gamma
            else{
                #pragma omp parallel for num_threads(std::min(m,numThreads)) if(m>=2)
                for(int i=0;i<m;i++){
                    layerNorm_isoutput_unnormedout(n,(float*)input->data()+i*n, (float*)norm_output->data()+i*n, (float*)gamma, (float*)beta,(residual != nullptr) ? ((float*)residual + i*n) : (float*)residual ,(float*)bias, ((float*)before_norm_output+ i*n),eps);                
                }
                return LayernormOutput({norm_output, params.before_norm_output});              
            }
        }
    }
    else throw OpException(OpErrorType::ERROR_UNIMPLEMENTED);
}