build-rockchip.git

/* 
 * Copyright (C) 2010-2018 Arm Limited or its affiliates. All rights reserved. 
 * 
 * SPDX-License-Identifier: Apache-2.0 
 * 
 * Licensed under the Apache License, Version 2.0 (the License); you may 
 * not use this file except in compliance with the License. 
 * You may obtain a copy of the License at 
 * 
 * www.apache.org/licenses/LICENSE-2.0 
 * 
 * Unless required by applicable law or agreed to in writing, software 
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT 
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
 * See the License for the specific language governing permissions and 
 * limitations under the License. 
 */ 
 
/* ---------------------------------------------------------------------- 
 * Project:      CMSIS NN Library 
 * Title:        arm_nn_mat_mult_kernel_q7_q15.c 
 * Description:  Matrix-multiplication function for convolution 
 * 
 * $Date:        17. January 2018 
 * $Revision:    V.1.0.0 
 * 
 * Target Processor:  Cortex-M cores 
 * -------------------------------------------------------------------- */ 
 
#include "arm_math.h" 
#include "arm_nnfunctions.h" 
 
  /** 
   * @brief Matrix-multiplication function for convolution 
   * @param[in]       pA          pointer to operand A 
   * @param[in]       pInBuffer   pointer to operand B, always conssists of 2 vectors 
   * @param[in]       ch_im_out   numRow of A 
   * @param[in]       numCol_A    numCol of A 
   * @param[in]       bias_shift  amount of left-shift for bias 
   * @param[in]       out_shift   amount of right-shift for output 
   * @param[in]       bias        the bias 
   * @param[in,out]   pOut        pointer to output 
   * @return     The function returns the incremented output pointer 
   * 
   * @details 
   * 
   * This function does the matrix multiplication with weight matrix 
   * and 2 columns from im2col.  
   */ 
 
q7_t     *arm_nn_mat_mult_kernel_q7_q15(const q7_t * pA, 
                                        const q15_t * pInBuffer, 
                                        const uint16_t ch_im_out, 
                                        const uint16_t numCol_A, 
                                        const uint16_t bias_shift, 
                                        const uint16_t out_shift,  
                                        const q7_t * bias,  
                                        q7_t * pOut) 
{ 
#if defined (ARM_MATH_DSP) 
    /* set up the second output pointers */ 
    q7_t     *pOut2 = pOut + ch_im_out; 
    const q7_t *pBias = bias; 
 
    uint16_t  rowCnt = ch_im_out >> 1; 
    /* this loop over rows in A */ 
    while (rowCnt) 
    { 
        /* setup pointers for B */ 
        const q15_t *pB = pInBuffer; 
        const q15_t *pB2 = pB + numCol_A; 
 
        /* align the second pointer for A */ 
        const q7_t *pA2 = pA + numCol_A; 
 
        /* init the sum with bias */ 
        q31_t     sum =  ((q31_t)(*pBias) << bias_shift) + NN_ROUND(out_shift); 
        q31_t     sum2 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); 
        q31_t     sum3 = ((q31_t)(*pBias) << bias_shift) + NN_ROUND(out_shift); 
        q31_t     sum4 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); 
 
        uint16_t  colCnt = numCol_A >> 2; 
        /* accumulate over the vector */ 
        while (colCnt) 
        { 
            q31_t     inA11, inA12, inA21, inA22; 
            q31_t     inB1 = *__SIMD32(pB)++; 
            q31_t     inB2 = *__SIMD32(pB2)++; 
 
            pA = (q7_t *) read_and_pad((void *)pA, &inA11, &inA12); 
            pA2 = (q7_t *) read_and_pad((void *)pA2, &inA21, &inA22); 
 
            sum = __SMLAD(inA11, inB1, sum); 
            sum2 = __SMLAD(inA11, inB2, sum2); 
            sum3 = __SMLAD(inA21, inB1, sum3); 
            sum4 = __SMLAD(inA21, inB2, sum4); 
 
            inB1 = *__SIMD32(pB)++; 
            inB2 = *__SIMD32(pB2)++; 
 
            sum = __SMLAD(inA12, inB1, sum); 
            sum2 = __SMLAD(inA12, inB2, sum2); 
            sum3 = __SMLAD(inA22, inB1, sum3); 
            sum4 = __SMLAD(inA22, inB2, sum4); 
 
            colCnt--; 
        }                       /* while over colCnt */ 
        colCnt = numCol_A & 0x3; 
        while (colCnt) 
        { 
            q7_t      inA1 = *pA++; 
            q15_t     inB1 = *pB++; 
            q7_t      inA2 = *pA2++; 
            q15_t     inB2 = *pB2++; 
 
            sum += inA1 * inB1; 
            sum2 += inA1 * inB2; 
            sum3 += inA2 * inB1; 
            sum4 += inA2 * inB2; 
            colCnt--; 
        }                       /* while over colCnt */ 
        *pOut++ = (q7_t) __SSAT((sum >> out_shift), 8); 
        *pOut++ = (q7_t) __SSAT((sum3 >> out_shift), 8); 
        *pOut2++ = (q7_t) __SSAT((sum2 >> out_shift), 8); 
        *pOut2++ = (q7_t) __SSAT((sum4 >> out_shift), 8); 
 
        /* skip the row computed with A2 */ 
        pA += numCol_A; 
        rowCnt--; 
    }                           /* for over ch_im_out */ 
 
    /* compute left-over row if any */ 
    if (ch_im_out & 0x1) 
    { 
        /* setup pointers for B */ 
        const q15_t *pB = pInBuffer; 
        const q15_t *pB2 = pB + numCol_A; 
 
        /* load the bias */ 
        q31_t     sum = ((q31_t)(*pBias) << bias_shift) + NN_ROUND(out_shift); 
        q31_t     sum2 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); 
 
        uint16_t  colCnt = numCol_A >> 2; 
        while (colCnt) 
        { 
            q31_t     inA11, inA12; 
            q31_t     inB1 = *__SIMD32(pB)++; 
            q31_t     inB2 = *__SIMD32(pB2)++; 
 
            pA = (q7_t *) read_and_pad((void *)pA, &inA11, &inA12); 
 
            sum = __SMLAD(inA11, inB1, sum); 
            sum2 = __SMLAD(inA11, inB2, sum2); 
 
            inB1 = *__SIMD32(pB)++; 
            inB2 = *__SIMD32(pB2)++; 
            sum = __SMLAD(inA12, inB1, sum); 
            sum2 = __SMLAD(inA12, inB2, sum2); 
 
            colCnt--; 
        } 
        colCnt = numCol_A & 0x3; 
        while (colCnt) 
        { 
            q7_t      inA1 = *pA++; 
            q15_t     inB1 = *pB++; 
            q15_t     inB2 = *pB2++; 
 
            sum += inA1 * inB1; 
            sum2 += inA1 * inB2; 
            colCnt--; 
        } 
 
        *pOut++ = (q7_t) __SSAT((sum >> out_shift), 8); 
        *pOut2++ = (q7_t) __SSAT((sum2 >> out_shift), 8); 
    } 
 
    pOut += ch_im_out; 
 
    /* return the new output pointer with offset */ 
    return pOut; 
#else 
    /* To be completed */ 
    return NULL; 
#endif                          /* ARM_MATH_DSP */ 
 
}