ONE/optimized_2_depthwise_conv_u_int8_8h_source.html

/*

 * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved

 * Copyright 2017 The TensorFlow Authors. All Rights Reserved.

 *

 * Licensed under the Apache License, Version 2.0 (the "License");

 * you may not use this file except in compliance with the License.

 * You may obtain a copy of the License at

 *

 *      http://www.apache.org/licenses/LICENSE-2.0

 *

 * Unless required by applicable law or agreed to in writing, software

 * distributed under the License is distributed on an "AS IS" BASIS,

 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

 * See the License for the specific language governing permissions and

 * limitations under the License.

 */


#ifndef __NNFW_CKER_OPTIMIZED_DEPTHWISE_CONV_UINT8_H__

#define __NNFW_CKER_OPTIMIZED_DEPTHWISE_CONV_UINT8_H__


#include "cker/Shape.h"

#include "cker/Types.h"

#include "cker/Utils.h"

#include "cker/neon/neon_check.h"


#include <fixedpoint/fixedpoint.h>

#include <public/gemmlowp.h>


namespace nnfw

{

namespace cker

{

namespace optimized

{


namespace depthwise_conv

{


// Implementation of quantized DepthwiseConv


template <bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier>


struct QuantizedDepthwiseConvKernel

{

};


#ifdef USE_NEON

template <> struct QuantizedDepthwiseConvKernel<true, 8, 2>

{

  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,

                  const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,

                  const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)

  {

    (void)input_depth;

    (void)depth_multiplier;

    // Load the filters, add filter_offset.

    uint8x8x2_t filter_u8;

    filter_u8.val[0] = vld1_u8(filter_ptr);

    filter_u8.val[1] = vld1_u8(filter_ptr + 8);

    int16x8_t filter[2];

    for (int i = 0; i < 2; i++)

    {

      filter[i] =

        vaddq_s16(vreinterpretq_s16_u16(vmovl_u8(filter_u8.val[i])), vdupq_n_s16(filter_offset));

    }

    // Handle one output pixel at a time.

    for (int outp = 0; outp < num_output_pixels; outp++)

    {

      // Load the accumulators from acc_buffer

      int32x4x2_t acc[2];

      for (int i = 0; i < 2; i++)

      {

        acc[i].val[0] = vld1q_s32(acc_buffer_ptr + 4 * i);

        acc[i].val[1] = vld1q_s32(acc_buffer_ptr + 4 * i + 8);

      }

      // Load the inputs, add input_offset.

      const uint8x8_t input_u8 = vld1_u8(input_ptr);

      input_ptr += input_ptr_increment;

      const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));

      const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));

      // Duplicate the input values, 2-fold

      const int16x8x2_t input_dup2 = vzipq_s16(input, input);

      // Multiply-accumulate

      for (int i = 0; i < 2; i++)

      {

        acc[0].val[i] =

          vmlal_s16(acc[0].val[i], vget_low_s16(filter[i]), vget_low_s16(input_dup2.val[i]));

        acc[1].val[i] =

          vmlal_s16(acc[1].val[i], vget_high_s16(filter[i]), vget_high_s16(input_dup2.val[i]));

      }

      // Store the accumulators back to acc_buffer

      for (int i = 0; i < 2; i++)

      {

        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i].val[0]);

        vst1q_s32(acc_buffer_ptr + 4 * i + 8, acc[i].val[1]);

      }

      acc_buffer_ptr += 16;

    }

  }

};


template <> struct QuantizedDepthwiseConvKernel<false, 8, 1>

{

  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,

                  const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,

                  const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)

  {

    (void)input_depth;

    (void)depth_multiplier;

    (void)input_ptr_increment;

    // Load the filters, add filter_offset.

    const uint8x8_t filter_u8 = vld1_u8(filter_ptr);

    const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8));

    const int16x8_t filter = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));


    int outp = 0;

    // Handle 2 output pixels at a time.

    for (; outp <= num_output_pixels - 2; outp += 2)

    {

      // Load the accumulators from acc_buffer.

      int32x4_t acc[4];

      for (int i = 0; i < 4; i++)

      {

        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);

      }

      // Load the inputs, add input_offset.

      uint8x8_t input_u8[2];

      for (int i = 0; i < 2; i++)

      {

        input_u8[i] = vld1_u8(input_ptr + 8 * i);

      }

      input_ptr += 16;

      int16x8_t input[2];

      for (int i = 0; i < 2; i++)

      {

        input[i] = vreinterpretq_s16_u16(vmovl_u8(input_u8[i]));

      }

      for (int i = 0; i < 2; i++)

      {

        input[i] = vaddq_s16(input[i], vdupq_n_s16(input_offset));

      }

      // Multiply-accumulate.

      acc[0] = vmlal_s16(acc[0], vget_low_s16(filter), vget_low_s16(input[0]));

      acc[1] = vmlal_s16(acc[1], vget_high_s16(filter), vget_high_s16(input[0]));

      acc[2] = vmlal_s16(acc[2], vget_low_s16(filter), vget_low_s16(input[1]));

      acc[3] = vmlal_s16(acc[3], vget_high_s16(filter), vget_high_s16(input[1]));

      // Store the accumulators back to acc_buffer

      for (int i = 0; i < 4; i++)

      {

        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);

      }

      acc_buffer_ptr += 16;

    }

    // Handle 1 output pixel at a time.

    for (; outp < num_output_pixels; outp++)

    {

      // Load the accumulators from acc_buffer.

      int32x4_t acc[2];

      acc[0] = vld1q_s32(acc_buffer_ptr);

      acc[1] = vld1q_s32(acc_buffer_ptr + 4);


      // Load the inputs, add input_offset.

      const uint8x8_t input_u8 = vld1_u8(input_ptr);

      input_ptr += 8;

      const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));

      const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));

      // Multiply-accumulate.

      acc[0] = vmlal_s16(acc[0], vget_low_s16(filter), vget_low_s16(input));

      acc[1] = vmlal_s16(acc[1], vget_high_s16(filter), vget_high_s16(input));

      // Store the accumulators back to acc_buffer

      vst1q_s32(acc_buffer_ptr, acc[0]);

      vst1q_s32(acc_buffer_ptr + 4, acc[1]);

      acc_buffer_ptr += 8;

    }

  }

};


template <> struct QuantizedDepthwiseConvKernel<false, 4, 2>

{

  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,

                  const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,

                  const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)

  {

    (void)input_depth;

    (void)depth_multiplier;

    (void)input_ptr_increment;

    // Load the filters, add filter_offset.

    const uint8x8_t filter_u8 = vld1_u8(filter_ptr);

    const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8));

    const int16x8_t filter = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));


    int outp = 0;

    // Handle 2 output pixels at a time.

    for (; outp <= num_output_pixels - 2; outp += 2)

    {

      // Load the accumulators from acc_buffer

      int32x4_t acc[4];

      for (int i = 0; i < 4; i++)

      {

        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);

      }

      // Load the inputs, add input_offset.

      const uint8x8_t input_u8 = vld1_u8(input_ptr);

      input_ptr += 8;

      const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));

      const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));

      // Duplicate the input values, 2-fold

      const int16x8x2_t input_dup2 = vzipq_s16(input, input);

      // Multiply-accumulate

      for (int i = 0; i < 2; i++)

      {

        acc[2 * i + 0] =

          vmlal_s16(acc[2 * i + 0], vget_low_s16(filter), vget_low_s16(input_dup2.val[i]));

        acc[2 * i + 1] =

          vmlal_s16(acc[2 * i + 1], vget_high_s16(filter), vget_high_s16(input_dup2.val[i]));

      }

      // Store the accumulators back to acc_buffer

      for (int i = 0; i < 4; i++)

      {

        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);

      }

      acc_buffer_ptr += 16;

    }

    // Handle one output pixel at a time.

    for (; outp < num_output_pixels; outp++)

    {

      // Load the accumulators from acc_buffer

      int32x4_t acc[2];

      for (int i = 0; i < 2; i++)

      {

        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);

      }

      // Load the inputs, add input_offset.

      uint8x8_t input_u8 = vdup_n_u8(0);

      input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);

      input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);

      input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2);

      input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3);

      input_ptr += 4;

      const int16x4_t input_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));

      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));

      // Duplicate the input values, 2-fold

      const int16x4x2_t input_dup2 = vzip_s16(input, input);

      // Multiply-accumulate

      acc[0] = vmlal_s16(acc[0], vget_low_s16(filter), input_dup2.val[0]);

      acc[1] = vmlal_s16(acc[1], vget_high_s16(filter), input_dup2.val[1]);

      // Store the accumulators back to acc_buffer

      for (int i = 0; i < 2; i++)

      {

        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);

      }

      acc_buffer_ptr += 8;

    }

  }

};


template <> struct QuantizedDepthwiseConvKernel<false, 2, 8>

{

  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,

                  const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,

                  const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)

  {

    (void)input_depth;

    (void)depth_multiplier;

    (void)input_ptr_increment;

    // Load the filters, add filter_offset.

    int16x8_t filter[2];

    for (int i = 0; i < 2; i++)

    {

      const uint8x8_t filter_u8 = vld1_u8(filter_ptr + 8 * i);

      const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8));

      filter[i] = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));

    }

    int outp = 0;

    // Handle two output pixels at a time.

    for (; outp <= num_output_pixels - 2; outp += 2)

    {

      // Load the accumulators from acc_buffer.

      int32x4_t acc[8];

      for (int i = 0; i < 8; i++)

      {

        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);

      }

      // Load the inputs, add input_offset.

      uint8x8_t input_u8 = vdup_n_u8(0);

      input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);

      input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);

      input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2);

      input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3);

      input_ptr += 4;

      const int16x4_t input_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));

      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));

      // Multiply-accumulate.

      acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]), input, 0);

      acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]), input, 0);

      acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]), input, 1);

      acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]), input, 1);

      acc[4] = vmlal_lane_s16(acc[4], vget_low_s16(filter[0]), input, 2);

      acc[5] = vmlal_lane_s16(acc[5], vget_high_s16(filter[0]), input, 2);

      acc[6] = vmlal_lane_s16(acc[6], vget_low_s16(filter[1]), input, 3);

      acc[7] = vmlal_lane_s16(acc[7], vget_high_s16(filter[1]), input, 3);

      // Store the accumulators back to acc_buffer.

      for (int i = 0; i < 8; i++)

      {

        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);

      }

      acc_buffer_ptr += 32;

    }

    // Handle one output pixel at a time.

    for (; outp < num_output_pixels; outp++)

    {

      // Load the accumulators from acc_buffer.

      int32x4_t acc[4];

      for (int i = 0; i < 4; i++)

      {

        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);

      }

      // Load the inputs, add input_offset.

      uint8x8_t input_u8 = vdup_n_u8(0);

      input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);

      input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);

      input_ptr += 2;

      const int16x4_t input_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));

      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));


      // Multiply-accumulate.

      acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]), input, 0);

      acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]), input, 0);

      acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]), input, 1);

      acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]), input, 1);


      // Store the accumulators back to acc_buffer.

      for (int i = 0; i < 4; i++)

      {

        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);

      }

      acc_buffer_ptr += 16;

    }

  }

};


template <> struct QuantizedDepthwiseConvKernel<false, 2, 2>

{

  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,

                  const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,

                  const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)

  {

    (void)input_depth;

    (void)depth_multiplier;

    (void)input_ptr_increment;

    // Load the filters, add filter_offset.

    uint8x8_t filter_u8 = vdup_n_u8(0);

    filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0);

    filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1);

    filter_u8 = vset_lane_u8(filter_ptr[2], filter_u8, 2);

    filter_u8 = vset_lane_u8(filter_ptr[3], filter_u8, 3);

    const int16x4_t filter_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8)));

    const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset));


    int outp = 0;

    // Handle 4 output pixels at a time.

    for (; outp <= num_output_pixels - 4; outp += 4)

    {

      // Load the accumulators from acc_buffer

      int32x4_t acc[4];

      for (int i = 0; i < 4; i++)

      {

        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);

      }


      // Load the inputs, add input_offset.

      const uint8x8_t input_u8 = vld1_u8(input_ptr);

      input_ptr += 8;

      const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));

      const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));

      // Duplicate the input values, 2-fold

      const int16x8x2_t input_dup2 = vzipq_s16(input, input);

      // Multiply-accumulate

      acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input_dup2.val[0]));

      acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input_dup2.val[0]));

      acc[2] = vmlal_s16(acc[2], filter, vget_low_s16(input_dup2.val[1]));

      acc[3] = vmlal_s16(acc[3], filter, vget_high_s16(input_dup2.val[1]));

      // Store the accumulators back to acc_buffer

      for (int i = 0; i < 4; i++)

      {

        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);

      }

      acc_buffer_ptr += 16;

    }

    // Handle one output pixel at a time.

    for (; outp < num_output_pixels; outp++)

    {

      // Load the accumulators from acc_buffer

      int32x4_t acc = vld1q_s32(acc_buffer_ptr);


      uint8x8_t input_u8 = vdup_n_u8(0);

      input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);

      input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);

      input_ptr += 2;

      const int16x4_t input_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));

      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));

      // Duplicate the input values, 2-fold

      const int16x4_t input_dup2 = vzip_s16(input, input).val[0];

      // Multiply-accumulate

      acc = vmlal_s16(acc, filter, input_dup2);

      // Store the accumulators back to acc_buffer

      vst1q_s32(acc_buffer_ptr, acc);

      acc_buffer_ptr += 4;

    }

  }

};


template <> struct QuantizedDepthwiseConvKernel<false, 2, 1>

{

  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,

                  const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,

                  const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)

  {

    (void)input_depth;

    (void)depth_multiplier;

    (void)input_ptr_increment;

    // Load the filters, add filter_offset.

    uint8x8_t filter_u8 = vdup_n_u8(0);

    filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0);

    filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1);

    filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 2);

    filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 3);

    const int16x4_t filter_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8)));

    const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset));


    int outp = 0;

    // Handle 8 output pixels at a time.

    for (; outp <= num_output_pixels - 8; outp += 8)

    {

      // Load the accumulators from acc_buffer.

      int32x4_t acc[4];

      for (int i = 0; i < 4; i++)

      {

        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);

      }

      // Load the inputs, add input_offset.

      uint8x8_t input_u8[2];

      for (int i = 0; i < 2; i++)

      {

        input_u8[i] = vld1_u8(input_ptr + 8 * i);

      }

      input_ptr += 16;

      int16x8_t input[2];

      for (int i = 0; i < 2; i++)

      {

        input[i] = vreinterpretq_s16_u16(vmovl_u8(input_u8[i]));

      }

      for (int i = 0; i < 2; i++)

      {

        input[i] = vaddq_s16(input[i], vdupq_n_s16(input_offset));

      }


      // Multiply-accumulate.

      acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input[0]));

      acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input[0]));

      acc[2] = vmlal_s16(acc[2], filter, vget_low_s16(input[1]));

      acc[3] = vmlal_s16(acc[3], filter, vget_high_s16(input[1]));

      // Store the accumulators back to acc_buffer.

      for (int i = 0; i < 4; i++)

      {

        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);

      }

      acc_buffer_ptr += 16;

    }

    // Handle 4 output pixels at a time.

    for (; outp <= num_output_pixels - 4; outp += 4)

    {

      // Load the accumulators from acc_buffer.

      int32x4_t acc[2];

      for (int i = 0; i < 2; i++)

      {

        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);

      }

      // Load the inputs, add input_offset.

      const uint8x8_t input_u8 = vld1_u8(input_ptr);

      input_ptr += 8;

      const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));

      const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));


      // Multiply-accumulate.

      acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input));

      acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input));

      // Store the accumulators back to acc_buffer.

      for (int i = 0; i < 2; i++)

      {

        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);

      }

      acc_buffer_ptr += 8;

    }

    // Handle 2 output pixels at a time.

    for (; outp <= num_output_pixels - 2; outp += 2)

    {

      // Load the accumulators from acc_buffer.

      int32x4_t acc = vld1q_s32(acc_buffer_ptr);

      // Load the inputs, add input_offset.

      uint8x8_t input_u8 = vdup_n_u8(0);

      input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);

      input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);

      input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2);

      input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3);

      input_ptr += 4;

      const int16x4_t input_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));

      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));


      // Multiply-accumulate.

      acc = vmlal_s16(acc, filter, input);

      // Store the accumulators back to acc_buffer.

      vst1q_s32(acc_buffer_ptr, acc);

      acc_buffer_ptr += 4;

    }

    // Handle 1 output pixel at a time.

    for (; outp < num_output_pixels; outp++)

    {

      // Load the accumulators from acc_buffer.

      int32x2_t acc = vld1_s32(acc_buffer_ptr);

      // Load the inputs, add input_offset.

      uint8x8_t input_u8 = vdup_n_u8(0);

      input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);

      input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);

      input_ptr += 2;

      const int16x4_t input_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));

      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));


      // Multiply-accumulate.

      acc = vget_low_s32(vmlal_s16(vcombine_s32(acc, acc), filter, input));

      // Store the accumulators back to acc_buffer.

      vst1_s32(acc_buffer_ptr, acc);

      acc_buffer_ptr += 2;

    }

  }

};


template <> struct QuantizedDepthwiseConvKernel<false, 1, 2>

{

  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,

                  const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,

                  const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)

  {

    (void)input_depth;

    (void)depth_multiplier;

    (void)input_ptr_increment;

    // Load the filters, add filter_offset.

    uint8x8_t filter_u8 = vdup_n_u8(0);

    filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0);

    filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1);

    filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 2);

    filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 3);

    const int16x4_t filter_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8)));

    const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset));


    int outp = 0;

    // Handle 8 output pixels at a time.

    for (; outp <= num_output_pixels - 8; outp += 8)

    {

      // Load the accumulators from acc_buffer

      int32x4_t acc[4];

      for (int i = 0; i < 4; i++)

      {

        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);

      }


      // Load the inputs, add input_offset.

      const uint8x8_t input_u8 = vld1_u8(input_ptr);

      input_ptr += 8;

      const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));

      const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));

      // Duplicate the input values, 2-fold

      const int16x8x2_t input_dup2 = vzipq_s16(input, input);

      // Multiply-accumulate

      acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input_dup2.val[0]));

      acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input_dup2.val[0]));

      acc[2] = vmlal_s16(acc[2], filter, vget_low_s16(input_dup2.val[1]));

      acc[3] = vmlal_s16(acc[3], filter, vget_high_s16(input_dup2.val[1]));

      // Store the accumulators back to acc_buffer

      for (int i = 0; i < 4; i++)

      {

        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);

      }

      acc_buffer_ptr += 16;

    }

    // Handle one output pixel at a time.

    for (; outp < num_output_pixels; outp++)

    {

      // Load the accumulators from acc_buffer

      int32x2_t acc = vld1_s32(acc_buffer_ptr);


      // Load the inputs, add input_offset.

      const uint32_t input = *input_ptr++ + input_offset;


      // Multiply-accumulate

      acc = vget_low_s32(vmlal_n_s16(vcombine_s32(acc, acc), filter, input));

      // Store the accumulators back to acc_buffer

      vst1_s32(acc_buffer_ptr, acc);

      acc_buffer_ptr += 2;

    }

  }

};


template <> struct QuantizedDepthwiseConvKernel<false, 1, 4>

{

  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,

                  const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,

                  const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)

  {

    (void)input_depth;

    (void)depth_multiplier;

    (void)input_ptr_increment;

    // Load the filters, add filter_offset.

    uint8x8_t filter_u8 = vdup_n_u8(0);

    filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0);

    filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1);

    filter_u8 = vset_lane_u8(filter_ptr[2], filter_u8, 2);

    filter_u8 = vset_lane_u8(filter_ptr[3], filter_u8, 3);

    const int16x4_t filter_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8)));

    const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset));


    int outp = 0;

    // Handle 8 output pixels at a time.

    for (; outp <= num_output_pixels - 8; outp += 8)

    {

      // Load the accumulators from acc_buffer

      int32x4_t acc[8];

      for (int i = 0; i < 8; i++)

      {

        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);

      }


      // Load the inputs, add input_offset.

      uint8x8_t input_u8 = vld1_u8(input_ptr);

      input_ptr += 8;

      const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));

      const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));


      // Multiply-accumulate

      acc[0] = vmlal_lane_s16(acc[0], filter, vget_low_s16(input), 0);

      acc[1] = vmlal_lane_s16(acc[1], filter, vget_low_s16(input), 1);

      acc[2] = vmlal_lane_s16(acc[2], filter, vget_low_s16(input), 2);

      acc[3] = vmlal_lane_s16(acc[3], filter, vget_low_s16(input), 3);

      acc[4] = vmlal_lane_s16(acc[4], filter, vget_high_s16(input), 0);

      acc[5] = vmlal_lane_s16(acc[5], filter, vget_high_s16(input), 1);

      acc[6] = vmlal_lane_s16(acc[6], filter, vget_high_s16(input), 2);

      acc[7] = vmlal_lane_s16(acc[7], filter, vget_high_s16(input), 3);


      // Store the accumulators back to acc_buffer

      for (int i = 0; i < 8; i++)

      {

        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);

      }

      acc_buffer_ptr += 32;

    }

    // Handle 4 output pixels at a time.

    for (; outp <= num_output_pixels - 4; outp += 4)

    {

      // Load the accumulators from acc_buffer

      int32x4_t acc[4];

      for (int i = 0; i < 4; i++)

      {

        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);

      }


      // Load the inputs, add input_offset.

      uint8x8_t input_u8 = vdup_n_u8(0);

      input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);

      input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);

      input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2);

      input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3);

      input_ptr += 4;

      const int16x4_t input_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));

      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));


      // Multiply-accumulate

      acc[0] = vmlal_lane_s16(acc[0], filter, input, 0);

      acc[1] = vmlal_lane_s16(acc[1], filter, input, 1);

      acc[2] = vmlal_lane_s16(acc[2], filter, input, 2);

      acc[3] = vmlal_lane_s16(acc[3], filter, input, 3);


      // Store the accumulators back to acc_buffer

      for (int i = 0; i < 4; i++)

      {

        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);

      }

      acc_buffer_ptr += 16;

    }

    // Handle one output pixel at a time.

    for (; outp < num_output_pixels; outp++)

    {

      // Load the accumulators from acc_buffer

      int32x4_t acc = vld1q_s32(acc_buffer_ptr);


      // Load the inputs, add input_offset.

      const uint32_t input = *input_ptr++ + input_offset;


      // Multiply-accumulate

      acc = vmlal_n_s16(acc, filter, input);

      // Store the accumulators back to acc_buffer

      vst1q_s32(acc_buffer_ptr, acc);

      acc_buffer_ptr += 4;

    }

  }

};


template <> struct QuantizedDepthwiseConvKernel<false, 4, 1>

{

  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,

                  const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,

                  const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)

  {

    (void)input_depth;

    (void)depth_multiplier;

    (void)input_ptr_increment;

    // Load the filters, add filter_offset.

    uint8x8_t filter_u8 = vdup_n_u8(0);

    filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0);

    filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1);

    filter_u8 = vset_lane_u8(filter_ptr[2], filter_u8, 2);

    filter_u8 = vset_lane_u8(filter_ptr[3], filter_u8, 3);

    const int16x4_t filter_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8)));

    const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset));


    int outp = 0;

    // Handle 4 output pixels at a time.

    for (; outp <= num_output_pixels - 4; outp += 4)

    {

      // Load the accumulators from acc_buffer

      int32x4_t acc[4];

      for (int i = 0; i < 4; i++)

      {

        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);

      }

      // Load the inputs, add input_offset.

      int16x8_t input[2];

      for (int i = 0; i < 2; i++)

      {

        const uint8x8_t input_u8 = vld1_u8(input_ptr + 8 * i);

        const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));

        input[i] = vaddq_s16(input_s16, vdupq_n_s16(input_offset));

      }

      input_ptr += 16;

      // Multiply-accumulate

      for (int i = 0; i < 2; i++)

      {

        acc[2 * i + 0] = vmlal_s16(acc[2 * i + 0], filter, vget_low_s16(input[i]));

        acc[2 * i + 1] = vmlal_s16(acc[2 * i + 1], filter, vget_high_s16(input[i]));

      }

      // Store the accumulators back to acc_buffer

      for (int i = 0; i < 4; i++)

      {

        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);

      }

      acc_buffer_ptr += 16;

    }

    // Handle one output pixel at a time.

    for (; outp < num_output_pixels; outp++)

    {

      // Load the accumulators from acc_buffer

      int32x4_t acc;

      acc = vld1q_s32(acc_buffer_ptr);


      // Load the inputs, add input_offset.

      uint8x8_t input_u8 = vdup_n_u8(0);

      input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);

      input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);

      input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2);

      input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3);

      input_ptr += 4;

      const int16x4_t input_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));

      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));

      // Multiply-accumulate

      acc = vmlal_s16(acc, filter, input);

      // Store the accumulators back to acc_buffer

      vst1q_s32(acc_buffer_ptr, acc);

      acc_buffer_ptr += 4;

    }

  }

};


template <> struct QuantizedDepthwiseConvKernel<false, 4, 4>

{

  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,

                  const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,

                  const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)

  {

    (void)input_depth;

    (void)depth_multiplier;

    (void)input_ptr_increment;

    // Load the filters, add filter_offset.

    int16x8_t filter[2];

    for (int i = 0; i < 2; i++)

    {

      const uint8x8_t filter_u8 = vld1_u8(filter_ptr + 8 * i);

      const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8));

      filter[i] = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));

    }


    int outp = 0;

    // Handle 2 output pixels at a time.

    for (; outp <= num_output_pixels - 2; outp += 2)

    {

      // Load the accumulators from acc_buffer

      int32x4_t acc[8];

      for (int i = 0; i < 8; i++)

      {

        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);

      }


      // Load the inputs, add input_offset.

      uint8x8_t input_u8 = vld1_u8(input_ptr);

      input_ptr += 8;

      const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));

      const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));


      // Multiply-accumulate

      acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]), vget_low_s16(input), 0);

      acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]), vget_low_s16(input), 1);

      acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]), vget_low_s16(input), 2);

      acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]), vget_low_s16(input), 3);

      acc[4] = vmlal_lane_s16(acc[4], vget_low_s16(filter[0]), vget_high_s16(input), 0);

      acc[5] = vmlal_lane_s16(acc[5], vget_high_s16(filter[0]), vget_high_s16(input), 1);

      acc[6] = vmlal_lane_s16(acc[6], vget_low_s16(filter[1]), vget_high_s16(input), 2);

      acc[7] = vmlal_lane_s16(acc[7], vget_high_s16(filter[1]), vget_high_s16(input), 3);

      // Store the accumulators back to acc_buffer

      for (int i = 0; i < 8; i++)

      {

        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);

      }

      acc_buffer_ptr += 32;

    }

    // Handle one output pixel at a time.

    for (; outp < num_output_pixels; outp++)

    {

      // Load the accumulators from acc_buffer

      int32x4_t acc[4];

      for (int i = 0; i < 4; i++)

      {

        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);

      }


      // Load the inputs, add input_offset.

      uint8x8_t input_u8 = vdup_n_u8(0);

      input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);

      input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);

      input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2);

      input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3);

      input_ptr += 4;

      const int16x4_t input_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));

      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));


      // Multiply-accumulate

      acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]), input, 0);

      acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]), input, 1);

      acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]), input, 2);

      acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]), input, 3);

      // Store the accumulators back to acc_buffer

      for (int i = 0; i < 4; i++)

      {

        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);

      }

      acc_buffer_ptr += 16;

    }

  }

};


template <> struct QuantizedDepthwiseConvKernel<true, 0, 3>

{

  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,

                  const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,

                  const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)

  {

    (void)input_depth;

    (void)depth_multiplier;

    // We will have to duplicate bytes in a NEON register, 3-fold.

    // We will do that by register-level table-look-up using VTBL instructions.

    // Here we prepare the registers containing the table-lookup indices.

    static const uint8_t dup3_indices_array[3][8] = {

      {0, 0, 0, 1, 1, 1, 2, 2}, {2, 3, 3, 3, 4, 4, 4, 5}, {5, 5, 6, 6, 6, 7, 7, 7}};

    uint8x8_t dup3_indices[3];

    for (int i = 0; i < 3; i++)

    {

      dup3_indices[i] = vld1_u8(dup3_indices_array[i]);

    }


    // Handle one output pixel at a time.

    for (int outp = 0; outp < num_output_pixels; outp++)

    {

      const uint8_t *local_filter_ptr = filter_ptr;

      const uint8_t *local_input_ptr = input_ptr;

      int ic = 0;

      // Handle 8 input channels at a time.

      for (; ic <= input_depth - 8; ic += 8)

      {

        // Load the filters, add filter_offset.

        int16x8_t filter[3];

        uint8x8x3_t filter_u8;

        filter_u8.val[0] = vld1_u8(local_filter_ptr);

        filter_u8.val[1] = vld1_u8(local_filter_ptr + 8);

        filter_u8.val[2] = vld1_u8(local_filter_ptr + 16);

        local_filter_ptr += 24;

        for (int i = 0; i < 3; i++)

        {

          const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8.val[i]));

          filter[i] = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));

        }

        // Load the inputs, duplicate 3-fold, add input_offset.

        const uint8x8_t input_u8 = vld1_u8(local_input_ptr);

        local_input_ptr += 8;


        uint8x8_t input_u8_dup3[3];

        for (int i = 0; i < 3; i++)

        {

          input_u8_dup3[i] = vtbl1_u8(input_u8, dup3_indices[i]);

        }

        int16x8_t input_dup3[3];

        for (int i = 0; i < 3; i++)

        {

          const int16x8_t input_s16_dup3 = vreinterpretq_s16_u16(vmovl_u8(input_u8_dup3[i]));

          input_dup3[i] = vaddq_s16(input_s16_dup3, vdupq_n_s16(input_offset));

        }

        // Load the accumulators from acc_buffer

        int32x4x3_t acc[2];

        for (int i = 0; i < 2; i++)

        {

          acc[i].val[0] = vld1q_s32(acc_buffer_ptr + 4 * i);

          acc[i].val[1] = vld1q_s32(acc_buffer_ptr + 4 * i + 8);

          acc[i].val[2] = vld1q_s32(acc_buffer_ptr + 4 * i + 16);

        }

        // Multiply-accumulate

        for (int j = 0; j < 3; j++)

        {

          acc[0].val[j] =

            vmlal_s16(acc[0].val[j], vget_low_s16(input_dup3[j]), vget_low_s16(filter[j]));

          acc[1].val[j] =

            vmlal_s16(acc[1].val[j], vget_high_s16(input_dup3[j]), vget_high_s16(filter[j]));

        }

        // Store the accumulators back to acc_buffer

        for (int i = 0; i < 2; i++)

        {

          vst1q_s32(acc_buffer_ptr + 4 * i, acc[i].val[0]);

          vst1q_s32(acc_buffer_ptr + 4 * i + 8, acc[i].val[1]);

          vst1q_s32(acc_buffer_ptr + 4 * i + 16, acc[i].val[2]);

        }

        acc_buffer_ptr += 24;

      }

      // Handle one input channel at a time.

      for (; ic < input_depth; ic++)

      {

        const int16_t input_val = *local_input_ptr++ + input_offset;

        for (int i = 0; i < 3; i++)

        {

          const int16_t filter_val = local_filter_ptr[i] + filter_offset;

          *acc_buffer_ptr++ += static_cast<int32_t>(filter_val) * input_val;

        }

        local_filter_ptr += 3;

      }

      input_ptr += input_ptr_increment;

    }

  }

};


template <> struct QuantizedDepthwiseConvKernel<true, 0, 2>

{

  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,

                  const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,

                  const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)

  {

    (void)input_depth;

    (void)depth_multiplier;

    // Handle one output pixel at a time.

    for (int outp = 0; outp < num_output_pixels; outp++)

    {

      const uint8_t *local_filter_ptr = filter_ptr;

      const uint8_t *local_input_ptr = input_ptr;

      int ic = 0;

      // Handle 8 input channels at a time.

      for (; ic <= input_depth - 8; ic += 8)

      {

        // Load the filters, add filter_offset.

        int16x8_t filter[2];

        uint8x8x2_t filter_u8;

        filter_u8.val[0] = vld1_u8(local_filter_ptr);

        filter_u8.val[1] = vld1_u8(local_filter_ptr + 8);

        local_filter_ptr += 16;

        for (int i = 0; i < 2; i++)

        {

          const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8.val[i]));

          filter[i] = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));

        }

        // Load the inputs, add input_offset, duplicate 2-fold.

        const uint8x8_t input_u8 = vld1_u8(local_input_ptr);

        local_input_ptr += 8;

        const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));

        const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));

        const int16x8x2_t input_dup2 = vzipq_s16(input, input);

        // Load the accumulators from acc_buffer.

        int32x4x2_t acc[2];

        for (int i = 0; i < 2; i++)

        {

          acc[i].val[0] = vld1q_s32(acc_buffer_ptr + 4 * i);

          acc[i].val[1] = vld1q_s32(acc_buffer_ptr + 4 * i + 8);

        }

        // Multiply-accumulate.

        for (int j = 0; j < 2; j++)

        {

          acc[0].val[j] =

            vmlal_s16(acc[0].val[j], vget_low_s16(filter[j]), vget_low_s16(input_dup2.val[j]));

          acc[1].val[j] =

            vmlal_s16(acc[1].val[j], vget_high_s16(filter[j]), vget_high_s16(input_dup2.val[j]));

        }

        // Store the accumulators back to acc_buffer.

        for (int i = 0; i < 2; i++)

        {

          vst1q_s32(acc_buffer_ptr + 4 * i, acc[i].val[0]);

          vst1q_s32(acc_buffer_ptr + 4 * i + 8, acc[i].val[1]);

        }

        acc_buffer_ptr += 16;

      }

      // Handle one input channel at a time.

      for (; ic < input_depth; ic++)

      {

        // Load the inputs.

        const int16_t input_val = *local_input_ptr++ + input_offset;

        for (int i = 0; i < 2; i++)

        {

          const int16_t filter_val = local_filter_ptr[i] + filter_offset;

          *acc_buffer_ptr++ += static_cast<int32_t>(filter_val) * input_val;

        }

        local_filter_ptr += 2;

      }

      input_ptr += input_ptr_increment;

    }

  }

};


template <> struct QuantizedDepthwiseConvKernel<true, 0, 1>

{

  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,

                  const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,

                  const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)

  {

    (void)input_depth;

    (void)depth_multiplier;

    // Handle one output pixel at a time.

    for (int outp = 0; outp < num_output_pixels; outp++)

    {

      const uint8_t *local_filter_ptr = filter_ptr;

      const uint8_t *local_input_ptr = input_ptr;

      int ic = 0;

      // Handle 16 input channels at a time.

      for (; ic <= input_depth - 16; ic += 16)

      {

        // Load the filters, add filter_offset.

        uint8x8_t filter_u8_0 = vld1_u8(local_filter_ptr + 8 * 0);

        uint8x8_t filter_u8_1 = vld1_u8(local_filter_ptr + 8 * 1);

        local_filter_ptr += 16;

        int16x8_t filter_0 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_0));

        int16x8_t filter_1 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_1));

        filter_0 = vaddq_s16(filter_0, vdupq_n_s16(filter_offset));

        filter_1 = vaddq_s16(filter_1, vdupq_n_s16(filter_offset));

        // Load the inputs, add input_offset.

        uint8x8_t input_u8_0 = vld1_u8(local_input_ptr + 8 * 0);

        uint8x8_t input_u8_1 = vld1_u8(local_input_ptr + 8 * 1);

        local_input_ptr += 16;

        int16x8_t input_0 = vreinterpretq_s16_u16(vmovl_u8(input_u8_0));

        int16x8_t input_1 = vreinterpretq_s16_u16(vmovl_u8(input_u8_1));

        input_0 = vaddq_s16(input_0, vdupq_n_s16(input_offset));

        input_1 = vaddq_s16(input_1, vdupq_n_s16(input_offset));

        // Load the accumulators from acc_buffer

        int32x4_t acc_0 = vld1q_s32(acc_buffer_ptr + 4 * 0);

        int32x4_t acc_1 = vld1q_s32(acc_buffer_ptr + 4 * 1);

        int32x4_t acc_2 = vld1q_s32(acc_buffer_ptr + 4 * 2);

        int32x4_t acc_3 = vld1q_s32(acc_buffer_ptr + 4 * 3);

        acc_0 = vmlal_s16(acc_0, vget_low_s16(input_0), vget_low_s16(filter_0));

        acc_1 = vmlal_s16(acc_1, vget_high_s16(input_0), vget_high_s16(filter_0));

        acc_2 = vmlal_s16(acc_2, vget_low_s16(input_1), vget_low_s16(filter_1));

        acc_3 = vmlal_s16(acc_3, vget_high_s16(input_1), vget_high_s16(filter_1));

        // Store the accumulators back to acc_buffer

        vst1q_s32(acc_buffer_ptr + 4 * 0, acc_0);

        vst1q_s32(acc_buffer_ptr + 4 * 1, acc_1);

        vst1q_s32(acc_buffer_ptr + 4 * 2, acc_2);

        vst1q_s32(acc_buffer_ptr + 4 * 3, acc_3);

        acc_buffer_ptr += 16;

      }

      // Handle 8 input channels at a time.

      for (; ic <= input_depth - 8; ic += 8)

      {

        // Load the filters, add filter_offset.

        const uint8x8_t filter_u8 = vld1_u8(local_filter_ptr);

        local_filter_ptr += 8;

        const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8));

        const int16x8_t filter = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));

        // Load the inputs, add input_offset.

        const uint8x8_t input_u8 = vld1_u8(local_input_ptr);

        local_input_ptr += 8;

        const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));

        const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));

        // Load the accumulators from acc_buffer

        int32x4_t acc[2];

        for (int i = 0; i < 2; i++)

        {

          acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);

        }

        // Multiply-accumulate

        acc[0] = vmlal_s16(acc[0], vget_low_s16(input), vget_low_s16(filter));

        acc[1] = vmlal_s16(acc[1], vget_high_s16(input), vget_high_s16(filter));

        // Store the accumulators back to acc_buffer

        for (int i = 0; i < 2; i++)

        {

          vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);

        }

        acc_buffer_ptr += 8;

      }

      // Handle one input channel at a time.

      for (; ic < input_depth; ic++)

      {

        const int16_t input_val = *local_input_ptr++ + input_offset;

        const int16_t filter_val = *local_filter_ptr++ + filter_offset;

        *acc_buffer_ptr++ += static_cast<int32_t>(filter_val) * input_val;

      }

      input_ptr += input_ptr_increment;

    }

  }

};


template <> struct QuantizedDepthwiseConvKernel<true, 16, 1>

{

  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,

                  const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,

                  const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)

  {

    (void)input_depth;

    (void)depth_multiplier;

    // Load the filters, add filter_offset.

    uint8x8_t filter_u8[2];

    for (int i = 0; i < 2; i++)

    {

      filter_u8[i] = vld1_u8(filter_ptr + 8 * i);

    }

    int16x8_t filter[2];

    for (int i = 0; i < 2; i++)

    {

      filter[i] = vreinterpretq_s16_u16(vmovl_u8(filter_u8[i]));

    }

    for (int i = 0; i < 2; i++)

    {

      filter[i] = vaddq_s16(filter[i], vdupq_n_s16(filter_offset));

    }

    // Handle one output pixel at a time.

    for (int outp = 0; outp < num_output_pixels; outp++)

    {

      // Load the inputs, add input_offset.

      uint8x8_t input_u8[2];

      for (int i = 0; i < 2; i++)

      {

        input_u8[i] = vld1_u8(input_ptr + 8 * i);

      }

      input_ptr += input_ptr_increment;

      int16x8_t input[2];

      for (int i = 0; i < 2; i++)

      {

        input[i] = vreinterpretq_s16_u16(vmovl_u8(input_u8[i]));

      }

      for (int i = 0; i < 2; i++)

      {

        input[i] = vaddq_s16(input[i], vdupq_n_s16(input_offset));

      }

      // Load the accumulators from acc_buffer

      int32x4_t acc[4];

      for (int i = 0; i < 4; i++)

      {

        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);

      }

      // Multiply-accumulate

      for (int i = 0; i < 2; i++)

      {

        acc[2 * i + 0] = vmlal_s16(acc[2 * i + 0], vget_low_s16(input[i]), vget_low_s16(filter[i]));

        acc[2 * i + 1] =

          vmlal_s16(acc[2 * i + 1], vget_high_s16(input[i]), vget_high_s16(filter[i]));

      }

      // Store the accumulators back to acc_buffer

      for (int i = 0; i < 4; i++)

      {

        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);

      }

      acc_buffer_ptr += 16;

    }

  }

};


template <> struct QuantizedDepthwiseConvKernel<true, 8, 1>

{

  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,

                  const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,

                  const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)

  {

    (void)input_depth;

    (void)depth_multiplier;

    // Load the filters, add filter_offset.

    const uint8x8_t filter_u8 = vld1_u8(filter_ptr);

    const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8));

    const int16x8_t filter = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));

    // Handle one output pixel at a time.

    for (int outp = 0; outp < num_output_pixels; outp++)

    {

      // Load the inputs, add input_offset.

      const uint8x8_t input_u8 = vld1_u8(input_ptr);

      const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));

      const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));

      // Load the accumulators from acc_buffer

      int32x4_t acc[2];

      for (int i = 0; i < 2; i++)

      {

        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);

      }

      // Multiply-accumulate

      acc[0] = vmlal_s16(acc[0], vget_low_s16(input), vget_low_s16(filter));

      acc[1] = vmlal_s16(acc[1], vget_high_s16(input), vget_high_s16(filter));

      // Store the accumulators back to acc_buffer

      for (int i = 0; i < 2; i++)

      {

        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);

      }

      acc_buffer_ptr += 8;

      input_ptr += input_ptr_increment;

    }

  }

};


template <> struct QuantizedDepthwiseConvKernel<true, 1, 16>

{

  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,

                  const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,

                  const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)

  {

    (void)input_depth;

    (void)depth_multiplier;

    // Load the filters, add filter_offset.

    uint8x8_t filter_u8[2];

    for (int i = 0; i < 2; i++)

    {

      filter_u8[i] = vld1_u8(filter_ptr + 8 * i);

    }

    int16x8_t filter[2];

    for (int i = 0; i < 2; i++)

    {

      filter[i] = vreinterpretq_s16_u16(vmovl_u8(filter_u8[i]));

    }

    for (int i = 0; i < 2; i++)

    {

      filter[i] = vaddq_s16(filter[i], vdupq_n_s16(filter_offset));

    }

    // Handle one output pixel at a time.

    for (int outp = 0; outp < num_output_pixels; outp++)

    {

      uint8_t input_u8 = *input_ptr;

      input_ptr += input_ptr_increment;

      int16_t input = static_cast<int16_t>(input_u8) + input_offset;

      // Load the accumulators from acc_buffer

      int32x4_t acc[4];

      for (int i = 0; i < 4; i++)

      {

        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);

      }

      // Multiply-accumulate

      for (int i = 0; i < 2; i++)

      {

        acc[2 * i + 0] = vmlal_n_s16(acc[2 * i + 0], vget_low_s16(filter[i]), input);

        acc[2 * i + 1] = vmlal_n_s16(acc[2 * i + 1], vget_high_s16(filter[i]), input);

      }

      // Store the accumulators back to acc_buffer

      for (int i = 0; i < 4; i++)

      {

        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);

      }

      acc_buffer_ptr += 16;

    }

  }

};


template <> struct QuantizedDepthwiseConvKernel<true, 1, 32>

{

  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,

                  const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,

                  const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)

  {

    (void)input_depth;

    (void)depth_multiplier;

    // Load the filters, add filter_offset.

    uint8x8_t filter_u8_0 = vld1_u8(filter_ptr + 8 * 0);

    uint8x8_t filter_u8_1 = vld1_u8(filter_ptr + 8 * 1);

    uint8x8_t filter_u8_2 = vld1_u8(filter_ptr + 8 * 2);

    uint8x8_t filter_u8_3 = vld1_u8(filter_ptr + 8 * 3);

    int16x8_t filter_0 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_0));

    int16x8_t filter_1 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_1));

    int16x8_t filter_2 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_2));

    int16x8_t filter_3 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_3));

    filter_0 = vaddq_s16(filter_0, vdupq_n_s16(filter_offset));

    filter_1 = vaddq_s16(filter_1, vdupq_n_s16(filter_offset));

    filter_2 = vaddq_s16(filter_2, vdupq_n_s16(filter_offset));

    filter_3 = vaddq_s16(filter_3, vdupq_n_s16(filter_offset));

    // Handle one output pixel at a time.

    for (int outp = 0; outp < num_output_pixels; outp++)

    {

      uint8_t input_u8 = *input_ptr;

      input_ptr += input_ptr_increment;

      int16_t input = static_cast<int16_t>(input_u8) + input_offset;

      // Load the accumulators from acc_buffer

      int32x4_t acc_0 = vld1q_s32(acc_buffer_ptr + 4 * 0);

      int32x4_t acc_1 = vld1q_s32(acc_buffer_ptr + 4 * 1);

      int32x4_t acc_2 = vld1q_s32(acc_buffer_ptr + 4 * 2);

      int32x4_t acc_3 = vld1q_s32(acc_buffer_ptr + 4 * 3);

      int32x4_t acc_4 = vld1q_s32(acc_buffer_ptr + 4 * 4);

      int32x4_t acc_5 = vld1q_s32(acc_buffer_ptr + 4 * 5);

      int32x4_t acc_6 = vld1q_s32(acc_buffer_ptr + 4 * 6);

      int32x4_t acc_7 = vld1q_s32(acc_buffer_ptr + 4 * 7);

      // Multiply-accumulate

      acc_0 = vmlal_n_s16(acc_0, vget_low_s16(filter_0), input);

      acc_1 = vmlal_n_s16(acc_1, vget_high_s16(filter_0), input);

      acc_2 = vmlal_n_s16(acc_2, vget_low_s16(filter_1), input);

      acc_3 = vmlal_n_s16(acc_3, vget_high_s16(filter_1), input);

      acc_4 = vmlal_n_s16(acc_4, vget_low_s16(filter_2), input);

      acc_5 = vmlal_n_s16(acc_5, vget_high_s16(filter_2), input);

      acc_6 = vmlal_n_s16(acc_6, vget_low_s16(filter_3), input);

      acc_7 = vmlal_n_s16(acc_7, vget_high_s16(filter_3), input);

      // Store the accumulators back to acc_buffer

      vst1q_s32(acc_buffer_ptr + 4 * 0, acc_0);

      vst1q_s32(acc_buffer_ptr + 4 * 1, acc_1);

      vst1q_s32(acc_buffer_ptr + 4 * 2, acc_2);

      vst1q_s32(acc_buffer_ptr + 4 * 3, acc_3);

      vst1q_s32(acc_buffer_ptr + 4 * 4, acc_4);

      vst1q_s32(acc_buffer_ptr + 4 * 5, acc_5);

      vst1q_s32(acc_buffer_ptr + 4 * 6, acc_6);

      vst1q_s32(acc_buffer_ptr + 4 * 7, acc_7);

      acc_buffer_ptr += 32;

    }

  }

};


template <> struct QuantizedDepthwiseConvKernel<true, 1, 20>

{

  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,

                  const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,

                  const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)

  {

    (void)input_depth;

    (void)depth_multiplier;

    // Load the filters, add filter_offset.

    // NEON wants to load 8 bytes at a time, but 20 is not divisible by 8.

    // We load the first 16 bytes into filter_u8_{0,1} as usual.

    // Then we load the 8 last bytes into filter_u8_x  (x for 'extra').

    // This is redundant: the first 4 bytes of filter_u8_x are the same

    // as the last 4 bytes of filter_u8_x.

    uint8x8_t filter_u8_0 = vld1_u8(filter_ptr + 8 * 0);

    uint8x8_t filter_u8_1 = vld1_u8(filter_ptr + 8 * 1);

    uint8x8_t filter_u8_x = vld1_u8(filter_ptr + 8 * 1 + 4);

    int16x8_t filter_0 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_0));

    int16x8_t filter_1 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_1));

    int16x8_t filter_x = vreinterpretq_s16_u16(vmovl_u8(filter_u8_x));

    filter_0 = vaddq_s16(filter_0, vdupq_n_s16(filter_offset));

    filter_1 = vaddq_s16(filter_1, vdupq_n_s16(filter_offset));

    filter_x = vaddq_s16(filter_x, vdupq_n_s16(filter_offset));

    // Handle one output pixel at a time.

    for (int outp = 0; outp < num_output_pixels; outp++)

    {

      uint8_t input_u8 = *input_ptr;

      input_ptr += input_ptr_increment;

      int16_t input = static_cast<int16_t>(input_u8) + input_offset;

      // Load the accumulators from acc_buffer

      int32x4_t acc_0 = vld1q_s32(acc_buffer_ptr + 4 * 0);

      int32x4_t acc_1 = vld1q_s32(acc_buffer_ptr + 4 * 1);

      int32x4_t acc_2 = vld1q_s32(acc_buffer_ptr + 4 * 2);

      int32x4_t acc_3 = vld1q_s32(acc_buffer_ptr + 4 * 3);

      int32x4_t acc_4 = vld1q_s32(acc_buffer_ptr + 4 * 4);

      // Multiply-accumulate

      acc_0 = vmlal_n_s16(acc_0, vget_low_s16(filter_0), input);

      acc_1 = vmlal_n_s16(acc_1, vget_high_s16(filter_0), input);

      acc_2 = vmlal_n_s16(acc_2, vget_low_s16(filter_1), input);

      acc_3 = vmlal_n_s16(acc_3, vget_high_s16(filter_1), input);

      acc_4 = vmlal_n_s16(acc_4, vget_high_s16(filter_x), input);

      // Store the accumulators back to acc_buffer

      vst1q_s32(acc_buffer_ptr + 4 * 0, acc_0);

      vst1q_s32(acc_buffer_ptr + 4 * 1, acc_1);

      vst1q_s32(acc_buffer_ptr + 4 * 2, acc_2);

      vst1q_s32(acc_buffer_ptr + 4 * 3, acc_3);

      vst1q_s32(acc_buffer_ptr + 4 * 4, acc_4);

      acc_buffer_ptr += 20;

    }

  }

};


template <> struct QuantizedDepthwiseConvKernel<true, 1, 8>

{

  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,

                  const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,

                  const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)

  {

    (void)input_depth;

    (void)depth_multiplier;

    // Load the filters, add filter_offset.

    const uint8x8_t filter_u8 = vld1_u8(filter_ptr);

    const int16x8_t filter =

      vaddq_s16(vreinterpretq_s16_u16(vmovl_u8(filter_u8)), vdupq_n_s16(filter_offset));

    // Handle one output pixel at a time.

    for (int outp = 0; outp < num_output_pixels; outp++)

    {

      uint8_t input_u8 = *input_ptr;

      input_ptr += input_ptr_increment;

      int16_t input = static_cast<int16_t>(input_u8) + input_offset;

      // Load the accumulators from acc_buffer

      int32x4_t acc[2];

      for (int i = 0; i < 2; i++)

      {

        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);

      }

      // Multiply-accumulate

      acc[0] = vmlal_n_s16(acc[0], vget_low_s16(filter), input);

      acc[1] = vmlal_n_s16(acc[1], vget_high_s16(filter), input);

      // Store the accumulators back to acc_buffer

      for (int i = 0; i < 2; i++)

      {

        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);

      }

      acc_buffer_ptr += 8;

    }

  }

};


template <> struct QuantizedDepthwiseConvKernel<true, 2, 1>

{

  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,

                  const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,

                  const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)

  {

    (void)input_depth;

    (void)depth_multiplier;

    // Load the filters, add filter_offset.

    uint8x8_t filter_u8 = vdup_n_u8(0);

    filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0);

    filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1);

    filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 2);

    filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 3);

    const int16x4_t filter_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8)));

    const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset));


    int outp = 0;


    // Handle 2 output pixels at a time.

    for (; outp <= num_output_pixels - 2; outp += 2)

    {

      // Load the accumulators from acc_buffer.

      int32x4_t acc = vld1q_s32(acc_buffer_ptr);

      // Load the inputs, add input_offset.

      uint16x4_t input_u16 = vdup_n_u16(0);

      input_u16 = vset_lane_u16((reinterpret_cast<const uint16_t *>(input_ptr))[0], input_u16, 0);

      input_ptr += input_ptr_increment;

      input_u16 = vset_lane_u16((reinterpret_cast<const uint16_t *>(input_ptr))[0], input_u16, 1);

      input_ptr += input_ptr_increment;

      const int16x4_t input_s16 =

        vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vreinterpret_u8_u16(input_u16))));

      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));


      // Multiply-accumulate.

      acc = vmlal_s16(acc, filter, input);

      // Store the accumulators back to acc_buffer.

      vst1q_s32(acc_buffer_ptr, acc);

      acc_buffer_ptr += 4;

    }


    // Handle 1 output pixel at a time.

    for (; outp < num_output_pixels; outp++)

    {

      // Load the accumulators from acc_buffer.

      int32x2_t acc = vld1_s32(acc_buffer_ptr);

      // Load the inputs, add input_offset.

      uint8x8_t input_u8 = vdup_n_u8(0);

      input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);

      input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);

      input_ptr += input_ptr_increment;

      const int16x4_t input_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));

      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));


      // Multiply-accumulate.

      acc = vget_low_s32(vmlal_s16(vcombine_s32(acc, acc), filter, input));

      // Store the accumulators back to acc_buffer.

      vst1_s32(acc_buffer_ptr, acc);

      acc_buffer_ptr += 2;

    }

  }

};


template <> struct QuantizedDepthwiseConvKernel<true, 4, 1>

{

  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,

                  const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,

                  const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)

  {

    (void)input_depth;

    (void)depth_multiplier;

    if (num_output_pixels <= 0)

    {

      return;

    }


    // Load the filters, add filter_offset.

    uint8x8_t filter_u8 = vdup_n_u8(0);

    filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0);

    filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1);

    filter_u8 = vset_lane_u8(filter_ptr[2], filter_u8, 2);

    filter_u8 = vset_lane_u8(filter_ptr[3], filter_u8, 3);

    const int16x4_t filter_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8)));

    const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset));


    int outp = 0;


    // Handle one output pixel at a time until second to the last pixel. Second

    // to the last because we read eight input pixels while only processing

    // four.

    for (; outp < num_output_pixels - 1; outp++)

    {

      // Load the accumulators from acc_buffer

      int32x4_t acc;

      acc = vld1q_s32(acc_buffer_ptr);


      // Load the inputs, add input_offset.

      uint8x8_t input_u8 = vld1_u8(input_ptr);

      input_ptr += input_ptr_increment;

      const int16x4_t input_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));

      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));

      // Multiply-accumulate

      acc = vmlal_s16(acc, filter, input);

      // Store the accumulators back to acc_buffer

      vst1q_s32(acc_buffer_ptr, acc);

      acc_buffer_ptr += 4;

    }


    // Handle the last output pixel.

    // Load the accumulators from acc_buffer

    int32x4_t acc;

    acc = vld1q_s32(acc_buffer_ptr);


    // Load the inputs, add input_offset.

    uint8x8_t input_u8 = vdup_n_u8(0);

    input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);

    input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);

    input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2);

    input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3);

    const int16x4_t input_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));

    const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));

    // Multiply-accumulate

    acc = vmlal_s16(acc, filter, input);

    // Store the accumulators back to acc_buffer

    vst1q_s32(acc_buffer_ptr, acc);

  }

};


template <> struct QuantizedDepthwiseConvKernel<false, 12, 1>

{

  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,

                  const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,

                  const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)

  {

    (void)input_depth;

    (void)depth_multiplier;

    // Load the filters, add filter_offset.

    uint8x8_t filter_u8_0 = vld1_u8(filter_ptr);

    uint8x8_t filter_u8_1 = vld1_u8(filter_ptr + 4);

    int16x8_t filter_s16_0 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_0));

    int16x8_t filter_s16_1 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_1));

    filter_s16_0 = vaddq_s16(filter_s16_0, vdupq_n_s16(filter_offset));

    filter_s16_1 = vaddq_s16(filter_s16_1, vdupq_n_s16(filter_offset));

    int16x4_t filter_0 = vget_low_s16(filter_s16_0);

    int16x4_t filter_1 = vget_high_s16(filter_s16_0);

    int16x4_t filter_2 = vget_high_s16(filter_s16_1);


    // Handle one output pixel at a time.

    for (int outp = 0; outp < num_output_pixels; outp++)

    {

      // Load the inputs, add input_offset.

      uint8x8_t input_u8_0 = vld1_u8(input_ptr);

      uint8x8_t input_u8_1 = vld1_u8(input_ptr + 4);

      input_ptr += input_ptr_increment;

      int16x8_t input_0 = vreinterpretq_s16_u16(vmovl_u8(input_u8_0));

      int16x8_t input_1 = vreinterpretq_s16_u16(vmovl_u8(input_u8_1));

      input_0 = vaddq_s16(input_0, vdupq_n_s16(input_offset));

      input_1 = vaddq_s16(input_1, vdupq_n_s16(input_offset));


      // Load the accumulators from acc_buffer

      int32x4_t acc_0 = vld1q_s32(acc_buffer_ptr + 4 * 0);

      int32x4_t acc_1 = vld1q_s32(acc_buffer_ptr + 4 * 1);

      int32x4_t acc_2 = vld1q_s32(acc_buffer_ptr + 4 * 2);


      // Multiply-accumulate

      acc_0 = vmlal_s16(acc_0, vget_low_s16(input_0), filter_0);

      acc_1 = vmlal_s16(acc_1, vget_high_s16(input_0), filter_1);

      acc_2 = vmlal_s16(acc_2, vget_high_s16(input_1), filter_2);


      // Store the accumulators back to acc_buffer

      vst1q_s32(acc_buffer_ptr + 4 * 0, acc_0);

      vst1q_s32(acc_buffer_ptr + 4 * 1, acc_1);

      vst1q_s32(acc_buffer_ptr + 4 * 2, acc_2);


      acc_buffer_ptr += 12;

    }

  }

};

#endif


// Accumulates the effect of one row of the filter, on a segment of one row

// of the output, accessing the corresponding one row of the input.

template <bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier>


void QuantizedDepthwiseConvAccumRow(int stride, int dilation_factor, int input_depth,

                                    int input_width, const uint8_t *input_data,

                                    int16_t input_offset, int pad_width, int depth_multiplier,

                                    int filter_width, const uint8_t *filter_data,

                                    int16_t filter_offset, int out_x_buffer_start,

                                    int out_x_buffer_end, int output_depth, int32_t *acc_buffer)

{

  // Sanity check parameters. This is important in particular to ensure

  // that we keep the number of template instantiations minimal, so we don't

  // increase binary size unnecessarily.

  static_assert(kFixedDepthMultiplier || !kFixedInputDepth, "");

  static_assert(kFixedInputDepth || kAllowStrided, "");

  assert(stride == 1 || kAllowStrided);

  if (kFixedInputDepth)

  {

    assert(input_depth == kFixedInputDepth);

  }

  if (kFixedDepthMultiplier)

  {

    assert(depth_multiplier == kFixedDepthMultiplier);

  }

  assert(output_depth == input_depth * depth_multiplier);

  const int input_ptr_increment = stride * input_depth;

  const uint8_t *filter_base_ptr = filter_data;

  for (int filter_x = 0; filter_x < filter_width; ++filter_x)

  {

    // For the current (filter_x, filter_y) point in the filter,

    // compute the boundaries of the corresponding output row segment.

    int out_x_loop_start_unclampled = 0;

    int out_x_loop_end_unclampled = 0;

    if (kAllowStrided)

    {

      if (stride == 2)

      {

        out_x_loop_start_unclampled = (pad_width - dilation_factor * filter_x + 1) / 2;

        out_x_loop_end_unclampled = (pad_width + input_width - dilation_factor * filter_x + 1) / 2;

      }

      else if (stride == 4)

      {

        out_x_loop_start_unclampled = (pad_width - dilation_factor * filter_x + 3) / 4;

        out_x_loop_end_unclampled = (pad_width + input_width - dilation_factor * filter_x + 3) / 4;

      }

      else

      {

        out_x_loop_start_unclampled =

          (pad_width - dilation_factor * filter_x + stride - 1) / stride;

        out_x_loop_end_unclampled =

          (pad_width + input_width - dilation_factor * filter_x + stride - 1) / stride;

      }

    }

    else

    {

      out_x_loop_start_unclampled = pad_width - dilation_factor * filter_x;

      out_x_loop_end_unclampled = pad_width + input_width - dilation_factor * filter_x;

    }

    // The kernel will have to iterate on the segment of the

    // output row that starts at out_x_loop_start and out_x_loop_end.

    const int out_x_loop_start = std::max(out_x_buffer_start, out_x_loop_start_unclampled);

    const int out_x_loop_end = std::min(out_x_buffer_end, out_x_loop_end_unclampled);


    int32_t *acc_buffer_ptr = acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth;

    const int in_x_origin = (out_x_loop_start * stride) - pad_width + dilation_factor * filter_x;

    const uint8_t *input_ptr = input_data + in_x_origin * input_depth;

    const int num_output_pixels = out_x_loop_end - out_x_loop_start;

    QuantizedDepthwiseConvKernel<kAllowStrided, kFixedInputDepth, kFixedDepthMultiplier>::Run(

      num_output_pixels, input_depth, depth_multiplier, input_ptr, input_offset,

      input_ptr_increment, filter_base_ptr, filter_offset, acc_buffer_ptr);

    filter_base_ptr += output_depth;

  }

}


// generic fallback of DepthwiseConvAccumRow, portable, non-templatized.


inline void QuantizedDepthwiseConvAccumRowGeneric(int stride, int dilation_factor, int input_depth,

                                                  int input_width, const uint8_t *input_data,

                                                  int16_t input_offset, int pad_width,

                                                  int depth_multiplier, int filter_width,

                                                  const uint8_t *filter_data, int16_t filter_offset,

                                                  int out_x_buffer_start, int out_x_buffer_end,

                                                  int output_depth, int32_t *acc_buffer)

{

  const uint8_t *filter_base_ptr = filter_data;

  for (int filter_x = 0; filter_x < filter_width; ++filter_x)

  {

    const int out_x_loop_start =

      std::max(out_x_buffer_start, (pad_width - dilation_factor * filter_x + stride - 1) / stride);

    const int out_x_loop_end =

      std::min(out_x_buffer_end,

               (pad_width + input_width - dilation_factor * filter_x + stride - 1) / stride);


    int32_t *acc_buffer_ptr = acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth;

    const int in_x_origin = (out_x_loop_start * stride) - pad_width + dilation_factor * filter_x;

    const uint8_t *input_ptr = input_data + in_x_origin * input_depth;

    const int input_ptr_increment = (stride - 1) * input_depth;

    for (int out_x = out_x_loop_start; out_x < out_x_loop_end; out_x++)

    {

      const uint8_t *filter_ptr = filter_base_ptr;

      for (int ic = 0; ic < input_depth; ++ic)

      {

        const int16_t input_val = *input_ptr++ + input_offset;

        for (int m = 0; m < depth_multiplier; m++)

        {

          const int16_t filter_val = *filter_ptr++ + filter_offset;

          *acc_buffer_ptr++ += static_cast<int32_t>(filter_val) * input_val;

        }

      }

      input_ptr += input_ptr_increment;

    }

    filter_base_ptr += output_depth;

  }

}


// Initializes the accumulator buffer with bias values.


inline void DepthwiseConvInitAccBuffer(int num_output_pixels, int output_depth,

                                       const int32_t *bias_data, int32_t *acc_buffer)

{

  int i = 0;

#ifdef USE_NEON

  if (output_depth == 1)

  {

    const int32x4_t b = vdupq_n_s32(bias_data[0]);

    for (; i <= num_output_pixels - 16; i += 16)

    {

      vst1q_s32(acc_buffer + i + 0, b);

      vst1q_s32(acc_buffer + i + 4, b);

      vst1q_s32(acc_buffer + i + 8, b);

      vst1q_s32(acc_buffer + i + 12, b);

    }

    for (; i <= num_output_pixels - 4; i += 4)

    {

      vst1q_s32(acc_buffer + i, b);

    }

  }

  else if (output_depth == 2)

  {

    int32x4_t b = vdupq_n_s32(bias_data[0]);

    b = vsetq_lane_s32(bias_data[1], b, 1);

    b = vsetq_lane_s32(bias_data[1], b, 3);

    for (; i <= num_output_pixels - 8; i += 8)

    {

      vst1q_s32(acc_buffer + 2 * i + 0, b);

      vst1q_s32(acc_buffer + 2 * i + 4, b);

      vst1q_s32(acc_buffer + 2 * i + 8, b);

      vst1q_s32(acc_buffer + 2 * i + 12, b);

    }

    for (; i <= num_output_pixels - 2; i += 2)

    {

      vst1q_s32(acc_buffer + 2 * i, b);

    }

  }

  else if (output_depth == 4)

  {

    const int32x4_t b = vld1q_s32(bias_data);

    for (; i <= num_output_pixels - 4; i += 4)

    {

      vst1q_s32(acc_buffer + 4 * i + 0, b);

      vst1q_s32(acc_buffer + 4 * i + 4, b);

      vst1q_s32(acc_buffer + 4 * i + 8, b);

      vst1q_s32(acc_buffer + 4 * i + 12, b);

    }

    for (; i < num_output_pixels; i++)

    {

      vst1q_s32(acc_buffer + 4 * i, b);

    }

  }

  else if (output_depth == 8)

  {

    const int32x4_t b0 = vld1q_s32(bias_data);

    const int32x4_t b1 = vld1q_s32(bias_data + 4);

    for (; i <= num_output_pixels - 2; i += 2)

    {

      vst1q_s32(acc_buffer + 8 * i + 0, b0);

      vst1q_s32(acc_buffer + 8 * i + 4, b1);

      vst1q_s32(acc_buffer + 8 * i + 8, b0);

      vst1q_s32(acc_buffer + 8 * i + 12, b1);

    }

    for (; i < num_output_pixels; i++)

    {

      vst1q_s32(acc_buffer + 8 * i + 0, b0);

      vst1q_s32(acc_buffer + 8 * i + 4, b1);

    }

  }

  else if (output_depth == 16)

  {

    const int32x4_t b0 = vld1q_s32(bias_data);

    const int32x4_t b1 = vld1q_s32(bias_data + 4);

    const int32x4_t b2 = vld1q_s32(bias_data + 8);

    const int32x4_t b3 = vld1q_s32(bias_data + 12);

    for (; i < num_output_pixels; i++)

    {

      vst1q_s32(acc_buffer + 16 * i + 0, b0);

      vst1q_s32(acc_buffer + 16 * i + 4, b1);

      vst1q_s32(acc_buffer + 16 * i + 8, b2);

      vst1q_s32(acc_buffer + 16 * i + 12, b3);

    }

  }

#endif

  for (; i < num_output_pixels; i++)

  {

    memcpy(acc_buffer + i * output_depth, bias_data, sizeof(acc_buffer[0]) * output_depth);

  }

}


inline void DepthwiseConvGeneral(const DepthwiseConvParams &params, const Shape &input_shape,

                                 const uint8_t *input_data, const Shape &filter_shape,

                                 const uint8_t *filter_data, const Shape &bias_shape,

                                 const int32_t *bias_data, const Shape &output_shape,

                                 uint8_t *output_data, int thread_start, int thread_end,

                                 int thread_dim)

{

  (void)bias_shape;

  const int stride_width = params.stride_width;

  const int stride_height = params.stride_height;

  const int pad_width = params.padding_values.width;

  const int pad_height = params.padding_values.height;

  const int depth_multiplier = params.depth_multiplier;

  const int32_t output_activation_min = params.quantized_activation_min;

  const int32_t output_activation_max = params.quantized_activation_max;

  const int32_t input_offset = params.input_offset;

  const int32_t filter_offset = params.weights_offset;

  const int32_t output_offset = params.output_offset;

  const int32_t output_multiplier = params.output_multiplier;

  const int output_shift = params.output_shift;

  const int dilation_width_factor = params.dilation_width_factor;

  const int dilation_height_factor = params.dilation_height_factor;

  const int batches = MatchingDim(input_shape, 0, output_shape, 0);

  const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);

  const int input_height = input_shape.Dims(1);

  const int input_width = input_shape.Dims(2);

  const int input_depth = input_shape.Dims(3);

  const int filter_height = filter_shape.Dims(1);

  const int filter_width = filter_shape.Dims(2);

  const int output_height = output_shape.Dims(1);

  const int output_width = output_shape.Dims(2);

#ifdef USE_NEON

  const bool shift_left = (output_shift > 0);

  const int32_t multiplier_power_of_two = shift_left ? (1 << output_shift) : 1;

#endif


  static const int kAccBufferMaxSize = 2048;

  int32_t acc_buffer[kAccBufferMaxSize];

  assert(kAccBufferMaxSize >= output_depth);

  const int kOutputPixelsInAccBuffer = kAccBufferMaxSize / output_depth;

  [[maybe_unused]] const int kAccBufferActualSize = kOutputPixelsInAccBuffer * output_depth;

  assert(kOutputPixelsInAccBuffer * output_depth <= kAccBufferActualSize);

  assert(kAccBufferActualSize <= kAccBufferMaxSize);

  assert(kOutputPixelsInAccBuffer >= 1);

  assert(thread_dim == 0 || thread_dim == 1);


  // row_accum_func will point to the core accumulation function to be used

  // for this DepthwiseConv op.

  using row_accum_func_t = decltype(&QuantizedDepthwiseConvAccumRowGeneric);

  row_accum_func_t row_accum_func = nullptr;


#define TFMINI_USE_DEPTHWISECONV_KERNEL(ALLOW_STRIDED, FIXED_INPUT_DEPTH, FIXED_DEPTH_MULTIPLIER) \

  if (!row_accum_func && (stride_width == 1 || ALLOW_STRIDED) &&                                  \

      (input_depth == FIXED_INPUT_DEPTH || FIXED_INPUT_DEPTH == 0) &&                             \

      depth_multiplier == FIXED_DEPTH_MULTIPLIER)                                                 \

  {                                                                                               \

    row_accum_func =                                                                              \

      QuantizedDepthwiseConvAccumRow<ALLOW_STRIDED, FIXED_INPUT_DEPTH, FIXED_DEPTH_MULTIPLIER>;   \

  }


#ifdef USE_NEON

  // We go over our list of kernels by decreasing order of preference

  // for the cases where multiple kernels could apply.


  // Start with the fastest kernels: AllowStrided=false, fixed input depth.


  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 1, 2)

  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 2)

  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 4, 2)

  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 1, 4)

  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 4, 1)

  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 4, 4)

  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 8, 1)

  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 8)

  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 1)

  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 12, 1)


  // Next come the strided kernels: AllowStrided=true, fixed input depth.

  // They are a bit less efficient, but allow stride!=1.


  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 8, 2)

  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 16, 1)

  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 16)

  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 20)

  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 32)

  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 8)

  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 8, 1)

  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 2, 1)

  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 4, 1)


  // Finally, the kernels allowing a variable input depth,

  // these are the least efficient but most general kernels.


  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 1)

  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 2)

  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 3)

#endif // USE_NEON


  // No matching fast kernel found, use slow fallback.

  if (!row_accum_func)

  {

    row_accum_func = QuantizedDepthwiseConvAccumRowGeneric;

  }


#undef TFMINI_USE_DEPTHWISECONV_KERNEL


  const int input_height_stride = input_shape.Dims(3) * input_shape.Dims(2);

  const int input_batch_stride = input_height_stride * input_shape.Dims(1);

  const int filter_height_stride = filter_shape.Dims(3) * filter_shape.Dims(2);


  // Now that we have determined row_accum_func, we can start work.

  int batch_start = 0;

  int batch_end = batches;

  int row_start = 0;

  int row_end = output_height;

  int output_ptr_offset = 0;


  switch (thread_dim)

  {

    case 0:

      // Multithread along with the batch axis

      assert(thread_start >= 0);

      assert(thread_end <= batches);

      batch_start = thread_start;

      batch_end = thread_end;

      output_ptr_offset = batch_start * FlatSizeSkipDim(output_shape, 0);

      break;

    case 1:

      // Multithread along with the row axis

      assert(thread_start >= 0);

      assert(thread_end <= output_height);

      row_start = thread_start;

      row_end = thread_end;

      output_ptr_offset = row_start * output_width * output_depth;

      break;

  }


  uint8_t *output_ptr = output_data + output_ptr_offset;

  int batch_step = (output_height + row_start - row_end) * output_width * output_depth;

  for (int b = batch_start; b < batch_end; ++b)

  {

    for (int out_y = row_start; out_y < row_end; ++out_y)

    {

      const int in_y_origin = (out_y * stride_height) - pad_height;

      const int filter_y_start =

        std::max(0, (-in_y_origin + dilation_height_factor - 1) / dilation_height_factor);

      const int filter_y_end =

        std::min(filter_height, (input_height - in_y_origin + dilation_height_factor - 1) /

                                  dilation_height_factor);

      for (int out_x_buffer_start = 0; out_x_buffer_start < output_width;

           out_x_buffer_start += kOutputPixelsInAccBuffer)

      {

        const int out_x_buffer_end =

          std::min(output_width, out_x_buffer_start + kOutputPixelsInAccBuffer);

        // We call a 'pixel' a group of activation that share all but the

        // 'depth'/'channel' coordinate. num_output_pixels is the number of

        // output pixels that we will accumulate in this loop iteration.

        const int num_output_pixels = out_x_buffer_end - out_x_buffer_start;

        // Initialize our local accumulator with the bias values, so we don't

        // have to add them later.

        DepthwiseConvInitAccBuffer(num_output_pixels, output_depth, bias_data, acc_buffer);

        // Accumulation loop. Most of the time should be spent in here.

        for (int filter_y = filter_y_start; filter_y < filter_y_end; ++filter_y)

        {

          const int in_y = in_y_origin + dilation_height_factor * filter_y;

          row_accum_func(stride_width, dilation_width_factor, input_depth, input_width,

                         input_data + in_y * input_height_stride + b * input_batch_stride,

                         input_offset, pad_width, depth_multiplier, filter_width,

                         filter_data + filter_y * filter_height_stride, filter_offset,

                         out_x_buffer_start, out_x_buffer_end, output_depth, acc_buffer);

        }

        // Finished accumulating int32_t values. Now need to convert them to

        // the final 8bit form and store them.

        const int num_output_values = output_depth * num_output_pixels;

        int i = 0;

#ifdef USE_NEON

        using gemmlowp::RoundingDivideByPOT;

        const int32x4_t output_offset_vec = vdupq_n_s32(output_offset);

        const int32x4_t output_activation_min_vec = vdupq_n_s32(output_activation_min);

        const int32x4_t output_activation_max_vec = vdupq_n_s32(output_activation_max);

        // Handle 16 values at once.

        // This allows us to issue 4 mutually independent int32

        // multiplications (vqrdmulh), which should alleviate most of their

        // high latency.

        for (; i <= num_output_values - 16; i += 16)

        {

          int32x4_t acc[4];

          for (int j = 0; j < 4; j++)

          {

            acc[j] = vld1q_s32(acc_buffer + i + 4 * j);

          }


          if (!shift_left)

          {

            // Fixed-point multiplication.

            for (int j = 0; j < 4; j++)

            {

              acc[j] = vqrdmulhq_n_s32(acc[j], output_multiplier);

            }

            for (int j = 0; j < 4; j++)

            {

              acc[j] = RoundingDivideByPOT(acc[j], -output_shift);

            }

          }

          else

          {

            // Fixed-point multiplication.

            for (int j = 0; j < 4; j++)

            {

              acc[j] = vmulq_n_s32(acc[j], multiplier_power_of_two);

              acc[j] = vqrdmulhq_n_s32(acc[j], output_multiplier);

            }

          }

          // Add the output offset.

          for (int j = 0; j < 4; j++)

          {

            acc[j] = vaddq_s32(acc[j], output_offset_vec);

          }

          // Apply the activation function.

          for (int j = 0; j < 4; j++)

          {

            acc[j] = vmaxq_s32(acc[j], output_activation_min_vec);

          }

          for (int j = 0; j < 4; j++)

          {

            acc[j] = vminq_s32(acc[j], output_activation_max_vec);

          }

          // Saturating cast to uint8_t and store to destination.

          int16x4_t acc_s16[4];

          for (int j = 0; j < 4; j++)

          {

            acc_s16[j] = vqmovn_s32(acc[j]);

          }

          const int16x8_t res_s16_0 = vcombine_s16(acc_s16[0], acc_s16[1]);

          const int16x8_t res_s16_1 = vcombine_s16(acc_s16[2], acc_s16[3]);

          const uint8x8_t res_u8_0 = vqmovun_s16(res_s16_0);

          const uint8x8_t res_u8_1 = vqmovun_s16(res_s16_1);

          vst1q_u8(output_ptr, vcombine_u8(res_u8_0, res_u8_1));

          output_ptr += 16;

        }

        // Handle 8 values at once.

        // Not as good as 16 (now we're only issuing 2 mutually independent

        // vqrdmulh instructions, so we're probably paying for their high

        // latency).

        for (; i <= num_output_values - 8; i += 8)

        {

          int32x4_t acc0 = vld1q_s32(acc_buffer + i);

          int32x4_t acc1 = vld1q_s32(acc_buffer + i + 4);

          if (!shift_left)

          {

            // Fixed-point multiplication.

            acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);

            acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);

            // Rounding right shift.

            acc0 = RoundingDivideByPOT(acc0, -output_shift);

            acc1 = RoundingDivideByPOT(acc1, -output_shift);

          }

          else

          {

            // Fixed-point multiplication.

            acc0 = vmulq_n_s32(acc0, multiplier_power_of_two);

            acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);


            acc1 = vmulq_n_s32(acc1, multiplier_power_of_two);

            acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);

          }

          // Add the output offset.

          acc0 = vaddq_s32(acc0, output_offset_vec);

          acc1 = vaddq_s32(acc1, output_offset_vec);

          // Apply the activation function.

          acc0 = vmaxq_s32(acc0, output_activation_min_vec);

          acc1 = vmaxq_s32(acc1, output_activation_min_vec);

          acc0 = vminq_s32(acc0, output_activation_max_vec);

          acc1 = vminq_s32(acc1, output_activation_max_vec);

          // Saturating cast to uint8_t and store to destination.

          const int16x4_t acc0_s16 = vqmovn_s32(acc0);

          const int16x4_t acc1_s16 = vqmovn_s32(acc1);

          const int16x8_t res_s16 = vcombine_s16(acc0_s16, acc1_s16);

          const uint8x8_t res_u8 = vqmovun_s16(res_s16);

          vst1_u8(output_ptr, res_u8);

          output_ptr += 8;

        }

        // Handle 4 values at once. Now we're paying the full price of the

        // high latency of vqrdmulh. Also, storing only 4 bytes at the end

        // (without any alignment) can only be done 1 byte at a time.

        // Yet, that is still worth doing to minimize the amount of leftover

        // that will have to go through the very slow scalar code.

        for (; i <= num_output_values - 4; i += 4)

        {

          int32x4_t acc = vld1q_s32(acc_buffer + i);

          if (!shift_left)

          {

            // Fixed-point multiplication.

            acc = vqrdmulhq_n_s32(acc, output_multiplier);

            // Rounding right shift.

            acc = RoundingDivideByPOT(acc, -output_shift);

          }

          else

          {

            // Fixed-point multiplication.

            acc = vmulq_n_s32(acc, multiplier_power_of_two);

            acc = vqrdmulhq_n_s32(acc, output_multiplier);

          }

          // Add the output offset.

          acc = vaddq_s32(acc, output_offset_vec);

          // Apply the activation function.

          acc = vmaxq_s32(acc, output_activation_min_vec);

          acc = vminq_s32(acc, output_activation_max_vec);

          // Saturating cast to uint8_t and store to destination.

          const int16x4_t acc_s16 = vqmovn_s32(acc);

          const int16x8_t res_s16 = vcombine_s16(acc_s16, acc_s16);

          const uint8x8_t res_u8 = vqmovun_s16(res_s16);

          vst1_lane_u8(output_ptr + 0, res_u8, 0);

          vst1_lane_u8(output_ptr + 1, res_u8, 1);

          vst1_lane_u8(output_ptr + 2, res_u8, 2);

          vst1_lane_u8(output_ptr + 3, res_u8, 3);

          output_ptr += 4;

        }

#endif // USE_NEON


        // Handle leftover values, one by one. This is very slow.

        for (; i < num_output_values; i++)

        {

          int32_t acc = acc_buffer[i];

          acc = MultiplyByQuantizedMultiplier(acc, output_multiplier, output_shift);

          acc += output_offset;

          acc = std::max(acc, output_activation_min);

          acc = std::min(acc, output_activation_max);

          *output_ptr++ = static_cast<uint8_t>(acc);

        }

      }

    }

    output_ptr += batch_step;

  }

}


} // namespace depthwise_conv


// template <DepthwiseConvOutputRounding kOutputRounding>


inline void DepthwiseConvWithRounding(const DepthwiseConvParams &params, const Shape &input_shape,

                                      const uint8_t *input_data, const Shape &filter_shape,

                                      const uint8_t *filter_data, const Shape &bias_shape,

                                      const int32_t *bias_data, const Shape &output_shape,

                                      uint8_t *output_data, int thread_start, int thread_end,

                                      int thread_dim)

{

  [[maybe_unused]] const int depth_multiplier = params.depth_multiplier;

  [[maybe_unused]] const int32_t output_activation_min = params.quantized_activation_min;

  [[maybe_unused]] const int32_t output_activation_max = params.quantized_activation_max;

  [[maybe_unused]] const int dilation_width_factor = params.dilation_width_factor;

  [[maybe_unused]] const int dilation_height_factor = params.dilation_height_factor;

  assert(dilation_width_factor >= 1);

  assert(dilation_height_factor >= 1);

  assert(input_shape.DimensionsCount() == 4);

  assert(filter_shape.DimensionsCount() == 4);

  assert(output_shape.DimensionsCount() == 4);

  assert(output_activation_min <= output_activation_max);

  [[maybe_unused]] const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);

  [[maybe_unused]] const int input_depth = input_shape.Dims(3);

  assert(output_depth == input_depth * depth_multiplier);

  assert(bias_shape.FlatSize() == output_depth);


// Enable for arm64 except for the Nvidia Linux 4 Tegra (L4T) running on

// Jetson TX-2. This compiler does not support the offsetof() macro.

#if defined(__aarch64__) && !defined(GOOGLE_L4T)

//  TODO Use below codes

//  // Dispatch to dot-product 3x3 kernels when supported.

//

//  ruy::Context *ruy_context = cpu_backend_context->ruy_context();

//  const bool has_dot_product_instructions =

//      ruy_context != nullptr &&

//      (ruy_context->GetRuntimeEnabledPaths() & ruy::Path::kNeonDotprod) != ruy::Path::kNone;

//  if (has_dot_product_instructions)

//  {

//    using optimized_ops::depthwise_conv::DotProduct3x3KernelType;

//    DotProduct3x3KernelType kernel_type =

//    optimized_ops::depthwise_conv::CategorizeDotProductKernel(

//        input_shape, filter_shape, params);

//    if (kernel_type != DotProduct3x3KernelType::kNone)

//    {

//      optimized_ops::depthwise_conv::DepthwiseConvDotProduct3x3<

//          DepthwiseConvImplementation::kUseNeon3x3DotProduct>(params, input_shape, input_data,

//                                                              filter_shape, filter_data,

//                                                              bias_shape,

//                                                              bias_data, output_shape,

//                                                              output_data);

//      return;

//    }

//  }

//

//  // Dispatch to non-dot-product 3x3 kernels when supported.

//

//  const int stride_width = params.stride_width;

//  const int stride_height = params.stride_height;

//  const int pad_width = params.padding_values.width;

//  const int pad_height = params.padding_values.height;

//  const int output_shift = params.output_shift;

//

//  // Call kernel optimized for depthwise convolutions using 3x3 filters if

//  // parameters are supported.

//  if (depthwise_conv::Fast3x3FilterKernelSupported(input_shape, filter_shape, stride_width,

//                                                   stride_height, dilation_width_factor,

//                                                   dilation_height_factor, pad_width, pad_height,

//                                                   depth_multiplier, output_shape, output_shift))

//  {

//    depthwise_conv::DepthwiseConv3x3Filter<kOutputRounding>(

//        params, input_shape, input_data, filter_shape, filter_data, bias_shape, bias_data,

//        output_shape, output_data, thread_start, thread_end, thread_dim);

//    return;

//  }

#endif


  depthwise_conv::DepthwiseConvGeneral(params, input_shape, input_data, filter_shape, filter_data,

                                       bias_shape, bias_data, output_shape, output_data,

                                       thread_start, thread_end, thread_dim);

}


inline void DepthwiseConvImpl(const DepthwiseConvParams &params, const Shape &input_shape,

                              const uint8_t *input_data, const Shape &filter_shape,

                              const uint8_t *filter_data, const Shape &bias_shape,

                              const int32_t *bias_data, const Shape &output_shape,

                              uint8_t *output_data, int thread_start, int thread_end,

                              int thread_dim)

{

  return DepthwiseConvWithRounding(params, input_shape, input_data, filter_shape, filter_data,

                                   bias_shape, bias_data, output_shape, output_data, thread_start,

                                   thread_end, thread_dim);

}


} // namespace optimized

} // namespace cker

} // namespace nnfw


#endif // __NNFW_CKER_OPTIMIZED_DEPTHWISE_CONV_UINT8_H__

TFMINI_USE_DEPTHWISECONV_KERNEL
#define TFMINI_USE_DEPTHWISECONV_KERNEL(ALLOW_STRIDED, FIXED_INPUT_DEPTH, FIXED_DEPTH_MULTIPLIER)

m
std::mutex m
Definition NEGEMMMatrixAccumulateBiasesKernel.cpp:135

neon_check.h

nnfw::cker::Shape
Definition Shape.h:32

nnfw::cker::Shape::DimensionsCount
int32_t DimensionsCount() const
Definition Shape.h:91

nnfw::cker::Shape::Dims
int32_t Dims(int i) const
Definition Shape.h:92

nnfw::cker::Shape::FlatSize
int FlatSize() const
Definition Shape.h:181

Shape.h

Types.h

Utils.h

output_shape
const luci_interpreter::RuntimeShape output_shape
Definition PALComparisons.h:32

mir_caffe::CaffeOpType::input
@ input

mir_caffe::CaffeOpType::filter
@ filter

nnfw::cker::optimized::depthwise_conv::DepthwiseConvInitAccBuffer
void DepthwiseConvInitAccBuffer(int num_output_pixels, int output_depth, const int32_t *bias_data, int32_t *acc_buffer)
Definition DepthwiseConvUint8.h:1724

nnfw::cker::optimized::depthwise_conv::DepthwiseConvGeneral
void DepthwiseConvGeneral(const DepthwiseConvParams &params, const Shape &input_shape, const uint8_t *input_data, const Shape &filter_shape, const uint8_t *filter_data, const Shape &bias_shape, const int32_t *bias_data, const Shape &output_shape, uint8_t *output_data, int thread_start, int thread_end, int thread_dim)
Definition DepthwiseConvUint8.h:1814

nnfw::cker::optimized::depthwise_conv::QuantizedDepthwiseConvAccumRowGeneric
void QuantizedDepthwiseConvAccumRowGeneric(int stride, int dilation_factor, int input_depth, int input_width, const uint8_t *input_data, int16_t input_offset, int pad_width, int depth_multiplier, int filter_width, const uint8_t *filter_data, int16_t filter_offset, int out_x_buffer_start, int out_x_buffer_end, int output_depth, int32_t *acc_buffer)
Definition DepthwiseConvUint8.h:1684

nnfw::cker::optimized::depthwise_conv::QuantizedDepthwiseConvAccumRow
void QuantizedDepthwiseConvAccumRow(int stride, int dilation_factor, int input_depth, int input_width, const uint8_t *input_data, int16_t input_offset, int pad_width, int depth_multiplier, int filter_width, const uint8_t *filter_data, int16_t filter_offset, int out_x_buffer_start, int out_x_buffer_end, int output_depth, int32_t *acc_buffer)
Definition DepthwiseConvUint8.h:1612

nnfw::cker::optimized::DepthwiseConvWithRounding
void DepthwiseConvWithRounding(const DepthwiseConvParams &params, const Shape &input_shape, const uint8_t *input_data, const Shape &filter_shape, const uint8_t *filter_data, const Shape &bias_shape, const int32_t *bias_data, const Shape &output_shape, uint8_t *output_data, int thread_start, int thread_end, int thread_dim)
Definition DepthwiseConvUint8.h:2153

nnfw::cker::optimized::DepthwiseConvImpl
void DepthwiseConvImpl(const DepthwiseConvParams &params, const Shape &input_shape, const float *input_data, const Shape &filter_shape, const float *filter_data, const Shape &bias_shape, const float *bias_data, const Shape &output_shape, float *output_data, int thread_start, int thread_end, int thread_dim)
Definition DepthwiseConvFloat.h:1033

nnfw::cker::MatchingDim
int MatchingDim(const Shape &shape1, int index1, const Shape &shape2, int index2)
Definition Shape.h:220

nnfw::cker::FlatSizeSkipDim
int FlatSizeSkipDim(const Shape &shape, int skip_dim)
Definition Shape.h:253

nnfw::cker::MultiplyByQuantizedMultiplier
int32_t MultiplyByQuantizedMultiplier(int32_t x, int32_t quantized_multiplier, int shift)
Definition Utils.h:96

nnfw
Definition topk_v2.h:30

nnfw::cker::DepthwiseConvParams
Definition Types.h:232

nnfw::cker::DepthwiseConvParams::output_offset
int32_t output_offset
Definition Types.h:244

nnfw::cker::DepthwiseConvParams::input_offset
int32_t input_offset
Definition Types.h:242

nnfw::cker::DepthwiseConvParams::dilation_height_factor
int16_t dilation_height_factor
Definition Types.h:238

nnfw::cker::DepthwiseConvParams::output_multiplier
int32_t output_multiplier
Definition Types.h:245

nnfw::cker::DepthwiseConvParams::dilation_width_factor
int16_t dilation_width_factor
Definition Types.h:237

nnfw::cker::DepthwiseConvParams::quantized_activation_max
int32_t quantized_activation_max
Definition Types.h:249

nnfw::cker::DepthwiseConvParams::stride_width
int16_t stride_width
Definition Types.h:235

nnfw::cker::DepthwiseConvParams::quantized_activation_min
int32_t quantized_activation_min
Definition Types.h:248

nnfw::cker::DepthwiseConvParams::weights_offset
int32_t weights_offset
Definition Types.h:243

nnfw::cker::DepthwiseConvParams::depth_multiplier
int16_t depth_multiplier
Definition Types.h:239

nnfw::cker::DepthwiseConvParams::padding_values
PaddingValues padding_values
Definition Types.h:234

nnfw::cker::DepthwiseConvParams::stride_height
int16_t stride_height
Definition Types.h:236

nnfw::cker::DepthwiseConvParams::output_shift
int output_shift
Definition Types.h:246

nnfw::cker::PaddingValues::height
int16_t height
Definition Types.h:75

nnfw::cker::PaddingValues::width
int16_t width
Definition Types.h:74

nnfw::cker::optimized::depthwise_conv::QuantizedDepthwiseConvKernel
Definition DepthwiseConvUint8.h:42