34void ConcatImpl<T>::run(
const std::vector<std::reference_wrapper<const mir::TensorVariant>> &inputs,
38 const size_t inputs_count = inputs.size();
40 int64_t concat_size = 0;
41 for (
size_t i = 0; i < inputs_count; i++)
43 const auto &input_shape = inputs[i].get().getShape();
44 assert(input_shape.rank() == concat_dims);
45 for (int32_t j = 0; j < concat_dims; j++)
52 concat_size += input_shape.dim(axis);
56 int32_t outer_size = 1;
57 for (int32_t i = 0; i < axis; i++)
60 int32_t base_inner_size = 1;
61 for (int32_t i = axis + 1; i < concat_dims; i++)
64 std::vector<int32_t> copy_sizes;
65 std::vector<char *> input_ptrs;
66 for (
size_t i = 0; i < inputs_count; i++)
68 const auto input_shape = inputs[i].get().getShape();
69 copy_sizes.push_back(input_shape.dim(axis) * base_inner_size);
70 input_ptrs.push_back(inputs[i].get().atOffset(0));
73 char *output_ptr = output.atOffset(0);
74 const size_t elem_size = inputs[0].get().getElementSize();
75 for (int32_t i = 0; i < outer_size; i++)
77 for (
size_t j = 0; j < inputs_count; j++)
79 std::memcpy(output_ptr, input_ptrs[j], copy_sizes[j] * elem_size);
80 output_ptr += copy_sizes[j] * elem_size;
81 input_ptrs[j] += copy_sizes[j] * elem_size;
93 const std::vector<std::reference_wrapper<const mir::TensorVariant>> &inputs,
int axis,
96 const size_t inputs_count = inputs.size();
97 std::vector<int32_t> input_zeropoints(inputs_count);
98 std::vector<float> input_scales(inputs_count);
101 int64_t concat_size = 0;
102 for (
size_t i = 0; i < inputs_count; i++)
104 const auto &input_type = inputs[i].get().getType();
105 assert(input_type.isQuantized());
106 assert(input_type.getElementType() == mir::DataType::UINT8);
107 const auto &input_shape = input_type.getShape();
108 assert(input_shape.rank() == concat_dimensions);
110 for (int32_t j = 0; j < concat_dimensions; j++)
114 concat_size += input_shape.dim(axis);
115 input_zeropoints[i] = input_type.getQuantization().getZeroPoint();
116 input_scales[i] = input_type.getQuantization().getScale();
120 const auto &output_type = output.getType();
121 assert(output_type.isQuantized());
122 int32_t output_zeropoint = output_type.getQuantization().getZeroPoint();
123 float output_scale = output_type.getQuantization().getScale();
126 int32_t outer_size = 1;
127 for (int32_t i = 0; i < axis; i++)
130 int32_t base_inner_size = 1;
131 for (int32_t i = axis + 1; i < concat_dimensions; i++)
135 uint8_t *output_ptr =
reinterpret_cast<uint8_t *
>(output.atOffset(0));
137 const float inverse_output_scale = 1.f / output_scale;
138 for (
int k = 0; k < outer_size; k++)
140 for (
size_t i = 0; i < inputs_count; ++i)
143 const int copy_size = input.getShape().dim(axis) * base_inner_size;
144 const char *input_data = input.atOffset(0) + k * copy_size;
145 const uint8_t *input_ptr =
reinterpret_cast<const uint8_t *
>(input_data);
146 if (input_zeropoints[i] == output_zeropoint && input_scales[i] == output_scale)
148 std::memcpy(output_ptr, input_ptr, copy_size);
152 const float scale = input_scales[i] * inverse_output_scale;
153 const float bias = -input_zeropoints[i] * scale;
154 for (
int j = 0; j < copy_size; ++j)
156 const int32_t value =
157 static_cast<int32_t
>(std::round(input_ptr[j] * scale + bias)) + output_zeropoint;
158 output_ptr[j] =
static_cast<uint8_t
>(std::max(std::min(255, value), 0));
161 output_ptr += copy_size;