22 void casadi_blazing_3d_boor_eval(T1* f, T1* J, T1* H,
const T1* all_knots,
const casadi_int* offset,
const T1* c,
const T1* dc,
const T1* ddc,
const T1* all_x,
const casadi_int* lookup_mode, casadi_int* iw, T1* w) {
23 casadi_int n_dims = 3;
25 casadi_int n_iter, k, i, pivot;
26 casadi_int *boor_offset, *starts, *index, *coeff_offset;
28 boor_offset = iw; iw+=n_dims+1;
29 starts = iw; iw+=n_dims;
30 index = iw; iw+=n_dims;
32 cumprod = w; w+= n_dims+1;
35 coeff_offset[n_dims] = 0;
37 casadi_int stride1 = offset[1]-offset[0]-4;
38 casadi_int stride2 = (offset[2]-offset[1]-4)*stride1;
40 simde__m256d zero = simde_mm256_set1_pd(0.0);
42 simde__m256d boor_start_0000 = zero;
43 simde__m256d boor_start_1111 = simde_mm256_set1_pd(1.0);
44 simde__m256d boor_start_0001 = simde_mm256_set_pd(1.0, 0.0, 0.0, 0.0);
45 simde__m256d boor_start_0010 = simde_mm256_set_pd(0.0, 1.0, 0.0, 0.0);
47 simde__m256d boor0_d3;
48 simde__m256d boor0_d2;
49 simde__m256d boor0_d1;
50 simde__m256d boor0_d0;
52 simde__m256d boor1_d3;
53 simde__m256d boor1_d2;
54 simde__m256d boor1_d1;
55 simde__m256d boor1_d0;
57 simde__m256d boor2_d3;
58 simde__m256d boor2_d2;
59 simde__m256d boor2_d1;
60 simde__m256d boor2_d0;
64 casadi_int degree, n_knots, n_b, L, start;
66 knots = all_knots + offset[0];
67 n_knots = offset[0+1]-offset[0];
68 n_b = n_knots-degree-1;
70 L = casadi_low(x, knots+degree, n_knots-2*degree, lookup_mode[0]);
72 if (start>n_b-degree-1) start = n_b-degree-1;
74 boor0_d3 = boor_start_0000;
75 if (x>=knots[0] && x<=knots[n_knots-1]) {
77 boor0_d3 = boor_start_1111;
78 }
else if (x==knots[n_knots-1]) {
79 boor0_d3 = boor_start_0001;
80 }
else if (knots[L+degree]==x) {
81 boor0_d3 = boor_start_0010;
83 boor0_d3 = boor_start_0001;
86 casadi_blazing_de_boor(x, knots+start, &boor0_d0, &boor0_d1, &boor0_d2, &boor0_d3);
88 knots = all_knots + offset[1];
89 n_knots = offset[1+1]-offset[1];
90 n_b = n_knots-degree-1;
92 L = casadi_low(x, knots+degree, n_knots-2*degree, lookup_mode[1]);
94 if (start>n_b-degree-1) start = n_b-degree-1;
96 boor1_d3 = boor_start_0000;
97 if (x>=knots[0] && x<=knots[n_knots-1]) {
99 boor1_d3 = boor_start_1111;
100 }
else if (x==knots[n_knots-1]) {
101 boor1_d3 = boor_start_0001;
102 }
else if (knots[L+degree]==x) {
103 boor1_d3 = boor_start_0010;
105 boor1_d3 = boor_start_0001;
108 casadi_blazing_de_boor(x, knots+start, &boor1_d0, &boor1_d1, &boor1_d2, &boor1_d3);
110 knots = all_knots + offset[2];
111 n_knots = offset[2+1]-offset[2];
112 n_b = n_knots-degree-1;
114 L = casadi_low(x, knots+degree, n_knots-2*degree, lookup_mode[2]);
116 if (start>n_b-degree-1) start = n_b-degree-1;
118 boor2_d3 = boor_start_0000;
119 if (x>=knots[0] && x<=knots[n_knots-1]) {
121 boor2_d3 = boor_start_1111;
122 }
else if (x==knots[n_knots-1]) {
123 boor2_d3 = boor_start_0001;
124 }
else if (knots[L+degree]==x) {
125 boor2_d3 = boor_start_0010;
127 boor2_d3 = boor_start_0001;
130 casadi_blazing_de_boor(x, knots+start, &boor2_d0, &boor2_d1, &boor2_d2, &boor2_d3);
134 for (
int j=0;j<4;++j) {
135 for (
int k=0;k<4;++k) {
136 C[j+4*k] = simde_mm256_loadu_pd(c+(starts[1]+j)*stride1+(starts[2]+k)*stride2+starts[0]);
140 simde__m256d a, b0, b1, b2, b3, c0, c1, c2, c3, r;
141 simde__m256d ab[4], cab[4];
145 b0 = simde_mm256_permute4x64_pd(boor1_d0, SIMDE_MM_SHUFFLE(0, 0, 0, 0));
146 b1 = simde_mm256_permute4x64_pd(boor1_d0, SIMDE_MM_SHUFFLE(1, 1, 1, 1));
147 b2 = simde_mm256_permute4x64_pd(boor1_d0, SIMDE_MM_SHUFFLE(2, 2, 2, 2));
148 b3 = simde_mm256_permute4x64_pd(boor1_d0, SIMDE_MM_SHUFFLE(3, 3, 3, 3));
150 c0 = simde_mm256_permute4x64_pd(boor2_d0, SIMDE_MM_SHUFFLE(0, 0, 0, 0));
151 c1 = simde_mm256_permute4x64_pd(boor2_d0, SIMDE_MM_SHUFFLE(1, 1, 1, 1));
152 c2 = simde_mm256_permute4x64_pd(boor2_d0, SIMDE_MM_SHUFFLE(2, 2, 2, 2));
153 c3 = simde_mm256_permute4x64_pd(boor2_d0, SIMDE_MM_SHUFFLE(3, 3, 3, 3));
158 ab[0] = simde_mm256_mul_pd(a, b0);
159 ab[1] = simde_mm256_mul_pd(a, b1);
160 ab[2] = simde_mm256_mul_pd(a, b2);
161 ab[3] = simde_mm256_mul_pd(a, b3);
165 for (
int i=0;i<4;++i) {
166 cab[i] = simde_mm256_set1_pd(0);
167 cab[i] = simde_mm256_fmadd_pd(ab[0], C[4*i+0], cab[i]);
168 cab[i] = simde_mm256_fmadd_pd(ab[1], C[4*i+1], cab[i]);
169 cab[i] = simde_mm256_fmadd_pd(ab[2], C[4*i+2], cab[i]);
170 cab[i] = simde_mm256_fmadd_pd(ab[3], C[4*i+3], cab[i]);
175 r = simde_mm256_set1_pd(0);
176 r = simde_mm256_fmadd_pd(cab[0], c0, r);
177 r = simde_mm256_fmadd_pd(cab[1], c1, r);
178 r = simde_mm256_fmadd_pd(cab[2], c2, r);
179 r = simde_mm256_fmadd_pd(cab[3], c3, r);
182 r0 = simde_mm256_castpd256_pd128(r);
183 r1 = simde_mm256_extractf128_pd(r, 1);
184 r0 = simde_mm_add_pd(r0, r1);
185 f[0] = simde_mm_cvtsd_f64(simde_mm_add_sd(r0, simde_mm_unpackhi_pd(r0, r0)));
190 stride1 = offset[1]-offset[0]-4-1;
191 stride2 = (offset[2]-offset[1]-4)*stride1;
192 for (
int j=0;j<4;++j) {
193 for (
int k=0;k<4;++k) {
194 C[j+4*k] = simde_mm256_loadu_pd(
195 dc+(starts[1]+j)*stride1+(starts[2]+k)*stride2+starts[0]-1);
198 dc += stride2*(offset[3]-offset[2]-4);
201 ab[0] = simde_mm256_mul_pd(a, b0);
202 ab[1] = simde_mm256_mul_pd(a, b1);
203 ab[2] = simde_mm256_mul_pd(a, b2);
204 ab[3] = simde_mm256_mul_pd(a, b3);
207 for (
int i=0;i<4;++i) {
208 cab[i] = simde_mm256_set1_pd(0);
209 cab[i] = simde_mm256_fmadd_pd(ab[0], C[4*i+0], cab[i]);
210 cab[i] = simde_mm256_fmadd_pd(ab[1], C[4*i+1], cab[i]);
211 cab[i] = simde_mm256_fmadd_pd(ab[2], C[4*i+2], cab[i]);
212 cab[i] = simde_mm256_fmadd_pd(ab[3], C[4*i+3], cab[i]);
216 r = simde_mm256_set1_pd(0);
217 r = simde_mm256_fmadd_pd(cab[0], c0, r);
218 r = simde_mm256_fmadd_pd(cab[1], c1, r);
219 r = simde_mm256_fmadd_pd(cab[2], c2, r);
220 r = simde_mm256_fmadd_pd(cab[3], c3, r);
223 r0 = simde_mm256_castpd256_pd128(r);
224 r1 = simde_mm256_extractf128_pd(r, 1);
225 r0 = simde_mm_add_pd(r0, r1);
226 J[0] = simde_mm_cvtsd_f64(simde_mm_add_sd(r0, simde_mm_unpackhi_pd(r0, r0)));
229 stride1 = offset[1]-offset[0]-4;
230 stride2 = (offset[2]-offset[1]-4-1)*stride1;
231 for (
int j=0;j<4;++j) {
232 for (
int k=0;k<4;++k) {
236 C[j+4*k] = simde_mm256_loadu_pd(
237 dc+(starts[1]+j-1)*stride1+(starts[2]+k)*stride2+starts[0]);
241 dc += stride2*(offset[3]-offset[2]-4);
245 b0 = simde_mm256_permute4x64_pd(boor1_d1, SIMDE_MM_SHUFFLE(0, 0, 0, 0));
246 b1 = simde_mm256_permute4x64_pd(boor1_d1, SIMDE_MM_SHUFFLE(1, 1, 1, 1));
247 b2 = simde_mm256_permute4x64_pd(boor1_d1, SIMDE_MM_SHUFFLE(2, 2, 2, 2));
248 b3 = simde_mm256_permute4x64_pd(boor1_d1, SIMDE_MM_SHUFFLE(3, 3, 3, 3));
250 ab[0] = simde_mm256_mul_pd(a, b0);
251 ab[1] = simde_mm256_mul_pd(a, b1);
252 ab[2] = simde_mm256_mul_pd(a, b2);
253 ab[3] = simde_mm256_mul_pd(a, b3);
257 for (
int i=0;i<4;++i) {
258 cab[i] = simde_mm256_set1_pd(0);
259 cab[i] = simde_mm256_fmadd_pd(ab[0], C[4*i+0], cab[i]);
260 cab[i] = simde_mm256_fmadd_pd(ab[1], C[4*i+1], cab[i]);
261 cab[i] = simde_mm256_fmadd_pd(ab[2], C[4*i+2], cab[i]);
262 cab[i] = simde_mm256_fmadd_pd(ab[3], C[4*i+3], cab[i]);
266 r = simde_mm256_set1_pd(0);
267 r = simde_mm256_fmadd_pd(cab[0], c0, r);
268 r = simde_mm256_fmadd_pd(cab[1], c1, r);
269 r = simde_mm256_fmadd_pd(cab[2], c2, r);
270 r = simde_mm256_fmadd_pd(cab[3], c3, r);
273 r0 = simde_mm256_castpd256_pd128(r);
274 r1 = simde_mm256_extractf128_pd(r, 1);
275 r0 = simde_mm_add_pd(r0, r1);
276 J[1] = simde_mm_cvtsd_f64(simde_mm_add_sd(r0, simde_mm_unpackhi_pd(r0, r0)));
278 stride1 = offset[1]-offset[0]-4;
279 stride2 = (offset[2]-offset[1]-4)*stride1;
280 for (
int j=0;j<4;++j) {
281 for (
int k=0;k<4;++k) {
285 C[j+4*k] = simde_mm256_loadu_pd(
286 dc+(starts[1]+j)*stride1+(starts[2]+k-1)*stride2+starts[0]);
291 b0 = simde_mm256_permute4x64_pd(boor1_d0, SIMDE_MM_SHUFFLE(0, 0, 0, 0));
292 b1 = simde_mm256_permute4x64_pd(boor1_d0, SIMDE_MM_SHUFFLE(1, 1, 1, 1));
293 b2 = simde_mm256_permute4x64_pd(boor1_d0, SIMDE_MM_SHUFFLE(2, 2, 2, 2));
294 b3 = simde_mm256_permute4x64_pd(boor1_d0, SIMDE_MM_SHUFFLE(3, 3, 3, 3));
296 c0 = simde_mm256_permute4x64_pd(boor2_d1, SIMDE_MM_SHUFFLE(0, 0, 0, 0));
297 c1 = simde_mm256_permute4x64_pd(boor2_d1, SIMDE_MM_SHUFFLE(1, 1, 1, 1));
298 c2 = simde_mm256_permute4x64_pd(boor2_d1, SIMDE_MM_SHUFFLE(2, 2, 2, 2));
299 c3 = simde_mm256_permute4x64_pd(boor2_d1, SIMDE_MM_SHUFFLE(3, 3, 3, 3));
301 ab[0] = simde_mm256_mul_pd(a, b0);
302 ab[1] = simde_mm256_mul_pd(a, b1);
303 ab[2] = simde_mm256_mul_pd(a, b2);
304 ab[3] = simde_mm256_mul_pd(a, b3);
308 for (
int i=0;i<4;++i) {
309 cab[i] = simde_mm256_set1_pd(0);
310 cab[i] = simde_mm256_fmadd_pd(ab[0], C[4*i+0], cab[i]);
311 cab[i] = simde_mm256_fmadd_pd(ab[1], C[4*i+1], cab[i]);
312 cab[i] = simde_mm256_fmadd_pd(ab[2], C[4*i+2], cab[i]);
313 cab[i] = simde_mm256_fmadd_pd(ab[3], C[4*i+3], cab[i]);
317 r = simde_mm256_set1_pd(0);
318 r = simde_mm256_fmadd_pd(cab[0], c0, r);
319 r = simde_mm256_fmadd_pd(cab[1], c1, r);
320 r = simde_mm256_fmadd_pd(cab[2], c2, r);
321 r = simde_mm256_fmadd_pd(cab[3], c3, r);
324 r0 = simde_mm256_castpd256_pd128(r);
325 r1 = simde_mm256_extractf128_pd(r, 1);
326 r0 = simde_mm_add_pd(r0, r1);
327 J[2] = simde_mm_cvtsd_f64(simde_mm_add_sd(r0, simde_mm_unpackhi_pd(r0, r0)));
332 stride1 = offset[1]-offset[0]-4-2;
333 stride2 = (offset[2]-offset[1]-4)*stride1;
334 for (
int j=0;j<4;++j) {
335 for (
int k=0;k<4;++k) {
336 C[j+4*k] = simde_mm256_loadu_pd(
337 ddc+(starts[1]+j)*stride1+(starts[2]+k)*stride2+starts[0]-2);
340 ddc += stride2*(offset[3]-offset[2]-4);
343 b0 = simde_mm256_permute4x64_pd(boor1_d0, SIMDE_MM_SHUFFLE(0, 0, 0, 0));
344 b1 = simde_mm256_permute4x64_pd(boor1_d0, SIMDE_MM_SHUFFLE(1, 1, 1, 1));
345 b2 = simde_mm256_permute4x64_pd(boor1_d0, SIMDE_MM_SHUFFLE(2, 2, 2, 2));
346 b3 = simde_mm256_permute4x64_pd(boor1_d0, SIMDE_MM_SHUFFLE(3, 3, 3, 3));
348 c0 = simde_mm256_permute4x64_pd(boor2_d0, SIMDE_MM_SHUFFLE(0, 0, 0, 0));
349 c1 = simde_mm256_permute4x64_pd(boor2_d0, SIMDE_MM_SHUFFLE(1, 1, 1, 1));
350 c2 = simde_mm256_permute4x64_pd(boor2_d0, SIMDE_MM_SHUFFLE(2, 2, 2, 2));
351 c3 = simde_mm256_permute4x64_pd(boor2_d0, SIMDE_MM_SHUFFLE(3, 3, 3, 3));
353 ab[0] = simde_mm256_mul_pd(a, b0);
354 ab[1] = simde_mm256_mul_pd(a, b1);
355 ab[2] = simde_mm256_mul_pd(a, b2);
356 ab[3] = simde_mm256_mul_pd(a, b3);
359 for (
int i=0;i<4;++i) {
360 cab[i] = simde_mm256_set1_pd(0);
361 cab[i] = simde_mm256_fmadd_pd(ab[0], C[4*i+0], cab[i]);
362 cab[i] = simde_mm256_fmadd_pd(ab[1], C[4*i+1], cab[i]);
363 cab[i] = simde_mm256_fmadd_pd(ab[2], C[4*i+2], cab[i]);
364 cab[i] = simde_mm256_fmadd_pd(ab[3], C[4*i+3], cab[i]);
368 r = simde_mm256_set1_pd(0);
369 r = simde_mm256_fmadd_pd(cab[0], c0, r);
370 r = simde_mm256_fmadd_pd(cab[1], c1, r);
371 r = simde_mm256_fmadd_pd(cab[2], c2, r);
372 r = simde_mm256_fmadd_pd(cab[3], c3, r);
375 r0 = simde_mm256_castpd256_pd128(r);
376 r1 = simde_mm256_extractf128_pd(r, 1);
377 r0 = simde_mm_add_pd(r0, r1);
378 H[0] = simde_mm_cvtsd_f64(simde_mm_add_sd(r0, simde_mm_unpackhi_pd(r0, r0)));
380 stride1 = offset[1]-offset[0]-4;
381 stride2 = (offset[2]-offset[1]-4-2)*stride1;
382 for (
int j=0;j<4;++j) {
383 for (
int k=0;k<4;++k) {
387 C[j+4*k] = simde_mm256_loadu_pd(
388 ddc+(starts[1]+j-2)*stride1+(starts[2]+k)*stride2+starts[0]);
392 ddc += stride2*(offset[3]-offset[2]-4);
395 b0 = simde_mm256_permute4x64_pd(boor1_d2, SIMDE_MM_SHUFFLE(0, 0, 0, 0));
396 b1 = simde_mm256_permute4x64_pd(boor1_d2, SIMDE_MM_SHUFFLE(1, 1, 1, 1));
397 b2 = simde_mm256_permute4x64_pd(boor1_d2, SIMDE_MM_SHUFFLE(2, 2, 2, 2));
398 b3 = simde_mm256_permute4x64_pd(boor1_d2, SIMDE_MM_SHUFFLE(3, 3, 3, 3));
400 c0 = simde_mm256_permute4x64_pd(boor2_d0, SIMDE_MM_SHUFFLE(0, 0, 0, 0));
401 c1 = simde_mm256_permute4x64_pd(boor2_d0, SIMDE_MM_SHUFFLE(1, 1, 1, 1));
402 c2 = simde_mm256_permute4x64_pd(boor2_d0, SIMDE_MM_SHUFFLE(2, 2, 2, 2));
403 c3 = simde_mm256_permute4x64_pd(boor2_d0, SIMDE_MM_SHUFFLE(3, 3, 3, 3));
405 ab[0] = simde_mm256_mul_pd(a, b0);
406 ab[1] = simde_mm256_mul_pd(a, b1);
407 ab[2] = simde_mm256_mul_pd(a, b2);
408 ab[3] = simde_mm256_mul_pd(a, b3);
411 for (
int i=0;i<4;++i) {
412 cab[i] = simde_mm256_set1_pd(0);
413 cab[i] = simde_mm256_fmadd_pd(ab[0], C[4*i+0], cab[i]);
414 cab[i] = simde_mm256_fmadd_pd(ab[1], C[4*i+1], cab[i]);
415 cab[i] = simde_mm256_fmadd_pd(ab[2], C[4*i+2], cab[i]);
416 cab[i] = simde_mm256_fmadd_pd(ab[3], C[4*i+3], cab[i]);
420 r = simde_mm256_set1_pd(0);
421 r = simde_mm256_fmadd_pd(cab[0], c0, r);
422 r = simde_mm256_fmadd_pd(cab[1], c1, r);
423 r = simde_mm256_fmadd_pd(cab[2], c2, r);
424 r = simde_mm256_fmadd_pd(cab[3], c3, r);
427 r0 = simde_mm256_castpd256_pd128(r);
428 r1 = simde_mm256_extractf128_pd(r, 1);
429 r0 = simde_mm_add_pd(r0, r1);
430 H[4] = simde_mm_cvtsd_f64(simde_mm_add_sd(r0, simde_mm_unpackhi_pd(r0, r0)));
432 stride1 = offset[1]-offset[0]-4;
433 stride2 = (offset[2]-offset[1]-4)*stride1;
434 for (
int j=0;j<4;++j) {
435 for (
int k=0;k<4;++k) {
439 C[j+4*k] = simde_mm256_loadu_pd(
440 ddc+(starts[1]+j)*stride1+(starts[2]+k-2)*stride2+starts[0]);
444 ddc += stride2*(offset[3]-offset[2]-4-2);
447 b0 = simde_mm256_permute4x64_pd(boor1_d0, SIMDE_MM_SHUFFLE(0, 0, 0, 0));
448 b1 = simde_mm256_permute4x64_pd(boor1_d0, SIMDE_MM_SHUFFLE(1, 1, 1, 1));
449 b2 = simde_mm256_permute4x64_pd(boor1_d0, SIMDE_MM_SHUFFLE(2, 2, 2, 2));
450 b3 = simde_mm256_permute4x64_pd(boor1_d0, SIMDE_MM_SHUFFLE(3, 3, 3, 3));
452 c0 = simde_mm256_permute4x64_pd(boor2_d2, SIMDE_MM_SHUFFLE(0, 0, 0, 0));
453 c1 = simde_mm256_permute4x64_pd(boor2_d2, SIMDE_MM_SHUFFLE(1, 1, 1, 1));
454 c2 = simde_mm256_permute4x64_pd(boor2_d2, SIMDE_MM_SHUFFLE(2, 2, 2, 2));
455 c3 = simde_mm256_permute4x64_pd(boor2_d2, SIMDE_MM_SHUFFLE(3, 3, 3, 3));
457 ab[0] = simde_mm256_mul_pd(a, b0);
458 ab[1] = simde_mm256_mul_pd(a, b1);
459 ab[2] = simde_mm256_mul_pd(a, b2);
460 ab[3] = simde_mm256_mul_pd(a, b3);
463 for (
int i=0;i<4;++i) {
464 cab[i] = simde_mm256_set1_pd(0);
465 cab[i] = simde_mm256_fmadd_pd(ab[0], C[4*i+0], cab[i]);
466 cab[i] = simde_mm256_fmadd_pd(ab[1], C[4*i+1], cab[i]);
467 cab[i] = simde_mm256_fmadd_pd(ab[2], C[4*i+2], cab[i]);
468 cab[i] = simde_mm256_fmadd_pd(ab[3], C[4*i+3], cab[i]);
472 r = simde_mm256_set1_pd(0);
473 r = simde_mm256_fmadd_pd(cab[0], c0, r);
474 r = simde_mm256_fmadd_pd(cab[1], c1, r);
475 r = simde_mm256_fmadd_pd(cab[2], c2, r);
476 r = simde_mm256_fmadd_pd(cab[3], c3, r);
479 r0 = simde_mm256_castpd256_pd128(r);
480 r1 = simde_mm256_extractf128_pd(r, 1);
481 r0 = simde_mm_add_pd(r0, r1);
482 H[8] = simde_mm_cvtsd_f64(simde_mm_add_sd(r0, simde_mm_unpackhi_pd(r0, r0)));
484 stride1 = offset[1]-offset[0]-5;
485 stride2 = (offset[2]-offset[1]-5)*stride1;
486 for (
int j=0;j<4;++j) {
487 for (
int k=0;k<4;++k) {
491 C[j+4*k] = simde_mm256_loadu_pd(
492 ddc+(starts[1]+j-1)*stride1+(starts[2]+k)*stride2+starts[0]-1);
496 ddc += stride2*(offset[3]-offset[2]-4);
500 b0 = simde_mm256_permute4x64_pd(boor1_d1, SIMDE_MM_SHUFFLE(0, 0, 0, 0));
501 b1 = simde_mm256_permute4x64_pd(boor1_d1, SIMDE_MM_SHUFFLE(1, 1, 1, 1));
502 b2 = simde_mm256_permute4x64_pd(boor1_d1, SIMDE_MM_SHUFFLE(2, 2, 2, 2));
503 b3 = simde_mm256_permute4x64_pd(boor1_d1, SIMDE_MM_SHUFFLE(3, 3, 3, 3));
505 c0 = simde_mm256_permute4x64_pd(boor2_d0, SIMDE_MM_SHUFFLE(0, 0, 0, 0));
506 c1 = simde_mm256_permute4x64_pd(boor2_d0, SIMDE_MM_SHUFFLE(1, 1, 1, 1));
507 c2 = simde_mm256_permute4x64_pd(boor2_d0, SIMDE_MM_SHUFFLE(2, 2, 2, 2));
508 c3 = simde_mm256_permute4x64_pd(boor2_d0, SIMDE_MM_SHUFFLE(3, 3, 3, 3));
510 ab[0] = simde_mm256_mul_pd(a, b0);
511 ab[1] = simde_mm256_mul_pd(a, b1);
512 ab[2] = simde_mm256_mul_pd(a, b2);
513 ab[3] = simde_mm256_mul_pd(a, b3);
516 for (
int i=0;i<4;++i) {
517 cab[i] = simde_mm256_set1_pd(0);
518 cab[i] = simde_mm256_fmadd_pd(ab[0], C[4*i+0], cab[i]);
519 cab[i] = simde_mm256_fmadd_pd(ab[1], C[4*i+1], cab[i]);
520 cab[i] = simde_mm256_fmadd_pd(ab[2], C[4*i+2], cab[i]);
521 cab[i] = simde_mm256_fmadd_pd(ab[3], C[4*i+3], cab[i]);
525 r = simde_mm256_set1_pd(0);
526 r = simde_mm256_fmadd_pd(cab[0], c0, r);
527 r = simde_mm256_fmadd_pd(cab[1], c1, r);
528 r = simde_mm256_fmadd_pd(cab[2], c2, r);
529 r = simde_mm256_fmadd_pd(cab[3], c3, r);
532 r0 = simde_mm256_castpd256_pd128(r);
533 r1 = simde_mm256_extractf128_pd(r, 1);
534 r0 = simde_mm_add_pd(r0, r1);
535 H[1] = H[3] = simde_mm_cvtsd_f64(simde_mm_add_sd(r0, simde_mm_unpackhi_pd(r0, r0)));
537 stride1 = offset[1]-offset[0]-4;
538 stride2 = (offset[2]-offset[1]-5)*stride1;
539 for (
int j=0;j<4;++j) {
540 for (
int k=0;k<4;++k) {
544 C[j+4*k] = simde_mm256_loadu_pd(
545 ddc+(starts[1]+j-1)*stride1+(starts[2]+k-1)*stride2+starts[0]);
549 ddc += stride2*(offset[3]-offset[2]-5);
553 b0 = simde_mm256_permute4x64_pd(boor1_d1, SIMDE_MM_SHUFFLE(0, 0, 0, 0));
554 b1 = simde_mm256_permute4x64_pd(boor1_d1, SIMDE_MM_SHUFFLE(1, 1, 1, 1));
555 b2 = simde_mm256_permute4x64_pd(boor1_d1, SIMDE_MM_SHUFFLE(2, 2, 2, 2));
556 b3 = simde_mm256_permute4x64_pd(boor1_d1, SIMDE_MM_SHUFFLE(3, 3, 3, 3));
558 c0 = simde_mm256_permute4x64_pd(boor2_d1, SIMDE_MM_SHUFFLE(0, 0, 0, 0));
559 c1 = simde_mm256_permute4x64_pd(boor2_d1, SIMDE_MM_SHUFFLE(1, 1, 1, 1));
560 c2 = simde_mm256_permute4x64_pd(boor2_d1, SIMDE_MM_SHUFFLE(2, 2, 2, 2));
561 c3 = simde_mm256_permute4x64_pd(boor2_d1, SIMDE_MM_SHUFFLE(3, 3, 3, 3));
563 ab[0] = simde_mm256_mul_pd(a, b0);
564 ab[1] = simde_mm256_mul_pd(a, b1);
565 ab[2] = simde_mm256_mul_pd(a, b2);
566 ab[3] = simde_mm256_mul_pd(a, b3);
569 for (
int i=0;i<4;++i) {
570 cab[i] = simde_mm256_set1_pd(0);
571 cab[i] = simde_mm256_fmadd_pd(ab[0], C[4*i+0], cab[i]);
572 cab[i] = simde_mm256_fmadd_pd(ab[1], C[4*i+1], cab[i]);
573 cab[i] = simde_mm256_fmadd_pd(ab[2], C[4*i+2], cab[i]);
574 cab[i] = simde_mm256_fmadd_pd(ab[3], C[4*i+3], cab[i]);
578 r = simde_mm256_set1_pd(0);
579 r = simde_mm256_fmadd_pd(cab[0], c0, r);
580 r = simde_mm256_fmadd_pd(cab[1], c1, r);
581 r = simde_mm256_fmadd_pd(cab[2], c2, r);
582 r = simde_mm256_fmadd_pd(cab[3], c3, r);
585 r0 = simde_mm256_castpd256_pd128(r);
586 r1 = simde_mm256_extractf128_pd(r, 1);
587 r0 = simde_mm_add_pd(r0, r1);
588 H[5] = H[7] = simde_mm_cvtsd_f64(simde_mm_add_sd(r0, simde_mm_unpackhi_pd(r0, r0)));
592 stride1 = offset[1]-offset[0]-5;
593 stride2 = (offset[2]-offset[1]-4)*stride1;
594 for (
int j=0;j<4;++j) {
595 for (
int k=0;k<4;++k) {
599 C[j+4*k] = simde_mm256_loadu_pd(
600 ddc+(starts[1]+j)*stride1+(starts[2]+k-1)*stride2+starts[0]-1);
604 ddc += stride2*(offset[3]-offset[2]-5);
608 b0 = simde_mm256_permute4x64_pd(boor1_d0, SIMDE_MM_SHUFFLE(0, 0, 0, 0));
609 b1 = simde_mm256_permute4x64_pd(boor1_d0, SIMDE_MM_SHUFFLE(1, 1, 1, 1));
610 b2 = simde_mm256_permute4x64_pd(boor1_d0, SIMDE_MM_SHUFFLE(2, 2, 2, 2));
611 b3 = simde_mm256_permute4x64_pd(boor1_d0, SIMDE_MM_SHUFFLE(3, 3, 3, 3));
613 c0 = simde_mm256_permute4x64_pd(boor2_d1, SIMDE_MM_SHUFFLE(0, 0, 0, 0));
614 c1 = simde_mm256_permute4x64_pd(boor2_d1, SIMDE_MM_SHUFFLE(1, 1, 1, 1));
615 c2 = simde_mm256_permute4x64_pd(boor2_d1, SIMDE_MM_SHUFFLE(2, 2, 2, 2));
616 c3 = simde_mm256_permute4x64_pd(boor2_d1, SIMDE_MM_SHUFFLE(3, 3, 3, 3));
618 ab[0] = simde_mm256_mul_pd(a, b0);
619 ab[1] = simde_mm256_mul_pd(a, b1);
620 ab[2] = simde_mm256_mul_pd(a, b2);
621 ab[3] = simde_mm256_mul_pd(a, b3);
624 for (
int i=0;i<4;++i) {
625 cab[i] = simde_mm256_set1_pd(0);
626 cab[i] = simde_mm256_fmadd_pd(ab[0], C[4*i+0], cab[i]);
627 cab[i] = simde_mm256_fmadd_pd(ab[1], C[4*i+1], cab[i]);
628 cab[i] = simde_mm256_fmadd_pd(ab[2], C[4*i+2], cab[i]);
629 cab[i] = simde_mm256_fmadd_pd(ab[3], C[4*i+3], cab[i]);
633 r = simde_mm256_set1_pd(0);
634 r = simde_mm256_fmadd_pd(cab[0], c0, r);
635 r = simde_mm256_fmadd_pd(cab[1], c1, r);
636 r = simde_mm256_fmadd_pd(cab[2], c2, r);
637 r = simde_mm256_fmadd_pd(cab[3], c3, r);
640 r0 = simde_mm256_castpd256_pd128(r);
641 r1 = simde_mm256_extractf128_pd(r, 1);
642 r0 = simde_mm_add_pd(r0, r1);
643 H[2] = H[6] = simde_mm_cvtsd_f64(simde_mm_add_sd(r0, simde_mm_unpackhi_pd(r0, r0)));