22 void casadi_blazing_2d_boor_eval(T1* f, T1* J, T1* H,
const T1* all_knots,
const casadi_int* offset,
const T1* c,
const T1* dc,
const T1* ddc,
const T1* all_x,
const casadi_int* lookup_mode, casadi_int* iw, T1* w) {
23 casadi_int n_dims = 2;
25 casadi_int n_iter, i, pivot;
26 casadi_int *boor_offset, *starts, *index, *coeff_offset;
28 boor_offset = iw; iw+=n_dims+1;
29 starts = iw; iw+=n_dims;
30 index = iw; iw+=n_dims;
32 cumprod = w; w+= n_dims+1;
35 coeff_offset[n_dims] = 0;
37 casadi_int stride1 = offset[1]-offset[0]-4;
39 simde__m256d zero = simde_mm256_set1_pd(0.0);
41 simde__m256d boor_start_0000 = zero;
42 simde__m256d boor_start_1111 = simde_mm256_set1_pd(1.0);
43 simde__m256d boor_start_0001 = simde_mm256_set_pd(1.0, 0.0, 0.0, 0.0);
44 simde__m256d boor_start_0010 = simde_mm256_set_pd(0.0, 1.0, 0.0, 0.0);
46 simde__m256d boor0_d3;
47 simde__m256d boor0_d2;
48 simde__m256d boor0_d1;
49 simde__m256d boor0_d0;
51 simde__m256d boor1_d3;
52 simde__m256d boor1_d2;
53 simde__m256d boor1_d1;
54 simde__m256d boor1_d0;
58 casadi_int degree, n_knots, n_b, L, start;
60 knots = all_knots + offset[0];
61 n_knots = offset[0+1]-offset[0];
62 n_b = n_knots-degree-1;
64 L = casadi_low(x, knots+degree, n_knots-2*degree, lookup_mode[0]);
66 if (start>n_b-degree-1) start = n_b-degree-1;
68 boor0_d3 = boor_start_0000;
69 if (x>=knots[0] && x<=knots[n_knots-1]) {
71 boor0_d3 = boor_start_1111;
72 }
else if (x==knots[n_knots-1]) {
73 boor0_d3 = boor_start_0001;
74 }
else if (knots[L+degree]==x) {
75 boor0_d3 = boor_start_0010;
77 boor0_d3 = boor_start_0001;
80 casadi_blazing_de_boor(x, knots+start, &boor0_d0, &boor0_d1, &boor0_d2, &boor0_d3);
82 knots = all_knots + offset[1];
83 n_knots = offset[1+1]-offset[1];
84 n_b = n_knots-degree-1;
86 L = casadi_low(x, knots+degree, n_knots-2*degree, lookup_mode[1]);
88 if (start>n_b-degree-1) start = n_b-degree-1;
90 boor1_d3 = boor_start_0000;
91 if (x>=knots[0] && x<=knots[n_knots-1]) {
93 boor1_d3 = boor_start_1111;
94 }
else if (x==knots[n_knots-1]) {
95 boor1_d3 = boor_start_0001;
96 }
else if (knots[L+degree]==x) {
97 boor1_d3 = boor_start_0010;
99 boor1_d3 = boor_start_0001;
102 casadi_blazing_de_boor(x, knots+start, &boor1_d0, &boor1_d1, &boor1_d2, &boor1_d3);
106 for (
int j=0;j<4;++j) {
107 C[j] = simde_mm256_loadu_pd(c+(starts[1]+j)*stride1+starts[0]);
110 simde__m256d a, b0, b1, b2, b3, c0, c1, c2, c3, r;
115 b0 = simde_mm256_permute4x64_pd(boor1_d0, SIMDE_MM_SHUFFLE(0, 0, 0, 0));
116 b1 = simde_mm256_permute4x64_pd(boor1_d0, SIMDE_MM_SHUFFLE(1, 1, 1, 1));
117 b2 = simde_mm256_permute4x64_pd(boor1_d0, SIMDE_MM_SHUFFLE(2, 2, 2, 2));
118 b3 = simde_mm256_permute4x64_pd(boor1_d0, SIMDE_MM_SHUFFLE(3, 3, 3, 3));
123 ab[0] = simde_mm256_mul_pd(a, b0);
124 ab[1] = simde_mm256_mul_pd(a, b1);
125 ab[2] = simde_mm256_mul_pd(a, b2);
126 ab[3] = simde_mm256_mul_pd(a, b3);
130 r = simde_mm256_set1_pd(0);
131 r = simde_mm256_fmadd_pd(ab[0], C[0], r);
132 r = simde_mm256_fmadd_pd(ab[1], C[1], r);
133 r = simde_mm256_fmadd_pd(ab[2], C[2], r);
134 r = simde_mm256_fmadd_pd(ab[3], C[3], r);
138 r0 = simde_mm256_castpd256_pd128(r);
139 r1 = simde_mm256_extractf128_pd(r, 1);
140 r0 = simde_mm_add_pd(r0, r1);
141 f[0] = simde_mm_cvtsd_f64(simde_mm_add_sd(r0, simde_mm_unpackhi_pd(r0, r0)));
146 stride1 = offset[1]-offset[0]-4-1;
147 for (
int j=0;j<4;++j) {
148 C[j] = simde_mm256_loadu_pd(dc+(starts[1]+j)*stride1+starts[0]-1);
150 dc += stride1*(offset[2]-offset[1]-4);
153 ab[0] = simde_mm256_mul_pd(a, b0);
154 ab[1] = simde_mm256_mul_pd(a, b1);
155 ab[2] = simde_mm256_mul_pd(a, b2);
156 ab[3] = simde_mm256_mul_pd(a, b3);
160 r = simde_mm256_set1_pd(0);
161 r = simde_mm256_fmadd_pd(ab[0], C[0], r);
162 r = simde_mm256_fmadd_pd(ab[1], C[1], r);
163 r = simde_mm256_fmadd_pd(ab[2], C[2], r);
164 r = simde_mm256_fmadd_pd(ab[3], C[3], r);
167 r0 = simde_mm256_castpd256_pd128(r);
168 r1 = simde_mm256_extractf128_pd(r, 1);
169 r0 = simde_mm_add_pd(r0, r1);
170 J[0] = simde_mm_cvtsd_f64(simde_mm_add_sd(r0, simde_mm_unpackhi_pd(r0, r0)));
173 stride1 = offset[1]-offset[0]-4;
174 for (
int j=0;j<4;++j) {
178 C[j] = simde_mm256_loadu_pd(dc+(starts[1]+j-1)*stride1+starts[0]);
184 b0 = simde_mm256_permute4x64_pd(boor1_d1, SIMDE_MM_SHUFFLE(0, 0, 0, 0));
185 b1 = simde_mm256_permute4x64_pd(boor1_d1, SIMDE_MM_SHUFFLE(1, 1, 1, 1));
186 b2 = simde_mm256_permute4x64_pd(boor1_d1, SIMDE_MM_SHUFFLE(2, 2, 2, 2));
187 b3 = simde_mm256_permute4x64_pd(boor1_d1, SIMDE_MM_SHUFFLE(3, 3, 3, 3));
189 ab[0] = simde_mm256_mul_pd(a, b0);
190 ab[1] = simde_mm256_mul_pd(a, b1);
191 ab[2] = simde_mm256_mul_pd(a, b2);
192 ab[3] = simde_mm256_mul_pd(a, b3);
196 r = simde_mm256_set1_pd(0);
197 r = simde_mm256_fmadd_pd(ab[0], C[0], r);
198 r = simde_mm256_fmadd_pd(ab[1], C[1], r);
199 r = simde_mm256_fmadd_pd(ab[2], C[2], r);
200 r = simde_mm256_fmadd_pd(ab[3], C[3], r);
203 r0 = simde_mm256_castpd256_pd128(r);
204 r1 = simde_mm256_extractf128_pd(r, 1);
205 r0 = simde_mm_add_pd(r0, r1);
206 J[1] = simde_mm_cvtsd_f64(simde_mm_add_sd(r0, simde_mm_unpackhi_pd(r0, r0)));
210 stride1 = offset[1]-offset[0]-4-2;
211 for (
int j=0;j<4;++j) {
212 C[j] = simde_mm256_loadu_pd(ddc+(starts[1]+j)*stride1+starts[0]-2);
214 ddc += stride1*(offset[2]-offset[1]-4);
217 b0 = simde_mm256_permute4x64_pd(boor1_d0, SIMDE_MM_SHUFFLE(0, 0, 0, 0));
218 b1 = simde_mm256_permute4x64_pd(boor1_d0, SIMDE_MM_SHUFFLE(1, 1, 1, 1));
219 b2 = simde_mm256_permute4x64_pd(boor1_d0, SIMDE_MM_SHUFFLE(2, 2, 2, 2));
220 b3 = simde_mm256_permute4x64_pd(boor1_d0, SIMDE_MM_SHUFFLE(3, 3, 3, 3));
222 ab[0] = simde_mm256_mul_pd(a, b0);
223 ab[1] = simde_mm256_mul_pd(a, b1);
224 ab[2] = simde_mm256_mul_pd(a, b2);
225 ab[3] = simde_mm256_mul_pd(a, b3);
228 r = simde_mm256_set1_pd(0);
229 r = simde_mm256_fmadd_pd(ab[0], C[0], r);
230 r = simde_mm256_fmadd_pd(ab[1], C[1], r);
231 r = simde_mm256_fmadd_pd(ab[2], C[2], r);
232 r = simde_mm256_fmadd_pd(ab[3], C[3], r);
235 r0 = simde_mm256_castpd256_pd128(r);
236 r1 = simde_mm256_extractf128_pd(r, 1);
237 r0 = simde_mm_add_pd(r0, r1);
238 H[0] = simde_mm_cvtsd_f64(simde_mm_add_sd(r0, simde_mm_unpackhi_pd(r0, r0)));
240 stride1 = offset[1]-offset[0]-4;
241 for (
int j=0;j<4;++j) {
245 C[j] = simde_mm256_loadu_pd(ddc+(starts[1]+j-2)*stride1+starts[0]);
248 ddc += stride1*(offset[2]-offset[1]-4-2);
251 b0 = simde_mm256_permute4x64_pd(boor1_d2, SIMDE_MM_SHUFFLE(0, 0, 0, 0));
252 b1 = simde_mm256_permute4x64_pd(boor1_d2, SIMDE_MM_SHUFFLE(1, 1, 1, 1));
253 b2 = simde_mm256_permute4x64_pd(boor1_d2, SIMDE_MM_SHUFFLE(2, 2, 2, 2));
254 b3 = simde_mm256_permute4x64_pd(boor1_d2, SIMDE_MM_SHUFFLE(3, 3, 3, 3));
256 ab[0] = simde_mm256_mul_pd(a, b0);
257 ab[1] = simde_mm256_mul_pd(a, b1);
258 ab[2] = simde_mm256_mul_pd(a, b2);
259 ab[3] = simde_mm256_mul_pd(a, b3);
262 r = simde_mm256_set1_pd(0);
263 r = simde_mm256_fmadd_pd(ab[0], C[0], r);
264 r = simde_mm256_fmadd_pd(ab[1], C[1], r);
265 r = simde_mm256_fmadd_pd(ab[2], C[2], r);
266 r = simde_mm256_fmadd_pd(ab[3], C[3], r);
269 r0 = simde_mm256_castpd256_pd128(r);
270 r1 = simde_mm256_extractf128_pd(r, 1);
271 r0 = simde_mm_add_pd(r0, r1);
272 H[3] = simde_mm_cvtsd_f64(simde_mm_add_sd(r0, simde_mm_unpackhi_pd(r0, r0)));
274 stride1 = offset[1]-offset[0]-5;
275 for (
int j=0;j<4;++j) {
279 C[j] = simde_mm256_loadu_pd(ddc+(starts[1]+j-1)*stride1+starts[0]-1);
282 ddc += stride1*(offset[3]-offset[2]-5);
286 b0 = simde_mm256_permute4x64_pd(boor1_d1, SIMDE_MM_SHUFFLE(0, 0, 0, 0));
287 b1 = simde_mm256_permute4x64_pd(boor1_d1, SIMDE_MM_SHUFFLE(1, 1, 1, 1));
288 b2 = simde_mm256_permute4x64_pd(boor1_d1, SIMDE_MM_SHUFFLE(2, 2, 2, 2));
289 b3 = simde_mm256_permute4x64_pd(boor1_d1, SIMDE_MM_SHUFFLE(3, 3, 3, 3));
291 ab[0] = simde_mm256_mul_pd(a, b0);
292 ab[1] = simde_mm256_mul_pd(a, b1);
293 ab[2] = simde_mm256_mul_pd(a, b2);
294 ab[3] = simde_mm256_mul_pd(a, b3);
297 r = simde_mm256_set1_pd(0);
298 r = simde_mm256_fmadd_pd(ab[0], C[0], r);
299 r = simde_mm256_fmadd_pd(ab[1], C[1], r);
300 r = simde_mm256_fmadd_pd(ab[2], C[2], r);
301 r = simde_mm256_fmadd_pd(ab[3], C[3], r);
304 r0 = simde_mm256_castpd256_pd128(r);
305 r1 = simde_mm256_extractf128_pd(r, 1);
306 r0 = simde_mm_add_pd(r0, r1);
307 H[1] = H[2] = simde_mm_cvtsd_f64(simde_mm_add_sd(r0, simde_mm_unpackhi_pd(r0, r0)));