22 void casadi_blazing_printvec(
const simde__m256d* e) {
24 simde_mm256_storeu_pd(elements, *e);
25 printf(
"mm256d: <%.4f %.4f %.4f %.4f>\n", elements[0], elements[1], elements[2], elements[3]);
30 void casadi_blazing_de_boor(T1 x,
const T1* knots, simde__m256d* boor_d0, simde__m256d* boor_d1, simde__m256d* boor_d2,
const simde__m256d* boor_d3) {
31 simde__m256d x_ = simde_mm256_set1_pd(x);
32 simde__m256d zero = simde_mm256_set1_pd(0.0);
33 simde__m256d mask_end = simde_mm256_set_pd(0.0, -1.0, -1.0, -1.0);
36 simde__m256d boor_d3i_1 = simde_mm256_permute4x64_pd(*boor_d3, SIMDE_MM_SHUFFLE(3, 3, 2, 1));
37 boor_d3i_1 = simde_mm256_blendv_pd(zero, boor_d3i_1, mask_end);
39 simde__m256d knotsi = simde_mm256_loadu_pd(knots);
40 simde__m256d knotsi_1 = simde_mm256_loadu_pd(knots+1);
41 simde__m256d knotsi_2 = simde_mm256_loadu_pd(knots+2);
42 simde__m256d knotsi_3 = simde_mm256_loadu_pd(knots+3);
43 simde__m256d knotsi_4 = simde_mm256_loadu_pd(knots+4);
45 simde__m256d bottom = simde_mm256_sub_pd(knotsi_1, knotsi);
46 simde__m256d bottom_mask = simde_mm256_cmp_pd(bottom, zero, SIMDE_CMP_EQ_OQ);
49 simde__m256d r = simde_mm256_div_pd(simde_mm256_sub_pd(x_, knotsi), bottom);
50 r = simde_mm256_blendv_pd(r, zero, bottom_mask);
51 *boor_d2 = simde_mm256_mul_pd(r, *boor_d3);
53 *boor_d2 = simde_mm256_blendv_pd(*boor_d2, zero, bottom_mask);
55 bottom = simde_mm256_sub_pd(knotsi_2, knotsi_1);
56 bottom_mask = simde_mm256_cmp_pd(bottom, zero, SIMDE_CMP_EQ_OQ);
57 r = simde_mm256_div_pd(simde_mm256_sub_pd(knotsi_2, x_), bottom);
58 r = simde_mm256_blendv_pd(r, zero, bottom_mask);
60 *boor_d2 = simde_mm256_fmadd_pd(r, boor_d3i_1, *boor_d2);
63 simde__m256d boor_d2i_1 = simde_mm256_permute4x64_pd(*boor_d2, SIMDE_MM_SHUFFLE(3, 3, 2, 1));
64 boor_d2i_1 = simde_mm256_blendv_pd(zero, boor_d2i_1, mask_end);
66 bottom = simde_mm256_sub_pd(knotsi_2, knotsi);
67 bottom_mask = simde_mm256_cmp_pd(bottom, zero, SIMDE_CMP_EQ_OQ);
69 r = simde_mm256_div_pd(simde_mm256_sub_pd(x_, knotsi), bottom);
70 r = simde_mm256_blendv_pd(r, zero, bottom_mask);
71 *boor_d1 = simde_mm256_mul_pd(r, *boor_d2);
73 *boor_d1 = simde_mm256_blendv_pd(*boor_d1, zero, bottom_mask);
75 bottom = simde_mm256_sub_pd(knotsi_3, knotsi_1);
76 bottom_mask = simde_mm256_cmp_pd(bottom, zero, SIMDE_CMP_EQ_OQ);
77 r = simde_mm256_div_pd(simde_mm256_sub_pd(knotsi_3, x_), bottom);
78 r = simde_mm256_blendv_pd(r, zero, bottom_mask);
80 *boor_d1 = simde_mm256_fmadd_pd(r, boor_d2i_1, *boor_d1);
83 simde__m256d boor_d1i_1 = simde_mm256_permute4x64_pd(*boor_d1, SIMDE_MM_SHUFFLE(3, 3, 2, 1));
84 boor_d1i_1 = simde_mm256_blendv_pd(zero, boor_d1i_1, mask_end);
86 bottom = simde_mm256_sub_pd(knotsi_3, knotsi);
87 bottom_mask = simde_mm256_cmp_pd(bottom, zero, SIMDE_CMP_EQ_OQ);
89 r = simde_mm256_div_pd(simde_mm256_sub_pd(x_, knotsi), bottom);
90 r = simde_mm256_blendv_pd(r, zero, bottom_mask);
91 *boor_d0 = simde_mm256_mul_pd(r, *boor_d1);
93 *boor_d0 = simde_mm256_blendv_pd(*boor_d0, zero, bottom_mask);
95 bottom = simde_mm256_sub_pd(knotsi_4, knotsi_1);
96 bottom_mask = simde_mm256_cmp_pd(bottom, zero, SIMDE_CMP_EQ_OQ);
97 r = simde_mm256_div_pd(simde_mm256_sub_pd(knotsi_4, x_), bottom);
98 r = simde_mm256_blendv_pd(r, zero, bottom_mask);
100 *boor_d0 = simde_mm256_fmadd_pd(r, boor_d1i_1, *boor_d0);