casadi_blazing_de_boor.hpp
1 //
2 // MIT No Attribution
3 //
4 // Copyright (C) 2010-2023 Joel Andersson, Joris Gillis, Moritz Diehl, KU Leuven.
5 //
6 // Permission is hereby granted, free of charge, to any person obtaining a copy of this
7 // software and associated documentation files (the "Software"), to deal in the Software
8 // without restriction, including without limitation the rights to use, copy, modify,
9 // merge, publish, distribute, sublicense, and/or sell copies of the Software, and to
10 // permit persons to whom the Software is furnished to do so.
11 //
12 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
13 // INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
14 // PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
15 // HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
16 // OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
17 // SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
18 //
19 
20 // SYMBOL "blazing_printvec"
21 template<typename T1>
22 void casadi_blazing_printvec(const simde__m256d* e) {
23  double elements[4];
24  simde_mm256_storeu_pd(elements, *e);
25  printf("mm256d: <%.4f %.4f %.4f %.4f>\n", elements[0], elements[1], elements[2], elements[3]);
26 }
27 
28 // SYMBOL "blazing_de_boor"
29 template<typename T1>
30 void casadi_blazing_de_boor(T1 x, const T1* knots, simde__m256d* boor_d0, simde__m256d* boor_d1, simde__m256d* boor_d2, const simde__m256d* boor_d3) { // NOLINT(whitespace/line_length)
31  simde__m256d x_ = simde_mm256_set1_pd(x);
32  simde__m256d zero = simde_mm256_set1_pd(0.0);
33  simde__m256d mask_end = simde_mm256_set_pd(0.0, -1.0, -1.0, -1.0);
34 
35  // shift one up
36  simde__m256d boor_d3i_1 = simde_mm256_permute4x64_pd(*boor_d3, SIMDE_MM_SHUFFLE(3, 3, 2, 1));
37  boor_d3i_1 = simde_mm256_blendv_pd(zero, boor_d3i_1, mask_end);
38 
39  simde__m256d knotsi = simde_mm256_loadu_pd(knots);
40  simde__m256d knotsi_1 = simde_mm256_loadu_pd(knots+1);
41  simde__m256d knotsi_2 = simde_mm256_loadu_pd(knots+2);
42  simde__m256d knotsi_3 = simde_mm256_loadu_pd(knots+3);
43  simde__m256d knotsi_4 = simde_mm256_loadu_pd(knots+4);
44 
45  simde__m256d bottom = simde_mm256_sub_pd(knotsi_1, knotsi); // bottom = knots[i + 1] - knots[i];
46  simde__m256d bottom_mask = simde_mm256_cmp_pd(bottom, zero, SIMDE_CMP_EQ_OQ); // if (bottom)
47 
48  // (x - knots[i]) / bottom;
49  simde__m256d r = simde_mm256_div_pd(simde_mm256_sub_pd(x_, knotsi), bottom);
50  r = simde_mm256_blendv_pd(r, zero, bottom_mask);
51  *boor_d2 = simde_mm256_mul_pd(r, *boor_d3);
52 
53  *boor_d2 = simde_mm256_blendv_pd(*boor_d2, zero, bottom_mask);
54 
55  bottom = simde_mm256_sub_pd(knotsi_2, knotsi_1); // bottom = knots[i + 2] - knots[i + 1];
56  bottom_mask = simde_mm256_cmp_pd(bottom, zero, SIMDE_CMP_EQ_OQ);
57  r = simde_mm256_div_pd(simde_mm256_sub_pd(knotsi_2, x_), bottom); // (knots[i + 2] - x) / bottom
58  r = simde_mm256_blendv_pd(r, zero, bottom_mask);
59 
60  *boor_d2 = simde_mm256_fmadd_pd(r, boor_d3i_1, *boor_d2);
61 
62  // shift one up
63  simde__m256d boor_d2i_1 = simde_mm256_permute4x64_pd(*boor_d2, SIMDE_MM_SHUFFLE(3, 3, 2, 1));
64  boor_d2i_1 = simde_mm256_blendv_pd(zero, boor_d2i_1, mask_end);
65 
66  bottom = simde_mm256_sub_pd(knotsi_2, knotsi); // bottom = knots[i + 2] - knots[i];
67  bottom_mask = simde_mm256_cmp_pd(bottom, zero, SIMDE_CMP_EQ_OQ); // if (bottom)
68 
69  r = simde_mm256_div_pd(simde_mm256_sub_pd(x_, knotsi), bottom); // (x - knots[i]) / bottom;
70  r = simde_mm256_blendv_pd(r, zero, bottom_mask);
71  *boor_d1 = simde_mm256_mul_pd(r, *boor_d2);
72 
73  *boor_d1 = simde_mm256_blendv_pd(*boor_d1, zero, bottom_mask);
74 
75  bottom = simde_mm256_sub_pd(knotsi_3, knotsi_1); // bottom = knots[i + 3] - knots[i + 1];
76  bottom_mask = simde_mm256_cmp_pd(bottom, zero, SIMDE_CMP_EQ_OQ);
77  r = simde_mm256_div_pd(simde_mm256_sub_pd(knotsi_3, x_), bottom); // (knots[i + 3] - x) / bottom
78  r = simde_mm256_blendv_pd(r, zero, bottom_mask);
79 
80  *boor_d1 = simde_mm256_fmadd_pd(r, boor_d2i_1, *boor_d1);
81 
82  // shift one up
83  simde__m256d boor_d1i_1 = simde_mm256_permute4x64_pd(*boor_d1, SIMDE_MM_SHUFFLE(3, 3, 2, 1));
84  boor_d1i_1 = simde_mm256_blendv_pd(zero, boor_d1i_1, mask_end);
85 
86  bottom = simde_mm256_sub_pd(knotsi_3, knotsi); // bottom = knots[i + 3] - knots[i];
87  bottom_mask = simde_mm256_cmp_pd(bottom, zero, SIMDE_CMP_EQ_OQ); // if (bottom)
88 
89  r = simde_mm256_div_pd(simde_mm256_sub_pd(x_, knotsi), bottom); // (x - knots[i]) / bottom;
90  r = simde_mm256_blendv_pd(r, zero, bottom_mask);
91  *boor_d0 = simde_mm256_mul_pd(r, *boor_d1);
92 
93  *boor_d0 = simde_mm256_blendv_pd(*boor_d0, zero, bottom_mask);
94 
95  bottom = simde_mm256_sub_pd(knotsi_4, knotsi_1); // bottom = knots[i + 4] - knots[i + 1];
96  bottom_mask = simde_mm256_cmp_pd(bottom, zero, SIMDE_CMP_EQ_OQ);
97  r = simde_mm256_div_pd(simde_mm256_sub_pd(knotsi_4, x_), bottom); // (knots[i + 4] - x) / bottom
98  r = simde_mm256_blendv_pd(r, zero, bottom_mask);
99 
100  *boor_d0 = simde_mm256_fmadd_pd(r, boor_d1i_1, *boor_d0);
101 }