Project Ne10
An Open Optimized Software Library Project for the ARM Architecture
macros.h
1 /*
2  * Copyright 2011-15 ARM Limited and Contributors.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of the <organization> nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL ARM Limited and Contributors. BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 /*
29  * NE10 Library : common/macros.h
30  */
31 
32 #include "factor.h"
33 
34 // Macros used in actual implementations
35 
37 
38 #define NE10_XC_OPERATION_X_C(loopCode) { \
39  NE10_TEMPLATE_XC_OPERATION_X_C( \
40  NE10_CHECKPOINTER_DstSrcCst_OPERATION; , \
41  loopCode); \
42  }
43 
44 #define NE10_XC_OPERATION_FLOAT_NEON(loopCode1, loopCode2) { \
45  float32x4_t n_cst = { cst, cst, cst, cst }; \
46  NE10_DstSrcCst_OPERATION_FLOAT_NEON( \
47  NE10_CHECKPOINTER_DstSrcCst_OPERATION; , \
48  NE10_DstSrcCst_MAINLOOP_FLOAT_NEON(loopCode1); , \
49  NE10_DstSrcCst_SECONDLOOP_FLOAT_NEON(loopCode2); \
50  ); \
51  }
52 
53 #define NE10_XC_OPERATION_VEC2F_NEON(loopCode1, loopCode2) { \
54  NE10_DstSrcCst_OPERATION_VEC2F_NEON( \
55  NE10_CHECKPOINTER_DstSrcCst_OPERATION; , \
56  NE10_DstSrcCst_MAINLOOP_VEC2F_NEON(loopCode1); , \
57  NE10_DstSrcCst_SECONDLOOP_VEC2F_NEON(loopCode2); \
58  ); \
59  }
60 
61 /* This macro uses interleaving to boost the performance */
62 #define NE10_XC_OPERATION_VEC3F_NEON(loopCode1, loopCode2) { \
63  NE10_DstSrcCst_OPERATION_VEC3F_NEON( \
64  NE10_CHECKPOINTER_DstSrcCst_OPERATION; , \
65  NE10_DstSrcCst_MAINLOOP_VEC3F_NEON(loopCode1); , \
66  NE10_DstSrcCst_SECONDLOOP_VEC3F_NEON(loopCode2); \
67  ); \
68  }
69 
70 #define NE10_XC_OPERATION_VEC4F_NEON(loopCode) { \
71  NE10_DstSrcCst_OPERATION_VEC4F_NEON( \
72  NE10_CHECKPOINTER_DstSrcCst_OPERATION; , \
73  NE10_DstSrcCst_MAINLOOP_VEC4F_NEON(loopCode); \
74  ); \
75  }
76 
78 
79 #define NE10_MLAC_OPERATION_X_C(loopCode) { \
80  NE10_TEMPLATE_XC_OPERATION_X_C( \
81  NE10_CHECKPOINTER_DstAccSrcCst_OPERATION; , \
82  loopCode); \
83  }
84 
85 #define NE10_MLAC_OPERATION_FLOAT_NEON(loopCode1, loopCode2) { \
86  float32x4_t n_acc; \
87  float32x4_t n_cst = { cst, cst, cst, cst }; \
88  NE10_DstAccSrcCst_OPERATION_FLOAT_NEON( \
89  NE10_CHECKPOINTER_DstAccSrcCst_OPERATION; , \
90  NE10_DstAccSrcCst_MAINLOOP_FLOAT_NEON(loopCode1); , \
91  NE10_DstAccSrcCst_SECONDLOOP_FLOAT_NEON(loopCode2); \
92  ); \
93  }
94 
95 #define NE10_MLAC_OPERATION_VEC2F_NEON(loopCode1, loopCode2) { \
96  float32x4_t n_acc; \
97  NE10_DstAccSrcCst_OPERATION_VEC2F_NEON( \
98  NE10_CHECKPOINTER_DstAccSrcCst_OPERATION; , \
99  NE10_DstAccSrcCst_MAINLOOP_VEC2F_NEON(loopCode1); , \
100  NE10_DstAccSrcCst_SECONDLOOP_VEC2F_NEON(loopCode2); \
101  ); \
102  }
103 
104 #define NE10_MLAC_OPERATION_VEC3F_NEON(loopCode1, loopCode2) { \
105  float32x4_t n_acc1, n_acc2, n_acc3; \
106  NE10_DstAccSrcCst_OPERATION_VEC3F_NEON( \
107  NE10_CHECKPOINTER_DstAccSrcCst_OPERATION; , \
108  NE10_DstAccSrcCst_MAINLOOP_VEC3F_NEON(loopCode1); , \
109  NE10_DstAccSrcCst_SECONDLOOP_VEC3F_NEON(loopCode2); \
110  ); \
111  }
112 
113 #define NE10_MLAC_OPERATION_VEC4F_NEON(loopCode) { \
114  float32x4_t n_acc; \
115  NE10_DstAccSrcCst_OPERATION_VEC4F_NEON( \
116  NE10_CHECKPOINTER_DstAccSrcCst_OPERATION; , \
117  NE10_DstAccSrcCst_MAINLOOP_VEC4F_NEON(loopCode); \
118  ); \
119  }
120 
122 
123 #define NE10_SETC_OPERATION_X_C(loopCode) { \
124  NE10_TEMPLATE_XC_OPERATION_X_C( \
125  NE10_CHECKPOINTER_DstCst_OPERATION; , \
126  loopCode); \
127  }
128 
129 #define NE10_SETC_OPERATION_FLOAT_NEON(loopCode1, loopCode2) { \
130  float32x4_t n_cst = { cst, cst, cst, cst }; \
131  NE10_DstCst_OPERATION_FLOAT_NEON( \
132  NE10_CHECKPOINTER_DstCst_OPERATION; , \
133  NE10_DstCst_MAINLOOP_FLOAT_NEON(loopCode1); , \
134  NE10_DstCst_SECONDLOOP_FLOAT_NEON(loopCode2); \
135  ); \
136  }
137 
138 #define NE10_SETC_OPERATION_VEC2F_NEON(loopCode1, loopCode2) { \
139  NE10_DstCst_OPERATION_VEC2F_NEON( \
140  NE10_CHECKPOINTER_DstCst_OPERATION; , \
141  NE10_DstCst_MAINLOOP_VEC2F_NEON(loopCode1); , \
142  NE10_DstCst_SECONDLOOP_VEC2F_NEON(loopCode2); \
143  ); \
144  }
145 
146 /* This macro uses interleaving to boost the performance */
147 #define NE10_SETC_OPERATION_VEC3F_NEON(loopCode1, loopCode2) { \
148  NE10_DstCst_OPERATION_VEC3F_NEON( \
149  NE10_CHECKPOINTER_DstCst_OPERATION; , \
150  NE10_DstCst_MAINLOOP_VEC3F_NEON(loopCode1); , \
151  NE10_DstCst_SECONDLOOP_VEC3F_NEON(loopCode2); \
152  ); \
153  }
154 
155 #define NE10_SETC_OPERATION_VEC4F_NEON(loopCode) { \
156  NE10_DstCst_OPERATION_VEC4F_NEON( \
157  NE10_CHECKPOINTER_DstCst_OPERATION; , \
158  NE10_DstCst_MAINLOOP_VEC4F_NEON(loopCode); \
159  ); \
160  }
161 
163 
164 #define NE10_X_OPERATION_FLOAT_C(loopCode) { \
165  NE10_TEMPLATE_XC_OPERATION_X_C( \
166  NE10_CHECKPOINTER_DstSrc1Src2_OPERATION; , \
167  loopCode); \
168  }
169 
170 #define NE10_X_OPERATION_FLOAT_NEON(loopCode1, loopCode2) { \
171  float32x4_t n_src2; \
172  NE10_DstSrc1Src2_OPERATION_FLOAT_NEON( \
173  NE10_CHECKPOINTER_DstSrc1Src2_OPERATION; , \
174  NE10_DstSrc1Src2_MAINLOOP_FLOAT_NEON(loopCode1); , \
175  NE10_DstSrc1Src2_SECONDLOOP_FLOAT_NEON(loopCode2); \
176  ); \
177  }
178 
179 #define NE10_DOT_OPERATION_X_C NE10_X_OPERATION_FLOAT_C
180 
182 
183 #define NE10_ABS_OPERATION_X_C(loopCode) { \
184  NE10_TEMPLATE_XC_OPERATION_X_C( \
185  NE10_CHECKPOINTER_DstSrc_OPERATION, \
186  loopCode); \
187  }
188 
189 #define NE10_ABS_OPERATION_FLOAT_C NE10_ABS_OPERATION_X_C
190 
191 #define NE10_ABS_OPERATION_FLOAT_NEON(loopCode1, loopCode2) { \
192  arm_float_t cst = 0.0f; /* this is used to compare the values against. */ \
193  float32x4_t n_cst = { cst, cst, cst, cst }; \
194  NE10_DstSrc_OPERATION_FLOAT_NEON( \
195  NE10_CHECKPOINTER_DstSrc_OPERATION; , \
196  NE10_DstSrc_MAINLOOP_FLOAT_NEON(loopCode1); , \
197  NE10_DstSrc_SECONDLOOP_FLOAT_NEON(loopCode2); \
198  ); \
199  }
200 
201 #define NE10_LEN_OPERATION_X_C NE10_ABS_OPERATION_X_C
202 
203 #define NE10_LEN_OPERATION_X_C NE10_ABS_OPERATION_X_C
204 
205 #define NE10_CMATVEC_OPERATION_X_C NE10_ABS_OPERATION_X_C
206 
207 #define NE10_LEN_OPERATION_VEC2F_NEON(loopCode1, loopCode2) { \
208  NE10_DstSrc_OPERATION_VEC2F_NEON( \
209  NE10_CHECKPOINTER_DstSrcCst_OPERATION, \
210  NE10_DstSrc_MAINLOOP_VEC2F_NEON(loopCode1), \
211  NE10_DstSrc_SECONDLOOP_VEC2F_NEON(loopCode2) \
212  ); \
213  }
214 
215 #define NE10_LEN_OPERATION_VEC3F_NEON(loopCode1, loopCode2) { \
216  NE10_DstSrc_OPERATION_VEC3F_NEON( \
217  NE10_CHECKPOINTER_DstSrcCst_OPERATION, \
218  NE10_DstSrc_MAINLOOP_VEC3F_NEON(loopCode1), \
219  NE10_DstSrc_SECONDLOOP_VEC3F_NEON(loopCode2) \
220  ); \
221  }
222 
223 #define NE10_LEN_OPERATION_VEC4F_NEON(loopCode) { \
224  NE10_DstSrc_OPERATION_VEC4F_NEON( \
225  NE10_CHECKPOINTER_DstSrcCst_OPERATION, \
226  NE10_DstSrc_MAINLOOP_VEC4F_NEON(loopCode) \
227  ); \
228  }
229 
230 #define NE10_DETMAT_OPERATION_X_C NE10_ABS_OPERATION_X_C
231 
233 
234 #define NE10_MLA_OPERATION_FLOAT_NEON(loopCode1, loopCode2) { \
235  float32x4_t n_acc; \
236  float32x4_t n_src2; \
237  NE10_DstAccSrc1Src2_OPERATION_FLOAT_NEON( \
238  NE10_CHECKPOINTER_DstAccSrc1Src2_OPERATION; , \
239  NE10_DstAccSrc1Src2_MAINLOOP_FLOAT_NEON(loopCode1); , \
240  NE10_DstAccSrc1Src2_SECONDLOOP_FLOAT_NEON(loopCode2); \
241  ); \
242  }