// Copyright (c) the JPEG XL Project Authors. All rights reserved. // // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. #![allow(clippy::type_complexity)] #![allow(clippy::erasing_op)] #![allow(clippy::identity_op)] use jxl_simd::{F32SimdVec, SimdDescriptor}; #[allow(clippy::too_many_arguments)] #[allow(clippy::excessive_precision)] #[inline(always)] pub(super) fn reinterpreting_dct_32( d: D, v0: D::F32Vec, v1: D::F32Vec, v2: D::F32Vec, v3: D::F32Vec, v4: D::F32Vec, v5: D::F32Vec, v6: D::F32Vec, v7: D::F32Vec, v8: D::F32Vec, v9: D::F32Vec, v10: D::F32Vec, v11: D::F32Vec, v12: D::F32Vec, v13: D::F32Vec, v14: D::F32Vec, v15: D::F32Vec, v16: D::F32Vec, v17: D::F32Vec, v18: D::F32Vec, v19: D::F32Vec, v20: D::F32Vec, v21: D::F32Vec, v22: D::F32Vec, v23: D::F32Vec, v24: D::F32Vec, v25: D::F32Vec, v26: D::F32Vec, v27: D::F32Vec, v28: D::F32Vec, v29: D::F32Vec, v30: D::F32Vec, v31: D::F32Vec, ) -> ( D::F32Vec, D::F32Vec, D::F32Vec, D::F32Vec, D::F32Vec, D::F32Vec, D::F32Vec, D::F32Vec, D::F32Vec, D::F32Vec, D::F32Vec, D::F32Vec, D::F32Vec, D::F32Vec, D::F32Vec, D::F32Vec, D::F32Vec, D::F32Vec, D::F32Vec, D::F32Vec, D::F32Vec, D::F32Vec, D::F32Vec, D::F32Vec, D::F32Vec, D::F32Vec, D::F32Vec, D::F32Vec, D::F32Vec, D::F32Vec, D::F32Vec, D::F32Vec, ) { let v32 = v0 + v31; let v33 = v1 + v30; let v34 = v2 + v29; let v35 = v3 + v28; let v36 = v4 + v27; let v37 = v5 + v26; let v38 = v6 + v25; let v39 = v7 + v24; let v40 = v8 + v23; let v41 = v9 + v22; let v42 = v10 + v21; let v43 = v11 + v20; let v44 = v12 + v19; let v45 = v13 + v18; let v46 = v14 + v17; let v47 = v15 + v16; let v48 = v32 + v47; let v49 = v33 + v46; let v50 = v34 + v45; let v51 = v35 + v44; let v52 = v36 + v43; let v53 = v37 + v42; let v54 = v38 + v41; let v55 = v39 + v40; let v56 = v48 + v55; let v57 = v49 + v54; let v58 = v50 + v53; let v59 = v51 + v52; let v60 = v56 + v59; let v61 = v57 + v58; let v62 = v60 + v61; let v63 = v60 - v61; let v64 = v56 - v59; let v65 = v57 - v58; let mul = D::F32Vec::splat(d, 0.5411961001461970); let v66 = v64 * mul; let mul = D::F32Vec::splat(d, 1.3065629648763764); let v67 = v65 * mul; let v68 = v66 + v67; let v69 = v66 - v67; let v70 = v68.mul_add(D::F32Vec::splat(d, std::f32::consts::SQRT_2), v69); let v71 = v48 - v55; let v72 = v49 - v54; let v73 = v50 - v53; let v74 = v51 - v52; let mul = D::F32Vec::splat(d, 0.5097955791041592); let v75 = v71 * mul; let mul = D::F32Vec::splat(d, 0.6013448869350453); let v76 = v72 * mul; let mul = D::F32Vec::splat(d, 0.8999762231364156); let v77 = v73 * mul; let mul = D::F32Vec::splat(d, 2.5629154477415055); let v78 = v74 * mul; let v79 = v75 + v78; let v80 = v76 + v77; let v81 = v79 + v80; let v82 = v79 - v80; let v83 = v75 - v78; let v84 = v76 - v77; let mul = D::F32Vec::splat(d, 0.5411961001461970); let v85 = v83 * mul; let mul = D::F32Vec::splat(d, 1.3065629648763764); let v86 = v84 * mul; let v87 = v85 + v86; let v88 = v85 - v86; let v89 = v87.mul_add(D::F32Vec::splat(d, std::f32::consts::SQRT_2), v88); let v90 = v81.mul_add(D::F32Vec::splat(d, std::f32::consts::SQRT_2), v89); let v91 = v89 + v82; let v92 = v82 + v88; let v93 = v32 - v47; let v94 = v33 - v46; let v95 = v34 - v45; let v96 = v35 - v44; let v97 = v36 - v43; let v98 = v37 - v42; let v99 = v38 - v41; let v100 = v39 - v40; let mul = D::F32Vec::splat(d, 0.5024192861881557); let v101 = v93 * mul; let mul = D::F32Vec::splat(d, 0.5224986149396889); let v102 = v94 * mul; let mul = D::F32Vec::splat(d, 0.5669440348163577); let v103 = v95 * mul; let mul = D::F32Vec::splat(d, 0.6468217833599901); let v104 = v96 * mul; let mul = D::F32Vec::splat(d, 0.7881546234512502); let v105 = v97 * mul; let mul = D::F32Vec::splat(d, 1.0606776859903471); let v106 = v98 * mul; let mul = D::F32Vec::splat(d, 1.7224470982383342); let v107 = v99 * mul; let mul = D::F32Vec::splat(d, 5.1011486186891553); let v108 = v100 * mul; let v109 = v101 + v108; let v110 = v102 + v107; let v111 = v103 + v106; let v112 = v104 + v105; let v113 = v109 + v112; let v114 = v110 + v111; let v115 = v113 + v114; let v116 = v113 - v114; let v117 = v109 - v112; let v118 = v110 - v111; let mul = D::F32Vec::splat(d, 0.5411961001461970); let v119 = v117 * mul; let mul = D::F32Vec::splat(d, 1.3065629648763764); let v120 = v118 * mul; let v121 = v119 + v120; let v122 = v119 - v120; let v123 = v121.mul_add(D::F32Vec::splat(d, std::f32::consts::SQRT_2), v122); let v124 = v101 - v108; let v125 = v102 - v107; let v126 = v103 - v106; let v127 = v104 - v105; let mul = D::F32Vec::splat(d, 0.5097955791041592); let v128 = v124 * mul; let mul = D::F32Vec::splat(d, 0.6013448869350453); let v129 = v125 * mul; let mul = D::F32Vec::splat(d, 0.8999762231364156); let v130 = v126 * mul; let mul = D::F32Vec::splat(d, 2.5629154477415055); let v131 = v127 * mul; let v132 = v128 + v131; let v133 = v129 + v130; let v134 = v132 + v133; let v135 = v132 - v133; let v136 = v128 - v131; let v137 = v129 - v130; let mul = D::F32Vec::splat(d, 0.5411961001461970); let v138 = v136 * mul; let mul = D::F32Vec::splat(d, 1.3065629648763764); let v139 = v137 * mul; let v140 = v138 + v139; let v141 = v138 - v139; let v142 = v140.mul_add(D::F32Vec::splat(d, std::f32::consts::SQRT_2), v141); let v143 = v134.mul_add(D::F32Vec::splat(d, std::f32::consts::SQRT_2), v142); let v144 = v142 + v135; let v145 = v135 + v141; let v146 = v115.mul_add(D::F32Vec::splat(d, std::f32::consts::SQRT_2), v143); let v147 = v143 + v123; let v148 = v123 + v144; let v149 = v144 + v116; let v150 = v116 + v145; let v151 = v145 + v122; let v152 = v122 + v141; let v153 = v0 - v31; let v154 = v1 - v30; let v155 = v2 - v29; let v156 = v3 - v28; let v157 = v4 - v27; let v158 = v5 - v26; let v159 = v6 - v25; let v160 = v7 - v24; let v161 = v8 - v23; let v162 = v9 - v22; let v163 = v10 - v21; let v164 = v11 - v20; let v165 = v12 - v19; let v166 = v13 - v18; let v167 = v14 - v17; let v168 = v15 - v16; let mul = D::F32Vec::splat(d, 0.5006029982351963); let v169 = v153 * mul; let mul = D::F32Vec::splat(d, 0.5054709598975436); let v170 = v154 * mul; let mul = D::F32Vec::splat(d, 0.5154473099226246); let v171 = v155 * mul; let mul = D::F32Vec::splat(d, 0.5310425910897841); let v172 = v156 * mul; let mul = D::F32Vec::splat(d, 0.5531038960344445); let v173 = v157 * mul; let mul = D::F32Vec::splat(d, 0.5829349682061339); let v174 = v158 * mul; let mul = D::F32Vec::splat(d, 0.6225041230356648); let v175 = v159 * mul; let mul = D::F32Vec::splat(d, 0.6748083414550057); let v176 = v160 * mul; let mul = D::F32Vec::splat(d, 0.7445362710022986); let v177 = v161 * mul; let mul = D::F32Vec::splat(d, 0.8393496454155268); let v178 = v162 * mul; let mul = D::F32Vec::splat(d, 0.9725682378619608); let v179 = v163 * mul; let mul = D::F32Vec::splat(d, 1.1694399334328847); let v180 = v164 * mul; let mul = D::F32Vec::splat(d, 1.4841646163141662); let v181 = v165 * mul; let mul = D::F32Vec::splat(d, 2.0577810099534108); let v182 = v166 * mul; let mul = D::F32Vec::splat(d, 3.4076084184687190); let v183 = v167 * mul; let mul = D::F32Vec::splat(d, 10.1900081235480329); let v184 = v168 * mul; let v185 = v169 + v184; let v186 = v170 + v183; let v187 = v171 + v182; let v188 = v172 + v181; let v189 = v173 + v180; let v190 = v174 + v179; let v191 = v175 + v178; let v192 = v176 + v177; let v193 = v185 + v192; let v194 = v186 + v191; let v195 = v187 + v190; let v196 = v188 + v189; let v197 = v193 + v196; let v198 = v194 + v195; let v199 = v197 + v198; let v200 = v197 - v198; let v201 = v193 - v196; let v202 = v194 - v195; let mul = D::F32Vec::splat(d, 0.5411961001461970); let v203 = v201 * mul; let mul = D::F32Vec::splat(d, 1.3065629648763764); let v204 = v202 * mul; let v205 = v203 + v204; let v206 = v203 - v204; let v207 = v205.mul_add(D::F32Vec::splat(d, std::f32::consts::SQRT_2), v206); let v208 = v185 - v192; let v209 = v186 - v191; let v210 = v187 - v190; let v211 = v188 - v189; let mul = D::F32Vec::splat(d, 0.5097955791041592); let v212 = v208 * mul; let mul = D::F32Vec::splat(d, 0.6013448869350453); let v213 = v209 * mul; let mul = D::F32Vec::splat(d, 0.8999762231364156); let v214 = v210 * mul; let mul = D::F32Vec::splat(d, 2.5629154477415055); let v215 = v211 * mul; let v216 = v212 + v215; let v217 = v213 + v214; let v218 = v216 + v217; let v219 = v216 - v217; let v220 = v212 - v215; let v221 = v213 - v214; let mul = D::F32Vec::splat(d, 0.5411961001461970); let v222 = v220 * mul; let mul = D::F32Vec::splat(d, 1.3065629648763764); let v223 = v221 * mul; let v224 = v222 + v223; let v225 = v222 - v223; let v226 = v224.mul_add(D::F32Vec::splat(d, std::f32::consts::SQRT_2), v225); let v227 = v218.mul_add(D::F32Vec::splat(d, std::f32::consts::SQRT_2), v226); let v228 = v226 + v219; let v229 = v219 + v225; let v230 = v169 - v184; let v231 = v170 - v183; let v232 = v171 - v182; let v233 = v172 - v181; let v234 = v173 - v180; let v235 = v174 - v179; let v236 = v175 - v178; let v237 = v176 - v177; let mul = D::F32Vec::splat(d, 0.5024192861881557); let v238 = v230 * mul; let mul = D::F32Vec::splat(d, 0.5224986149396889); let v239 = v231 * mul; let mul = D::F32Vec::splat(d, 0.5669440348163577); let v240 = v232 * mul; let mul = D::F32Vec::splat(d, 0.6468217833599901); let v241 = v233 * mul; let mul = D::F32Vec::splat(d, 0.7881546234512502); let v242 = v234 * mul; let mul = D::F32Vec::splat(d, 1.0606776859903471); let v243 = v235 * mul; let mul = D::F32Vec::splat(d, 1.7224470982383342); let v244 = v236 * mul; let mul = D::F32Vec::splat(d, 5.1011486186891553); let v245 = v237 * mul; let v246 = v238 + v245; let v247 = v239 + v244; let v248 = v240 + v243; let v249 = v241 + v242; let v250 = v246 + v249; let v251 = v247 + v248; let v252 = v250 + v251; let v253 = v250 - v251; let v254 = v246 - v249; let v255 = v247 - v248; let mul = D::F32Vec::splat(d, 0.5411961001461970); let v256 = v254 * mul; let mul = D::F32Vec::splat(d, 1.3065629648763764); let v257 = v255 * mul; let v258 = v256 + v257; let v259 = v256 - v257; let v260 = v258.mul_add(D::F32Vec::splat(d, std::f32::consts::SQRT_2), v259); let v261 = v238 - v245; let v262 = v239 - v244; let v263 = v240 - v243; let v264 = v241 - v242; let mul = D::F32Vec::splat(d, 0.5097955791041592); let v265 = v261 * mul; let mul = D::F32Vec::splat(d, 0.6013448869350453); let v266 = v262 * mul; let mul = D::F32Vec::splat(d, 0.8999762231364156); let v267 = v263 * mul; let mul = D::F32Vec::splat(d, 2.5629154477415055); let v268 = v264 * mul; let v269 = v265 + v268; let v270 = v266 + v267; let v271 = v269 + v270; let v272 = v269 - v270; let v273 = v265 - v268; let v274 = v266 - v267; let mul = D::F32Vec::splat(d, 0.5411961001461970); let v275 = v273 * mul; let mul = D::F32Vec::splat(d, 1.3065629648763764); let v276 = v274 * mul; let v277 = v275 + v276; let v278 = v275 - v276; let v279 = v277.mul_add(D::F32Vec::splat(d, std::f32::consts::SQRT_2), v278); let v280 = v271.mul_add(D::F32Vec::splat(d, std::f32::consts::SQRT_2), v279); let v281 = v279 + v272; let v282 = v272 + v278; let v283 = v252.mul_add(D::F32Vec::splat(d, std::f32::consts::SQRT_2), v280); let v284 = v280 + v260; let v285 = v260 + v281; let v286 = v281 + v253; let v287 = v253 + v282; let v288 = v282 + v259; let v289 = v259 + v278; let v290 = v199.mul_add(D::F32Vec::splat(d, std::f32::consts::SQRT_2), v283); let v291 = v283 + v227; let v292 = v227 + v284; let v293 = v284 + v207; let v294 = v207 + v285; let v295 = v285 + v228; let v296 = v228 + v286; let v297 = v286 + v200; let v298 = v200 + v287; let v299 = v287 + v229; let v300 = v229 + v288; let v301 = v288 + v206; let v302 = v206 + v289; let v303 = v289 + v225; let v304 = v225 + v278; ( v62 * D::F32Vec::splat(d, 0.031250), v290 * D::F32Vec::splat(d, 0.031262), v146 * D::F32Vec::splat(d, 0.031299), v291 * D::F32Vec::splat(d, 0.031361), v90 * D::F32Vec::splat(d, 0.031449), v292 * D::F32Vec::splat(d, 0.031561), v147 * D::F32Vec::splat(d, 0.031699), v293 * D::F32Vec::splat(d, 0.031864), v70 * D::F32Vec::splat(d, 0.032055), v294 * D::F32Vec::splat(d, 0.032274), v148 * D::F32Vec::splat(d, 0.032521), v295 * D::F32Vec::splat(d, 0.032797), v91 * D::F32Vec::splat(d, 0.033103), v296 * D::F32Vec::splat(d, 0.033441), v149 * D::F32Vec::splat(d, 0.033811), v297 * D::F32Vec::splat(d, 0.034215), v63 * D::F32Vec::splat(d, 0.034654), v298 * D::F32Vec::splat(d, 0.035131), v150 * D::F32Vec::splat(d, 0.035647), v299 * D::F32Vec::splat(d, 0.036204), v92 * D::F32Vec::splat(d, 0.036806), v300 * D::F32Vec::splat(d, 0.037453), v151 * D::F32Vec::splat(d, 0.038150), v301 * D::F32Vec::splat(d, 0.038899), v69 * D::F32Vec::splat(d, 0.039705), v302 * D::F32Vec::splat(d, 0.040571), v152 * D::F32Vec::splat(d, 0.041502), v303 * D::F32Vec::splat(d, 0.042502), v88 * D::F32Vec::splat(d, 0.043578), v304 * D::F32Vec::splat(d, 0.044735), v141 * D::F32Vec::splat(d, 0.045981), v278 * D::F32Vec::splat(d, 0.047324), ) } #[inline(always)] pub(super) fn do_reinterpreting_dct_32( d: D, data: &mut [::UnderlyingArray], stride: usize, ) { assert!(data.len() > 31 * stride); let mut v0 = D::F32Vec::load_array(d, &data[0 * stride]); let mut v1 = D::F32Vec::load_array(d, &data[1 * stride]); let mut v2 = D::F32Vec::load_array(d, &data[2 * stride]); let mut v3 = D::F32Vec::load_array(d, &data[3 * stride]); let mut v4 = D::F32Vec::load_array(d, &data[4 * stride]); let mut v5 = D::F32Vec::load_array(d, &data[5 * stride]); let mut v6 = D::F32Vec::load_array(d, &data[6 * stride]); let mut v7 = D::F32Vec::load_array(d, &data[7 * stride]); let mut v8 = D::F32Vec::load_array(d, &data[8 * stride]); let mut v9 = D::F32Vec::load_array(d, &data[9 * stride]); let mut v10 = D::F32Vec::load_array(d, &data[10 * stride]); let mut v11 = D::F32Vec::load_array(d, &data[11 * stride]); let mut v12 = D::F32Vec::load_array(d, &data[12 * stride]); let mut v13 = D::F32Vec::load_array(d, &data[13 * stride]); let mut v14 = D::F32Vec::load_array(d, &data[14 * stride]); let mut v15 = D::F32Vec::load_array(d, &data[15 * stride]); let mut v16 = D::F32Vec::load_array(d, &data[16 * stride]); let mut v17 = D::F32Vec::load_array(d, &data[17 * stride]); let mut v18 = D::F32Vec::load_array(d, &data[18 * stride]); let mut v19 = D::F32Vec::load_array(d, &data[19 * stride]); let mut v20 = D::F32Vec::load_array(d, &data[20 * stride]); let mut v21 = D::F32Vec::load_array(d, &data[21 * stride]); let mut v22 = D::F32Vec::load_array(d, &data[22 * stride]); let mut v23 = D::F32Vec::load_array(d, &data[23 * stride]); let mut v24 = D::F32Vec::load_array(d, &data[24 * stride]); let mut v25 = D::F32Vec::load_array(d, &data[25 * stride]); let mut v26 = D::F32Vec::load_array(d, &data[26 * stride]); let mut v27 = D::F32Vec::load_array(d, &data[27 * stride]); let mut v28 = D::F32Vec::load_array(d, &data[28 * stride]); let mut v29 = D::F32Vec::load_array(d, &data[29 * stride]); let mut v30 = D::F32Vec::load_array(d, &data[30 * stride]); let mut v31 = D::F32Vec::load_array(d, &data[31 * stride]); ( v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, ) = reinterpreting_dct_32( d, v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, ); v0.store_array(&mut data[0 * stride]); v1.store_array(&mut data[1 * stride]); v2.store_array(&mut data[2 * stride]); v3.store_array(&mut data[3 * stride]); v4.store_array(&mut data[4 * stride]); v5.store_array(&mut data[5 * stride]); v6.store_array(&mut data[6 * stride]); v7.store_array(&mut data[7 * stride]); v8.store_array(&mut data[8 * stride]); v9.store_array(&mut data[9 * stride]); v10.store_array(&mut data[10 * stride]); v11.store_array(&mut data[11 * stride]); v12.store_array(&mut data[12 * stride]); v13.store_array(&mut data[13 * stride]); v14.store_array(&mut data[14 * stride]); v15.store_array(&mut data[15 * stride]); v16.store_array(&mut data[16 * stride]); v17.store_array(&mut data[17 * stride]); v18.store_array(&mut data[18 * stride]); v19.store_array(&mut data[19 * stride]); v20.store_array(&mut data[20 * stride]); v21.store_array(&mut data[21 * stride]); v22.store_array(&mut data[22 * stride]); v23.store_array(&mut data[23 * stride]); v24.store_array(&mut data[24 * stride]); v25.store_array(&mut data[25 * stride]); v26.store_array(&mut data[26 * stride]); v27.store_array(&mut data[27 * stride]); v28.store_array(&mut data[28 * stride]); v29.store_array(&mut data[29 * stride]); v30.store_array(&mut data[30 * stride]); v31.store_array(&mut data[31 * stride]); } #[inline(always)] pub(super) fn do_reinterpreting_dct_32_rowblock( d: D, data: &mut [::UnderlyingArray], ) { assert!(data.len() >= 32); const { assert!(32usize.is_multiple_of(D::F32Vec::LEN)) }; let row_stride = 32 / D::F32Vec::LEN; let mut v0 = D::F32Vec::load_array( d, &data[row_stride * (0 % D::F32Vec::LEN) + (0 / D::F32Vec::LEN)], ); let mut v1 = D::F32Vec::load_array( d, &data[row_stride * (1 % D::F32Vec::LEN) + (1 / D::F32Vec::LEN)], ); let mut v2 = D::F32Vec::load_array( d, &data[row_stride * (2 % D::F32Vec::LEN) + (2 / D::F32Vec::LEN)], ); let mut v3 = D::F32Vec::load_array( d, &data[row_stride * (3 % D::F32Vec::LEN) + (3 / D::F32Vec::LEN)], ); let mut v4 = D::F32Vec::load_array( d, &data[row_stride * (4 % D::F32Vec::LEN) + (4 / D::F32Vec::LEN)], ); let mut v5 = D::F32Vec::load_array( d, &data[row_stride * (5 % D::F32Vec::LEN) + (5 / D::F32Vec::LEN)], ); let mut v6 = D::F32Vec::load_array( d, &data[row_stride * (6 % D::F32Vec::LEN) + (6 / D::F32Vec::LEN)], ); let mut v7 = D::F32Vec::load_array( d, &data[row_stride * (7 % D::F32Vec::LEN) + (7 / D::F32Vec::LEN)], ); let mut v8 = D::F32Vec::load_array( d, &data[row_stride * (8 % D::F32Vec::LEN) + (8 / D::F32Vec::LEN)], ); let mut v9 = D::F32Vec::load_array( d, &data[row_stride * (9 % D::F32Vec::LEN) + (9 / D::F32Vec::LEN)], ); let mut v10 = D::F32Vec::load_array( d, &data[row_stride * (10 % D::F32Vec::LEN) + (10 / D::F32Vec::LEN)], ); let mut v11 = D::F32Vec::load_array( d, &data[row_stride * (11 % D::F32Vec::LEN) + (11 / D::F32Vec::LEN)], ); let mut v12 = D::F32Vec::load_array( d, &data[row_stride * (12 % D::F32Vec::LEN) + (12 / D::F32Vec::LEN)], ); let mut v13 = D::F32Vec::load_array( d, &data[row_stride * (13 % D::F32Vec::LEN) + (13 / D::F32Vec::LEN)], ); let mut v14 = D::F32Vec::load_array( d, &data[row_stride * (14 % D::F32Vec::LEN) + (14 / D::F32Vec::LEN)], ); let mut v15 = D::F32Vec::load_array( d, &data[row_stride * (15 % D::F32Vec::LEN) + (15 / D::F32Vec::LEN)], ); let mut v16 = D::F32Vec::load_array( d, &data[row_stride * (16 % D::F32Vec::LEN) + (16 / D::F32Vec::LEN)], ); let mut v17 = D::F32Vec::load_array( d, &data[row_stride * (17 % D::F32Vec::LEN) + (17 / D::F32Vec::LEN)], ); let mut v18 = D::F32Vec::load_array( d, &data[row_stride * (18 % D::F32Vec::LEN) + (18 / D::F32Vec::LEN)], ); let mut v19 = D::F32Vec::load_array( d, &data[row_stride * (19 % D::F32Vec::LEN) + (19 / D::F32Vec::LEN)], ); let mut v20 = D::F32Vec::load_array( d, &data[row_stride * (20 % D::F32Vec::LEN) + (20 / D::F32Vec::LEN)], ); let mut v21 = D::F32Vec::load_array( d, &data[row_stride * (21 % D::F32Vec::LEN) + (21 / D::F32Vec::LEN)], ); let mut v22 = D::F32Vec::load_array( d, &data[row_stride * (22 % D::F32Vec::LEN) + (22 / D::F32Vec::LEN)], ); let mut v23 = D::F32Vec::load_array( d, &data[row_stride * (23 % D::F32Vec::LEN) + (23 / D::F32Vec::LEN)], ); let mut v24 = D::F32Vec::load_array( d, &data[row_stride * (24 % D::F32Vec::LEN) + (24 / D::F32Vec::LEN)], ); let mut v25 = D::F32Vec::load_array( d, &data[row_stride * (25 % D::F32Vec::LEN) + (25 / D::F32Vec::LEN)], ); let mut v26 = D::F32Vec::load_array( d, &data[row_stride * (26 % D::F32Vec::LEN) + (26 / D::F32Vec::LEN)], ); let mut v27 = D::F32Vec::load_array( d, &data[row_stride * (27 % D::F32Vec::LEN) + (27 / D::F32Vec::LEN)], ); let mut v28 = D::F32Vec::load_array( d, &data[row_stride * (28 % D::F32Vec::LEN) + (28 / D::F32Vec::LEN)], ); let mut v29 = D::F32Vec::load_array( d, &data[row_stride * (29 % D::F32Vec::LEN) + (29 / D::F32Vec::LEN)], ); let mut v30 = D::F32Vec::load_array( d, &data[row_stride * (30 % D::F32Vec::LEN) + (30 / D::F32Vec::LEN)], ); let mut v31 = D::F32Vec::load_array( d, &data[row_stride * (31 % D::F32Vec::LEN) + (31 / D::F32Vec::LEN)], ); ( v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, ) = reinterpreting_dct_32( d, v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, ); v0.store_array(&mut data[row_stride * (0 % D::F32Vec::LEN) + (0 / D::F32Vec::LEN)]); v1.store_array(&mut data[row_stride * (1 % D::F32Vec::LEN) + (1 / D::F32Vec::LEN)]); v2.store_array(&mut data[row_stride * (2 % D::F32Vec::LEN) + (2 / D::F32Vec::LEN)]); v3.store_array(&mut data[row_stride * (3 % D::F32Vec::LEN) + (3 / D::F32Vec::LEN)]); v4.store_array(&mut data[row_stride * (4 % D::F32Vec::LEN) + (4 / D::F32Vec::LEN)]); v5.store_array(&mut data[row_stride * (5 % D::F32Vec::LEN) + (5 / D::F32Vec::LEN)]); v6.store_array(&mut data[row_stride * (6 % D::F32Vec::LEN) + (6 / D::F32Vec::LEN)]); v7.store_array(&mut data[row_stride * (7 % D::F32Vec::LEN) + (7 / D::F32Vec::LEN)]); v8.store_array(&mut data[row_stride * (8 % D::F32Vec::LEN) + (8 / D::F32Vec::LEN)]); v9.store_array(&mut data[row_stride * (9 % D::F32Vec::LEN) + (9 / D::F32Vec::LEN)]); v10.store_array(&mut data[row_stride * (10 % D::F32Vec::LEN) + (10 / D::F32Vec::LEN)]); v11.store_array(&mut data[row_stride * (11 % D::F32Vec::LEN) + (11 / D::F32Vec::LEN)]); v12.store_array(&mut data[row_stride * (12 % D::F32Vec::LEN) + (12 / D::F32Vec::LEN)]); v13.store_array(&mut data[row_stride * (13 % D::F32Vec::LEN) + (13 / D::F32Vec::LEN)]); v14.store_array(&mut data[row_stride * (14 % D::F32Vec::LEN) + (14 / D::F32Vec::LEN)]); v15.store_array(&mut data[row_stride * (15 % D::F32Vec::LEN) + (15 / D::F32Vec::LEN)]); v16.store_array(&mut data[row_stride * (16 % D::F32Vec::LEN) + (16 / D::F32Vec::LEN)]); v17.store_array(&mut data[row_stride * (17 % D::F32Vec::LEN) + (17 / D::F32Vec::LEN)]); v18.store_array(&mut data[row_stride * (18 % D::F32Vec::LEN) + (18 / D::F32Vec::LEN)]); v19.store_array(&mut data[row_stride * (19 % D::F32Vec::LEN) + (19 / D::F32Vec::LEN)]); v20.store_array(&mut data[row_stride * (20 % D::F32Vec::LEN) + (20 / D::F32Vec::LEN)]); v21.store_array(&mut data[row_stride * (21 % D::F32Vec::LEN) + (21 / D::F32Vec::LEN)]); v22.store_array(&mut data[row_stride * (22 % D::F32Vec::LEN) + (22 / D::F32Vec::LEN)]); v23.store_array(&mut data[row_stride * (23 % D::F32Vec::LEN) + (23 / D::F32Vec::LEN)]); v24.store_array(&mut data[row_stride * (24 % D::F32Vec::LEN) + (24 / D::F32Vec::LEN)]); v25.store_array(&mut data[row_stride * (25 % D::F32Vec::LEN) + (25 / D::F32Vec::LEN)]); v26.store_array(&mut data[row_stride * (26 % D::F32Vec::LEN) + (26 / D::F32Vec::LEN)]); v27.store_array(&mut data[row_stride * (27 % D::F32Vec::LEN) + (27 / D::F32Vec::LEN)]); v28.store_array(&mut data[row_stride * (28 % D::F32Vec::LEN) + (28 / D::F32Vec::LEN)]); v29.store_array(&mut data[row_stride * (29 % D::F32Vec::LEN) + (29 / D::F32Vec::LEN)]); v30.store_array(&mut data[row_stride * (30 % D::F32Vec::LEN) + (30 / D::F32Vec::LEN)]); v31.store_array(&mut data[row_stride * (31 % D::F32Vec::LEN) + (31 / D::F32Vec::LEN)]); } #[inline(always)] pub(super) fn do_reinterpreting_dct_32_trh( d: D, data: &mut [::UnderlyingArray], ) { let row_stride = 16 / D::F32Vec::LEN; assert!(data.len() > 31 * row_stride); const { assert!(16usize.is_multiple_of(D::F32Vec::LEN)) }; let mut v0 = D::F32Vec::load_array(d, &data[row_stride * 0]); let mut v1 = D::F32Vec::load_array(d, &data[row_stride * 1]); let mut v2 = D::F32Vec::load_array(d, &data[row_stride * 2]); let mut v3 = D::F32Vec::load_array(d, &data[row_stride * 3]); let mut v4 = D::F32Vec::load_array(d, &data[row_stride * 4]); let mut v5 = D::F32Vec::load_array(d, &data[row_stride * 5]); let mut v6 = D::F32Vec::load_array(d, &data[row_stride * 6]); let mut v7 = D::F32Vec::load_array(d, &data[row_stride * 7]); let mut v8 = D::F32Vec::load_array(d, &data[row_stride * 8]); let mut v9 = D::F32Vec::load_array(d, &data[row_stride * 9]); let mut v10 = D::F32Vec::load_array(d, &data[row_stride * 10]); let mut v11 = D::F32Vec::load_array(d, &data[row_stride * 11]); let mut v12 = D::F32Vec::load_array(d, &data[row_stride * 12]); let mut v13 = D::F32Vec::load_array(d, &data[row_stride * 13]); let mut v14 = D::F32Vec::load_array(d, &data[row_stride * 14]); let mut v15 = D::F32Vec::load_array(d, &data[row_stride * 15]); let mut v16 = D::F32Vec::load_array(d, &data[row_stride * 16]); let mut v17 = D::F32Vec::load_array(d, &data[row_stride * 17]); let mut v18 = D::F32Vec::load_array(d, &data[row_stride * 18]); let mut v19 = D::F32Vec::load_array(d, &data[row_stride * 19]); let mut v20 = D::F32Vec::load_array(d, &data[row_stride * 20]); let mut v21 = D::F32Vec::load_array(d, &data[row_stride * 21]); let mut v22 = D::F32Vec::load_array(d, &data[row_stride * 22]); let mut v23 = D::F32Vec::load_array(d, &data[row_stride * 23]); let mut v24 = D::F32Vec::load_array(d, &data[row_stride * 24]); let mut v25 = D::F32Vec::load_array(d, &data[row_stride * 25]); let mut v26 = D::F32Vec::load_array(d, &data[row_stride * 26]); let mut v27 = D::F32Vec::load_array(d, &data[row_stride * 27]); let mut v28 = D::F32Vec::load_array(d, &data[row_stride * 28]); let mut v29 = D::F32Vec::load_array(d, &data[row_stride * 29]); let mut v30 = D::F32Vec::load_array(d, &data[row_stride * 30]); let mut v31 = D::F32Vec::load_array(d, &data[row_stride * 31]); ( v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, ) = reinterpreting_dct_32( d, v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, ); v0.store_array(&mut data[row_stride * 0]); v16.store_array(&mut data[row_stride * 1]); v1.store_array(&mut data[row_stride * 2]); v17.store_array(&mut data[row_stride * 3]); v2.store_array(&mut data[row_stride * 4]); v18.store_array(&mut data[row_stride * 5]); v3.store_array(&mut data[row_stride * 6]); v19.store_array(&mut data[row_stride * 7]); v4.store_array(&mut data[row_stride * 8]); v20.store_array(&mut data[row_stride * 9]); v5.store_array(&mut data[row_stride * 10]); v21.store_array(&mut data[row_stride * 11]); v6.store_array(&mut data[row_stride * 12]); v22.store_array(&mut data[row_stride * 13]); v7.store_array(&mut data[row_stride * 14]); v23.store_array(&mut data[row_stride * 15]); v8.store_array(&mut data[row_stride * 16]); v24.store_array(&mut data[row_stride * 17]); v9.store_array(&mut data[row_stride * 18]); v25.store_array(&mut data[row_stride * 19]); v10.store_array(&mut data[row_stride * 20]); v26.store_array(&mut data[row_stride * 21]); v11.store_array(&mut data[row_stride * 22]); v27.store_array(&mut data[row_stride * 23]); v12.store_array(&mut data[row_stride * 24]); v28.store_array(&mut data[row_stride * 25]); v13.store_array(&mut data[row_stride * 26]); v29.store_array(&mut data[row_stride * 27]); v14.store_array(&mut data[row_stride * 28]); v30.store_array(&mut data[row_stride * 29]); v15.store_array(&mut data[row_stride * 30]); v31.store_array(&mut data[row_stride * 31]); }