deskew_sse.h 31 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941
  1. // Copyright (C) 2004-2024 Artifex Software, Inc.
  2. //
  3. // This file is part of MuPDF.
  4. //
  5. // MuPDF is free software: you can redistribute it and/or modify it under the
  6. // terms of the GNU Affero General Public License as published by the Free
  7. // Software Foundation, either version 3 of the License, or (at your option)
  8. // any later version.
  9. //
  10. // MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
  11. // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  12. // FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
  13. // details.
  14. //
  15. // You should have received a copy of the GNU Affero General Public License
  16. // along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
  17. //
  18. // Alternative licensing terms are available from the licensor.
  19. // For commercial licensing, see <https://www.artifex.com/> or contact
  20. // Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
  21. // CA 94129, USA, for further information.
  22. /* This file is included from deskew.c if SSE cores are allowed. */
  23. #include <emmintrin.h>
  24. #include <smmintrin.h>
  25. static void
  26. zoom_x1_sse(uint8_t * FZ_RESTRICT tmp,
  27. const uint8_t * FZ_RESTRICT src,
  28. const index_t * FZ_RESTRICT index,
  29. const int32_t * FZ_RESTRICT weights,
  30. uint32_t dst_w,
  31. uint32_t src_w,
  32. uint32_t channels,
  33. const uint8_t * FZ_RESTRICT bg)
  34. {
  35. __m128i round = _mm_set1_epi32(WEIGHT_ROUND);
  36. if (0)
  37. slow:
  38. {
  39. /* Do any where we might index off the edge of the source */
  40. int pix_num = index->first_pixel;
  41. const uint8_t *s = &src[pix_num];
  42. const int32_t *w = &weights[index->index];
  43. uint32_t j = index->n;
  44. int32_t pixel0 = WEIGHT_ROUND;
  45. if (pix_num < 0)
  46. {
  47. int32_t wt = *w++;
  48. assert(pix_num == -1);
  49. pixel0 += bg[0] * wt;
  50. s++;
  51. j--;
  52. pix_num = 0;
  53. }
  54. pix_num = (int)src_w - pix_num;
  55. if (pix_num > (int)j)
  56. pix_num = j;
  57. j -= pix_num;
  58. while (pix_num > 0)
  59. {
  60. pixel0 += *s++ * *w++;
  61. pix_num--;
  62. }
  63. if (j > 0)
  64. {
  65. assert(j == 1);
  66. pixel0 += bg[0] * *w;
  67. }
  68. pixel0 >>= WEIGHT_SHIFT;
  69. *tmp++ = CLAMP(pixel0, 0, 255);
  70. index++;
  71. dst_w--;
  72. }
  73. while (dst_w > 0)
  74. {
  75. const uint8_t *s;
  76. uint32_t j;
  77. const int32_t *w;
  78. /* Jump out of band to do the (rare) slow (edge) pixels */
  79. if (index->slow)
  80. goto slow;
  81. s = &src[index->first_pixel];
  82. j = index->n;
  83. w = &weights[index->index];
  84. if (j <= 4)
  85. {
  86. __m128i mw0, mm0;
  87. mw0 = _mm_load_si128((const __m128i *)w);
  88. mm0 = _mm_loadu_si128((const __m128i *)s);
  89. // mm0 = ppoonnmmllkkjjiihhggffeeddccbbaa SSE2
  90. mm0 = _mm_cvtepu8_epi32(mm0); // mm0 = 000000dd000000cc000000bb000000aa SSE4.1
  91. mm0 = _mm_mullo_epi32(mm0,mw0); // mm0 = 0000wdwd0000wcwc0000wbwb0000wawa SSE4.1
  92. mm0 = _mm_hadd_epi32(mm0,mm0);
  93. mm0 = _mm_hadd_epi32(mm0,mm0);
  94. mm0 = _mm_add_epi32(mm0, round);// Add round
  95. mm0 = _mm_srai_epi32(mm0, WEIGHT_SHIFT-8); // Shift down
  96. mm0 = _mm_packus_epi32(mm0,mm0);// Clamp to 0 to 65535 range.
  97. *tmp++ = _mm_extract_epi8(mm0,1);
  98. }
  99. else if (j <= 8)
  100. {
  101. __m128i mw0, mw1, mm0, mm1;
  102. mw0 = _mm_load_si128((const __m128i *)w);
  103. mm0 = _mm_loadu_si128((const __m128i *)s);
  104. // mm0 = ppoonnmmllkkjjiihhggffeeddccbbaa SSE2
  105. mw1 = _mm_load_si128((const __m128i *)(w+4));
  106. mm1 = _mm_cvtepu8_epi32(mm0); // mm1 = 000000dd000000cc000000bb000000aa SSE4.1
  107. mm0 = _mm_srli_si128(mm0, 4); // mm0 = 000000ppoonnmmllkkjjiihhggffeedd SSE2
  108. mm1 = _mm_mullo_epi32(mm1,mw0); // mm1 = 0000wdwd0000wcwc0000wbwb0000wawa SSE4.1
  109. mm0 = _mm_cvtepu8_epi32(mm0); // mm0 = 000000hh000000gg000000ff000000ee SSE4.1
  110. mm0 = _mm_mullo_epi32(mm0,mw1); // mm1 = 0000whwh0000wgwg0000wfwf0000wewe SSE4.1
  111. mm1 = _mm_add_epi32(mm1, mm0);
  112. mm1 = _mm_hadd_epi32(mm1,mm1);
  113. mm1 = _mm_hadd_epi32(mm1,mm1);
  114. mm1 = _mm_add_epi32(mm1, round); // Add round
  115. mm1 = _mm_srai_epi32(mm1, WEIGHT_SHIFT-8); // Shift down
  116. mm1 = _mm_packus_epi32(mm1,mm1); // Clamp to 0 to 65535 range.
  117. *tmp++ = _mm_extract_epi8(mm1,1);
  118. }
  119. else
  120. {
  121. int32_t pixel0 = WEIGHT_ROUND;
  122. for (j = index->n; j > 0; j--)
  123. {
  124. pixel0 += *s++ * *w++;
  125. }
  126. pixel0 >>= WEIGHT_SHIFT;
  127. *tmp++ = CLAMP(pixel0, 0, 255);
  128. }
  129. index++;
  130. dst_w--;
  131. }
  132. }
  133. static void
  134. zoom_x3_sse(uint8_t * FZ_RESTRICT tmp,
  135. const uint8_t * FZ_RESTRICT src,
  136. const index_t * FZ_RESTRICT index,
  137. const int32_t * FZ_RESTRICT weights,
  138. uint32_t dst_w,
  139. uint32_t src_w,
  140. uint32_t channels,
  141. const uint8_t * FZ_RESTRICT bg)
  142. {
  143. __m128i round = _mm_set1_epi32(WEIGHT_ROUND);
  144. if (0)
  145. slow:
  146. {
  147. /* Do any where we might index off the edge of the source */
  148. int pix_num = index->first_pixel;
  149. const uint8_t *s = &src[pix_num * 3];
  150. const int32_t *w = &weights[index->index];
  151. uint32_t j = index->n;
  152. int32_t pixel0 = WEIGHT_ROUND;
  153. int32_t pixel1 = WEIGHT_ROUND;
  154. int32_t pixel2 = WEIGHT_ROUND;
  155. if (pix_num < 0)
  156. {
  157. int32_t wt = *w++;
  158. assert(pix_num == -1);
  159. pixel0 += bg[0] * wt;
  160. pixel1 += bg[1] * wt;
  161. pixel2 += bg[2] * wt;
  162. s += 3;
  163. j--;
  164. pix_num = 0;
  165. }
  166. pix_num = (int)src_w - pix_num;
  167. if (pix_num > (int)j)
  168. pix_num = j;
  169. j -= pix_num;
  170. while (pix_num > 0)
  171. {
  172. int32_t wt = *w++;
  173. pixel0 += *s++ * wt;
  174. pixel1 += *s++ * wt;
  175. pixel2 += *s++ * wt;
  176. pix_num--;
  177. }
  178. if (j > 0)
  179. {
  180. int32_t wt = *w++;
  181. assert(j == 1);
  182. pixel0 += bg[0] * wt;
  183. pixel1 += bg[1] * wt;
  184. pixel2 += bg[2] * wt;
  185. }
  186. pixel0 >>= WEIGHT_SHIFT;
  187. pixel1 >>= WEIGHT_SHIFT;
  188. pixel2 >>= WEIGHT_SHIFT;
  189. *tmp++ = CLAMP(pixel0, 0, 255);
  190. *tmp++ = CLAMP(pixel1, 0, 255);
  191. *tmp++ = CLAMP(pixel2, 0, 255);
  192. index++;
  193. dst_w--;
  194. }
  195. while (dst_w > 0)
  196. {
  197. const uint8_t *s;
  198. int j;
  199. const int32_t *w;
  200. __m128i mm0, mm1, mm4, mw0, mw1;
  201. /* Jump out of band to do the (rare) slow (edge) pixels */
  202. if (index->slow)
  203. goto slow;
  204. s = &src[index->first_pixel * 3];
  205. j = (int)index->n;
  206. w = &weights[index->index];
  207. mm4 = round;
  208. mm0 = _mm_loadu_si128((const __m128i *)s); // mm0 = ppoonnmmllkkjjiihhggffeeddccbbaa SSE2
  209. if (j == 4)
  210. {
  211. mw0 = _mm_load_si128((const __m128i *)w);
  212. mw1 = _mm_shuffle_epi32(mw0, 0 + (0 << 2) + (0 << 4) + (0 << 6));
  213. mm1 = _mm_cvtepu8_epi32(mm0); // mm1 = 000000dd000000cc000000bb000000aa SSE4.1
  214. mm1 = _mm_mullo_epi32(mm1, mw1);// mm1 = 0000wdwd0000wcwc0000wbwb0000wawa SSE4.1
  215. mm4 = _mm_add_epi32(mm4, mm1); // mm4 = 0000xxxx0000xxxx0000xxxx0000xxxx SSE2
  216. mm0 = _mm_srli_si128(mm0, 3); // mm0 = 000000ppoonnmmllkkjjiihhggffeedd SSE2
  217. mw1 = _mm_shuffle_epi32(mw0, 1 + (1 << 2) + (1 << 4) + (1 << 6));
  218. mm1 = _mm_cvtepu8_epi32(mm0); // mm1 = 000000dd000000cc000000bb000000aa SSE4.1
  219. mm1 = _mm_mullo_epi32(mm1, mw1);// mm1 = 0000wdwd0000wcwc0000wbwb0000wawa SSE4.1
  220. mm4 = _mm_add_epi32(mm4, mm1); // mm4 = 0000xxxx0000xxxx0000xxxx0000xxxx SSE2
  221. mm0 = _mm_srli_si128(mm0, 3); // mm0 = 000000ppoonnmmllkkjjiihhggffeedd SSE2
  222. mw1 = _mm_shuffle_epi32(mw0, 2 + (2 << 2) + (2 << 4) + (2 << 6));
  223. mm1 = _mm_cvtepu8_epi32(mm0); // mm1 = 000000dd000000cc000000bb000000aa SSE4.1
  224. mm1 = _mm_mullo_epi32(mm1, mw1);// mm1 = 0000wdwd0000wcwc0000wbwb0000wawa SSE4.1
  225. mm4 = _mm_add_epi32(mm4, mm1); // mm4 = 0000xxxx0000xxxx0000xxxx0000xxxx SSE2
  226. mm0 = _mm_srli_si128(mm0, 3); // mm0 = 000000ppoonnmmllkkjjiihhggffeedd SSE2
  227. mw1 = _mm_shuffle_epi32(mw0, 3 + (3 << 2) + (3 << 4) + (3 << 6));
  228. mm1 = _mm_cvtepu8_epi32(mm0); // mm1 = 000000dd000000cc000000bb000000aa SSE4.1
  229. mm1 = _mm_mullo_epi32(mm1, mw1);// mm1 = 0000wdwd0000wcwc0000wbwb0000wawa SSE4.1
  230. mm4 = _mm_add_epi32(mm4, mm1); // mm4 = 0000xxxx0000xxxx0000xxxx0000xxxx SSE2
  231. }
  232. else
  233. {
  234. int off = j & 3;
  235. w -= (4 - j) & 3;
  236. s += (off ? off : 4) * 3;
  237. mw0 = _mm_loadu_si128((const __m128i *)w);
  238. w += 4;
  239. /* This is a use of Duff's Device. I'm very sorry, but on the other hand, Yay! */
  240. switch (off)
  241. {
  242. do
  243. {
  244. mm0 = _mm_loadu_si128((const __m128i *)s);
  245. // mm0 = ppoonnmmllkkjjiihhggffeeddccbbaa SSE2
  246. s += 4 * 3;
  247. mw0 = _mm_load_si128((const __m128i *)w);
  248. w += 4;
  249. case 0:
  250. mw1 = _mm_shuffle_epi32(mw0, 0 + (0 << 2) + (0 << 4) + (0 << 6));
  251. mm1 = _mm_cvtepu8_epi32(mm0); // mm1 = 000000dd000000cc000000bb000000aa SSE4.1
  252. mm1 = _mm_mullo_epi32(mm1, mw1);// mm1 = 0000wdwd0000wcwc0000wbwb0000wawa SSE4.1
  253. mm4 = _mm_add_epi32(mm4, mm1); // mm4 = 0000xxxx0000xxxx0000xxxx0000xxxx SSE2
  254. mm0 = _mm_srli_si128(mm0, 3); // mm0 = 000000ppoonnmmllkkjjiihhggffeedd SSE2
  255. case 3:
  256. mw1 = _mm_shuffle_epi32(mw0, 1 + (1 << 2) + (1 << 4) + (1 << 6));
  257. mm1 = _mm_cvtepu8_epi32(mm0); // mm1 = 000000dd000000cc000000bb000000aa SSE4.1
  258. mm1 = _mm_mullo_epi32(mm1, mw1);// mm1 = 0000wdwd0000wcwc0000wbwb0000wawa SSE4.1
  259. mm4 = _mm_add_epi32(mm4, mm1); // mm4 = 0000xxxx0000xxxx0000xxxx0000xxxx SSE2
  260. mm0 = _mm_srli_si128(mm0, 3); // mm0 = 000000ppoonnmmllkkjjiihhggffeedd SSE2
  261. case 2:
  262. mw1 = _mm_shuffle_epi32(mw0, 2 + (2 << 2) + (2 << 4) + (2 << 6));
  263. mm1 = _mm_cvtepu8_epi32(mm0); // mm1 = 000000dd000000cc000000bb000000aa SSE4.1
  264. mm1 = _mm_mullo_epi32(mm1, mw1);// mm1 = 0000wdwd0000wcwc0000wbwb0000wawa SSE4.1
  265. mm4 = _mm_add_epi32(mm4, mm1); // mm4 = 0000xxxx0000xxxx0000xxxx0000xxxx SSE2
  266. mm0 = _mm_srli_si128(mm0, 3); // mm0 = 000000ppoonnmmllkkjjiihhggffeedd SSE2
  267. case 1:
  268. mw1 = _mm_shuffle_epi32(mw0, 3 + (3 << 2) + (3 << 4) + (3 << 6));
  269. mm1 = _mm_cvtepu8_epi32(mm0); // mm1 = 000000dd000000cc000000bb000000aa SSE4.1
  270. mm1 = _mm_mullo_epi32(mm1, mw1);// mm1 = 0000wdwd0000wcwc0000wbwb0000wawa SSE4.1
  271. mm4 = _mm_add_epi32(mm4, mm1); // mm4 = 0000xxxx0000xxxx0000xxxx0000xxxx SSE2
  272. j -= 4;
  273. } while (j > 0);
  274. }
  275. }
  276. #if 0
  277. mm4 = _mm_srai_epi32(mm4, WEIGHT_SHIFT - 8); // Shift down
  278. mm4 = _mm_packus_epi32(mm4, mm4); // Clamp to 0 to 65535 range.
  279. *tmp++ = _mm_extract_epi8(mm4, 1);
  280. *tmp++ = _mm_extract_epi8(mm4, 3);
  281. *tmp++ = _mm_extract_epi8(mm4, 5);
  282. #else
  283. mm4 = _mm_srai_epi32(mm4, WEIGHT_SHIFT); // Shift down
  284. mm4 = _mm_packus_epi32(mm4, mm4); // Clamp to 0 to 65535 range.
  285. mm4 = _mm_packus_epi16(mm4, mm4); // Clamp to 0 to 65535 range.
  286. j = _mm_extract_epi32(mm4, 0);
  287. *(int16_t *)tmp = j;
  288. ((int8_t *)tmp)[2] = j>>16;
  289. tmp += 3;
  290. #endif
  291. index++;
  292. dst_w--;
  293. }
  294. while (dst_w > 0)
  295. {
  296. const uint8_t *s;
  297. /* Jump out of band to do the (rare) slow (edge) pixels */
  298. if (index->slow)
  299. goto slow;
  300. s = &src[index->first_pixel * 3];
  301. {
  302. const int32_t *w = &weights[index->index];
  303. uint32_t j = index->n;
  304. int32_t pixel0 = WEIGHT_ROUND;
  305. int32_t pixel1 = WEIGHT_ROUND;
  306. int32_t pixel2 = WEIGHT_ROUND;
  307. for (j = index->n; j > 0; j--)
  308. {
  309. int32_t wt = *w++;
  310. pixel0 += *s++ * wt;
  311. pixel1 += *s++ * wt;
  312. pixel2 += *s++ * wt;
  313. }
  314. pixel0 >>= WEIGHT_SHIFT;
  315. pixel1 >>= WEIGHT_SHIFT;
  316. pixel2 >>= WEIGHT_SHIFT;
  317. *tmp++ = CLAMP(pixel0, 0, 255);
  318. *tmp++ = CLAMP(pixel1, 0, 255);
  319. *tmp++ = CLAMP(pixel2, 0, 255);
  320. }
  321. index++;
  322. dst_w--;
  323. }
  324. }
  325. static void
  326. zoom_x4_sse(uint8_t * FZ_RESTRICT tmp,
  327. const uint8_t * FZ_RESTRICT src,
  328. const index_t * FZ_RESTRICT index,
  329. const int32_t * FZ_RESTRICT weights,
  330. uint32_t dst_w,
  331. uint32_t src_w,
  332. uint32_t channels,
  333. const uint8_t * FZ_RESTRICT bg)
  334. {
  335. __m128i round = _mm_set1_epi32(WEIGHT_ROUND);
  336. if (0)
  337. slow:
  338. {
  339. /* Do any where we might index off the edge of the source */
  340. int pn = index->first_pixel;
  341. const uint8_t *s = &src[pn * 4];
  342. const int32_t *w = &weights[index->index];
  343. uint32_t j = index->n;
  344. int32_t pixel0 = WEIGHT_ROUND;
  345. int32_t pixel1 = WEIGHT_ROUND;
  346. int32_t pixel2 = WEIGHT_ROUND;
  347. int32_t pixel3 = WEIGHT_ROUND;
  348. int pix_num = pn;
  349. if (pix_num < 0)
  350. {
  351. int32_t wt = *w++;
  352. assert(pix_num == -1);
  353. pixel0 += bg[0] * wt;
  354. pixel1 += bg[1] * wt;
  355. pixel2 += bg[2] * wt;
  356. pixel3 += bg[3] * wt;
  357. s += 4;
  358. j--;
  359. pix_num = 0;
  360. }
  361. pix_num = (int)src_w - pix_num;
  362. if (pix_num > (int)j)
  363. pix_num = j;
  364. j -= pix_num;
  365. while (pix_num > 0)
  366. {
  367. int32_t wt = *w++;
  368. pixel0 += *s++ * wt;
  369. pixel1 += *s++ * wt;
  370. pixel2 += *s++ * wt;
  371. pixel3 += *s++ * wt;
  372. pix_num--;
  373. }
  374. if (j > 0)
  375. {
  376. int32_t wt = *w;
  377. assert(j == 1);
  378. pixel0 += bg[0] * wt;
  379. pixel1 += bg[1] * wt;
  380. pixel2 += bg[2] * wt;
  381. pixel3 += bg[3] * wt;
  382. }
  383. pixel0 >>= WEIGHT_SHIFT;
  384. pixel1 >>= WEIGHT_SHIFT;
  385. pixel2 >>= WEIGHT_SHIFT;
  386. pixel3 >>= WEIGHT_SHIFT;
  387. *tmp++ = CLAMP(pixel0, 0, 255);
  388. *tmp++ = CLAMP(pixel1, 0, 255);
  389. *tmp++ = CLAMP(pixel2, 0, 255);
  390. *tmp++ = CLAMP(pixel3, 0, 255);
  391. index++;
  392. dst_w--;
  393. }
  394. while (dst_w > 0)
  395. {
  396. const uint8_t *s;
  397. int j;
  398. const int32_t *w;
  399. __m128i mm0, mm1, mm4, mw0, mw1;
  400. /* Jump out of band to do the (rare) slow (edge) pixels */
  401. if (index->slow)
  402. goto slow;
  403. s = &src[index->first_pixel * 4];
  404. j = (int)index->n;
  405. w = &weights[index->index];
  406. mm4 = round;
  407. mm0 = _mm_loadu_si128((const __m128i *)s); // mm0 = ppoonnmmllkkjjiihhggffeeddccbbaa SSE2
  408. if (j == 4)
  409. {
  410. mw0 = _mm_load_si128((const __m128i *)w);
  411. mw1 = _mm_shuffle_epi32(mw0, 0 + (0 << 2) + (0 << 4) + (0 << 6));
  412. mm1 = _mm_cvtepu8_epi32(mm0); // mm1 = 000000dd000000cc000000bb000000aa SSE4.1
  413. mm1 = _mm_mullo_epi32(mm1, mw1);// mm1 = 0000wdwd0000wcwc0000wbwb0000wawa SSE4.1
  414. mm4 = _mm_add_epi32(mm4, mm1); // mm4 = 0000xxxx0000xxxx0000xxxx0000xxxx SSE2
  415. mm0 = _mm_srli_si128(mm0, 4); // mm0 = 000000ppoonnmmllkkjjiihhggffeedd SSE2
  416. mw1 = _mm_shuffle_epi32(mw0, 1 + (1 << 2) + (1 << 4) + (1 << 6));
  417. mm1 = _mm_cvtepu8_epi32(mm0); // mm1 = 000000dd000000cc000000bb000000aa SSE4.1
  418. mm1 = _mm_mullo_epi32(mm1, mw1);// mm1 = 0000wdwd0000wcwc0000wbwb0000wawa SSE4.1
  419. mm4 = _mm_add_epi32(mm4, mm1); // mm4 = 0000xxxx0000xxxx0000xxxx0000xxxx SSE2
  420. mm0 = _mm_srli_si128(mm0, 4); // mm0 = 000000ppoonnmmllkkjjiihhggffeedd SSE2
  421. mw1 = _mm_shuffle_epi32(mw0, 2 + (2 << 2) + (2 << 4) + (2 << 6));
  422. mm1 = _mm_cvtepu8_epi32(mm0); // mm1 = 000000dd000000cc000000bb000000aa SSE4.1
  423. mm1 = _mm_mullo_epi32(mm1, mw1);// mm1 = 0000wdwd0000wcwc0000wbwb0000wawa SSE4.1
  424. mm4 = _mm_add_epi32(mm4, mm1); // mm4 = 0000xxxx0000xxxx0000xxxx0000xxxx SSE2
  425. mm0 = _mm_srli_si128(mm0, 4); // mm0 = 000000ppoonnmmllkkjjiihhggffeedd SSE2
  426. mw1 = _mm_shuffle_epi32(mw0, 3 + (3 << 2) + (3 << 4) + (3 << 6));
  427. mm1 = _mm_cvtepu8_epi32(mm0); // mm1 = 000000dd000000cc000000bb000000aa SSE4.1
  428. mm1 = _mm_mullo_epi32(mm1, mw1);// mm1 = 0000wdwd0000wcwc0000wbwb0000wawa SSE4.1
  429. mm4 = _mm_add_epi32(mm4, mm1); // mm4 = 0000xxxx0000xxxx0000xxxx0000xxxx SSE2
  430. }
  431. else
  432. {
  433. int off = j & 3;
  434. w -= (4 - j) & 3;
  435. s += (off ? off : 4) * 4;
  436. mw0 = _mm_loadu_si128((const __m128i *)w);
  437. w += 4;
  438. /* This is a use of Duff's Device. I'm very sorry, but on the other hand, Yay! */
  439. switch (off)
  440. {
  441. do
  442. {
  443. mm0 = _mm_loadu_si128((const __m128i *)s);
  444. // mm0 = ppoonnmmllkkjjiihhggffeeddccbbaa SSE2
  445. s += 4 * 4;
  446. mw0 = _mm_load_si128((const __m128i *)w);
  447. w += 4;
  448. case 0:
  449. mw1 = _mm_shuffle_epi32(mw0, 0 + (0 << 2) + (0 << 4) + (0 << 6));
  450. mm1 = _mm_cvtepu8_epi32(mm0); // mm1 = 000000dd000000cc000000bb000000aa SSE4.1
  451. mm1 = _mm_mullo_epi32(mm1, mw1);// mm1 = 0000wdwd0000wcwc0000wbwb0000wawa SSE4.1
  452. mm4 = _mm_add_epi32(mm4, mm1); // mm4 = 0000xxxx0000xxxx0000xxxx0000xxxx SSE2
  453. mm0 = _mm_srli_si128(mm0, 4); // mm0 = 000000ppoonnmmllkkjjiihhggffeedd SSE2
  454. case 3:
  455. mw1 = _mm_shuffle_epi32(mw0, 1 + (1 << 2) + (1 << 4) + (1 << 6));
  456. mm1 = _mm_cvtepu8_epi32(mm0); // mm1 = 000000dd000000cc000000bb000000aa SSE4.1
  457. mm1 = _mm_mullo_epi32(mm1, mw1);// mm1 = 0000wdwd0000wcwc0000wbwb0000wawa SSE4.1
  458. mm4 = _mm_add_epi32(mm4, mm1); // mm4 = 0000xxxx0000xxxx0000xxxx0000xxxx SSE2
  459. mm0 = _mm_srli_si128(mm0, 4); // mm0 = 000000ppoonnmmllkkjjiihhggffeedd SSE2
  460. case 2:
  461. mw1 = _mm_shuffle_epi32(mw0, 2 + (2 << 2) + (2 << 4) + (2 << 6));
  462. mm1 = _mm_cvtepu8_epi32(mm0); // mm1 = 000000dd000000cc000000bb000000aa SSE4.1
  463. mm1 = _mm_mullo_epi32(mm1, mw1);// mm1 = 0000wdwd0000wcwc0000wbwb0000wawa SSE4.1
  464. mm4 = _mm_add_epi32(mm4, mm1); // mm4 = 0000xxxx0000xxxx0000xxxx0000xxxx SSE2
  465. mm0 = _mm_srli_si128(mm0, 4); // mm0 = 000000ppoonnmmllkkjjiihhggffeedd SSE2
  466. case 1:
  467. mw1 = _mm_shuffle_epi32(mw0, 3 + (3 << 2) + (3 << 4) + (3 << 6));
  468. mm1 = _mm_cvtepu8_epi32(mm0); // mm1 = 000000dd000000cc000000bb000000aa SSE4.1
  469. mm1 = _mm_mullo_epi32(mm1, mw1);// mm1 = 0000wdwd0000wcwc0000wbwb0000wawa SSE4.1
  470. mm4 = _mm_add_epi32(mm4, mm1); // mm4 = 0000xxxx0000xxxx0000xxxx0000xxxx SSE2
  471. j -= 4;
  472. } while (j > 0);
  473. }
  474. }
  475. #if 0
  476. mm4 = _mm_srai_epi32(mm4, WEIGHT_SHIFT - 8); // Shift down
  477. mm4 = _mm_packus_epi32(mm4,mm4); // Clamp to 0 to 65535 range.
  478. *tmp++ = _mm_extract_epi8(mm4,1);
  479. *tmp++ = _mm_extract_epi8(mm4,3);
  480. *tmp++ = _mm_extract_epi8(mm4,5);
  481. *tmp++ = _mm_extract_epi8(mm4,7);
  482. #else
  483. mm4 = _mm_srai_epi32(mm4, WEIGHT_SHIFT);
  484. mm4 = _mm_packus_epi32(mm4,mm4);
  485. mm4 = _mm_packus_epi16(mm4,mm4);
  486. *(int32_t *)tmp = _mm_extract_epi32(mm4,0);
  487. tmp += 4;
  488. #endif
  489. index++;
  490. dst_w--;
  491. }
  492. }
  493. static void
  494. zoom_y1_sse(uint8_t * dst,
  495. const uint8_t * FZ_RESTRICT tmp,
  496. const index_t * FZ_RESTRICT index,
  497. const int32_t * FZ_RESTRICT weights,
  498. uint32_t width,
  499. uint32_t channels,
  500. uint32_t mod,
  501. int32_t y)
  502. {
  503. uint32_t stride = width;
  504. uint32_t offset = 0;
  505. const __m128i *mm_weights = (const __m128i *)weights;
  506. const __m128i mm_weight_round = _mm_set1_epi32(WEIGHT_ROUND);
  507. if (0)
  508. slow:
  509. {
  510. uint32_t off = (index->first_pixel + y) * stride + offset;
  511. offset++;
  512. if (off >= mod)
  513. off -= mod;
  514. {
  515. const int32_t *w = (const int32_t *)&mm_weights[index->index];
  516. uint32_t j;
  517. int32_t pixel0 = WEIGHT_ROUND;
  518. for (j = index->n; j > 0; j--)
  519. {
  520. pixel0 += tmp[off] * *w;
  521. w += 4;
  522. off += stride;
  523. if (off >= mod)
  524. off -= mod;
  525. }
  526. pixel0 >>= WEIGHT_SHIFT;
  527. *dst++ = CLAMP(pixel0, 0, 255);
  528. }
  529. index++;
  530. width--;
  531. }
  532. while ((int)width > 0)
  533. {
  534. uint32_t off;
  535. /* The slow flag stops us accessing off the end of the source row.
  536. * It also tells us how many pixels we can do at once. This usage
  537. * is different for zoom_y1 than for all other cores. */
  538. uint8_t n = index->slow;
  539. if (n <= 1 || n > width)
  540. goto slow;
  541. off = (index->first_pixel + y) * stride + offset;
  542. offset += n;
  543. if (off >= mod)
  544. off -= mod;
  545. {
  546. const __m128i *w = &mm_weights[index->index];
  547. uint32_t j = index->n;
  548. __m128i mm_pixels = mm_weight_round;
  549. if (j == 4)
  550. {
  551. __m128i pix0, pix1, pix2;
  552. __m128i w0, w1, w2;
  553. pix0 = _mm_loadu_si128((const __m128i *)&tmp[off]);
  554. off += stride;
  555. if (off >= mod)
  556. off -= mod;
  557. w0 = _mm_load_si128(w++);
  558. pix0 = _mm_cvtepu8_epi32(pix0);
  559. pix1 = _mm_loadu_si128((const __m128i *)&tmp[off]);
  560. off += stride;
  561. if (off >= mod)
  562. off -= mod;
  563. pix0 = _mm_mullo_epi32(pix0, w0);
  564. w1 = _mm_load_si128(w++);
  565. pix1 = _mm_cvtepu8_epi32(pix1);
  566. pix2 = _mm_loadu_si128((const __m128i *)&tmp[off]);
  567. off += stride;
  568. if (off >= mod)
  569. off -= mod;
  570. mm_pixels = _mm_add_epi32(mm_pixels, pix0);
  571. pix1 = _mm_mullo_epi32(pix1, w1);
  572. w2 = _mm_load_si128(w++);
  573. pix2 = _mm_cvtepu8_epi32(pix2);
  574. pix0 = _mm_loadu_si128((const __m128i *)&tmp[off]);
  575. off += stride;
  576. if (off >= mod)
  577. off -= mod;
  578. mm_pixels = _mm_add_epi32(mm_pixels, pix1);
  579. pix2 = _mm_mullo_epi32(pix2, w2);
  580. w0 = _mm_load_si128(w++);
  581. pix0 = _mm_cvtepu8_epi32(pix0);
  582. pix0 = _mm_mullo_epi32(pix0, w0);
  583. mm_pixels = _mm_add_epi32(mm_pixels, pix2);
  584. mm_pixels = _mm_add_epi32(mm_pixels, pix0);
  585. }
  586. else
  587. for ( ; j > 0; j--)
  588. {
  589. __m128i pix0 = _mm_loadu_si128((const __m128i *)&tmp[off]);
  590. __m128i w0 = _mm_load_si128(w++);
  591. pix0 = _mm_cvtepu8_epi32(pix0);
  592. off += stride;
  593. pix0 = _mm_mullo_epi32(pix0, w0);
  594. if (off >= mod)
  595. off -= mod;
  596. mm_pixels = _mm_add_epi32(mm_pixels, pix0);
  597. }
  598. mm_pixels = _mm_srli_epi32(mm_pixels, WEIGHT_SHIFT);
  599. mm_pixels = _mm_packus_epi32(mm_pixels, mm_pixels); // Clamp to 0 to 65535 range.
  600. mm_pixels = _mm_packus_epi16(mm_pixels, mm_pixels); // Clamp to 0 to 255 range.
  601. j = _mm_extract_epi32(mm_pixels, 0);
  602. switch (n)
  603. {
  604. default:
  605. case 4:
  606. *(int32_t *)dst = j;
  607. dst += 4;
  608. break;
  609. case 3:
  610. *(int16_t *)dst = j;
  611. ((uint8_t *)dst)[2] = j >> 16;
  612. dst += 3;
  613. break;
  614. case 2:
  615. *(int16_t *)dst = j;
  616. dst += 2;
  617. break;
  618. case 1:
  619. *(int8_t *)dst = j;
  620. dst += 1;
  621. break;
  622. }
  623. }
  624. index += n;
  625. width -= n;
  626. }
  627. }
  628. static void
  629. zoom_y3_sse(uint8_t * dst,
  630. const uint8_t * FZ_RESTRICT tmp,
  631. const index_t * FZ_RESTRICT index,
  632. const int32_t * FZ_RESTRICT weights,
  633. uint32_t width,
  634. uint32_t channels,
  635. uint32_t mod,
  636. int32_t y)
  637. {
  638. uint32_t stride = width * 3;
  639. uint32_t offset = 0;
  640. __m128i round = _mm_set1_epi32(WEIGHT_ROUND);
  641. while (width--)
  642. {
  643. uint32_t off = (index->first_pixel + y) * stride + offset;
  644. offset += 3;
  645. if (off >= mod)
  646. off -= mod;
  647. {
  648. const int32_t *w = &weights[index->index];
  649. int32_t j = (int32_t)index->n;
  650. __m128i mm0, mm1, mm2, mw0, mw1;
  651. if (j == 4)
  652. {
  653. mw0 = _mm_load_si128((const __m128i *)w);
  654. mm0 = _mm_loadu_si128((const __m128i *)&tmp[off]);
  655. // mm0 = ppoonnmmllkkjjiihhggffeeddccbbaa SSE2
  656. mw1 = _mm_shuffle_epi32(mw0, 0 + (0 << 2) + (0 << 4) + (0 << 6));
  657. mm0 = _mm_cvtepu8_epi32(mm0); // mm0 = 000000dd000000cc000000bb000000aa SSE4.1
  658. mm0 = _mm_mullo_epi32(mm0,mw1); // mm0 = 0000wdwd0000wcwc0000wbwb0000wawa SSE4.1
  659. mm1 = _mm_add_epi32(round, mm0);// mm1 = 0000wdwd0000wcwc0000wbwb0000wawa SSE4.1
  660. off += stride;
  661. if (off >= mod)
  662. off -= mod;
  663. mm0 = _mm_loadu_si128((const __m128i *)&tmp[off]);
  664. // mm0 = ppoonnmmllkkjjiihhggffeeddccbbaa SSE2
  665. mw1 = _mm_shuffle_epi32(mw0, 1 + (1 << 2) + (1 << 4) + (1 << 6));
  666. mm0 = _mm_cvtepu8_epi32(mm0); // mm0 = 000000dd000000cc000000bb000000aa SSE4.1
  667. mm0 = _mm_mullo_epi32(mm0,mw1); // mm0 = 0000wdwd0000wcwc0000wbwb0000wawa SSE4.1
  668. mm1 = _mm_add_epi32(mm1, mm0); // mm1 = 0000wdwd0000wcwc0000wbwb0000wawa SSE4.1
  669. off += stride;
  670. if (off >= mod)
  671. off -= mod;
  672. mm0 = _mm_loadu_si128((const __m128i *)&tmp[off]);
  673. // mm0 = ppoonnmmllkkjjiihhggffeeddccbbaa SSE2
  674. mw1 = _mm_shuffle_epi32(mw0, 2 + (2 << 2) + (2 << 4) + (2 << 6));
  675. mm0 = _mm_cvtepu8_epi32(mm0); // mm0 = 000000dd000000cc000000bb000000aa SSE4.1
  676. mm0 = _mm_mullo_epi32(mm0,mw1); // mm0 = 0000wdwd0000wcwc0000wbwb0000wawa SSE4.1
  677. mm1 = _mm_add_epi32(mm1, mm0); // mm1 = 0000wdwd0000wcwc0000wbwb0000wawa SSE4.1
  678. off += stride;
  679. if (off >= mod)
  680. off -= mod;
  681. mm0 = _mm_loadu_si128((const __m128i *)&tmp[off]);
  682. // mm0 = ppoonnmmllkkjjiihhggffeeddccbbaa SSE2
  683. mw1 = _mm_shuffle_epi32(mw0, 3 + (3 << 2) + (3 << 4) + (3 << 6));
  684. mm0 = _mm_cvtepu8_epi32(mm0); // mm0 = 000000dd000000cc000000bb000000aa SSE4.1
  685. mm0 = _mm_mullo_epi32(mm0,mw1); // mm0 = 0000wdwd0000wcwc0000wbwb0000wawa SSE4.1
  686. mm1 = _mm_add_epi32(mm1, mm0); // mm1 = 0000wdwd0000wcwc0000wbwb0000wawa SSE4.1
  687. }
  688. else
  689. {
  690. int duff = j & 3;
  691. w -= (4 - j) & 3;
  692. mw0 = _mm_loadu_si128((const __m128i *)w);
  693. w += 4;
  694. mm1 = round;
  695. /* This is a use of Duff's Device. I'm very sorry, but on the other hand, Yay! */
  696. switch (duff)
  697. {
  698. do
  699. {
  700. off += stride;
  701. if (off >= mod)
  702. off -= mod;
  703. mw0 = _mm_load_si128((const __m128i *)w);
  704. w += 4;
  705. case 0:
  706. mm0 = _mm_loadu_si128((const __m128i *)&tmp[off]);
  707. // mm0 = ppoonnmmllkkjjiihhggffeeddccbbaa SSE2
  708. mw1 = _mm_shuffle_epi32(mw0, 0 + (0 << 2) + (0 << 4) + (0 << 6));
  709. off += stride;
  710. if (off >= mod)
  711. off -= mod;
  712. mm2 = _mm_cvtepu8_epi32(mm0); // mm2 = 000000dd000000cc000000bb000000aa SSE4.1
  713. mm2 = _mm_mullo_epi32(mm2, mw1);// mm2 = 0000wdwd0000wcwc0000wbwb0000wawa SSE4.1
  714. mm1 = _mm_add_epi32(mm1, mm2); // mm1 = 0000xxxx0000xxxx0000xxxx0000xxxx SSE2
  715. case 3:
  716. mm0 = _mm_loadu_si128((const __m128i *)&tmp[off]);
  717. // mm0 = ppoonnmmllkkjjiihhggffeeddccbbaa SSE2
  718. mw1 = _mm_shuffle_epi32(mw0, 1 + (1 << 2) + (1 << 4) + (1 << 6));
  719. off += stride;
  720. if (off >= mod)
  721. off -= mod;
  722. mm2 = _mm_cvtepu8_epi32(mm0); // mm2 = 000000dd000000cc000000bb000000aa SSE4.1
  723. mm2 = _mm_mullo_epi32(mm2, mw1);// mm2 = 0000wdwd0000wcwc0000wbwb0000wawa SSE4.1
  724. mm1 = _mm_add_epi32(mm1, mm2); // mm1 = 0000xxxx0000xxxx0000xxxx0000xxxx SSE2
  725. case 2:
  726. mm0 = _mm_loadu_si128((const __m128i *)&tmp[off]);
  727. // mm0 = ppoonnmmllkkjjiihhggffeeddccbbaa SSE2
  728. mw1 = _mm_shuffle_epi32(mw0, 2 + (2 << 2) + (2 << 4) + (2 << 6));
  729. off += stride;
  730. if (off >= mod)
  731. off -= mod;
  732. mm2 = _mm_cvtepu8_epi32(mm0); // mm2 = 000000dd000000cc000000bb000000aa SSE4.1
  733. mm2 = _mm_mullo_epi32(mm2, mw1);// mm2 = 0000wdwd0000wcwc0000wbwb0000wawa SSE4.1
  734. mm1 = _mm_add_epi32(mm1, mm2); // mm1 = 0000xxxx0000xxxx0000xxxx0000xxxx SSE2
  735. case 1:
  736. mm0 = _mm_loadu_si128((const __m128i *)&tmp[off]);
  737. // mm0 = ppoonnmmllkkjjiihhggffeeddccbbaa SSE2
  738. mw1 = _mm_shuffle_epi32(mw0, 3 + (3 << 2) + (3 << 4) + (3 << 6));
  739. mm2 = _mm_cvtepu8_epi32(mm0); // mm2 = 000000dd000000cc000000bb000000aa SSE4.1
  740. mm2 = _mm_mullo_epi32(mm2, mw1);// mm2 = 0000wdwd0000wcwc0000wbwb0000wawa SSE4.1
  741. mm1 = _mm_add_epi32(mm1, mm2); // mm1 = 0000xxxx0000xxxx0000xxxx0000xxxx SSE2
  742. j -= 4;
  743. } while (j > 0);
  744. }
  745. }
  746. #if 0
  747. mm1 = _mm_srai_epi32(mm1, WEIGHT_SHIFT - 8); // Shift down
  748. mm1 = _mm_packus_epi32(mm1,mm1); // Clamp to 0 to 65535 range.
  749. *dst++ = _mm_extract_epi8(mm1,1);
  750. *dst++ = _mm_extract_epi8(mm1,3);
  751. *dst++ = _mm_extract_epi8(mm1,5);
  752. #else
  753. mm1 = _mm_srai_epi32(mm1, WEIGHT_SHIFT); // Shift down
  754. mm1 = _mm_packus_epi32(mm1, mm1); // Clamp to 0 to 65535 range.
  755. mm1 = _mm_packus_epi16(mm1, mm1); // Clamp to 0 to 255 range.
  756. j = _mm_extract_epi32(mm1, 0);
  757. *(int16_t *)dst = j;
  758. ((uint8_t *)dst)[2] = j >> 16;
  759. dst += 3;
  760. #endif
  761. }
  762. index++;
  763. }
  764. }
  765. static void
  766. zoom_y4_sse(uint8_t * dst,
  767. const uint8_t * FZ_RESTRICT tmp,
  768. const index_t * FZ_RESTRICT index,
  769. const int32_t * FZ_RESTRICT weights,
  770. uint32_t width,
  771. uint32_t channels,
  772. uint32_t mod,
  773. int32_t y)
  774. {
  775. uint32_t stride = width * 4;
  776. uint32_t offset = 0;
  777. __m128i round = _mm_set1_epi32(WEIGHT_ROUND);
  778. while (width--)
  779. {
  780. uint32_t off = (index->first_pixel + y) * stride + offset;
  781. offset += 4;
  782. if (off >= mod)
  783. off -= mod;
  784. {
  785. const int32_t *w = &weights[index->index];
  786. int32_t j = (int32_t)index->n;
  787. __m128i mm0, mm1, mm2, mw0, mw1;
  788. if (j == 4)
  789. {
  790. mw0 = _mm_load_si128((const __m128i *)w);
  791. mm0 = _mm_loadu_si128((const __m128i *)&tmp[off]);
  792. // mm0 = ppoonnmmllkkjjiihhggffeeddccbbaa SSE2
  793. mw1 = _mm_shuffle_epi32(mw0, 0 + (0 << 2) + (0 << 4) + (0 << 6));
  794. mm0 = _mm_cvtepu8_epi32(mm0); // mm0 = 000000dd000000cc000000bb000000aa SSE4.1
  795. mm0 = _mm_mullo_epi32(mm0,mw1); // mm0 = 0000wdwd0000wcwc0000wbwb0000wawa SSE4.1
  796. mm1 = _mm_add_epi32(round, mm0);// mm1 = 0000wdwd0000wcwc0000wbwb0000wawa SSE4.1
  797. off += stride;
  798. if (off >= mod)
  799. off -= mod;
  800. mm0 = _mm_loadu_si128((const __m128i *)&tmp[off]);
  801. // mm0 = ppoonnmmllkkjjiihhggffeeddccbbaa SSE2
  802. mw1 = _mm_shuffle_epi32(mw0, 1 + (1 << 2) + (1 << 4) + (1 << 6));
  803. mm0 = _mm_cvtepu8_epi32(mm0); // mm0 = 000000dd000000cc000000bb000000aa SSE4.1
  804. mm0 = _mm_mullo_epi32(mm0,mw1); // mm0 = 0000wdwd0000wcwc0000wbwb0000wawa SSE4.1
  805. mm1 = _mm_add_epi32(mm1, mm0); // mm1 = 0000wdwd0000wcwc0000wbwb0000wawa SSE4.1
  806. off += stride;
  807. if (off >= mod)
  808. off -= mod;
  809. mm0 = _mm_loadu_si128((const __m128i *)&tmp[off]);
  810. // mm0 = ppoonnmmllkkjjiihhggffeeddccbbaa SSE2
  811. mw1 = _mm_shuffle_epi32(mw0, 2 + (2 << 2) + (2 << 4) + (2 << 6));
  812. mm0 = _mm_cvtepu8_epi32(mm0); // mm0 = 000000dd000000cc000000bb000000aa SSE4.1
  813. mm0 = _mm_mullo_epi32(mm0,mw1); // mm0 = 0000wdwd0000wcwc0000wbwb0000wawa SSE4.1
  814. mm1 = _mm_add_epi32(mm1, mm0); // mm1 = 0000wdwd0000wcwc0000wbwb0000wawa SSE4.1
  815. off += stride;
  816. if (off >= mod)
  817. off -= mod;
  818. mm0 = _mm_loadu_si128((const __m128i *)&tmp[off]);
  819. // mm0 = ppoonnmmllkkjjiihhggffeeddccbbaa SSE2
  820. mw1 = _mm_shuffle_epi32(mw0, 3 + (3 << 2) + (3 << 4) + (3 << 6));
  821. mm0 = _mm_cvtepu8_epi32(mm0); // mm0 = 000000dd000000cc000000bb000000aa SSE4.1
  822. mm0 = _mm_mullo_epi32(mm0,mw1); // mm0 = 0000wdwd0000wcwc0000wbwb0000wawa SSE4.1
  823. mm1 = _mm_add_epi32(mm1, mm0); // mm1 = 0000wdwd0000wcwc0000wbwb0000wawa SSE4.1
  824. }
  825. else
  826. {
  827. int duff = j & 3;
  828. w -= (4 - j) & 3;
  829. mw0 = _mm_loadu_si128((const __m128i *)w);
  830. w += 4;
  831. mm1 = round;
  832. /* This is a use of Duff's Device. I'm very sorry, but on the other hand, Yay! */
  833. switch (duff)
  834. {
  835. do
  836. {
  837. off += stride;
  838. if (off >= mod)
  839. off -= mod;
  840. mw0 = _mm_load_si128((const __m128i *)w);
  841. w += 4;
  842. case 0:
  843. mm0 = _mm_loadu_si128((const __m128i *)&tmp[off]);
  844. // mm0 = ppoonnmmllkkjjiihhggffeeddccbbaa SSE2
  845. mw1 = _mm_shuffle_epi32(mw0, 0 + (0 << 2) + (0 << 4) + (0 << 6));
  846. off += stride;
  847. if (off >= mod)
  848. off -= mod;
  849. mm2 = _mm_cvtepu8_epi32(mm0); // mm2 = 000000dd000000cc000000bb000000aa SSE4.1
  850. mm2 = _mm_mullo_epi32(mm2, mw1);// mm2 = 0000wdwd0000wcwc0000wbwb0000wawa SSE4.1
  851. mm1 = _mm_add_epi32(mm1, mm2); // mm1 = 0000xxxx0000xxxx0000xxxx0000xxxx SSE2
  852. case 3:
  853. mm0 = _mm_loadu_si128((const __m128i *)&tmp[off]);
  854. // mm0 = ppoonnmmllkkjjiihhggffeeddccbbaa SSE2
  855. mw1 = _mm_shuffle_epi32(mw0, 1 + (1 << 2) + (1 << 4) + (1 << 6));
  856. off += stride;
  857. if (off >= mod)
  858. off -= mod;
  859. mm2 = _mm_cvtepu8_epi32(mm0); // mm2 = 000000dd000000cc000000bb000000aa SSE4.1
  860. mm2 = _mm_mullo_epi32(mm2, mw1);// mm2 = 0000wdwd0000wcwc0000wbwb0000wawa SSE4.1
  861. mm1 = _mm_add_epi32(mm1, mm2); // mm1 = 0000xxxx0000xxxx0000xxxx0000xxxx SSE2
  862. case 2:
  863. mm0 = _mm_loadu_si128((const __m128i *)&tmp[off]);
  864. // mm0 = ppoonnmmllkkjjiihhggffeeddccbbaa SSE2
  865. mw1 = _mm_shuffle_epi32(mw0, 2 + (2 << 2) + (2 << 4) + (2 << 6));
  866. off += stride;
  867. if (off >= mod)
  868. off -= mod;
  869. mm2 = _mm_cvtepu8_epi32(mm0); // mm2 = 000000dd000000cc000000bb000000aa SSE4.1
  870. mm2 = _mm_mullo_epi32(mm2, mw1);// mm2 = 0000wdwd0000wcwc0000wbwb0000wawa SSE4.1
  871. mm1 = _mm_add_epi32(mm1, mm2); // mm1 = 0000xxxx0000xxxx0000xxxx0000xxxx SSE2
  872. case 1:
  873. mm0 = _mm_loadu_si128((const __m128i *)&tmp[off]);
  874. // mm0 = ppoonnmmllkkjjiihhggffeeddccbbaa SSE2
  875. mw1 = _mm_shuffle_epi32(mw0, 3 + (3 << 2) + (3 << 4) + (3 << 6));
  876. mm2 = _mm_cvtepu8_epi32(mm0); // mm2 = 000000dd000000cc000000bb000000aa SSE4.1
  877. mm2 = _mm_mullo_epi32(mm2, mw1);// mm2 = 0000wdwd0000wcwc0000wbwb0000wawa SSE4.1
  878. mm1 = _mm_add_epi32(mm1, mm2); // mm1 = 0000xxxx0000xxxx0000xxxx0000xxxx SSE2
  879. j -= 4;
  880. } while (j > 0);
  881. }
  882. }
  883. #if 0
  884. mm1 = _mm_srai_epi32(mm1, WEIGHT_SHIFT - 8); // Shift down
  885. mm1 = _mm_packus_epi32(mm1,mm1); // Clamp to 0 to 65535 range.
  886. *dst++ = _mm_extract_epi8(mm1,1);
  887. *dst++ = _mm_extract_epi8(mm1,3);
  888. *dst++ = _mm_extract_epi8(mm1,5);
  889. *dst++ = _mm_extract_epi8(mm1,7);
  890. #else
  891. mm1 = _mm_srai_epi32(mm1, WEIGHT_SHIFT); // Shift down
  892. mm1 = _mm_packus_epi32(mm1,mm1); // Clamp to 0 to 65535 range.
  893. mm1 = _mm_packus_epi16(mm1,mm1); // Clamp to 0 to 255 range.
  894. *(int32_t *)dst = _mm_extract_epi32(mm1, 0);
  895. dst += 4;
  896. #endif
  897. }
  898. index++;
  899. }
  900. }