You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
author = {Tao, Xiaohan and Zhu, Yu and Wang, Boyang and Xu, Jinlong and Pang, Jianmin and Zhao, Jie},
5
-
booktitle = {Proceedings of the 51st International Conference on Parallel Processing},
6
-
isbn = {9781450397339},
7
-
numpages = {12},
8
-
pages = {23:1-23:12},
9
-
publisher = {Association for Computing Machinery},
10
-
series = {ICPP'22},
11
-
title = {Automatically Generating High-performance Matrix Multiplication Kernels on the Latest Sunway Processor},
12
-
doi = {https://doi.org/10.1145/3545008.3545031},
13
-
year = {2022}
1
+
@inproceedings{10.1145/3330345.3331059,
2
+
author = {Sun, Huihui and Fey, Florian and Zhao, Jie and Gorlatch, Sergei},
3
+
title = {WCCV: improving the vectorization of IF-statements with warp-coherent conditions},
4
+
year = {2019},
5
+
isbn = {9781450360791},
6
+
publisher = {Association for Computing Machinery},
7
+
address = {New York, NY, USA},
8
+
url = {https://doi.org/10.1145/3330345.3331059},
9
+
doi = {10.1145/3330345.3331059},
10
+
abstract = {When vectorizing programs for modern processors with SIMD extensions, IF-statements pose a challenge: existing vectorization approaches often introduce redundant computations or they resort to inefficient masked instructions.In this paper, we introduce a new notion of warp-coherence for conditions that exhibit coherent run-time behavior on different lanes of a vector register. We demonstrate that warp-coherent conditions appear frequently in practice. We present Warp-Coherent Condition Vectorization (WCCV) - an approach to detecting and optimizing IF-statements with warp-coherent conditions - to efficiently vectorize programs with IF-statements while avoiding the overhead of existing methods. WCCV detects warp-coherent conditions via the affine analysis of conditional boolean expressions and branch predication of IF-statements; the runtime code generated by WCCV avoids redundant computations and masked instructions. We employ auto-tuning to find the optimal benefit-overhead ratio for WCCV. We implement WCCV on top of Region Vectorizer (RV) - an LLVM-based vectorizing compiler, and we conduct experiments on the Rodinia benchmark suite, achieving a mean speedup of 1.14\texttimes{} over the original vectorized and optimized code, and speedup between 0.98\texttimes{} and 7.02\texttimes{} over the scalar code on Skylake with AVX512.},
11
+
booktitle = {Proceedings of the ACM International Conference on Supercomputing},
0 commit comments