From 7ea66cf937393f96198aa6ccb4d0a379c9e36f16 Mon Sep 17 00:00:00 2001 From: genshen Date: Wed, 20 Sep 2023 14:49:53 +0800 Subject: [PATCH] add more articles --- .../publication/chu-efficient-2023/cite.bib | 19 ++++++ .../publication/chu-efficient-2023/index.md | 67 +++++++++++++++++++ content/zh/publication/chu-md-2021/cite.bib | 17 +++++ content/zh/publication/chu-md-2021/index.md | 46 +++++++++++++ .../zh/publication/he-testing-2020/cite.bib | 1 + .../zh/publication/he-testing-2020/index.md | 15 ++++- .../zh/publication/hu-kernel-2017/cite.bib | 15 +++++ .../zh/publication/hu-kernel-2017/index.md | 58 ++++++++++++++++ 8 files changed, 237 insertions(+), 1 deletion(-) create mode 100644 content/zh/publication/chu-efficient-2023/cite.bib create mode 100644 content/zh/publication/chu-efficient-2023/index.md create mode 100644 content/zh/publication/chu-md-2021/cite.bib create mode 100644 content/zh/publication/chu-md-2021/index.md create mode 100644 content/zh/publication/hu-kernel-2017/cite.bib create mode 100644 content/zh/publication/hu-kernel-2017/index.md diff --git a/content/zh/publication/chu-efficient-2023/cite.bib b/content/zh/publication/chu-efficient-2023/cite.bib new file mode 100644 index 0000000..7e35ae9 --- /dev/null +++ b/content/zh/publication/chu-efficient-2023/cite.bib @@ -0,0 +1,19 @@ +@inproceedings{chu_efficient_2023, + abstract = {Sparse matrix-vector multiplication (SpMV) is a fundamental building block for various numerical computing applications. However, most existing GPU-SpMV approaches may suffer from either long preprocessing overhead, load imbalance, format conversion, bad memory access patterns. In this paper, we proposed two new SpMV algorithms:flat andline-enhance, as well as their implementations, for GPU systems to overcome the above shortcomings. Our algorithms work directly on the CSR sparse matrix format. To achieve high performance: 1) for load balance, theflat algorithm uses non-zero splitting andline-enhance uses a mix of row and non-zero splitting; 2) memory access patterns are designed for both algorithms for data loading, storing and reduction steps; and 3) an adaptive approach is proposed to select appropriate algorithm and parameters based on matrix characteristics. +We evaluate our methods using theSuiteSparse Matrix Collection on AMD and NVIDIA GPU platforms. Average performance improvements of 424%, 741%, 49%, 46%, 72% are achieved when comparing our adaptive approach with CSR-Vector, CSR-Adaptive, HOLA, cuSparse and merge-based SpMV, respectively. In bandwidth tests, our approach can also achieve a high memory bandwidth, which is very close to the peak memory bandwidth.}, + address = {Orlando, Florida}, + author = {Chu, Genshen and He, Yuanjie and Dong, Lingyu and Ding, Zhezhao and Chen, Dandan and Bai, He and Wang, Xuesong and Hu, Changjun}, + booktitle = {Proceedings of the 32nd International Symposium on High-Performance Parallel and Distributed Computing (HPDC '23), June 16–23, 2023, Orlando, FL, USA}, + copyright = {All rights reserved}, + doi = {10.1145/3588195.3593002}, + isbn = {979-8-4007-0155-9/23/06}, + language = {en}, + note = {event-place: Orlando, FL, USA}, + pages = {1--14}, + publisher = {ACM Press}, + series = {HPDC '23}, + title = {Efficient Algorithm Design of Optimizing SpMV on GPU}, + url = {http://doi.org/10.1145/3588195.3593002}, + year = {2023} +} + diff --git a/content/zh/publication/chu-efficient-2023/index.md b/content/zh/publication/chu-efficient-2023/index.md new file mode 100644 index 0000000..3bcb57e --- /dev/null +++ b/content/zh/publication/chu-efficient-2023/index.md @@ -0,0 +1,67 @@ +--- +# Documentation: https://wowchemy.com/docs/managing-content/ + +title: Efficient Algorithm Design of Optimizing SpMV on GPU +subtitle: '' +summary: '' +authors: +- Genshen Chu +- Yuanjie He +- Lingyu Dong +- Zhezhao Ding +- Dandan Chen +- He Bai +- Xuesong Wang +- Changjun Hu +tags: [] +categories: [] +date: '2023-01-01' +lastmod: 2023-09-20T14:45:21+08:00 +featured: false +draft: false + +# Featured image +# To use, add an image named `featured.jpg/png` to your page's folder. +# Focal points: Smart, Center, TopLeft, Top, TopRight, Left, Right, BottomLeft, Bottom, BottomRight. +image: + caption: '' + focal_point: '' + preview_only: false + +# Projects (optional). +# Associate this post with one or more of your projects. +# Simply enter your project's folder or file name without extension. +# E.g. `projects = ["internal-project"]` references `content/project/deep-learning/index.md`. +# Otherwise, set `projects = []`. +projects: [] +publishDate: '2023-09-20T06:47:53.258484Z' +publication_types: +- '1' +abstract: 'Sparse matrix-vector multiplication (SpMV) is a fundamental building block + for various numerical computing applications. However, most existing GPU-SpMV approaches + may suffer from either long preprocessing overhead, load imbalance, format conversion, + bad memory access patterns. In this paper, we proposed two new SpMV algorithms:flat + andline-enhance, as well as their implementations, for GPU systems to overcome the + above shortcomings. Our algorithms work directly on the CSR sparse matrix format. + To achieve high performance: 1) for load balance, theflat algorithm uses non-zero + splitting andline-enhance uses a mix of row and non-zero splitting; 2) memory access + patterns are designed for both algorithms for data loading, storing and reduction + steps; and 3) an adaptive approach is proposed to select appropriate algorithm and + parameters based on matrix characteristics. We evaluate our methods using theSuiteSparse + Matrix Collection on AMD and NVIDIA GPU platforms. Average performance improvements + of 424%, 741%, 49%, 46%, 72% are achieved when comparing our adaptive approach with + CSR-Vector, CSR-Adaptive, HOLA, cuSparse and merge-based SpMV, respectively. In + bandwidth tests, our approach can also achieve a high memory bandwidth, which is + very close to the peak memory bandwidth.' +publication: "*Proceedings of the 32nd International Symposium on High-Performance\ + \ Parallel and Distributed Computing (HPDC '23), June 16–23, 2023, Orlando, FL,\ + \ USA*" +doi: 10.1145/3588195.3593002 +links: +- name: URL + url: http://doi.org/10.1145/3588195.3593002 +- name: "Conference Site" + url: https://www.hpdc.org/2022/program/technical-sessions/ +url_code: 'https://github.com/hpcde/spmv-acc/' +url_slides: https://drive.google.com/file/d/1U4iajY9dF0QfSTCL1pW7L0TTsqs85KaS/view?usp=share_link +--- diff --git a/content/zh/publication/chu-md-2021/cite.bib b/content/zh/publication/chu-md-2021/cite.bib new file mode 100644 index 0000000..9597c20 --- /dev/null +++ b/content/zh/publication/chu-md-2021/cite.bib @@ -0,0 +1,17 @@ +@article{chu_md_2021, + author = {Chu, Genshen and Li, Yang and Zhao, Runchu and Ren, Shuai and Yang, Wen and He, Xinfu and Hu, Changjun and Wang, Jue}, + copyright = {All rights reserved}, + doi = {10.1016/j.cpc.2021.108128}, + issn = {00104655}, + journal = {Computer Physics Communications}, + language = {en}, + month = {August}, + number = {1}, + pages = {108128}, + title = {MD Simulation of Hundred-Billion-Metal-Atom Cascade Collision on Sunway Taihulight}, + url = {https://linkinghub.elsevier.com/retrieve/pii/S001046552100240X}, + urldate = {2021-08-08}, + volume = {269}, + year = {2021} +} + diff --git a/content/zh/publication/chu-md-2021/index.md b/content/zh/publication/chu-md-2021/index.md new file mode 100644 index 0000000..e08a7d9 --- /dev/null +++ b/content/zh/publication/chu-md-2021/index.md @@ -0,0 +1,46 @@ +--- +# Documentation: https://wowchemy.com/docs/managing-content/ + +title: MD Simulation of Hundred-Billion-Metal-Atom Cascade Collision on Sunway Taihulight +subtitle: '' +summary: '' +authors: +- Genshen Chu +- Yang Li +- Runchu Zhao +- Shuai Ren +- Wen Yang +- Xinfu He +- Changjun Hu +- Jue Wang +tags: [] +categories: [] +date: '2021-08-01' +lastmod: 2023-01-04T22:24:34+08:00 +featured: false +draft: false + +# Featured image +# To use, add an image named `featured.jpg/png` to your page's folder. +# Focal points: Smart, Center, TopLeft, Top, TopRight, Left, Right, BottomLeft, Bottom, BottomRight. +image: + caption: '' + focal_point: '' + preview_only: false + +# Projects (optional). +# Associate this post with one or more of your projects. +# Simply enter your project's folder or file name without extension. +# E.g. `projects = ["internal-project"]` references `content/project/deep-learning/index.md`. +# Otherwise, set `projects = []`. +projects: [] +publishDate: '2023-01-04T14:29:12.088506Z' +publication_types: +- '2' +abstract: '' +publication: '*Computer Physics Communications*' +doi: 10.1016/j.cpc.2021.108128 +links: +- name: URL + url: https://linkinghub.elsevier.com/retrieve/pii/S001046552100240X +--- diff --git a/content/zh/publication/he-testing-2020/cite.bib b/content/zh/publication/he-testing-2020/cite.bib index 17797ce..eba1a10 100644 --- a/content/zh/publication/he-testing-2020/cite.bib +++ b/content/zh/publication/he-testing-2020/cite.bib @@ -1,4 +1,5 @@ @inproceedings{he_testing_2020, + abstract = {High performance numerical simulation programs are widely used to simulate actual physical processes on high performance computers for the analysis of various physical and engineering problems. They are usually regarded as non-testable due to their high complexity. This paper reports our real experience and lessons learned from testing five simulation programs that will be used to design and analyze nuclear power plants. We applied five testing approaches and found 33 bugs. We found that property-based testing and metamorphic testing are two effective methods. Nevertheless, we suffered from the lack of domain knowledge, the high test costs, the shortage of test cases, severe oracle issues, and inadequate automation support. Consequently, the five programs are not exhaustively tested from the perspective of software testing, and many existing software testing techniques and tools are not fully applicable due to scalability and portability issues. We need more collaboration and communication with other communities to promote the research and application of software testing techniques.}, address = {Virtual Event USA}, author = {He, Xiao and Wang, Xingwei and Shi, Jia and Liu, Yi}, booktitle = {Proceedings of the 29th ACM SIGSOFT International Symposium on Software Testing and Analysis}, diff --git a/content/zh/publication/he-testing-2020/index.md b/content/zh/publication/he-testing-2020/index.md index c8abd13..70a5b13 100644 --- a/content/zh/publication/he-testing-2020/index.md +++ b/content/zh/publication/he-testing-2020/index.md @@ -34,7 +34,20 @@ projects: [] publishDate: '2023-01-04T14:29:12.338426Z' publication_types: - '1' -abstract: '' +abstract: High performance numerical simulation programs are widely used to simulate + actual physical processes on high performance computers for the analysis of various + physical and engineering problems. They are usually regarded as non-testable due + to their high complexity. This paper reports our real experience and lessons learned + from testing five simulation programs that will be used to design and analyze nuclear + power plants. We applied five testing approaches and found 33 bugs. We found that + property-based testing and metamorphic testing are two effective methods. Nevertheless, + we suffered from the lack of domain knowledge, the high test costs, the shortage + of test cases, severe oracle issues, and inadequate automation support. Consequently, + the five programs are not exhaustively tested from the perspective of software testing, + and many existing software testing techniques and tools are not fully applicable + due to scalability and portability issues. We need more collaboration and communication + with other communities to promote the research and application of software testing + techniques. publication: '*Proceedings of the 29th ACM SIGSOFT International Symposium on Software Testing and Analysis*' doi: 10.1145/3395363.3397382 diff --git a/content/zh/publication/hu-kernel-2017/cite.bib b/content/zh/publication/hu-kernel-2017/cite.bib new file mode 100644 index 0000000..25058d3 --- /dev/null +++ b/content/zh/publication/hu-kernel-2017/cite.bib @@ -0,0 +1,15 @@ +@article{hu_kernel_2017, + abstract = {To optimize short-range force computations in Molecular Dynamics (MD) simulations, multi-threading and SIMD optimizations are presented in this paper. With respect to multi-threading optimization, a Partition-and-Separate-Calculation (PSC) method is designed to avoid write conflicts caused by using Newton’s third law. Serial bottlenecks are eliminated with no additional memory usage. The method is implemented by using the OpenMP model. Furthermore, the PSC method is employed on Intel Xeon Phi coprocessors in both native and offload models. We also evaluate the performance of the PSC method under different thread affinities on the MIC architecture. In the SIMD execution, we explain the performance influence in the PSC method, considering the ‘‘if-clause’’ of the cutoff radius check. The experiment results show that our PSC method is relatively more efficient compared to some traditional methods. In double precision, our 256-bit SIMD implementation is about 3 times faster than the scalar version.}, + author = {Hu, Changjun and Wang, Xianmeng and Li, Jianjiang and He, Xinfu and Li, Shigang and Feng, Yangde and Yang, Shaofeng and Bai, He}, + doi = {10.1016/j.cpc.2016.07.010}, + issn = {00104655}, + journal = {Computer Physics Communications}, + language = {en}, + month = {February}, + pages = {31--40}, + title = {Kernel optimization for short-range molecular dynamics}, + url = {https://linkinghub.elsevier.com/retrieve/pii/S0010465516301928}, + urldate = {2019-01-22}, + volume = {211}, + year = {2017} +} diff --git a/content/zh/publication/hu-kernel-2017/index.md b/content/zh/publication/hu-kernel-2017/index.md new file mode 100644 index 0000000..e6ce7e2 --- /dev/null +++ b/content/zh/publication/hu-kernel-2017/index.md @@ -0,0 +1,58 @@ +--- +# Documentation: https://wowchemy.com/docs/managing-content/ + +title: Kernel optimization for short-range molecular dynamics +subtitle: '' +summary: '' +authors: +- Changjun Hu +- Xianmeng Wang +- Jianjiang Li +- Xinfu He +- Shigang Li +- Yangde Feng +- Shaofeng Yang +- He Bai +tags: [] +categories: [] +date: '2017-02-01' +lastmod: 2023-01-04T22:29:03+08:00 +featured: false +draft: false + +# Featured image +# To use, add an image named `featured.jpg/png` to your page's folder. +# Focal points: Smart, Center, TopLeft, Top, TopRight, Left, Right, BottomLeft, Bottom, BottomRight. +image: + caption: '' + focal_point: '' + preview_only: false + +# Projects (optional). +# Associate this post with one or more of your projects. +# Simply enter your project's folder or file name without extension. +# E.g. `projects = ["internal-project"]` references `content/project/deep-learning/index.md`. +# Otherwise, set `projects = []`. +projects: [] +publishDate: '2023-01-04T14:29:11.845539Z' +publication_types: +- '2' +abstract: To optimize short-range force computations in Molecular Dynamics (MD) simulations, + multi-threading and SIMD optimizations are presented in this paper. With respect + to multi-threading optimization, a Partition-and-Separate-Calculation (PSC) method + is designed to avoid write conflicts caused by using Newton’s third law. Serial + bottlenecks are eliminated with no additional memory usage. The method is implemented + by using the OpenMP model. Furthermore, the PSC method is employed on Intel Xeon + Phi coprocessors in both native and offload models. We also evaluate the performance + of the PSC method under different thread affinities on the MIC architecture. In + the SIMD execution, we explain the performance influence in the PSC method, considering + the ‘‘if-clause’’ of the cutoff radius check. The experiment results show that our + PSC method is relatively more efficient compared to some traditional methods. In + double precision, our 256-bit SIMD implementation is about 3 times faster than the + scalar version. +publication: '*Computer Physics Communications*' +doi: 10.1016/j.cpc.2016.07.010 +links: +- name: URL + url: https://linkinghub.elsevier.com/retrieve/pii/S0010465516301928 +---