add more articles

hpcde · Sep 20, 2023 · 1edb8d3 · 1edb8d3 · github-actions · Sep 20, 2023
1 parent 9fc3495
commit 1edb8d3
Show file tree

Hide file tree

Showing 8 changed files with 244 additions and 1 deletion.
diff --git a/content/zh/publication/chu-efficient-2023/cite.bib b/content/zh/publication/chu-efficient-2023/cite.bib
@@ -0,0 +1,19 @@
+@inproceedings{chu_efficient_2023,
+ abstract = {Sparse matrix-vector multiplication (SpMV) is a fundamental building block for various numerical computing applications. However, most existing GPU-SpMV approaches may suffer from either long preprocessing overhead, load imbalance, format conversion, bad memory access patterns. In this paper, we proposed two new SpMV algorithms:flat andline-enhance, as well as their implementations, for GPU systems to overcome the above shortcomings. Our algorithms work directly on the CSR sparse matrix format. To achieve high performance: 1) for load balance, theflat algorithm uses non-zero splitting andline-enhance uses a mix of row and non-zero splitting; 2) memory access patterns are designed for both algorithms for data loading, storing and reduction steps; and 3) an adaptive approach is proposed to select appropriate algorithm and parameters based on matrix characteristics.
+We evaluate our methods using theSuiteSparse Matrix Collection on AMD and NVIDIA GPU platforms. Average performance improvements of 424%, 741%, 49%, 46%, 72% are achieved when comparing our adaptive approach with CSR-Vector, CSR-Adaptive, HOLA, cuSparse and merge-based SpMV, respectively. In bandwidth tests, our approach can also achieve a high memory bandwidth, which is very close to the peak memory bandwidth.},
+ address = {Orlando, Florida},
+ author = {Chu, Genshen and He, Yuanjie and Dong, Lingyu and Ding, Zhezhao and Chen, Dandan and Bai, He and Wang, Xuesong and Hu, Changjun},
+ booktitle = {Proceedings of the 32nd International Symposium on High-Performance Parallel and Distributed Computing (HPDC '23), June 16–23, 2023, Orlando, FL, USA},
+ copyright = {All rights reserved},
+ doi = {10.1145/3588195.3593002},
+ isbn = {979-8-4007-0155-9/23/06},
+ language = {en},
+ note = {event-place: Orlando, FL, USA},
+ pages = {1--14},
+ publisher = {ACM Press},
+ series = {HPDC '23},
+ title = {Efficient Algorithm Design of Optimizing SpMV on GPU},
+ url = {http://doi.org/10.1145/3588195.3593002},
+ year = {2023}
+}
+
diff --git a/content/zh/publication/chu-efficient-2023/index.md b/content/zh/publication/chu-efficient-2023/index.md
@@ -0,0 +1,72 @@
+---
+# Documentation: https://wowchemy.com/docs/managing-content/
+
+title: Efficient Algorithm Design of Optimizing SpMV on GPU
+subtitle: ''
+summary: ''
+authors:
+- Genshen Chu
+- Yuanjie He
+- Lingyu Dong
+- Zhezhao Ding
+- Dandan Chen
+- He Bai
+- Xuesong Wang
+- Changjun Hu
+tags: []
+categories: []
+date: '2023-01-01'
+lastmod: 2023-09-20T14:45:21+08:00
+featured: false
+draft: false
+
+# Featured image
+# To use, add an image named `featured.jpg/png` to your page's folder.
+# Focal points: Smart, Center, TopLeft, Top, TopRight, Left, Right, BottomLeft, Bottom, BottomRight.
+image:
+  caption: ''
+  focal_point: ''
+  preview_only: false
+
+# Projects (optional).
+#   Associate this post with one or more of your projects.
+#   Simply enter your project's folder or file name without extension.
+#   E.g. `projects = ["internal-project"]` references `content/project/deep-learning/index.md`.
+#   Otherwise, set `projects = []`.
+projects: []
+publishDate: '2023-09-20T06:47:53.258484Z'
+publication_types:
+- '1'
+abstract: 'Sparse matrix-vector multiplication (SpMV) is a fundamental building block
+  for various numerical computing applications. However, most existing GPU-SpMV approaches
+  may suffer from either long preprocessing overhead, load imbalance, format conversion,
+  bad memory access patterns. In this paper, we proposed two new SpMV algorithms:flat
+  andline-enhance, as well as their implementations, for GPU systems to overcome the
+  above shortcomings. Our algorithms work directly on the CSR sparse matrix format.
+  To achieve high performance: 1) for load balance, theflat algorithm uses non-zero
+  splitting andline-enhance uses a mix of row and non-zero splitting; 2) memory access
+  patterns are designed for both algorithms for data loading, storing and reduction
+  steps; and 3) an adaptive approach is proposed to select appropriate algorithm and
+  parameters based on matrix characteristics. We evaluate our methods using theSuiteSparse
+  Matrix Collection on AMD and NVIDIA GPU platforms. Average performance improvements
+  of 424%, 741%, 49%, 46%, 72% are achieved when comparing our adaptive approach with
+  CSR-Vector, CSR-Adaptive, HOLA, cuSparse and merge-based SpMV, respectively. In
+  bandwidth tests, our approach can also achieve a high memory bandwidth, which is
+  very close to the peak memory bandwidth.'
+publication: "*Proceedings of the 32nd International Symposium on High-Performance\
+  \ Parallel and Distributed Computing (HPDC '23), June 16–23, 2023, Orlando, FL,\
+  \ USA*"
+doi: 10.1145/3588195.3593002
+links:
+- name: URL
+  url: http://doi.org/10.1145/3588195.3593002
+- name: "Conference Site"
+  url: https://www.hpdc.org/2022/program/technical-sessions/
+- name: "CCF B"
+  url: "#"
+- name: "News"
+  url: "/post/2023-hpdc-spmv-paper/"
+
+url_code: 'https://github.com/hpcde/spmv-acc/'
+url_slides: https://drive.google.com/file/d/1U4iajY9dF0QfSTCL1pW7L0TTsqs85KaS/view?usp=share_link
+---
diff --git a/content/zh/publication/chu-md-2021/cite.bib b/content/zh/publication/chu-md-2021/cite.bib
@@ -0,0 +1,17 @@
+@article{chu_md_2021,
+ author = {Chu, Genshen and Li, Yang and Zhao, Runchu and Ren, Shuai and Yang, Wen and He, Xinfu and Hu, Changjun and Wang, Jue},
+ copyright = {All rights reserved},
+ doi = {10.1016/j.cpc.2021.108128},
+ issn = {00104655},
+ journal = {Computer Physics Communications},
+ language = {en},
+ month = {August},
+ number = {1},
+ pages = {108128},
+ title = {MD Simulation of Hundred-Billion-Metal-Atom Cascade Collision on Sunway Taihulight},
+ url = {https://linkinghub.elsevier.com/retrieve/pii/S001046552100240X},
+ urldate = {2021-08-08},
+ volume = {269},
+ year = {2021}
+}
+
diff --git a/content/zh/publication/chu-md-2021/index.md b/content/zh/publication/chu-md-2021/index.md
@@ -0,0 +1,46 @@
+---
+# Documentation: https://wowchemy.com/docs/managing-content/
+
+title: MD Simulation of Hundred-Billion-Metal-Atom Cascade Collision on Sunway Taihulight
+subtitle: ''
+summary: ''
+authors:
+- Genshen Chu
+- Yang Li
+- Runchu Zhao
+- Shuai Ren
+- Wen Yang
+- Xinfu He
+- Changjun Hu
+- Jue Wang
+tags: []
+categories: []
+date: '2021-08-01'
+lastmod: 2023-01-04T22:24:34+08:00
+featured: false
+draft: false
+
+# Featured image
+# To use, add an image named `featured.jpg/png` to your page's folder.
+# Focal points: Smart, Center, TopLeft, Top, TopRight, Left, Right, BottomLeft, Bottom, BottomRight.
+image:
+  caption: ''
+  focal_point: ''
+  preview_only: false
+
+# Projects (optional).
+#   Associate this post with one or more of your projects.
+#   Simply enter your project's folder or file name without extension.
+#   E.g. `projects = ["internal-project"]` references `content/project/deep-learning/index.md`.
+#   Otherwise, set `projects = []`.
+projects: []
+publishDate: '2023-01-04T14:29:12.088506Z'
+publication_types:
+- '2'
+abstract: ''
+publication: '*Computer Physics Communications*'
+doi: 10.1016/j.cpc.2021.108128
+links:
+- name: URL
+  url: https://linkinghub.elsevier.com/retrieve/pii/S001046552100240X
+---
diff --git a/content/zh/publication/he-testing-2020/cite.bib b/content/zh/publication/he-testing-2020/cite.bib
@@ -1,4 +1,5 @@
 @inproceedings{he_testing_2020,
+ abstract = {High performance numerical simulation programs are widely used to simulate actual physical processes on high performance computers for the analysis of various physical and engineering problems. They are usually regarded as non-testable due to their high complexity. This paper reports our real experience and lessons learned from testing five simulation programs that will be used to design and analyze nuclear power plants. We applied five testing approaches and found 33 bugs. We found that property-based testing and metamorphic testing are two effective methods. Nevertheless, we suffered from the lack of domain knowledge, the high test costs, the shortage of test cases, severe oracle issues, and inadequate automation support. Consequently, the five programs are not exhaustively tested from the perspective of software testing, and many existing software testing techniques and tools are not fully applicable due to scalability and portability issues. We need more collaboration and communication with other communities to promote the research and application of software testing techniques.},
  address = {Virtual Event USA},
  author = {He, Xiao and Wang, Xingwei and Shi, Jia and Liu, Yi},
  booktitle = {Proceedings of the 29th ACM SIGSOFT International Symposium on Software Testing and Analysis},

diff --git a/content/zh/publication/he-testing-2020/index.md b/content/zh/publication/he-testing-2020/index.md
@@ -34,11 +34,26 @@ projects: []
 publishDate: '2023-01-04T14:29:12.338426Z'
 publication_types:
 - '1'
-abstract: ''
+abstract: High performance numerical simulation programs are widely used to simulate
+  actual physical processes on high performance computers for the analysis of various
+  physical and engineering problems. They are usually regarded as non-testable due
+  to their high complexity. This paper reports our real experience and lessons learned
+  from testing five simulation programs that will be used to design and analyze nuclear
+  power plants. We applied five testing approaches and found 33 bugs. We found that
+  property-based testing and metamorphic testing are two effective methods. Nevertheless,
+  we suffered from the lack of domain knowledge, the high test costs, the shortage
+  of test cases, severe oracle issues, and inadequate automation support. Consequently,
+  the five programs are not exhaustively tested from the perspective of software testing,
+  and many existing software testing techniques and tools are not fully applicable
+  due to scalability and portability issues. We need more collaboration and communication
+  with other communities to promote the research and application of software testing
+  techniques.
 publication: '*Proceedings of the 29th ACM SIGSOFT International Symposium on Software
   Testing and Analysis*'
 doi: 10.1145/3395363.3397382
 links:
 - name: URL
   url: https://dl.acm.org/doi/10.1145/3395363.3397382
+- name: "CCF A"
+  url: ""
 ---
diff --git a/content/zh/publication/hu-kernel-2017/cite.bib b/content/zh/publication/hu-kernel-2017/cite.bib
@@ -0,0 +1,15 @@
+@article{hu_kernel_2017,
+ abstract = {To optimize short-range force computations in Molecular Dynamics (MD) simulations, multi-threading and SIMD optimizations are presented in this paper. With respect to multi-threading optimization, a Partition-and-Separate-Calculation (PSC) method is designed to avoid write conflicts caused by using Newton’s third law. Serial bottlenecks are eliminated with no additional memory usage. The method is implemented by using the OpenMP model. Furthermore, the PSC method is employed on Intel Xeon Phi coprocessors in both native and offload models. We also evaluate the performance of the PSC method under different thread affinities on the MIC architecture. In the SIMD execution, we explain the performance influence in the PSC method, considering the ‘‘if-clause’’ of the cutoff radius check. The experiment results show that our PSC method is relatively more efficient compared to some traditional methods. In double precision, our 256-bit SIMD implementation is about 3 times faster than the scalar version.},
+ author = {Hu, Changjun and Wang, Xianmeng and Li, Jianjiang and He, Xinfu and Li, Shigang and Feng, Yangde and Yang, Shaofeng and Bai, He},
+ doi = {10.1016/j.cpc.2016.07.010},
+ issn = {00104655},
+ journal = {Computer Physics Communications},
+ language = {en},
+ month = {February},
+ pages = {31--40},
+ title = {Kernel optimization for short-range molecular dynamics},
+ url = {https://linkinghub.elsevier.com/retrieve/pii/S0010465516301928},
+ urldate = {2019-01-22},
+ volume = {211},
+ year = {2017}
+}
diff --git a/content/zh/publication/hu-kernel-2017/index.md b/content/zh/publication/hu-kernel-2017/index.md
@@ -0,0 +1,58 @@
+---
+# Documentation: https://wowchemy.com/docs/managing-content/
+
+title: Kernel optimization for short-range molecular dynamics
+subtitle: ''
+summary: ''
+authors:
+- Changjun Hu
+- Xianmeng Wang
+- Jianjiang Li
+- Xinfu He
+- Shigang Li
+- Yangde Feng
+- Shaofeng Yang
+- He Bai
+tags: []
+categories: []
+date: '2017-02-01'
+lastmod: 2023-01-04T22:29:03+08:00
+featured: false
+draft: false
+
+# Featured image
+# To use, add an image named `featured.jpg/png` to your page's folder.
+# Focal points: Smart, Center, TopLeft, Top, TopRight, Left, Right, BottomLeft, Bottom, BottomRight.
+image:
+  caption: ''
+  focal_point: ''
+  preview_only: false
+
+# Projects (optional).
+#   Associate this post with one or more of your projects.
+#   Simply enter your project's folder or file name without extension.
+#   E.g. `projects = ["internal-project"]` references `content/project/deep-learning/index.md`.
+#   Otherwise, set `projects = []`.
+projects: []
+publishDate: '2023-01-04T14:29:11.845539Z'
+publication_types:
+- '2'
+abstract: To optimize short-range force computations in Molecular Dynamics (MD) simulations,
+  multi-threading and SIMD optimizations are presented in this paper. With respect
+  to multi-threading optimization, a Partition-and-Separate-Calculation (PSC) method
+  is designed to avoid write conflicts caused by using Newton’s third law. Serial
+  bottlenecks are eliminated with no additional memory usage. The method is implemented
+  by using the OpenMP model. Furthermore, the PSC method is employed on Intel Xeon
+  Phi coprocessors in both native and offload models. We also evaluate the performance
+  of the PSC method under different thread affinities on the MIC architecture. In
+  the SIMD execution, we explain the performance influence in the PSC method, considering
+  the ‘‘if-clause’’ of the cutoff radius check. The experiment results show that our
+  PSC method is relatively more efficient compared to some traditional methods. In
+  double precision, our 256-bit SIMD implementation is about 3 times faster than the
+  scalar version.
+publication: '*Computer Physics Communications*'
+doi: 10.1016/j.cpc.2016.07.010
+links:
+- name: URL
+  url: https://linkinghub.elsevier.com/retrieve/pii/S0010465516301928
+---