Skip to content

Commit

Permalink
add more articles
Browse files Browse the repository at this point in the history
  • Loading branch information
genshen committed Sep 20, 2023
1 parent 9fc3495 commit 1edb8d3
Show file tree
Hide file tree
Showing 8 changed files with 244 additions and 1 deletion.
19 changes: 19 additions & 0 deletions content/zh/publication/chu-efficient-2023/cite.bib
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
@inproceedings{chu_efficient_2023,
abstract = {Sparse matrix-vector multiplication (SpMV) is a fundamental building block for various numerical computing applications. However, most existing GPU-SpMV approaches may suffer from either long preprocessing overhead, load imbalance, format conversion, bad memory access patterns. In this paper, we proposed two new SpMV algorithms:flat andline-enhance, as well as their implementations, for GPU systems to overcome the above shortcomings. Our algorithms work directly on the CSR sparse matrix format. To achieve high performance: 1) for load balance, theflat algorithm uses non-zero splitting andline-enhance uses a mix of row and non-zero splitting; 2) memory access patterns are designed for both algorithms for data loading, storing and reduction steps; and 3) an adaptive approach is proposed to select appropriate algorithm and parameters based on matrix characteristics.
We evaluate our methods using theSuiteSparse Matrix Collection on AMD and NVIDIA GPU platforms. Average performance improvements of 424%, 741%, 49%, 46%, 72% are achieved when comparing our adaptive approach with CSR-Vector, CSR-Adaptive, HOLA, cuSparse and merge-based SpMV, respectively. In bandwidth tests, our approach can also achieve a high memory bandwidth, which is very close to the peak memory bandwidth.},
address = {Orlando, Florida},
author = {Chu, Genshen and He, Yuanjie and Dong, Lingyu and Ding, Zhezhao and Chen, Dandan and Bai, He and Wang, Xuesong and Hu, Changjun},
booktitle = {Proceedings of the 32nd International Symposium on High-Performance Parallel and Distributed Computing (HPDC '23), June 16–23, 2023, Orlando, FL, USA},
copyright = {All rights reserved},
doi = {10.1145/3588195.3593002},
isbn = {979-8-4007-0155-9/23/06},
language = {en},
note = {event-place: Orlando, FL, USA},
pages = {1--14},
publisher = {ACM Press},
series = {HPDC '23},
title = {Efficient Algorithm Design of Optimizing SpMV on GPU},
url = {http://doi.org/10.1145/3588195.3593002},
year = {2023}
}

72 changes: 72 additions & 0 deletions content/zh/publication/chu-efficient-2023/index.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
---
# Documentation: https://wowchemy.com/docs/managing-content/

title: Efficient Algorithm Design of Optimizing SpMV on GPU
subtitle: ''
summary: ''
authors:
- Genshen Chu
- Yuanjie He
- Lingyu Dong
- Zhezhao Ding
- Dandan Chen
- He Bai
- Xuesong Wang
- Changjun Hu
tags: []
categories: []
date: '2023-01-01'
lastmod: 2023-09-20T14:45:21+08:00
featured: false
draft: false

# Featured image
# To use, add an image named `featured.jpg/png` to your page's folder.
# Focal points: Smart, Center, TopLeft, Top, TopRight, Left, Right, BottomLeft, Bottom, BottomRight.
image:
caption: ''
focal_point: ''
preview_only: false

# Projects (optional).
# Associate this post with one or more of your projects.
# Simply enter your project's folder or file name without extension.
# E.g. `projects = ["internal-project"]` references `content/project/deep-learning/index.md`.
# Otherwise, set `projects = []`.
projects: []
publishDate: '2023-09-20T06:47:53.258484Z'
publication_types:
- '1'
abstract: 'Sparse matrix-vector multiplication (SpMV) is a fundamental building block
for various numerical computing applications. However, most existing GPU-SpMV approaches
may suffer from either long preprocessing overhead, load imbalance, format conversion,
bad memory access patterns. In this paper, we proposed two new SpMV algorithms:flat
andline-enhance, as well as their implementations, for GPU systems to overcome the
above shortcomings. Our algorithms work directly on the CSR sparse matrix format.
To achieve high performance: 1) for load balance, theflat algorithm uses non-zero
splitting andline-enhance uses a mix of row and non-zero splitting; 2) memory access
patterns are designed for both algorithms for data loading, storing and reduction
steps; and 3) an adaptive approach is proposed to select appropriate algorithm and
parameters based on matrix characteristics. We evaluate our methods using theSuiteSparse
Matrix Collection on AMD and NVIDIA GPU platforms. Average performance improvements
of 424%, 741%, 49%, 46%, 72% are achieved when comparing our adaptive approach with
CSR-Vector, CSR-Adaptive, HOLA, cuSparse and merge-based SpMV, respectively. In
bandwidth tests, our approach can also achieve a high memory bandwidth, which is
very close to the peak memory bandwidth.'
publication: "*Proceedings of the 32nd International Symposium on High-Performance\
\ Parallel and Distributed Computing (HPDC '23), June 16–23, 2023, Orlando, FL,\
\ USA*"
doi: 10.1145/3588195.3593002
links:
- name: URL
url: http://doi.org/10.1145/3588195.3593002
- name: "Conference Site"
url: https://www.hpdc.org/2022/program/technical-sessions/
- name: "CCF B"
url: "#"
- name: "News"
url: "/post/2023-hpdc-spmv-paper/"

url_code: 'https://github.com/hpcde/spmv-acc/'
url_slides: https://drive.google.com/file/d/1U4iajY9dF0QfSTCL1pW7L0TTsqs85KaS/view?usp=share_link
---
17 changes: 17 additions & 0 deletions content/zh/publication/chu-md-2021/cite.bib
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
@article{chu_md_2021,
author = {Chu, Genshen and Li, Yang and Zhao, Runchu and Ren, Shuai and Yang, Wen and He, Xinfu and Hu, Changjun and Wang, Jue},
copyright = {All rights reserved},
doi = {10.1016/j.cpc.2021.108128},
issn = {00104655},
journal = {Computer Physics Communications},
language = {en},
month = {August},
number = {1},
pages = {108128},
title = {MD Simulation of Hundred-Billion-Metal-Atom Cascade Collision on Sunway Taihulight},
url = {https://linkinghub.elsevier.com/retrieve/pii/S001046552100240X},
urldate = {2021-08-08},
volume = {269},
year = {2021}
}

46 changes: 46 additions & 0 deletions content/zh/publication/chu-md-2021/index.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
---
# Documentation: https://wowchemy.com/docs/managing-content/

title: MD Simulation of Hundred-Billion-Metal-Atom Cascade Collision on Sunway Taihulight
subtitle: ''
summary: ''
authors:
- Genshen Chu
- Yang Li
- Runchu Zhao
- Shuai Ren
- Wen Yang
- Xinfu He
- Changjun Hu
- Jue Wang
tags: []
categories: []
date: '2021-08-01'
lastmod: 2023-01-04T22:24:34+08:00
featured: false
draft: false

# Featured image
# To use, add an image named `featured.jpg/png` to your page's folder.
# Focal points: Smart, Center, TopLeft, Top, TopRight, Left, Right, BottomLeft, Bottom, BottomRight.
image:
caption: ''
focal_point: ''
preview_only: false

# Projects (optional).
# Associate this post with one or more of your projects.
# Simply enter your project's folder or file name without extension.
# E.g. `projects = ["internal-project"]` references `content/project/deep-learning/index.md`.
# Otherwise, set `projects = []`.
projects: []
publishDate: '2023-01-04T14:29:12.088506Z'
publication_types:
- '2'
abstract: ''
publication: '*Computer Physics Communications*'
doi: 10.1016/j.cpc.2021.108128
links:
- name: URL
url: https://linkinghub.elsevier.com/retrieve/pii/S001046552100240X
---
1 change: 1 addition & 0 deletions content/zh/publication/he-testing-2020/cite.bib
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
@inproceedings{he_testing_2020,
abstract = {High performance numerical simulation programs are widely used to simulate actual physical processes on high performance computers for the analysis of various physical and engineering problems. They are usually regarded as non-testable due to their high complexity. This paper reports our real experience and lessons learned from testing five simulation programs that will be used to design and analyze nuclear power plants. We applied five testing approaches and found 33 bugs. We found that property-based testing and metamorphic testing are two effective methods. Nevertheless, we suffered from the lack of domain knowledge, the high test costs, the shortage of test cases, severe oracle issues, and inadequate automation support. Consequently, the five programs are not exhaustively tested from the perspective of software testing, and many existing software testing techniques and tools are not fully applicable due to scalability and portability issues. We need more collaboration and communication with other communities to promote the research and application of software testing techniques.},
address = {Virtual Event USA},
author = {He, Xiao and Wang, Xingwei and Shi, Jia and Liu, Yi},
booktitle = {Proceedings of the 29th ACM SIGSOFT International Symposium on Software Testing and Analysis},
Expand Down
17 changes: 16 additions & 1 deletion content/zh/publication/he-testing-2020/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,11 +34,26 @@ projects: []
publishDate: '2023-01-04T14:29:12.338426Z'
publication_types:
- '1'
abstract: ''
abstract: High performance numerical simulation programs are widely used to simulate
actual physical processes on high performance computers for the analysis of various
physical and engineering problems. They are usually regarded as non-testable due
to their high complexity. This paper reports our real experience and lessons learned
from testing five simulation programs that will be used to design and analyze nuclear
power plants. We applied five testing approaches and found 33 bugs. We found that
property-based testing and metamorphic testing are two effective methods. Nevertheless,
we suffered from the lack of domain knowledge, the high test costs, the shortage
of test cases, severe oracle issues, and inadequate automation support. Consequently,
the five programs are not exhaustively tested from the perspective of software testing,
and many existing software testing techniques and tools are not fully applicable
due to scalability and portability issues. We need more collaboration and communication
with other communities to promote the research and application of software testing
techniques.
publication: '*Proceedings of the 29th ACM SIGSOFT International Symposium on Software
Testing and Analysis*'
doi: 10.1145/3395363.3397382
links:
- name: URL
url: https://dl.acm.org/doi/10.1145/3395363.3397382
- name: "CCF A"
url: ""
---
15 changes: 15 additions & 0 deletions content/zh/publication/hu-kernel-2017/cite.bib
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
@article{hu_kernel_2017,
abstract = {To optimize short-range force computations in Molecular Dynamics (MD) simulations, multi-threading and SIMD optimizations are presented in this paper. With respect to multi-threading optimization, a Partition-and-Separate-Calculation (PSC) method is designed to avoid write conflicts caused by using Newton’s third law. Serial bottlenecks are eliminated with no additional memory usage. The method is implemented by using the OpenMP model. Furthermore, the PSC method is employed on Intel Xeon Phi coprocessors in both native and offload models. We also evaluate the performance of the PSC method under different thread affinities on the MIC architecture. In the SIMD execution, we explain the performance influence in the PSC method, considering the ‘‘if-clause’’ of the cutoff radius check. The experiment results show that our PSC method is relatively more efficient compared to some traditional methods. In double precision, our 256-bit SIMD implementation is about 3 times faster than the scalar version.},
author = {Hu, Changjun and Wang, Xianmeng and Li, Jianjiang and He, Xinfu and Li, Shigang and Feng, Yangde and Yang, Shaofeng and Bai, He},
doi = {10.1016/j.cpc.2016.07.010},
issn = {00104655},
journal = {Computer Physics Communications},
language = {en},
month = {February},
pages = {31--40},
title = {Kernel optimization for short-range molecular dynamics},
url = {https://linkinghub.elsevier.com/retrieve/pii/S0010465516301928},
urldate = {2019-01-22},
volume = {211},
year = {2017}
}
58 changes: 58 additions & 0 deletions content/zh/publication/hu-kernel-2017/index.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
---
# Documentation: https://wowchemy.com/docs/managing-content/

title: Kernel optimization for short-range molecular dynamics
subtitle: ''
summary: ''
authors:
- Changjun Hu
- Xianmeng Wang
- Jianjiang Li
- Xinfu He
- Shigang Li
- Yangde Feng
- Shaofeng Yang
- He Bai
tags: []
categories: []
date: '2017-02-01'
lastmod: 2023-01-04T22:29:03+08:00
featured: false
draft: false

# Featured image
# To use, add an image named `featured.jpg/png` to your page's folder.
# Focal points: Smart, Center, TopLeft, Top, TopRight, Left, Right, BottomLeft, Bottom, BottomRight.
image:
caption: ''
focal_point: ''
preview_only: false

# Projects (optional).
# Associate this post with one or more of your projects.
# Simply enter your project's folder or file name without extension.
# E.g. `projects = ["internal-project"]` references `content/project/deep-learning/index.md`.
# Otherwise, set `projects = []`.
projects: []
publishDate: '2023-01-04T14:29:11.845539Z'
publication_types:
- '2'
abstract: To optimize short-range force computations in Molecular Dynamics (MD) simulations,
multi-threading and SIMD optimizations are presented in this paper. With respect
to multi-threading optimization, a Partition-and-Separate-Calculation (PSC) method
is designed to avoid write conflicts caused by using Newton’s third law. Serial
bottlenecks are eliminated with no additional memory usage. The method is implemented
by using the OpenMP model. Furthermore, the PSC method is employed on Intel Xeon
Phi coprocessors in both native and offload models. We also evaluate the performance
of the PSC method under different thread affinities on the MIC architecture. In
the SIMD execution, we explain the performance influence in the PSC method, considering
the ‘‘if-clause’’ of the cutoff radius check. The experiment results show that our
PSC method is relatively more efficient compared to some traditional methods. In
double precision, our 256-bit SIMD implementation is about 3 times faster than the
scalar version.
publication: '*Computer Physics Communications*'
doi: 10.1016/j.cpc.2016.07.010
links:
- name: URL
url: https://linkinghub.elsevier.com/retrieve/pii/S0010465516301928
---

1 comment on commit 1edb8d3

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Deploy preview for hpcde-github-io ready!

✅ Preview
https://hpcde-github-m4jrdix28-genshen.vercel.app

Built with commit 1edb8d3.
This pull request is being automatically deployed with vercel-action

Please sign in to comment.