paddle 在A800上hang住 #71664

wangguan1995 · 2025-03-14T06:50:50Z

bug描述 Describe the Bug

1. Bug 情况：

用户在使用【starccm+】占用3/7台 GPU的情况下，paddle指定空闲卡，进程hang住

Python 3.10.14 (main, Apr  6 2024, 18:45:05) [GCC 9.4.0] on linux
Type "help", "copyright", "credits" or "license" for more information.
>>> import paddle
WARNING: Logging before InitGoogleLogging() is written to STDERR
I0314 06:42:40.373276 10607 header_generator.cc:52] Unable to open file : /paddle/paddle/cinn/runtime/cuda/float16.h
grep: warning: GREP_OPTIONS is deprecated; please use an alias or script

[pid 1832041] read(10, "rchar: 17688289\nwchar: 367314\nsy"..., 1024) = 107
[pid 1832041] close(10)                 = 0
[pid 1832041] openat(AT_FDCWD, "/sys/devices/system/cpu/online", O_RDONLY|O_CLOEXEC) = 10
[pid 1832041] read(10, "0-127\n", 8192) = 6
[pid 1832041] close(10)                 = 0
[pid 1832041] getrusage(RUSAGE_SELF, {ru_utime={tv_sec=1, tv_usec=984884}, ru_stime={tv_sec=5, tv_usec=658737}, ...}) = 0
[pid 1832041] clock_nanosleep(CLOCK_REALTIME, 0, {tv_sec=0, tv_nsec=997976000}, NULL) = 0
[pid 1832041] openat(AT_FDCWD, "/proc/self/stat", O_RDONLY) = 10
[pid 1832041] fstat(10, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
[pid 1832041] read(10, "10759 (python) S 10287 10759 102"..., 1024) = 308
[pid 1832041] close(10)                 = 0
[pid 1832041] openat(AT_FDCWD, "/proc/self/fd", O_RDONLY|O_DIRECTORY) = 10
[pid 1832041] getdents64(10, 0x7f00b0de4b44 /* 13 entries */, 512) = 312
[pid 1832041] getdents64(10, 0x7f00b0de4b44 /* 0 entries */, 512) = 0
[pid 1832041] close(10)                 = 0
[pid 1832041] openat(AT_FDCWD, "/proc/self/statm", O_RDONLY) = 10
[pid 1832041] fstat(10, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
[pid 1832041] read(10, "2053228 149934 95481 743 0 73501"..., 1024) = 36
[pid 1832041] close(10)                 = 0
[pid 1832041] openat(AT_FDCWD, "/proc/loadavg", O_RDONLY) = 10
[pid 1832041] fstat(10, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
[pid 1832041] read(10, "20.85 18.73 18.23 19/9035 10832\n", 1024) = 32
[pid 1832041] close(10)                 = 0
[pid 1832041] openat(AT_FDCWD, "/proc/self/io", O_RDONLY) = 10
[pid 1832041] fstat(10, {st_mode=S_IFREG|0400, st_size=0, ...}) = 0
[pid 1832041] read(10, "rchar: 17688778\nwchar: 367314\nsy"..., 1024) = 107
[pid 1832041] close(10)                 = 0
[pid 1832041] openat(AT_FDCWD, "/sys/devices/system/cpu/online", O_RDONLY|O_CLOEXEC) = 10
[pid 1832041] read(10, "0-127\n", 8192) = 6
[pid 1832041] close(10)                 = 0
[pid 1832041] getrusage(RUSAGE_SELF, {ru_utime={tv_sec=1, tv_usec=985649}, ru_stime={tv_sec=5, tv_usec=658737}, ...}) = 0
[pid 1832041] clock_nanosleep(CLOCK_REALTIME, 0, {tv_sec=0, tv_nsec=997963000}, NULL) = 0
[pid 1832041] openat(AT_FDCWD, "/proc/self/stat", O_RDONLY) = 10
[pid 1832041] fstat(10, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
[pid 1832041] read(10, "10759 (python) S 10287 10759 102"..., 1024) = 308
[pid 1832041] close(10)                 = 0
[pid 1832041] openat(AT_FDCWD, "/proc/self/fd", O_RDONLY|O_DIRECTORY) = 10
[pid 1832041] getdents64(10, 0x7f00b0de4b44 /* 13 entries */, 512) = 312
[pid 1832041] getdents64(10, 0x7f00b0de4b44 /* 0 entries */, 512) = 0
[pid 1832041] close(10)                 = 0
[pid 1832041] openat(AT_FDCWD, "/proc/self/statm", O_RDONLY) = 10
[pid 1832041] fstat(10, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
[pid 1832041] read(10, "2053228 149934 95481 743 0 73501"..., 1024) = 36
[pid 1832041] close(10)                 = 0
[pid 1832041] openat(AT_FDCWD, "/proc/loadavg", O_RDONLY) = 10
[pid 1832041] fstat(10, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
[pid 1832041] read(10, "20.85 18.73 18.23 19/9031 10832\n", 1024) = 32
[pid 1832041] close(10)                 = 0
[pid 1832041] openat(AT_FDCWD, "/proc/self/io", O_RDONLY) = 10
[pid 1832041] fstat(10, {st_mode=S_IFREG|0400, st_size=0, ...}) = 0
[pid 1832041] read(10, "rchar: 17689267\nwchar: 367314\nsy"..., 1024) = 107
[pid 1832041] close(10)                 = 0
[pid 1832041] openat(AT_FDCWD, "/sys/devices/system/cpu/online", O_RDONLY|O_CLOEXEC) = 10
[pid 1832041] read(10, "0-127\n", 8192) = 6
[pid 1832041] close(10)                 = 0
[pid 1832041] getrusage(RUSAGE_SELF, {ru_utime={tv_sec=1, tv_usec=986212}, ru_stime={tv_sec=5, tv_usec=658908}, ...}) = 0
[pid 1832041] clock_nanosleep(CLOCK_REALTIME, 0, {tv_sec=0, tv_nsec=998007000}, NULL) = 0
[pid 1832041] openat(AT_FDCWD, "/proc/self/stat", O_RDONLY) = 10
[pid 1832041] fstat(10, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
[pid 1832041] read(10, "10759 (python) S 10287 10759 102"..., 1024) = 308
[pid 1832041] close(10)                 = 0
[pid 1832041] openat(AT_FDCWD, "/proc/self/fd", O_RDONLY|O_DIRECTORY) = 10
[pid 1832041] getdents64(10, 0x7f00b0de4b44 /* 13 entries */, 512) = 312
[pid 1832041] getdents64(10, 0x7f00b0de4b44 /* 0 entries */, 512) = 0
[pid 1832041] close(10)                 = 0
[pid 1832041] openat(AT_FDCWD, "/proc/self/statm", O_RDONLY) = 10
[pid 1832041] fstat(10, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
[pid 1832041] read(10, "2053228 149934 95481 743 0 73501"..., 1024) = 36
[pid 1832041] close(10)                 = 0
[pid 1832041] openat(AT_FDCWD, "/proc/loadavg", O_RDONLY) = 10
[pid 1832041] fstat(10, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
[pid 1832041] read(10, "20.85 18.73 18.23 20/9033 10832\n", 1024) = 32
[pid 1832041] close(10)                 = 0
[pid 1832041] openat(AT_FDCWD, "/proc/self/io", O_RDONLY) = 10
[pid 1832041] fstat(10, {st_mode=S_IFREG|0400, st_size=0, ...}) = 0
[pid 1832041] read(10, "rchar: 17689756\nwchar: 367314\nsy"..., 1024) = 107
[pid 1832041] close(10)                 = 0
[pid 1832041] openat(AT_FDCWD, "/sys/devices/system/cpu/online", O_RDONLY|O_CLOEXEC) = 10
[pid 1832041] read(10, "0-127\n", 8192) = 6
[pid 1832041] close(10)                 = 0
[pid 1832041] getrusage(RUSAGE_SELF, {ru_utime={tv_sec=1, tv_usec=986266}, ru_stime={tv_sec=5, tv_usec=659638}, ...}) = 0
[pid 1832041] clock_nanosleep(CLOCK_REALTIME, 0, {tv_sec=0, tv_nsec=997967000}, NULL) = 0
[pid 1832041] openat(AT_FDCWD, "/proc/self/stat", O_RDONLY) = 10
[pid 1832041] fstat(10, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
[pid 1832041] read(10, "10759 (python) S 10287 10759 102"..., 1024) = 308
[pid 1832041] close(10)                 = 0
[pid 1832041] openat(AT_FDCWD, "/proc/self/fd", O_RDONLY|O_DIRECTORY) = 10
[pid 1832041] getdents64(10, 0x7f00b0de4b44 /* 13 entries */, 512) = 312
[pid 1832041] getdents64(10, 0x7f00b0de4b44 /* 0 entries */, 512) = 0
[pid 1832041] close(10)                 = 0
[pid 1832041] openat(AT_FDCWD, "/proc/self/statm", O_RDONLY) = 10
[pid 1832041] fstat(10, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
[pid 1832041] read(10, "2053228 149934 95481 743 0 73501"..., 1024) = 36
[pid 1832041] close(10)                 = 0
[pid 1832041] openat(AT_FDCWD, "/proc/loadavg", O_RDONLY) = 10
[pid 1832041] fstat(10, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
[pid 1832041] read(10, "20.85 18.73 18.23 19/9031 10832\n", 1024) = 32
[pid 1832041] close(10)                 = 0
[pid 1832041] openat(AT_FDCWD, "/proc/self/io", O_RDONLY) = 10
[pid 1832041] fstat(10, {st_mode=S_IFREG|0400, st_size=0, ...}) = 0
[pid 1832041] read(10, "rchar: 17690245\nwchar: 367314\nsy"..., 1024) = 107
[pid 1832041] close(10)                 = 0
[pid 1832041] openat(AT_FDCWD, "/sys/devices/system/cpu/online", O_RDONLY|O_CLOEXEC) = 10
[pid 1832041] read(10, "0-127\n", 8192) = 6
[pid 1832041] close(10)                 = 0
[pid 1832041] getrusage(RUSAGE_SELF, {ru_utime={tv_sec=1, tv_usec=986266}, ru_stime={tv_sec=5, tv_usec=660400}, ...}) = 0

2. 其他补充信息 Additional Supplementary Information

容器：
docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddle:3.0.0rc1-gpu-cuda12.3-cudnn9.0-trt8.6

paddle版本
paddlepaddle-gpu 3.0.0.dev20250310

docker版本
Docker version 27.3.1, build ce12230

The text was updated successfully, but these errors were encountered:

Bobholamovic · 2025-03-14T07:04:50Z

建议可以设置环境变量export GLOG_v=3看看详细的日志～

wangguan1995 · 2025-03-14T08:10:09Z

export GLOG_v=3 日志

python ./src/script/starccm+/read_all.py 
WARNING: Logging before InitGoogleLogging() is written to STDERR
I0314 08:09:07.824322   129 header_generator.cc:52] Unable to open file : /paddle/paddle/cinn/runtime/cuda/float16.h
grep: warning: GREP_OPTIONS is deprecated; please use an alias or script
I0314 08:09:07.904161   129 dynamic_loader.cc:175] Set paddle lib path : /usr/local/lib/python3.10/dist-packages/paddle/libs
I0314 08:09:08.219502   129 init.cc:101] Before Parse: argc is 2, Init commandline: dummy --tryfromenv=enable_api_kernel_fallback,new_executor_use_local_scope,all_blocks_convert_trt,accuracy_check_atol_fp32,print_sub_graph_dir,nccl_blocking_wait,gpugraph_enable_print_op_debug,gpugraph_slot_feasign_max_num,enable_opt_get_features,print_ir,sort_sum_gradient,enable_tracker_all2all,gpu_allocator_retry_time,multiple_of_cupti_buffer_size,prim_all,enable_dependency_builder_debug_info,init_allocated_mem,cse_max_count,enable_neighbor_list_use_uva,initial_cpu_memory_in_mb,enable_blaslt_global_search,cusparse_dir,auto_growth_chunk_size_in_mb,prim_forward,gpugraph_storage_mode,trt_ibuilder_cache,use_stride_kernel,tracer_profile_fname,sync_after_alloc,check_kernel_launch,enable_fusion_result_check,enable_append_iters_in_fusion,logging_trunc_pir_py_code,initial_gpu_memory_in_mb,embedding_deterministic,enable_async_trace,enable_cinn_accuracy_check,memory_fraction_of_eager_deletion,graph_edges_debug_node_num,enable_unused_var_check,fuse_parameter_memory_size,eager_delete_tensor_gb,gpugraph_load_node_list_into_hbm,use_cuda_managed_memory,fraction_of_cpu_memory_to_use,check_infer_symbolic,tracer_onednn_ops_on,use_cuda_malloc_async_allocator,gpugraph_merge_grads_segment_size,enable_record_memory,eager_communication_connection,new_executor_use_cuda_graph,nvidia_package_dir,save_cf_stack_op,cublaslt_device_best_config,conv2d_disable_cudnn,reallocate_gpu_memory_in_mb,dynamic_static_unified_comm,cudnn_dir,gpugraph_enable_hbm_table_collision_stat,gpugraph_enable_segment_merge_grads,pir_broadcast_tree_limit,use_cinn,cudnn_deterministic,accuracy_check_atol_bf16,log_memory_stats,jit_engine_type,cublaslt_exhaustive_search_times,enable_auto_rdma_trans,cupti_dir,comp_skip_default_ops,dataloader_use_file_descriptor,check_nan_inf,communicator_is_sgd_optimizer,async_trace_count,cublas_dir,benchmark_nccl,prim_backward,cuda_dir,cuda_memory_async_pool_release_threshold,pir_apply_inplace_pass,fast_eager_deletion_mode,new_executor_sequential_run,dist_threadpool_size,inner_op_parallelism,prim_enabled,enable_adjust_op_order,print_allocator_trace_info,trt_min_group_size,enable_gpu_memory_usage_log,tensorrt_dir,gpugraph_debug_gpu_memory,gpu_memory_limit_mb,executor_log_deps_every_microseconds,gpugraph_offload_param_stat,convert_all_blocks,graph_neighbor_size_percent,graph_get_neighbor_id,manually_trans_conv_filter,enable_transpose_iters_in_fusion,enable_pir_api,accuracy_check_rtol_fp16,save_static_runtime_data,rocksdb_path,win_cuda_bin_dir,cache_inference_while_scope,gpugraph_parallel_stream_num,enable_fusion_fallback,cusolver_dir,use_fast_math,graph_embedding_split_infer_mode,graph_load_in_parallel,gpugraph_force_device_batch_num_equal,gpugraph_offload_param_extends,cinn_specify_input_dynamic_dim,cudnn_exhaustive_search,use_pinned_memory,enable_fuse_parallel_matmul_pass,prim_enable_dynamic,host_trace_level,multi_block_attention_min_partition_size,use_shm_cache,set_to_1d,add_dependency_for_communication_op,free_idle_chunk,search_cache_max_number,low_precision_op_list,cinn_compile_thread_num,logging_pir_py_code_dir,cudnn_batchnorm_spatial_persistent,enable_auto_parallel_align_mode,npu_storage_format,pir_interpreter_record_stream_for_gc_cache,cuda_core_int8_gemm,allow_cinn_ops,cusparselt_dir,allocator_strategy,tracer_onednn_ops_off,run_kp_kernel,gpugraph_offload_gather_copy_maxsize,disable_dyshape_in_train,graph_edges_split_mode,alloc_fill_value,prim_check_ops,enable_auto_layout_pass,new_executor_use_inplace,enable_pir_in_executor,fused_multi_transformer_op_use_mbfmha,enable_cublas_tensor_op_math,enable_cinn_compile_cache,nccl_dir,logging_pir_py_code_dump_symbolic_dims,use_system_allocator,accuracy_check_rtol_bf16,mkl_dir,dygraph_debug,conv_workspace_size_limit,enable_cse_in_dy2st,get_host_by_name_time,fraction_of_cuda_pinned_memory_to_use,use_virtual_memory_auto_growth,gpugraph_dedup_pull_push_mode,paddle_num_threads,call_stack_level,rpc_send_thread_num,enable_graph_multi_node_sampling,disable_logging_op_attr_list,benchmark,use_auto_growth_pinned_allocator,enable_sparse_inner_gather,max_inplace_grad_add,static_executor_perfstat_filepath,prim_skip_dynamic,gemm_use_half_precision_compute_type,cudnn_exhaustive_search_times,use_xqa_optim,selected_gpus,cuda_malloc_async_pool_memory_throttle_ratio,gpugraph_sparse_table_storage_mode,multi_node_sample_use_gpu_table,enable_pir_in_executor_trace_run,new_executor_serial_run,graph_metapath_split_opt,fuse_parameter_groups_size,ir_inplace_kernel_blacklist,communicator_send_queue_size,graph_edges_split_debug,fraction_of_gpu_memory_to_use,static_runtime_data_save_path,apply_pass_to_program,enable_reuse_iters_in_fusion,deny_cinn_ops,tensor_operants_mode,free_when_no_cache_hit,curand_dir,query_dest_rank_by_multi_node,eager_delete_scope,use_stream_safe_cuda_allocator,allreduce_record_one_event,local_exe_sub_scope_limit,check_nan_inf_level,gpugraph_enable_gpu_direct_access,new_executor_static_build,enable_all2all_use_fp16,mklml_dir,tcp_max_syn_backlog,auto_free_cudagraph_allocations_on_launch,dump_chunk_info,gpugraph_hbm_table_load_factor,prim_forward_blacklist,enable_pir_with_pt_in_dy2st,communicator_max_merge_var_num,graph_edges_debug_node_id,print_kernel_run_info,enable_dump_main_program,accuracy_check_atol_fp16,logging_pir_py_code_int_tensor_element_limit,pinned_memory_as_cpu_backend,accuracy_check_rtol_fp32,enable_exit_when_partial_worker,enable_gpu_memory_usage_log_mb,lapack_dir,enable_collect_shape,reader_queue_speed_test_mode,gpugraph_parallel_copyer_split_maxsize,einsum_opt,flash_attn_version,cinn_input_dynamic_dim_spec_file,graph_edges_split_only_by_src_id,enable_custom_engine,sync_nccl_allreduce,enable_auto_detect_gpu_topo,force_sync_ops,op_dir,use_auto_growth_v2,use_mkldnn,use_autotune 
I0314 08:09:08.219594   129 init.cc:109] After Parse: argc is 2

Bobholamovic · 2025-03-14T12:41:07Z

收到～我们会安排相关同事跟进

wangguan1995 added status/new-issue 新建 type/bug-report 报bug labels Mar 14, 2025

paddle-bot bot assigned Bobholamovic Mar 14, 2025

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

paddle 在A800上hang住 #71664

paddle 在A800上hang住 #71664

wangguan1995 commented Mar 14, 2025 •

edited

Loading

Bobholamovic commented Mar 14, 2025

wangguan1995 commented Mar 14, 2025

Bobholamovic commented Mar 14, 2025

paddle 在A800上hang住 #71664

paddle 在A800上hang住 #71664

Comments

wangguan1995 commented Mar 14, 2025 • edited Loading

bug描述 Describe the Bug

1. Bug 情况：

2. 其他补充信息 Additional Supplementary Information

Bobholamovic commented Mar 14, 2025

wangguan1995 commented Mar 14, 2025

Bobholamovic commented Mar 14, 2025

wangguan1995 commented Mar 14, 2025 •

edited

Loading