Skip to content

UnifyFS Client Timeouts at rpc for unifyfs_dispatch_transfer with MOVE and unifyfs_finalize #741

@hariharan-devarajan

Description

@hariharan-devarajan

System information

Lassen Machine link

Describe the problem you're observing

The code hangs and timeouts on two UnifyFS calls. unifyfs_dispatch_transfer and unifyfs_finalize

for unifyfs_dispatch_transfer it hangs for the UNIFYFS_TRANSFER_MODE_MOVE but works for UNIFYFS_TRANSFER_MODE_COPY

for unifyfs_finalize the function timeouts at RPC.

Describe how to reproduce the problem

    unifyfs_handle fshdl;
    options_ct = 4;
    unifyfs_cfg_option *options = static_cast<unifyfs_cfg_option *>(
        calloc(options_ct, sizeof(unifyfs_cfg_option)));
    options[0] = {.opt_name = "logio.spill_dir", .opt_value = logio_spill_dir};
    options[1] = {.opt_name = "logio.spill_size",
                  .opt_value = logio_spill_size};
    options[2] = {.opt_name = "logio.shmem_size",
                  .opt_value = logio_shmem_size};
    options[3] = {.opt_name = "logio.chunk_size",
                  .opt_value = logio_chunk_size};
    int rc = unifyfs_initialize(info.unifyfs_path.c_str(), options, options_ct, fshdl);
    REQUIRE(rc == UNIFYFS_SUCCESS);
    fs::path unifyfs_filename = info.unifyfs_path / filename;
    unifyfs_gfid gfid = 0;
    int rc = UNIFYFS_SUCCESS;
    int create_flags = 0;
    open_time.resumeTime();
    rc = unifyfs_create(fshdl, create_flags, unifyfs_filename.c_str(), &gfid);
    open_time.pauseTime();
    
    INFO("unifyfs rc " << strerror(rc));
    REQUIRE(rc == UNIFYFS_SUCCESS);
    REQUIRE(gfid != UNIFYFS_INVALID_GFID);
    if (info.rank == 0) INFO("Writing data");
    /* Write data to file */
    auto write_data =
        std::vector<char>(args.request_size * args.iteration, 'w');
    size_t write_req_ct = args.iteration + 1;
    unifyfs_io_request write_req[write_req_ct];
    for (size_t i = 0; i < args.iteration; ++i) {
      write_req[i].op = UNIFYFS_IOREQ_OP_WRITE;
      write_req[i].gfid = gfid;
      write_req[i].nbytes = args.request_size;
      off_t base_offset = 0;
      if (args.file_sharing == tt::FileSharing::SHARED_FILE) {
        base_offset = (off_t)info.rank * args.request_size * args.iteration;
      }
      off_t relative_offset = i * args.request_size;
      write_req[i].offset = base_offset + relative_offset;
      write_req[i].user_buf = write_data.data() + (i * args.request_size);
    }
    write_req[args.iteration].op = UNIFYFS_IOREQ_OP_SYNC_META;
    write_req[args.iteration].gfid = gfid;
    rc = unifyfs_dispatch_io(fshdl, write_req_ct, write_req);
    if (rc == UNIFYFS_SUCCESS) {
      int waitall = 1;
      rc = unifyfs_wait_io(fshdl, write_req_ct, write_req, waitall);
      if (rc == UNIFYFS_SUCCESS) {
        for (size_t i = 0; i < args.iteration; i++) {
          REQUIRE(write_req[i].result.error == 0);
          REQUIRE(write_req[i].result.count == args.request_size);
        }
        REQUIRE(write_req[args.iteration].result.error == 0);
      }
    }
    MPI_Barrier(MPI_COMM_WORLD);
    if (info.rank == 0) PRINT_MSG("Finished Writing", "");
    if (info.rank == 0) INFO("Flushing data");
      unifyfs_transfer_request mv_req;
      mv_req.src_path = unifyfs_filename.c_str();
      mv_req.dst_path = full_filename_path.c_str();
      mv_req.mode = UNIFYFS_TRANSFER_MODE_MOVE;
      mv_req.use_parallel = 1;
      rc = unifyfs_dispatch_transfer(fshdl, 1, &mv_req);
      REQUIRE(rc == UNIFYFS_SUCCESS);
      if (rc == UNIFYFS_SUCCESS) {
        int waitall = 1;
        rc = unifyfs_wait_transfer(fshdl, 1, &mv_req, waitall);
        if (rc == UNIFYFS_SUCCESS) {
          for (int i = 0; i < (int)1; i++) {
            REQUIRE(mv_req.result.error == 0);
          }
        }
      }
    MPI_Barrier(MPI_COMM_WORLD);
    rc = unifyfs_finalize(fshdl);

Include any warning or errors or releveant debugging data

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    Status

    To Consider

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions