Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

cant save checkpoint #7

Open
Heiheicxx opened this issue Jan 17, 2025 · 0 comments
Open

cant save checkpoint #7

Heiheicxx opened this issue Jan 17, 2025 · 0 comments

Comments

@Heiheicxx
Copy link

W0117 14:48:50.972433 140051385186112 experiment.py:220] Your experiment's self.CHECKPOINT_ATTRS and self.NON_BROADCAST_CHECKPOINT_ATTRS are empty. Your job will not checkpoint any state or parameters.
I0117 14:48:50.972585 140051385186112 utils.py:615] Saved checkpoint latest with id 0.
I0117 14:48:50.973322 140051385186112 utils.py:316] [jaxline] training loop finished.
I0117 14:48:50.974981 140051385186112 utils.py:309] [jaxline] final checkpoint starting...
W0117 14:48:50.975140 140051385186112 experiment.py:220] Your experiment's self.CHECKPOINT_ATTRS and self.NON_BROADCAST_CHECKPOINT_ATTRS are empty. Your job will not checkpoint any state or parameters.
I0117 14:48:50.975250 140051385186112 utils.py:615] Saved checkpoint latest with id 1.
I0117 14:48:50.975420 140051385186112 utils.py:316] [jaxline] final checkpoint finished.
I0117 14:48:50.975657 140051385186112 utils.py:498] Waiting for a periodic action to finish...
I0117 14:49:06.433700 140051385186112 utils.py:309] [jaxline] rendezvous starting...
I0117 14:49:06.486311 140051385186112 utils.py:316] [jaxline] rendezvous finished.

train config as follow:
I0117 14:44:59.102810 140051385186112 train.py:76] Training with config:
best_checkpoint_all_hosts: false
best_model_eval_metric: ''
best_model_eval_metric_higher_is_better: true
binary_args:

  • !!python/tuple
    • --define cudnn_embed_so
    • 1
  • !!python/tuple
    • --define=cuda_compress
    • 1
      checkpoint_dir: /workplace/0_code/2_Game/2025/ai-imgbit/training/
      checkpoint_interval_type: null
      eval_initial_weights: false
      eval_modes: !!python/tuple []
      eval_specific_checkpoint_dir: /workplace/0_code/2_Game/2025/ai-imgbit/eval/
      experiment_kwargs:
      config:
      dataset:
      name: clic2020
      num_examples: 1
      num_frames: null
      root_dir: /workplace/0_code/2_Game/2025/ai-imgbit/clic2020
      skip_examples: 0
      spatial_patch_size: null
      video_idx: null
      eval: {}
      log_gradient_norms: false
      log_per_datum_metrics: true
      loss:
      rd_weight: 0.001
      rd_weight_warmup_steps: 0
      model:
      entropy:
      activation_fn: gelu
      clip_like_cool_chic: true
      conditional_spec:
      interpolation: bilinear
      prev_kernel_shape: &id001 !!python/tuple
      - 3
      - 3
      use_conditioning: true
      use_prev_grid: true
      context_num_rows_cols: *id001
      layers: &id002 !!python/tuple
      - 24
      - 24
      scale_range: !!python/tuple
      - 0.001
      - 150
      shift_log_scale: 8.0
      use_linear_w_init: true
      latents:
      add_gains: true
      downsampling_exponents: !!python/tuple
      - 0
      - 1
      - 2
      - 3
      - 4
      - 5
      - 6
      downsampling_factor: 2.0
      gain_factor: null
      gain_values: null
      learnable_gains: false
      num_grids: 7
      q_step: 0.4
      quant:
      q_steps_bias:
      - 5.0e-05
      - 0.0001
      - 0.0005
      - 0.001
      - 0.003
      - 0.006
      - 0.01
      q_steps_weight:
      - 5.0e-05
      - 0.0001
      - 0.0005
      - 0.001
      - 0.003
      - 0.006
      - 0.01
      synthesis:
      activation_fn: gelu
      add_activation_before_residual: false
      add_layer_norm: false
      b_last_init_input_mean: false
      clip_range: !!python/tuple
      - 0.0
      - 1.0
      kernel_shape: 1
      layers: *id002
      num_residual_layers: 2
      residual_kernel_shape: 3
      upsampling:
      kwargs:
      interpolation_method: bilinear
      type: image_resize
      opt:
      cosine_decay_schedule: true
      cosine_decay_schedule_kwargs:
      alpha: 0.0
      decay_steps: 100
      init_value: 0.01
      grad_norm_clip: 0.1
      max_num_ste_steps: 100
      noise_log_every: 100
      num_noise_steps: 100
      ste_break_at_lr: 1.0e-08
      ste_init_lr: 0.0001
      ste_log_every: 50
      ste_lr_decay_factor: 0.8
      ste_num_steps_not_improved: 20
      ste_reset_params_at_lr_decay: true
      ste_uses_cosine_decay: false
      quant:
      kumaraswamy_decay_steps: 100
      kumaraswamy_end_value: 1.0
      kumaraswamy_init_value: 2.0
      noise_quant_type: soft_round
      soft_round_temp_end: 0.1
      soft_round_temp_start: 0.3
      ste_quant_type: ste_soft_round
      ste_soft_round_temp: 0.0001
      use_kumaraswamy_noise: true
      interval_type: steps
      legacy_random_seed_behavior: false
      log_all_hosts: false
      log_all_train_data: false
      log_async: true
      log_tensors_interval: 1
      log_train_data_interval: 1.0
      logging_interval_type: null
      max_checkpoints_to_keep: 5
      one_off_evaluate: false
      periodic_action_growth_ratios: null
      random_mode_eval: same_host_same_device
      random_mode_train: unique_host_unique_device
      random_seed: 0
      save_checkpoint_interval: 1
      save_initial_train_checkpoint: false
      train_checkpoint_all_hosts: false
      training_steps: 1
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant