Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

incorrect looping or insufficient data for the requested number of epochs and evaluations #116

Open
Mehrdad-Hosseini1992 opened this issue Nov 22, 2024 · 7 comments

Comments

@Mehrdad-Hosseini1992
Copy link

Hi, first thanks for your innovation,
I am using the point transformer v3 for my custom datasets and here is my configuration file :

base = ["../base/default_runtime.py"]

misc custom setting

batch_size = 3 # bs: total bs in all gpus
num_worker = 4
mix_prob = 0.8
empty_cache = False
enable_amp = True

model settings

model = dict(
type="DefaultSegmentorV2",
num_classes=12,
backbone_out_channels=64,
backbone=dict(
type="PT-v3m1",
in_channels=7,
order=("z", "z-trans", "hilbert", "hilbert-trans"),
stride=(2, 2, 2, 2),
enc_depths=(2, 2, 2, 6, 2),
enc_channels=(32, 64, 128, 256, 512),
enc_num_head=(2, 4, 8, 16, 32),
enc_patch_size=(1024, 1024, 1024, 1024, 1024),
dec_depths=(2, 2, 2, 2),
dec_channels=(64, 64, 128, 256),
dec_num_head=(4, 4, 8, 16),
dec_patch_size=(1024, 1024, 1024, 1024),
mlp_ratio=4,
qkv_bias=True,
qk_scale=None,
attn_drop=0.0,
proj_drop=0.0,
drop_path=0.3,
shuffle_orders=True,
pre_norm=True,
enable_rpe=False,
enable_flash=True,
upcast_attention=False,
upcast_softmax=False,
cls_mode=False,
pdnorm_bn=False,
pdnorm_ln=False,
pdnorm_decouple=True,
pdnorm_adaptive=False,
pdnorm_affine=True,
# pdnorm_conditions=("ScanNet", "S3DIS", "Structured3D"),
pdnorm_conditions=("nuScenes", "SemanticKITTI", "Waymo"),
),
criteria=[
dict(type="CrossEntropyLoss",
weight=[1.0000, 5.7017, 1.8023, 7.3687,
1.0904, 4.4289, 3.1646, 5.8000,
2.4000, 1.3571, 7.7758, 9.4042],
loss_weight=1.0, ignore_index=-1),
dict(type="LovaszLoss", mode="multiclass", loss_weight=1.0, ignore_index=-1),
],
)

scheduler settings

epoch = 800
optimizer = dict(type="AdamW", lr=0.002, weight_decay=0.005)
scheduler = dict(
type="OneCycleLR",
max_lr=0.002,
pct_start=0.04,
anneal_strategy="cos",
div_factor=10.0,
final_div_factor=1000.0,
)
param_dicts = [dict(keyword="block", lr=0.0002)]

dataset settings

dataset_type = "S3DISDataset"

data = dict(
num_classes=12,
ignore_index=-1,
names=[
"other",
"sidewalk",
"road and asphalt",
"curb cut",
"Vegetation",
"Tree",
"post",
"Ramp",
"Road-sign",
"Building",
"steps",
"Door",
],
train=dict(
type=dataset_type,
split=("ND-Train-1", "ND-Train-2", "ND-Train-3"),
data_root = "/home/mehrdad/Codes/PTV3-2/processed_output/Train",
transform=[
dict(type="RandomRotate", angle=[-1, 1], axis="z", center=[0, 0, 0], p=0.5),
# dict(type="PointClip", point_cloud_range=(-75.2, -75.2, -4, 75.2, 75.2, 2)),
dict(type="RandomScale", scale=[0.9, 1.1]),
dict(type="RandomFlip", p=0.5),
dict(type="RandomJitter", sigma=0.005, clip=0.02),
dict(
type="GridSample",
grid_size=0.05,
hash_type="fnv",
mode="train",
return_grid_coord=True,
),
dict(type="SphereCrop", point_max=250000, mode="random"),
dict(type="CenterShift", apply_z=False),
dict(type="ToTensor"),
dict(
type="Collect",
keys=("coord", "grid_coord", "segment"),
feat_keys=("coord","color", "strength"),
),
],
test_mode=False,
),
val=dict(
type=dataset_type,
split="ND-Validation",
data_root="/home/mehrdad/Codes/PTV3-2/processed_output/Validation",
transform=[
dict(type="CenterShift", apply_z=True),
dict(
type="Copy",
keys_dict={"coord": "origin_coord", "segment": "origin_segment"},
),
dict(
type="GridSample",
grid_size=0.05,
hash_type="fnv",
mode="train",
return_grid_coord=True,
),
dict(type="CenterShift", apply_z=False),
dict(type="ToTensor"),
dict(
type="Collect",
keys=(
"coord",
"grid_coord",
"origin_coord",
"segment",
"origin_segment",
),
offset_keys_dict=dict(offset="coord", origin_offset="origin_coord"),
feat_keys=("coord", "color", "strength"),
),
],
test_mode=False,
),
# test=dict(
# type=dataset_type,
# split="ND-Validation",
# data_root="/home/mehrdad/Codes/PTV3-2/processed_output/Validation",
# transform=[
# dict(type="CenterShift", apply_z=True),
# dict(type="NormalizeColor"),
# ],
# test_mode=True,
# test_cfg=dict(
# voxelize=dict(
# type="GridSample",
# grid_size=0.02,
# hash_type="fnv",
# mode="test",
# keys=("coord", "color", "strength"),
# return_grid_coord=True,
# ),
# crop=None,
# post_transform=[
# dict(type="CenterShift", apply_z=False),
# dict(type="ToTensor"),
# dict(
# type="Collect",
# keys=("coord", "grid_coord", "segment", "index"),
# feat_keys=("color", "strength"),
# ),
# ],
# aug_transform=[
# [dict(type="RandomScale", scale=[0.9, 0.9])],
# [dict(type="RandomScale", scale=[0.95, 0.95])],
# [dict(type="RandomScale", scale=[1, 1])],
# [dict(type="RandomScale", scale=[1.05, 1.05])],
# [dict(type="RandomScale", scale=[1.1, 1.1])],
# [
# dict(type="RandomScale", scale=[0.9, 0.9]),
# dict(type="RandomFlip", p=1),
# ],
# [
# dict(type="RandomScale", scale=[0.95, 0.95]),
# dict(type="RandomFlip", p=1),
# ],
# [
# dict(type="RandomScale", scale=[1, 1]),
# dict(type="RandomFlip", p=1),
# ],
# [
# dict(type="RandomScale", scale=[1.05, 1.05]),
# dict(type="RandomFlip", p=1),
# ],
# [
# dict(type="RandomScale", scale=[1.1, 1.1]),
# dict(type="RandomFlip", p=1),
# ],
# ],
# ),
# ),
)

when I set the epoch = 100 and eval_epoch=100 the model work well, and I know that for example when I set epoch = 800 + eval_epoch =100 => train_epoch = 100 + loop = 8 that means each epoch loop 8 times.
but when I set the epoch= 800 and eval_epoch =100 , I faced with this error:

Exception has occurred: SystemExit
1
File "/home/mehrdad/Codes/PTV3-2/pointcept/engines/train.py", line 161, in train
for (
IndexError: Caught IndexError in DataLoader worker process 0.
Original Traceback (most recent call last):
File "/home/mehrdad/anaconda3/envs/mehrdad_env/lib/python3.8/site-packages/torch/utils/data/_utils/worker.py", line 309, in _worker_loop
data = fetcher.fetch(index) # type: ignore[possibly-undefined]
File "/home/mehrdad/anaconda3/envs/mehrdad_env/lib/python3.8/site-packages/torch/utils/data/_utils/fetch.py", line 52, in fetch
data = [self.dataset[idx] for idx in possibly_batched_index]
File "/home/mehrdad/anaconda3/envs/mehrdad_env/lib/python3.8/site-packages/torch/utils/data/_utils/fetch.py", line 52, in
data = [self.dataset[idx] for idx in possibly_batched_index]
File "/home/mehrdad/Codes/PTV3-2/pointcept/datasets/defaults.py", line 208, in getitem
raise IndexError(f"Index {idx} out of range for data list of length {len(self.data_list)}")
IndexError: Index 14 out of range for data list of length 3

During handling of the above exception, another exception occurred:

File "/home/mehrdad/Codes/PTV3-2/pointcept/utils/events.py", line 612, in exit
sys.exit(1) # This prevents double logging the error to the console
File "/home/mehrdad/Codes/PTV3-2/pointcept/engines/train.py", line 174, in train
self.after_train()
File "/home/mehrdad/Codes/PTV3-2/tools/train.py", line 20, in main_worker
trainer.train()
File "/home/mehrdad/Codes/PTV3-2/pointcept/engines/launch.py", line 89, in launch
main_func(*cfg)
File "/home/mehrdad/Codes/PTV3-2/tools/train.py", line 27, in main
launch(
File "/home/mehrdad/Codes/PTV3-2/tools/train.py", line 38, in
main()
SystemExit: 1

I have to mention that I use the configuration of S3dis dataset for my custom dataset but my dataset is outdoor dataset with massive point( about 900 milion points)

I will be thankful if you help me.

@Gofinge
Copy link
Member

Gofinge commented Dec 2, 2024

Hi, could you attach your code for CustomDataset?

@Mehrdad-Hosseini1992
Copy link
Author

Thanks for your response and follow up
I use the S3dis code for Preprocessing:

def parse_room(room, dataset_root, output_root, align_angle=True):
print(f"Parsing: {room}")

source_dir = os.path.join(dataset_root, room)
save_path = os.path.join(output_root, room)
os.makedirs(save_path, exist_ok=True)

# Read the .txt file for the room
room_file = os.path.join(source_dir, f"{room}.txt")
if not os.path.isfile(room_file):
    print(f"Room file {room_file} not found. Skipping...")
    return

# Load the data from the .txt file
data = np.loadtxt(room_file)
coords = data[:, :3]  # x, y, z
colors = data[:, 3:6]  # r, g, b
intensity = data[:, 6]  # intensity
semantic_gt = data[:, 7].astype(int)  # labels

if align_angle:
    # Dummy angle alignment logic (modify as needed)
    angle = np.pi / 4  # Example angle
    rot_cos, rot_sin = np.cos(angle), np.sin(angle)
    rot_t = np.array([[rot_cos, -rot_sin, 0], [rot_sin, rot_cos, 0], [0, 0, 1]])
    room_center = (np.max(coords, axis=0) + np.min(coords, axis=0)) / 2
    coords = (coords - room_center) @ np.transpose(rot_t) + room_center

# Save processed data to .npy files
np.save(os.path.join(save_path, "coord.npy"), coords.astype(np.float32))
np.save(os.path.join(save_path, "color.npy"), colors.astype(np.uint8))
np.save(os.path.join(save_path, "strength.npy"), intensity.astype(np.float32))  # Save intensity as strength
np.save(os.path.join(save_path, "segment.npy"), semantic_gt.astype(np.int16))  # Save labels

print(f"Finished parsing: {room}")

def main_process():
parser = argparse.ArgumentParser()
parser.add_argument(
"--splits",
required=True,
nargs="+",
choices=["ND-Train-1", "ND-Train-2", "ND-Train-3", "validation"],
help="Splits need to process ([ND-Train-1, ND-Train-2, ND-Train-3, validation]).",
)
parser.add_argument(
"--dataset_root", required=True, help="Path to the dataset root directory"
)
parser.add_argument(
"--output_root", required=True, help="Output path where processed data will be saved"
)
parser.add_argument(
"--align_angle", action="store_true", help="Whether to align room angles"
)
parser.add_argument(
"--num_workers", default=1, type=int, help="Number of workers for preprocessing."
)
args = parser.parse_args()

room_list = args.splits

print("Processing scenes...")
pool = ProcessPoolExecutor(max_workers=args.num_workers)
_ = list(
    pool.map(
        parse_room,
        room_list,
        repeat(args.dataset_root),
        repeat(args.output_root),
        repeat(args.align_angle),
    )
)
print("All scenes processed successfully!")

if name == "main":
main_process()

@Gofinge
Copy link
Member

Gofinge commented Dec 3, 2024

Oh sorry, I mean the config and pytorch Dataset class for this processed dataset.

@Mehrdad-Hosseini1992
Copy link
Author

Thanks for your response.
It's not a PyTorch dataset; I am working with my custom dataset that has this distribution:
Other 12,042,142
Sidewalk 86,440,848
Road and asphalt 160,843,736
Curb cut 6,167,570
Vegetation 8,958,736
Tree 10,952,274
Post 4,101,434
Ramp 217,978
Road-sign 724,858
Building 88,872,444
Step 490,386
Door 1,441,536

Also, I have placed my configuration file above.

@Gofinge
Copy link
Member

Gofinge commented Dec 3, 2024

ile "/home/mehrdad/Codes/PTV3-2/pointcept/datasets/defaults.py", line 208, in getitem
raise IndexError(f"Index {idx} out of range for data list of length {len(self.data_list)}")
IndexError: Index 14 out of range for data list of length 3

Could you show me your local version of the code around "datasets/defaults.py" L 208?
By default, we set the length of the dataset equal len(self.data_list) * loop, so no such problem is the official version of code. This might occur due to some custom modification.

@Mehrdad-Hosseini1992
Copy link
Author

You're right, I initially faced some problems, and to solve them, I used this function instead of the defaults.

    def get_data_list(self):

        data_list = []
        if isinstance(self.split, str):
            split_dirs = [os.path.join(self.data_root, self.split)]
        elif isinstance(self.split, Sequence):
            split_dirs = [os.path.join(self.data_root, split) for split in self.split]
        else:
            raise NotImplementedError("The provided split value is not supported.")
        for split_dir in split_dirs:
            if os.path.isdir(split_dir):

                # Append the directory itself, not the files inside
                data_list.append(split_dir)
            else:
                print(f"Warning: {split_dir} is not a directory.")
        if len(data_list) == 0:
            raise ValueError(f"No valid data directories found in {self.data_root} with splits: {self.split}")

        return data_list

@Gofinge
Copy link
Member

Gofinge commented Dec 3, 2024

Check our code here "https://github.com/Pointcept/Pointcept/blob/main/pointcept/datasets/defaults.py#L86" and check your local get_data function (get data path with self.data_list[idx % len(self.data_list)])

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants