I load the model "ZoeD_M12_N.pt", and run the general following code with little modification
, to save metrics depth map first.
import torch
import os
import datetime
import numpy as np
from PIL import Image
from pathlib import Path
from torchvision import transforms
from zoedepth.models.builder import build_model
from zoedepth.utils.config import get_config
from zoedepth.data.preprocess import get_black_border
os.environ["CUDA_VISIBLE_DEVICES"] = '0'
def main():
datas_dir = 'nyu_scenes_split/test'
method = 'mde_zoedepth_eval'
crop_black_or_white_border_ = False
# conf = get_config("zoedepth", "infer")
conf = get_config("zoedepth", "eval")
model_zoe_n = build_model(conf)
zoe = model_zoe_n.to('cuda')
dataset_path = Path(datas_dir)
count = 0
for img_dir in dataset_path.rglob('*.jpg'):
img_dir = str(img_dir)
if 'rgb_' in img_dir:
count += 1
if crop_black_or_white_border_:
rgb_pil = Image.open(img_dir).convert('RGB')
w, h = rgb_pil.size
crop_params = get_black_border(np.array(rgb_pil, dtype=np.uint8))
rgb_pil = rgb_pil.crop((crop_params.left, crop_params.top, crop_params.right, crop_params.bottom))
rgb_array = np.array(rgb_pil)
rgb_array = np.pad(rgb_array, ((crop_params.top, h - crop_params.bottom), (crop_params.left, w - crop_params.right), (0, 0)), mode='reflect')
rgb_pil = Image.fromarray(rgb_array)
else:
rgb_pil = Image.open(img_dir).convert('RGB')
with torch.no_grad():
mde_array = zoe.infer_pil(rgb_pil, output_type="numpy")*1000
# mde_array = zoe.infer_pil(rgb_pil, output_type="numpy")
mde_array = mde_array.astype(np.uint16)
mde_pil = Image.fromarray(mde_array)
save_dir = img_dir.replace('test', method)
save_dir = save_dir.replace('jpg', 'png')
save_dir = save_dir.replace('rgb', 'mde')
img_name = save_dir.split('/')[-1]
saves_dir = save_dir.replace(img_name, '')
os.makedirs(saves_dir, exist_ok=True)
mde_pil.save(save_dir)
# save_dir = img_dir.replace('test', method)
# save_dir = save_dir.replace('jpg', 'npy')
# save_dir = save_dir.replace('rgb', 'mde')
# img_name = save_dir.split('/')[-1]
# saves_dir = save_dir.replace(img_name, '')
# os.makedirs(saves_dir, exist_ok=True)
# np.save(save_dir, mde_array)
else:
count += 0
print(f'{method} has generated {count} mdes, saved to {saves_dir}')
if __name__ == '__main__':
tm_begin = datetime.datetime.now()
print('tm_begin: ', tm_begin)
main()
tm_end = datetime.datetime.now()
print('tm_begin: ', tm_begin)
print('tm_end: ', tm_end)
in this way, I will get the output metric depth in mm, and I test it by the following code, got bad rmse of 0.375 by the following code:
from PIL import Image
from pathlib import Path
import datetime
import numpy as np
import torch
import math
import torch.nn.functional as F
def gaussian(window_size, sigma):
gauss = torch.Tensor([math.exp(-(x - window_size//2)**2/float(2*sigma**2)) for x in range(window_size)])
return gauss/gauss.sum()
def create_window(window_size, channel=1):
_1D_window = gaussian(window_size, 1.5).unsqueeze(1)
_2D_window = _1D_window.mm(_1D_window.t()).float().unsqueeze(0).unsqueeze(0)
window = _2D_window.expand(channel, 1, window_size, window_size).contiguous()
return window
def compute_ssim(pre_gsmaps, gt_gsmaps, window_size=11, size_average=True, full=False, val_range=None, window=None):
if val_range is None:
if torch.max(pre_gsmaps) > 128:
max_val = 255
else:
max_val = 1
if torch.min(pre_gsmaps) < -0.5:
min_val = -1
else:
min_val = 0
L = max_val - min_val
else:
L = val_range
padd = 0
(_, channel, height, width) = pre_gsmaps.size()
if window is None:
real_size = min(window_size, height, width)
window = create_window(real_size, channel=channel).to(pre_gsmaps.device)
mu1 = F.conv2d(pre_gsmaps, window, padding=padd, groups=channel)
mu2 = F.conv2d(gt_gsmaps, window, padding=padd, groups=channel)
mu1_sq = mu1.pow(2)
mu2_sq = mu2.pow(2)
mu1_mu2 = mu1 * mu2
sigma1_sq = F.conv2d(pre_gsmaps * pre_gsmaps, window, padding=padd, groups=channel) - mu1_sq
sigma2_sq = F.conv2d(gt_gsmaps * gt_gsmaps, window, padding=padd, groups=channel) - mu2_sq
sigma12 = F.conv2d(pre_gsmaps * gt_gsmaps, window, padding=padd, groups=channel) - mu1_mu2
C1 = (0.01 * L) ** 2
C2 = (0.03 * L) ** 2
v1 = 2.0 * sigma12 + C2
v2 = sigma1_sq + sigma2_sq + C2
cs = torch.mean(v1 / v2) # contrast sensitivity
ssim_map = ((2 * mu1_mu2 + C1) * v1) / ((mu1_sq + mu2_sq + C1) * v2)
if size_average:
ret = ssim_map.mean() # 无mean返回map
# ret = ssim_map
else:
ret = ssim_map.mean(1).mean(1).mean(1)
if full:
return ret, cs
return ret
def gen_1d_rand_pairs(gt_1d_array, point_num):
if np.array(gt_1d_array.shape).shape != (1,):
raise ValueError('array must be 1d')
np.random.seed(int(gt_1d_array[0]))
length = gt_1d_array.shape[0]
pos = np.random.rand(2, point_num)
pos *= np.array([[length], [length]])
pos = np.floor(pos)
return pos
def ranking_1d_eval(pre, gt, positions, t=0.03):
idx1 = positions[0].astype(np.int64)
idx2 = positions[1].astype(np.int64)
z_1_pre = pre[idx1]
z_2_pre = pre[idx2]
rela_pre = z_1_pre/z_2_pre
z_1_gt = gt[idx1]
z_2_gt = gt[idx2]
rela_gt = z_1_gt/z_2_gt
mask1 = z_1_gt == 0
mask2 = z_2_gt == 0
mask = mask1 * mask2
z_1_pre[mask == True] = 1
z_2_pre[mask == True] = 1
z_1_gt[mask == True] = 1
z_2_gt[mask == True] = 1
mask_list_pre = np.zeros_like(rela_pre)
mask_list_pre[rela_pre>(1+t)] = 1
mask_list_pre[rela_pre<(1-t)] = -1
mask_list_gt = np.zeros_like(rela_pre)
mask_list_gt[rela_gt>(1+t)] = 1
mask_list_gt[rela_gt<(1-t)] = -1
diff_mask = mask_list_gt - mask_list_pre
wrong_points = np.count_nonzero(diff_mask)
return wrong_points/len(rela_gt)
def compute_errors(gt, pred):
rand_pairs = gen_1d_rand_pairs(gt, point_num=50000)
oe = ranking_1d_eval(pred, gt, rand_pairs) * 100
pre_tensor = torch.from_numpy(pred).unsqueeze(0).unsqueeze(0).unsqueeze(0)
gt_tensor = torch.from_numpy(gt).unsqueeze(0).unsqueeze(0).unsqueeze(0)
ssim = compute_ssim(pre_tensor, gt_tensor)
ms = (gt - pred) ** 2
psnr = 20 * np.log10(255 / np.sqrt(ms.mean()))
mae = np.mean(np.abs(gt - pred))
igt = 1 / gt
ipred = 1 / pred
iabsrel = np.mean(np.abs(igt - ipred) / igt)
irmse = np.sqrt(np.mean((igt - ipred) ** 2))
imae = np.mean(np.abs(igt - ipred))
thresh = np.maximum((gt / pred), (pred / gt))
d1 = (thresh < 1.25).mean()
d2 = (thresh < 1.25 ** 2).mean()
d3 = (thresh < 1.25 ** 3).mean()
rmse = (gt - pred) ** 2
rmse = np.sqrt(rmse.mean())
logrmse = (np.log(gt) - np.log(pred)) ** 2
logrmse = np.sqrt(logrmse.mean())
absrel = np.mean(np.abs(gt - pred) / gt)
sqrel = np.mean(((gt - pred) ** 2) / gt)
err = np.log(pred) - np.log(gt)
silog = np.sqrt(np.mean(err ** 2) - np.mean(err) ** 2) * 100
err = np.abs(np.log10(pred) - np.log10(gt))
log10 = np.mean(err)
# return np.array([silog, absrel, log10, rmse, sqrel, logrmse, d1, d2, d3])
return np.array([oe, psnr, ssim, absrel, rmse, mae, iabsrel, irmse, imae, d1*100, d2*100, d3*100])
def main():
# method = 'mde_newcrfs'
# method = 'mde_zoedepth'
method = 'mde_zoedepth_eval'
gts_path = Path('nyu_scenes_split/test')
measures_sum = np.zeros(12)
cnt = 0
for gt_path in gts_path.rglob('*.png'):
gt_dir = str(gt_path)
mde_dir = gt_dir.replace('test', method)
mde_dir = mde_dir.replace('sync_depth', 'mde')
# mde_dir = mde_dir.replace('png', 'npy')
mde_array = np.array(Image.open(mde_dir), dtype=np.float32)/1000
# mde_array = np.load(mde_dir)
gt_array = np.array(Image.open(gt_dir), dtype=np.float32)/1000
# a = gt_array.shape
# import cv2
# mde_array = cv2.resize(mde_array, (gt_array.shape[1], gt_array.shape[0]), interpolation=cv2.INTER_CUBIC)
mde_array[mde_array < 1e-3] = 1e-3
mde_array[mde_array > 10] = 10
mde_array[np.isinf(mde_array)] = 10
mde_array[np.isnan(mde_array)] = 1e-3
valid_mask = np.logical_and(gt_array > 1e-3, gt_array < 10)
eval_mask = np.zeros(valid_mask.shape)
eval_mask[45:471, 41:601] = 1
valid_mask = np.logical_and(valid_mask, eval_mask)
measures_sum += compute_errors(gt_array[valid_mask], mde_array[valid_mask])
cnt += 1
measures_sum /= cnt
print('oe psnr ssim absrel rmse mae iabsrel irmse imae d1 d2 d3')
print(f'{measures_sum[0]:.3f} {measures_sum[1]:.3f} {measures_sum[2]:.3f} {measures_sum[3]:.3f} {measures_sum[4]:.3f} {measures_sum[5]:.3f} {measures_sum[6]:.3f} {measures_sum[7]:.3f} {measures_sum[8]:.3f} {measures_sum[9]:.1f} {measures_sum[10]:.1f} {measures_sum[11]:.1f}')
if __name__ == '__main__':
tm_begin = datetime.datetime.now()
print('tm_begin: ', tm_begin)
main()
tm_end = datetime.datetime.now()
print('tm_begin: ', tm_begin)
print('tm_end: ', tm_end)
I compute the rmse in the same way for the method "NeWCRFs", and got the right rmse of 0.333. However, when I tried zoedepth, I got the rmse of 0.375. As you can see in my annotation, I have tried to crop the black border of RGB the same as Zoedepth, changed the mode of 'inter' into 'eval', and also tried to save the npy file to avoid round-off error, but none of them work. An interesting thing is that when I directly run the evaluate.py by the author, I got the right rmse of 0.27.
Directly utilize "mde_array = zoe.infer_pil(rgb_pil, output_type="numpy")" is so convenient to save depth map, but what makes the metrics down?
I load the model "ZoeD_M12_N.pt", and run the general following code with little modification
, to save metrics depth map first.
in this way, I will get the output metric depth in mm, and I test it by the following code, got bad rmse of 0.375 by the following code:
I compute the rmse in the same way for the method "NeWCRFs", and got the right rmse of 0.333. However, when I tried zoedepth, I got the rmse of 0.375. As you can see in my annotation, I have tried to crop the black border of RGB the same as Zoedepth, changed the mode of 'inter' into 'eval', and also tried to save the npy file to avoid round-off error, but none of them work. An interesting thing is that when I directly run the evaluate.py by the author, I got the right rmse of 0.27.
Directly utilize "mde_array = zoe.infer_pil(rgb_pil, output_type="numpy")" is so convenient to save depth map, but what makes the metrics down?