princeton-vl · olayasturias · Mar 12, 2025 · Mar 14, 2025 · Sep 10, 2025
diff --git a/droid_slam/data_readers/rgbd_utils.py b/droid_slam/data_readers/rgbd_utils.py
@@ -2,7 +2,7 @@
 import os.path as osp
 
 import torch
-from lietorch import SE3
+from thirdparty.lietorch.lietorch import SE3
 
 import geom.projective_ops as pops
 from scipy.spatial.transform import Rotation

diff --git a/droid_slam/data_readers/tartan.py b/droid_slam/data_readers/tartan.py
@@ -6,7 +6,7 @@
 import os
 import os.path as osp
 
-from lietorch import SE3
+from thirdparty.lietorch.lietorch import SE3
 from .base import RGBDDataset
 from .stream import RGBDStream
 

diff --git a/droid_slam/depth_video.py b/droid_slam/depth_video.py
@@ -1,6 +1,6 @@
 import numpy as np
 import torch
-import lietorch
+from thirdparty.lietorch.lietorch import SE3
 import droid_backends
 
 from torch.multiprocessing import Process, Queue, Lock, Value
@@ -139,7 +139,7 @@ def normalize(self):
     def reproject(self, ii, jj):
         """ project points from ii -> jj """
         ii, jj = DepthVideo.format_indicies(ii, jj)
-        Gs = lietorch.SE3(self.poses[None])
+        Gs = SE3(self.poses[None])
 
         coords, valid_mask = \
             pops.projective_transform(Gs, self.disps[None], self.intrinsics[None], ii, jj)

diff --git a/droid_slam/droid_backend.py b/droid_slam/droid_backend.py
@@ -2,7 +2,7 @@
 import lietorch
 import numpy as np
 
-from lietorch import SE3
+from thirdparty.lietorch.lietorch import SE3
 from factor_graph import FactorGraph
 
 
@@ -20,7 +20,7 @@ def __init__(self, net, video, args):
         self.backend_thresh = args.backend_thresh
         self.backend_radius = args.backend_radius
         self.backend_nms = args.backend_nms
-        
+
     @torch.no_grad()
     def __call__(self, steps=12):
         """ main update """
@@ -31,9 +31,9 @@ def __call__(self, steps=12):
 
         graph = FactorGraph(self.video, self.update_op, corr_impl="alt", max_factors=16*t, upsample=self.upsample)
 
-        graph.add_proximity_factors(rad=self.backend_radius, 
-                                    nms=self.backend_nms, 
-                                    thresh=self.backend_thresh, 
+        graph.add_proximity_factors(rad=self.backend_radius,
+                                    nms=self.backend_nms,
+                                    thresh=self.backend_thresh,
                                     beta=self.beta)
 
         graph.update_lowmem(steps=steps)

diff --git a/droid_slam/droid_frontend.py b/droid_slam/droid_frontend.py
@@ -2,7 +2,7 @@
 import lietorch
 import numpy as np
 
-from lietorch import SE3
+from thirdparty.lietorch.lietorch import SE3
 from factor_graph import FactorGraph
 
 

diff --git a/droid_slam/droid_net.py b/droid_slam/droid_net.py
@@ -9,7 +9,6 @@
 from modules.gru import ConvGRU
 from modules.clipping import GradientClip
 
-from lietorch import SE3
 from geom.ba import BA
 
 import geom.projective_ops as pops
@@ -77,6 +76,23 @@ def forward(self, net, ii):
 
 class UpdateModule(nn.Module):
     def __init__(self):
+        """
+        Initialize the UpdateModule class.
+        This method sets up the neural network layers and components used in the UpdateModule.
+        It includes the following components:
+        - corr_encoder: A sequential model consisting of two convolutional layers with ReLU activations.
+          It processes the correlation planes.
+        - flow_encoder: A sequential model consisting of two convolutional layers with ReLU activations.
+          It processes the optical flow.
+        - weight: A sequential model consisting of two convolutional layers with ReLU activations,
+          a GradientClip layer, and a Sigmoid activation. It computes the weights.
+        - delta: A sequential model consisting of two convolutional layers with ReLU activations
+          and a GradientClip layer. It computes the delta values.
+        - gru: A ConvGRU layer that processes the concatenated features from corr_encoder, flow_encoder, and other inputs.
+        - agg: A GraphAgg layer that performs graph aggregation.
+        The method also calculates the number of correlation planes (cor_planes) based on the input dimensions.
+        """
+
         super(UpdateModule, self).__init__()
         cor_planes = 4 * (2*3 + 1)**2
 

diff --git a/droid_slam/factor_graph.py b/droid_slam/factor_graph.py
@@ -3,7 +3,7 @@
 import numpy as np
 
 import matplotlib.pyplot as plt
-from lietorch import SE3
+from thirdparty.lietorch.lietorch import SE3
 from modules.corr import CorrBlock, AltCorrBlock
 import geom.projective_ops as pops
 
@@ -119,7 +119,7 @@ def add_factors(self, ii, jj, remove=False):
             inp = self.video.inps[ii].to(self.device).unsqueeze(0)
             self.inp = inp if self.inp is None else torch.cat([self.inp, inp], 1)
 
-        with torch.cuda.amp.autocast(enabled=False):
+        with torch.amp.autocast('cuda', enabled=False):
             target, _ = self.video.reproject(ii, jj)
             weight = torch.zeros_like(target)
 

diff --git a/droid_slam/geom/chol.py b/droid_slam/geom/chol.py
@@ -1,6 +1,5 @@
 import torch
 import torch.nn.functional as F
-import geom.projective_ops as pops
 
 class CholeskySolver(torch.autograd.Function):
     @staticmethod

diff --git a/droid_slam/geom/losses.py b/droid_slam/geom/losses.py
@@ -1,7 +1,7 @@
 from collections import OrderedDict
 import numpy as np
 import torch
-from lietorch import SO3, SE3, Sim3
+from thirdparty.lietorch.lietorch import SO3, SE3, Sim3
 from .graph_utils import graph_to_edge_list
 from .projective_ops import projective_transform
 

diff --git a/droid_slam/geom/projective_ops.py b/droid_slam/geom/projective_ops.py
@@ -1,14 +1,25 @@
 import torch
 import torch.nn.functional as F
 
-from lietorch import SE3, Sim3
+from thirdparty.lietorch.lietorch import SE3, Sim3
 
 MIN_DEPTH = 0.2
 
 def extract_intrinsics(intrinsics):
     return intrinsics[...,None,None,:].unbind(dim=-1)
 
 def coords_grid(ht, wd, **kwargs):
+    """
+    Generates a grid of coordinates.
+    Args:
+        ht (int): Height of the grid.
+        wd (int): Width of the grid.
+        **kwargs: Additional arguments to be passed to the `to` method of the tensors.
+    Returns:
+        torch.Tensor: A tensor of shape (ht, wd, 2) containing the coordinates grid.
+                      The first channel contains the x-coordinates and the second channel contains the y-coordinates.
+    """
+
     y, x = torch.meshgrid(
         torch.arange(ht).to(**kwargs).float(),
         torch.arange(wd).to(**kwargs).float())
@@ -100,7 +111,7 @@ def projective_transform(poses, depths, intrinsics, ii, jj, jacobian=False, retu
     X0, Jz = iproj(depths[:,ii], intrinsics[:,ii], jacobian=jacobian)
 
     # transform
-    Gij = poses[:,jj] * poses[:,ii].inv()
+    Gij = poses[:,jj] * poses[:,ii].inv() # TODO: se rompe aqui!!!
 
     Gij.data[:,ii==jj] = torch.as_tensor([-0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0], device="cuda")
     X1, Ja = actp(Gij, X0, jacobian=jacobian)

diff --git a/droid_slam/modules/corr.py b/droid_slam/modules/corr.py
@@ -22,6 +22,19 @@ def backward(ctx, grad_output):
 
 class CorrBlock:
     def __init__(self, fmap1, fmap2, num_levels=4, radius=3):
+        """
+        Initializes the CorrBlock class.
+        Args:
+            fmap1 (torch.Tensor): The first feature map tensor.
+            fmap2 (torch.Tensor): The second feature map tensor.
+            num_levels (int, optional): The number of levels in the correlation pyramid. Default is 4.
+            radius (int, optional): The radius for correlation. Default is 3.
+        Attributes:
+            num_levels (int): The number of levels in the correlation pyramid.
+            radius (int): The radius for correlation.
+            corr_pyramid (list): A list to store the correlation pyramid.
+        """
+
         self.num_levels = num_levels
         self.radius = radius
         self.corr_pyramid = []

diff --git a/droid_slam/modules/extractor.py b/droid_slam/modules/extractor.py
@@ -181,9 +181,16 @@ def _make_layer(self, dim, stride=1):
         return nn.Sequential(*layers)
 
     def forward(self, x):
-        b, n, c1, h1, w1 = x.shape
-        x = x.view(b*n, c1, h1, w1)
-
+        batch_size, num_views, \
+            input_channels, input_height, input_width = x.shape
+        # Flatten batch and views together
+        x = x.view(
+            batch_size*num_views,
+            input_channels,
+            input_height,
+            input_width
+            )
+        # Apply convolutions
         x = self.conv1(x)
         x = self.norm1(x)
         x = self.relu1(x)
@@ -194,5 +201,5 @@ def forward(self, x):
 
         x = self.conv2(x)
 
-        _, c2, h2, w2 = x.shape
-        return x.view(b, n, c2, h2, w2)
+        batchxviews_size, output_channels, output_height, output_width = x.shape
+        return x.view(batch_size, num_views, output_channels, output_height, output_width)
diff --git a/droid_slam/motion_filter.py b/droid_slam/motion_filter.py
@@ -1,7 +1,6 @@
 import cv2
 import torch
-import lietorch
-
+from thirdparty.lietorch.lietorch import SE3
 from collections import OrderedDict
 from droid_net import DroidNet
 
@@ -15,7 +14,7 @@ class MotionFilter:
     def __init__(self, net, video, thresh=2.5, device="cuda:0"):
 
         # split net modules
-        self.cnet = net.cnet
+        self.context_encoder = net.cnet
         self.fnet = net.fnet
         self.update = net.update
 
@@ -28,55 +27,66 @@ def __init__(self, net, video, thresh=2.5, device="cuda:0"):
         # mean, std for image normalization
         self.MEAN = torch.as_tensor([0.485, 0.456, 0.406], device=self.device)[:, None, None]
         self.STDV = torch.as_tensor([0.229, 0.224, 0.225], device=self.device)[:, None, None]
-        
-    @torch.cuda.amp.autocast(enabled=True)
+
+    @torch.autocast("cuda", enabled=True)
     def __context_encoder(self, image):
         """ context features """
-        net, inp = self.cnet(image).split([128,128], dim=2)
-        return net.tanh().squeeze(0), inp.relu().squeeze(0)
+        feature_map = self.context_encoder(image)
+        context_features, input_features = feature_map.split([128, 128], dim=2)
+        return context_features.tanh().squeeze(0), input_features.relu().squeeze(0)
 
-    @torch.cuda.amp.autocast(enabled=True)
+    @torch.autocast("cuda", enabled=True)
     def __feature_encoder(self, image):
         """ features for correlation volume """
         return self.fnet(image).squeeze(0)
 
-    @torch.cuda.amp.autocast(enabled=True)
+    @torch.autocast("cuda", enabled=True)
     @torch.no_grad()
     def track(self, tstamp, image, depth=None, intrinsics=None):
         """ main update operation - run on every frame in video """
 
-        Id = lietorch.SE3.Identity(1,).data.squeeze()
-        ht = image.shape[-2] // 8
-        wd = image.shape[-1] // 8
+        IdentityTransformation = SE3.Identity(1,).data.squeeze()
+        height_div_8 = image.shape[-2] // 8
+        width_div_8 = image.shape[-1] // 8
 
         # normalize images
-        inputs = image[None, :, [2,1,0]].to(self.device) / 255.0
-        inputs = inputs.sub_(self.MEAN).div_(self.STDV)
+        normalized_image_input = image[None, :, [2,1,0]].to(self.device) / 255.0
+        normalized_image_input = normalized_image_input.sub_(self.MEAN).div_(self.STDV)
 
         # extract features
-        gmap = self.__feature_encoder(inputs)
+        image_feature_representation = self.__feature_encoder(normalized_image_input)
 
         ### always add first frame to the depth video ###
         if self.video.counter.value == 0:
-            net, inp = self.__context_encoder(inputs[:,[0]])
-            self.net, self.inp, self.fmap = net, inp, gmap
-            self.video.append(tstamp, image[0], Id, 1.0, depth, intrinsics / 8.0, gmap, net[0,0], inp[0,0])
+            context_features, context_input_features = self.__context_encoder(normalized_image_input[:,[0]])
+            self.context_features, self.context_input_features, self.image_feature_map = context_features, context_input_features, image_feature_representation
+            self.video.append(
+                tstamp,
+                image[0],
+                IdentityTransformation,
+                1.0,  # Initial pose = Identity
+                depth,
+                intrinsics / 8.0,
+                image_feature_representation,
+                context_features[0,0],
+                context_input_features[0,0]
+                )
 
         ### only add new frame if there is enough motion ###
-        else:                
+        else:
             # index correlation volume
-            coords0 = pops.coords_grid(ht, wd, device=self.device)[None,None]
-            corr = CorrBlock(self.fmap[None,[0]], gmap[None,[0]])(coords0)
+            grid_coordinates = pops.coords_grid(height_div_8, width_div_8, device=self.device)[None,None]
+            correlation_features = CorrBlock(self.image_feature_map[None,[0]], image_feature_representation[None,[0]])(grid_coordinates)
 
             # approximate flow magnitude using 1 update iteration
-            _, delta, weight = self.update(self.net[None], self.inp[None], corr)
+            _, delta, weight = self.update(self.context_features[None], self.context_input_features[None], correlation_features)
 
             # check motion magnitue / add new frame to video
             if delta.norm(dim=-1).mean().item() > self.thresh:
                 self.count = 0
-                net, inp = self.__context_encoder(inputs[:,[0]])
-                self.net, self.inp, self.fmap = net, inp, gmap
-                self.video.append(tstamp, image[0], None, None, depth, intrinsics / 8.0, gmap, net[0], inp[0])
+                context_features, context_input_features = self.__context_encoder(normalized_image_input[:,[0]])
+                self.context_features, self.context_input_features, self.image_feature_map = context_features, context_input_features, image_feature_representation
+                self.video.append(tstamp, image[0], None, None, depth, intrinsics / 8.0, image_feature_representation, context_features[0], context_input_features[0])
 
             else:
                 self.count += 1

diff --git a/droid_slam/trajectory_filler.py b/droid_slam/trajectory_filler.py
@@ -2,7 +2,8 @@
 import torch
 import lietorch
 
-from lietorch import SE3
+from thirdparty.lietorch.lietorch import SE3
+from thirdparty.lietorch.lietorch.groups import cat
 from collections import OrderedDict
 from factor_graph import FactorGraph
 from droid_net import DroidNet
@@ -100,5 +101,5 @@ def __call__(self, image_stream):
             pose_list += self.__fill(tstamps, images, intrinsics)
 
         # stitch pose segments together
-        return lietorch.cat(pose_list, 0)
+        return cat(pose_list, 0)
 
diff --git a/droid_slam/visualization.py b/droid_slam/visualization.py
@@ -7,7 +7,7 @@
 import numpy as np
 import open3d as o3d
 
-from lietorch import SE3
+from thirdparty.lietorch.lietorch import SE3
 import geom.projective_ops as pops
 
 CAM_POINTS = np.array([