Skip to content

Commit

Permalink
perf: read_off remove pandas.read_csv skiprows
Browse files Browse the repository at this point in the history
Using the already seeked file avoids the need for pandas to reparse the
first rows of the file to find the starting position.

---

Also, improve robustness of header parsing a bit.

In particular, ModelNet40 has faulty headers:
```bash
$ head -n 1 ModelNet40/chair/train/chair_0856.off
OFF6586 5534 0
```

For reference, the correct format is:
```
OFF
6586 5534 0
```

Nonetheless, it is still valuable to parse the faulty header.
  • Loading branch information
YodaEmbedding committed Dec 24, 2023
1 parent 248fd0f commit fa353ac
Showing 1 changed file with 38 additions and 27 deletions.
65 changes: 38 additions & 27 deletions pyntcloud/io/off.py
Original file line number Diff line number Diff line change
@@ -1,62 +1,73 @@
import pandas as pd
import re

import numpy as np
import pandas as pd


def read_off(filename):

with open(filename) as off:

first_line = off.readline()
with open(filename) as f:
first_line = f.readline()
if "OFF" not in first_line:
raise ValueError('The file does not start with the word OFF')
color = True if "C" in first_line else False
raise ValueError("The file does not start with the word OFF")
has_color = "C" in first_line

n_points = 0
n_faces = 0
num_rows = None
n_points = None
n_faces = None

count = 1
for line in off:
count += 1
# Deal with faulty headers, e.g. "OFF4 4 0".
m = re.match(r"^(?P<prefix>\D)([\d\s]+)$", first_line)
if m:
f.seek(len(m.group("prefix")))

# Read header.
for line in f:
if line.startswith("#"):
continue
line = line.strip().split()
if len(line) > 1:
n_points = int(line[0])
n_faces = int(line[1])
break
if len(line) <= 1:
continue
n_points = int(line[0])
n_faces = int(line[1])
num_rows = n_points + n_faces
break

if (n_points == 0):
raise ValueError('The file has no points')
if num_rows is None:
raise ValueError("The file does not contain a valid header")

if n_points == 0:
raise ValueError("The file contains no points")

data = {}
point_names = ["x", "y", "z"]
point_types = {'x': np.float32, 'y': np.float32, 'z': np.float32}
point_types = {"x": np.float32, "y": np.float32, "z": np.float32}

if color:
if has_color:
point_names.extend(["red", "green", "blue"])
point_types = dict(point_types, **{'red': np.uint8, 'green': np.uint8, 'blue': np.uint8})
color_point_types = {"red": np.uint8, "green": np.uint8, "blue": np.uint8}
point_types = {**point_types, **color_point_types}

data["points"] = pd.read_csv(
off,
f,
sep=" ",
header=None,
engine="c",
nrows=n_points,
names=point_names,
dtype=point_types,
index_col=False,
comment="#"
comment="#",
)

data["mesh"] = pd.read_csv(
filename,
f,
sep=" ",
header=None,
engine="c",
skiprows=(count + n_points),
nrows=n_faces,
usecols=[1, 2, 3],
names=["v1", "v2", "v3"],
comment="#"
comment="#",
)
return data

return data

0 comments on commit fa353ac

Please sign in to comment.