teamtomo · d-j-hatton · Mar 16, 2024 · Mar 16, 2024 · Mar 17, 2024 · Mar 17, 2024
diff --git a/README.md b/README.md
@@ -17,6 +17,8 @@
 scripts and larger software packages to provide basic STAR file I/O functions.
 Data is exposed as simple python dictionaries or
 [pandas dataframes](https://pandas.pydata.org/docs/user_guide/dsintro.html#dataframe).
+(The data may be exposed as [polars dataframes](https://docs.pola.rs/py-polars/html/reference/dataframe/index.html) 
+if `polars=True` is passed to the `read` function.)
 
 This package was designed principally for compatibility with files generated by
 [RELION](https://www3.mrc-lmb.cam.ac.uk/relion/index.php/Main_Page).

diff --git a/pyproject.toml b/pyproject.toml
@@ -38,6 +38,7 @@ classifiers = [
 dependencies = [
     "numpy",
     "pandas>=2.1.1",
+    "polars>=0.20",
     "pyarrow",
     "typing-extensions",
 ]

diff --git a/src/starfile/functions.py b/src/starfile/functions.py
@@ -1,26 +1,26 @@
 from __future__ import annotations
 
-from typing import TYPE_CHECKING, Dict, List, Union, Optional
+from typing import TYPE_CHECKING, Optional, Union
 
 if TYPE_CHECKING:
-    import pandas as pd
     from os import PathLike
 
+
 from .parser import StarParser
-from .writer import StarWriter
 from .typing import DataBlock
+from .writer import StarWriter
 
 if TYPE_CHECKING:
-    import pandas as pd
     from os import PathLike
 
 
 def read(
     filename: PathLike,
     read_n_blocks: Optional[int] = None,
     always_dict: bool = False,
-    parse_as_string: List[str] = []
-) -> Union[DataBlock, Dict[DataBlock]]:
+    parse_as_string: list[str] = [],
+    polars: bool = False,
+) -> Union[DataBlock, dict[DataBlock]]:
     """Read data from a STAR file.
 
     Basic data blocks are read as dictionaries. Loop blocks are read as pandas
@@ -40,22 +40,27 @@ def read(
     parse_as_string: list[str]
         A list of keys or column names which will not be coerced to numeric values.
     """
-    parser = StarParser(filename, n_blocks_to_read=read_n_blocks, parse_as_string=parse_as_string)
+    parser = StarParser(
+        filename,
+        n_blocks_to_read=read_n_blocks,
+        parse_as_string=parse_as_string,
+        polars=polars,
+    )
     if len(parser.data_blocks) == 1 and always_dict is False:
         return list(parser.data_blocks.values())[0]
     else:
         return parser.data_blocks
 
 
 def write(
-    data: Union[DataBlock, Dict[str, DataBlock], List[DataBlock]],
+    data: Union[DataBlock, dict[str, DataBlock], list[DataBlock]],
     filename: PathLike,
-    float_format: str = '%.6f',
-    sep: str = '\t',
-    na_rep: str = '<NA>',
+    float_format: int = 6,
+    sep: str = "\t",
+    na_rep: str = "<NA>",
     quote_character: str = '"',
     quote_all_strings: bool = False,
-    **kwargs
+    **kwargs,
 ):
     """Write data to disk in the STAR format.
 
@@ -66,8 +71,8 @@ def write(
         If a dictionary of datablocks are passed the keys will be the data block names.
     filename: PathLike
         Path where the file will be saved.
-    float_format: str
-        Float format string which will be passed to pandas.
+    float_format: int
+        Number of decimal places to write floats to.
     sep: str
         Separator between values, will be passed to pandas.
     na_rep: str

diff --git a/src/starfile/parser.py b/src/starfile/parser.py
@@ -1,14 +1,16 @@
 from __future__ import annotations
 
+import shlex
 from collections import deque
+from functools import lru_cache
 from io import StringIO
 from linecache import getline
-import shlex
+from pathlib import Path
+from typing import TYPE_CHECKING, Optional, Union
 
 import numpy as np
 import pandas as pd
-from pathlib import Path
-from typing import TYPE_CHECKING, Union, Optional, Dict, Tuple, List
+import polars as pl
 
 from starfile.typing import DataBlock
 
@@ -21,14 +23,15 @@ class StarParser:
     n_lines_in_file: int
     n_blocks_to_read: int
     current_line_number: int
-    data_blocks: Dict[DataBlock]
-    parse_as_string: List[str]
+    data_blocks: dict[DataBlock]
+    parse_as_string: list[str]
 
     def __init__(
         self,
         filename: PathLike,
         n_blocks_to_read: Optional[int] = None,
-        parse_as_string: List[str] = [],
+        parse_as_string: list[str] = [],
+        polars: bool = False,
     ):
         # set filename, with path checking
         filename = Path(filename)
@@ -42,48 +45,51 @@ def __init__(
         self.n_blocks_to_read = n_blocks_to_read
         self.parse_as_string = parse_as_string
 
+        self.polars = polars
+
         # parse file
         self.current_line_number = 0
         self.parse_file()
 
-    @property
-    def current_line(self) -> str:
-        return getline(str(self.filename), self.current_line_number).strip()
+    @lru_cache(maxsize=25)
+    def _get_line(self, line_number: int) -> str:
+        return " ".join(getline(str(self.filename), line_number).split())
 
     def parse_file(self):
         while self.current_line_number <= self.n_lines_in_file:
             if len(self.data_blocks) == self.n_blocks_to_read:
                 break
-            elif self.current_line.startswith('data_'):
+            elif self._get_line(self.current_line_number).startswith("data_"):
                 block_name, block = self._parse_data_block()
                 self.data_blocks[block_name] = block
             else:
                 self.current_line_number += 1
 
-    def _parse_data_block(self) -> Tuple[str, DataBlock]:
+    def _parse_data_block(self) -> tuple[str, DataBlock]:
         # current line starts with 'data_foo'
-        block_name = self.current_line[5:]  # 'data_foo' -> 'foo'
+        block_name = self._get_line(self.current_line_number)[5:]  # 'data_foo' -> 'foo'
         self.current_line_number += 1
 
         # iterate over file,
         while self.current_line_number <= self.n_lines_in_file:
             self.current_line_number += 1
-            if self.current_line.startswith('loop_'):
+            current_line = self._get_line(self.current_line_number)
+            if current_line.startswith("loop_"):
                 return block_name, self._parse_loop_block()
-            elif self.current_line.startswith('_'):  # line is simple block
+            elif current_line.startswith("_"):  # line is simple block
                 return block_name, self._parse_simple_block()
 
-    def _parse_simple_block(self) -> Dict[str, Union[str, int, float]]:
+    def _parse_simple_block(self) -> dict[str, Union[str, int, float]]:
         block = {}
         while self.current_line_number <= self.n_lines_in_file:
-            if self.current_line.startswith('data'):
+            c = self._get_line(self.current_line_number)
+            if c.startswith("data"):
                 break
-            elif self.current_line.startswith('_'):  # '_foo bar'
-                k, v = shlex.split(self.current_line)
+            elif c.startswith("_"):  # '_foo bar'
+                k, v = shlex.split(c)
                 column_name = k[1:]
-                parse_column_as_string = (
-                    self.parse_as_string is not None
-                    and any(column_name == col for col in self.parse_as_string)
+                parse_column_as_string = self.parse_as_string is not None and any(
+                    column_name == col for col in self.parse_as_string
                 )
                 if parse_column_as_string is True:
                     block[column_name] = v
@@ -92,58 +98,66 @@ def _parse_simple_block(self) -> Dict[str, Union[str, int, float]]:
             self.current_line_number += 1
         return block
 
-    def _parse_loop_block(self) -> pd.DataFrame:
+    def _parse_loop_block(self) -> pd.DataFrame | pl.DataFrame:
         # parse loop header
         loop_column_names = deque()
         self.current_line_number += 1
 
-        while self.current_line.startswith('_'):
-            column_name = self.current_line.split()[0][1:]
+        while self._get_line(self.current_line_number).startswith("_"):
+            column_name = self._get_line(self.current_line_number).split()[0][1:]
             loop_column_names.append(column_name)
             self.current_line_number += 1
 
         # now parse the loop block data
         loop_data = deque()
         while self.current_line_number <= self.n_lines_in_file:
-            if self.current_line.startswith('data_'):
+            current_line = self._get_line(self.current_line_number)
+            if current_line.startswith("data_"):
                 break
-            loop_data.append(self.current_line)
+            previous_line = self._get_line(self.current_line_number - 1)
+            if not (current_line.isspace() and previous_line.isspace()) and (
+                current_line and previous_line
+            ):
+                loop_data.append(current_line)
             self.current_line_number += 1
-        loop_data = '\n'.join(loop_data)
-        if loop_data[-2:] != '\n':
-            loop_data += '\n'
+        loop_data = "\n".join(loop_data)
+        if loop_data[-2:] != "\n":
+            loop_data += "\n"
 
         # put string data into a dataframe
-        if loop_data == '\n':
+        if loop_data == "\n":
             n_cols = len(loop_column_names)
-            df = pd.DataFrame(np.zeros(shape=(0, n_cols)))
+            df = pl.DataFrame(np.zeros(shape=(0, n_cols)))
         else:
-            column_name_to_index = {col: idx for idx, col in enumerate(loop_column_names)}
-            df = pd.read_csv(
+            df = pl.read_csv(
                 StringIO(loop_data.replace("'", '"')),
-                delimiter=r'\s+',
-                header=None,
-                comment='#',
-                dtype={column_name_to_index[k]: str for k in self.parse_as_string if k in loop_column_names},
-                keep_default_na=False,
-                engine='c',
+                separator=" ",
+                has_header=False,
+                comment_prefix="#",
+                dtypes={
+                    k: pl.String for k in self.parse_as_string if k in loop_column_names
+                },
+                truncate_ragged_lines=True,
+                null_values=["", "<NA>"],
             )
             df.columns = loop_column_names
 
-            # Numericise all columns in temporary copy
-            df_numeric = df.apply(_apply_numeric)
-
-            # Replace columns that are all NaN with the original columns
-            df_numeric[df_numeric.columns[df_numeric.isna().all()]] = df[df_numeric.columns[df_numeric.isna().all()]]
+            # If the column type is string then use empty strings rather than null
+            df = df.with_columns(pl.col(pl.String).fill_null(""))
 
             # Replace columns that should be strings
             for col in df.columns:
-                df[col] = df_numeric[col] if col not in self.parse_as_string else df[col]
-        return df
+                if col in self.parse_as_string:
+                    df = df.with_columns(
+                        pl.col(col).cast(pl.String).fill_null("").alias(col)
+                    )
+        if self.polars:
+            return df
+        return df.to_pandas()
 
 
 def count_lines(file: Path) -> int:
-    with open(file, 'rb') as f:
+    with open(file, "rb") as f:
         return sum(1 for _ in f)
 
 
@@ -169,10 +183,3 @@ def numericise(value: str) -> Union[str, int, float]:
             # If it's not a float either, leave it as a string
             value = value
     return value
-
-
-def _apply_numeric(col: pd.Series) -> pd.Series:
-    try:
-        return pd.to_numeric(col)
-    except ValueError:
-        return col
diff --git a/src/starfile/typing.py b/src/starfile/typing.py
@@ -1,11 +1,11 @@
 from __future__ import annotations
 
-from typing import Union, Dict
-from typing_extensions import TypeAlias
+from typing import Dict, Union
 
 import pandas as pd
+import polars as pl
+from typing_extensions import TypeAlias
 
 DataBlock: TypeAlias = Union[
-    pd.DataFrame,
-    Dict[str, Union[str, int, float]]
+    Union[pd.DataFrame, pl.DataFrame], Dict[str, Union[str, int, float]]
 ]