Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use Polars to read and write rather than Pandas #56

Open
wants to merge 13 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@
scripts and larger software packages to provide basic STAR file I/O functions.
Data is exposed as simple python dictionaries or
[pandas dataframes](https://pandas.pydata.org/docs/user_guide/dsintro.html#dataframe).
(The data may be exposed as [polars dataframes](https://docs.pola.rs/py-polars/html/reference/dataframe/index.html)
if `polars=True` is passed to the `read` function.)

This package was designed principally for compatibility with files generated by
[RELION](https://www3.mrc-lmb.cam.ac.uk/relion/index.php/Main_Page).
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ classifiers = [
dependencies = [
"numpy",
"pandas>=2.1.1",
"polars>=0.20",
"pyarrow",
"typing-extensions",
]
Expand Down
33 changes: 19 additions & 14 deletions src/starfile/functions.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,26 @@
from __future__ import annotations

from typing import TYPE_CHECKING, Dict, List, Union, Optional
from typing import TYPE_CHECKING, Optional, Union

if TYPE_CHECKING:
import pandas as pd
from os import PathLike


from .parser import StarParser
from .writer import StarWriter
from .typing import DataBlock
from .writer import StarWriter

if TYPE_CHECKING:
import pandas as pd
from os import PathLike


def read(
filename: PathLike,
read_n_blocks: Optional[int] = None,
always_dict: bool = False,
parse_as_string: List[str] = []
) -> Union[DataBlock, Dict[DataBlock]]:
parse_as_string: list[str] = [],
polars: bool = False,
) -> Union[DataBlock, dict[DataBlock]]:
"""Read data from a STAR file.

Basic data blocks are read as dictionaries. Loop blocks are read as pandas
Expand All @@ -40,22 +40,27 @@ def read(
parse_as_string: list[str]
A list of keys or column names which will not be coerced to numeric values.
"""
parser = StarParser(filename, n_blocks_to_read=read_n_blocks, parse_as_string=parse_as_string)
parser = StarParser(
filename,
n_blocks_to_read=read_n_blocks,
parse_as_string=parse_as_string,
polars=polars,
)
if len(parser.data_blocks) == 1 and always_dict is False:
return list(parser.data_blocks.values())[0]
else:
return parser.data_blocks


def write(
data: Union[DataBlock, Dict[str, DataBlock], List[DataBlock]],
data: Union[DataBlock, dict[str, DataBlock], list[DataBlock]],
filename: PathLike,
float_format: str = '%.6f',
sep: str = '\t',
na_rep: str = '<NA>',
float_format: int = 6,
sep: str = "\t",
na_rep: str = "<NA>",
quote_character: str = '"',
quote_all_strings: bool = False,
**kwargs
**kwargs,
):
"""Write data to disk in the STAR format.

Expand All @@ -66,8 +71,8 @@ def write(
If a dictionary of datablocks are passed the keys will be the data block names.
filename: PathLike
Path where the file will be saved.
float_format: str
Float format string which will be passed to pandas.
float_format: int
Number of decimal places to write floats to.
sep: str
Separator between values, will be passed to pandas.
na_rep: str
Expand Down
115 changes: 61 additions & 54 deletions src/starfile/parser.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,16 @@
from __future__ import annotations

import shlex
from collections import deque
from functools import lru_cache
from io import StringIO
from linecache import getline
import shlex
from pathlib import Path
from typing import TYPE_CHECKING, Optional, Union

import numpy as np
import pandas as pd
from pathlib import Path
from typing import TYPE_CHECKING, Union, Optional, Dict, Tuple, List
import polars as pl

from starfile.typing import DataBlock

Expand All @@ -21,14 +23,15 @@ class StarParser:
n_lines_in_file: int
n_blocks_to_read: int
current_line_number: int
data_blocks: Dict[DataBlock]
parse_as_string: List[str]
data_blocks: dict[DataBlock]
parse_as_string: list[str]

def __init__(
self,
filename: PathLike,
n_blocks_to_read: Optional[int] = None,
parse_as_string: List[str] = [],
parse_as_string: list[str] = [],
polars: bool = False,
):
# set filename, with path checking
filename = Path(filename)
Expand All @@ -42,48 +45,51 @@ def __init__(
self.n_blocks_to_read = n_blocks_to_read
self.parse_as_string = parse_as_string

self.polars = polars

# parse file
self.current_line_number = 0
self.parse_file()

@property
def current_line(self) -> str:
return getline(str(self.filename), self.current_line_number).strip()
@lru_cache(maxsize=25)
def _get_line(self, line_number: int) -> str:
return " ".join(getline(str(self.filename), line_number).split())

def parse_file(self):
while self.current_line_number <= self.n_lines_in_file:
if len(self.data_blocks) == self.n_blocks_to_read:
break
elif self.current_line.startswith('data_'):
elif self._get_line(self.current_line_number).startswith("data_"):
block_name, block = self._parse_data_block()
self.data_blocks[block_name] = block
else:
self.current_line_number += 1

def _parse_data_block(self) -> Tuple[str, DataBlock]:
def _parse_data_block(self) -> tuple[str, DataBlock]:
# current line starts with 'data_foo'
block_name = self.current_line[5:] # 'data_foo' -> 'foo'
block_name = self._get_line(self.current_line_number)[5:] # 'data_foo' -> 'foo'
self.current_line_number += 1

# iterate over file,
while self.current_line_number <= self.n_lines_in_file:
self.current_line_number += 1
if self.current_line.startswith('loop_'):
current_line = self._get_line(self.current_line_number)
if current_line.startswith("loop_"):
return block_name, self._parse_loop_block()
elif self.current_line.startswith('_'): # line is simple block
elif current_line.startswith("_"): # line is simple block
return block_name, self._parse_simple_block()

def _parse_simple_block(self) -> Dict[str, Union[str, int, float]]:
def _parse_simple_block(self) -> dict[str, Union[str, int, float]]:
block = {}
while self.current_line_number <= self.n_lines_in_file:
if self.current_line.startswith('data'):
c = self._get_line(self.current_line_number)
if c.startswith("data"):
break
elif self.current_line.startswith('_'): # '_foo bar'
k, v = shlex.split(self.current_line)
elif c.startswith("_"): # '_foo bar'
k, v = shlex.split(c)
column_name = k[1:]
parse_column_as_string = (
self.parse_as_string is not None
and any(column_name == col for col in self.parse_as_string)
parse_column_as_string = self.parse_as_string is not None and any(
column_name == col for col in self.parse_as_string
)
if parse_column_as_string is True:
block[column_name] = v
Expand All @@ -92,58 +98,66 @@ def _parse_simple_block(self) -> Dict[str, Union[str, int, float]]:
self.current_line_number += 1
return block

def _parse_loop_block(self) -> pd.DataFrame:
def _parse_loop_block(self) -> pd.DataFrame | pl.DataFrame:
# parse loop header
loop_column_names = deque()
self.current_line_number += 1

while self.current_line.startswith('_'):
column_name = self.current_line.split()[0][1:]
while self._get_line(self.current_line_number).startswith("_"):
column_name = self._get_line(self.current_line_number).split()[0][1:]
loop_column_names.append(column_name)
self.current_line_number += 1

# now parse the loop block data
loop_data = deque()
while self.current_line_number <= self.n_lines_in_file:
if self.current_line.startswith('data_'):
current_line = self._get_line(self.current_line_number)
if current_line.startswith("data_"):
break
loop_data.append(self.current_line)
previous_line = self._get_line(self.current_line_number - 1)
if not (current_line.isspace() and previous_line.isspace()) and (
current_line and previous_line
):
loop_data.append(current_line)
self.current_line_number += 1
loop_data = '\n'.join(loop_data)
if loop_data[-2:] != '\n':
loop_data += '\n'
loop_data = "\n".join(loop_data)
if loop_data[-2:] != "\n":
loop_data += "\n"

# put string data into a dataframe
if loop_data == '\n':
if loop_data == "\n":
n_cols = len(loop_column_names)
df = pd.DataFrame(np.zeros(shape=(0, n_cols)))
df = pl.DataFrame(np.zeros(shape=(0, n_cols)))
else:
column_name_to_index = {col: idx for idx, col in enumerate(loop_column_names)}
df = pd.read_csv(
df = pl.read_csv(
StringIO(loop_data.replace("'", '"')),
delimiter=r'\s+',
header=None,
comment='#',
dtype={column_name_to_index[k]: str for k in self.parse_as_string if k in loop_column_names},
keep_default_na=False,
engine='c',
separator=" ",
has_header=False,
comment_prefix="#",
dtypes={
k: pl.String for k in self.parse_as_string if k in loop_column_names
},
truncate_ragged_lines=True,
null_values=["", "<NA>"],
)
df.columns = loop_column_names

# Numericise all columns in temporary copy
df_numeric = df.apply(_apply_numeric)

# Replace columns that are all NaN with the original columns
df_numeric[df_numeric.columns[df_numeric.isna().all()]] = df[df_numeric.columns[df_numeric.isna().all()]]
# If the column type is string then use empty strings rather than null
df = df.with_columns(pl.col(pl.String).fill_null(""))

# Replace columns that should be strings
for col in df.columns:
df[col] = df_numeric[col] if col not in self.parse_as_string else df[col]
return df
if col in self.parse_as_string:
df = df.with_columns(
pl.col(col).cast(pl.String).fill_null("").alias(col)
)
if self.polars:
return df
return df.to_pandas()


def count_lines(file: Path) -> int:
with open(file, 'rb') as f:
with open(file, "rb") as f:
return sum(1 for _ in f)


Expand All @@ -169,10 +183,3 @@ def numericise(value: str) -> Union[str, int, float]:
# If it's not a float either, leave it as a string
value = value
return value


def _apply_numeric(col: pd.Series) -> pd.Series:
try:
return pd.to_numeric(col)
except ValueError:
return col
8 changes: 4 additions & 4 deletions src/starfile/typing.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
from __future__ import annotations

from typing import Union, Dict
from typing_extensions import TypeAlias
from typing import Dict, Union

import pandas as pd
import polars as pl
from typing_extensions import TypeAlias

DataBlock: TypeAlias = Union[
pd.DataFrame,
Dict[str, Union[str, int, float]]
Union[pd.DataFrame, pl.DataFrame], Dict[str, Union[str, int, float]]
]
Loading