Skip to content

Commit

Permalink
Read specific columns as str
Browse files Browse the repository at this point in the history
  • Loading branch information
EuanPyle authored and alisterburt committed Feb 22, 2024
1 parent 6b50943 commit d4607d4
Show file tree
Hide file tree
Showing 2 changed files with 36 additions and 12 deletions.
4 changes: 2 additions & 2 deletions src/starfile/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,15 +15,15 @@
from os import PathLike


def read(filename: PathLike, read_n_blocks: int = None, always_dict: bool = False):
def read(filename: PathLike, read_n_blocks: int = None, always_dict: bool = False, parse_as_string: List[str] = None):
"""
Read a star file into a pandas dataframe or dict of pandas dataframes
default behaviour in the case of only one data block being present in the STAR file is to
return only a dataframe, this can be changed by setting 'always_dict=True'
"""

parser = StarParser(filename, n_blocks_to_read=read_n_blocks)
parser = StarParser(filename, n_blocks_to_read=read_n_blocks, parse_as_string=parse_as_string)
if len(parser.data_blocks) == 1 and always_dict is False:
return list(parser.data_blocks.values())[0]
else:
Expand Down
44 changes: 34 additions & 10 deletions src/starfile/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,37 +8,41 @@
import numpy as np
import pandas as pd
from pathlib import Path
from typing import TYPE_CHECKING, Union, Optional, Dict, Tuple
from typing import TYPE_CHECKING, Union, Optional, Dict, Tuple, List

from starfile.typing import DataBlock

if TYPE_CHECKING:
from os import PathLike

def _apply_numeric(col: pd.Series) -> pd.Series:
try:
return pd.to_numeric(col)
except ValueError:
return col

class StarParser:
filename: Path
n_lines_in_file: int
n_blocks_to_read: int
current_line_number: int
data_blocks: Dict[DataBlock]

def __init__(self, filename: PathLike, n_blocks_to_read: Optional[int] = None):
parse_as_string: List[str]

def __init__(
self,
filename: PathLike,
n_blocks_to_read: Optional[int] = None,
parse_as_string: Optional[Union[str, List[str]]] = None
):
# set filename, with path checking
filename = Path(filename)
if not filename.exists():
raise FileNotFoundError(filename)
if isinstance(parse_as_string, str):
parse_as_string = [parse_as_string]
self.filename = filename

# setup for parsing
self.data_blocks = {}
self.n_lines_in_file = count_lines(self.filename)
self.n_blocks_to_read = n_blocks_to_read
self.parse_as_string = parse_as_string

# parse file
self.current_line_number = 0
Expand Down Expand Up @@ -77,8 +81,16 @@ def _parse_simple_block(self) -> Dict[str, Union[str, int, float]]:
if self.current_line.startswith('data'):
break
elif self.current_line.startswith('_'): # '_foo bar'
k, v = shlex.split(self.current_line)
block[k[1:]] = numericise(v)
k, v = self.current_line.split()
column_name = k[1:]
parse_column_as_string = (
self.parse_as_string is not None
and any(column_name == col for col in self.parse_as_string)
)
if parse_column_as_string is True:
block[column_name] = v
else:
block[column_name] = numericise(v)
self.current_line_number += 1
return block

Expand Down Expand Up @@ -112,12 +124,17 @@ def _parse_loop_block(self) -> pd.DataFrame:
StringIO(loop_data.replace("'", '"')),
delimiter=r'\s+',
header=None,
dtype={k: 'str' for k in self.parse_as_string}
if self.parse_as_string is not None else None,
comment='#',
keep_default_na=False
)
df_numeric = df.apply(_apply_numeric)
# Replace columns that are all NaN with the original string columns
df_numeric[df_numeric.columns[df_numeric.isna().all()]] = df[df_numeric.columns[df_numeric.isna().all()]]

# Replace columns that should be strings
# todo:
df = df_numeric
df.columns = loop_column_names
return df
Expand Down Expand Up @@ -150,3 +167,10 @@ def numericise(value: str) -> Union[str, int, float]:
# If it's not a float either, leave it as a string
value = value
return value


def _apply_numeric(col: pd.Series) -> pd.Series:
try:
return pd.to_numeric(col)
except ValueError:
return col

0 comments on commit d4607d4

Please sign in to comment.