Skip to content

Commit

Permalink
merge with current main and add test
Browse files Browse the repository at this point in the history
  • Loading branch information
alisterburt committed Feb 22, 2024
1 parent d4607d4 commit 4128727
Show file tree
Hide file tree
Showing 2 changed files with 41 additions and 26 deletions.
24 changes: 13 additions & 11 deletions src/starfile/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,14 +28,12 @@ def __init__(
self,
filename: PathLike,
n_blocks_to_read: Optional[int] = None,
parse_as_string: Optional[Union[str, List[str]]] = None
parse_as_string: List[str] = [],
):
# set filename, with path checking
filename = Path(filename)
if not filename.exists():
raise FileNotFoundError(filename)
if isinstance(parse_as_string, str):
parse_as_string = [parse_as_string]
self.filename = filename

# setup for parsing
Expand Down Expand Up @@ -81,7 +79,7 @@ def _parse_simple_block(self) -> Dict[str, Union[str, int, float]]:
if self.current_line.startswith('data'):
break
elif self.current_line.startswith('_'): # '_foo bar'
k, v = self.current_line.split()
k, v = shlex.split(self.current_line)
column_name = k[1:]
parse_column_as_string = (
self.parse_as_string is not None
Expand Down Expand Up @@ -120,23 +118,27 @@ def _parse_loop_block(self) -> pd.DataFrame:
n_cols = len(loop_column_names)
df = pd.DataFrame(np.zeros(shape=(0, n_cols)))
else:
column_name_to_index = {col: idx for idx, col in enumerate(loop_column_names)}
df = pd.read_csv(
StringIO(loop_data.replace("'", '"')),
delimiter=r'\s+',
header=None,
dtype={k: 'str' for k in self.parse_as_string}
if self.parse_as_string is not None else None,
comment='#',
keep_default_na=False
dtype={column_name_to_index[k]: str for k in self.parse_as_string if k in loop_column_names},
keep_default_na=False,
engine='c',
)
df.columns = loop_column_names

# Numericise all columns in temporary copy
df_numeric = df.apply(_apply_numeric)
# Replace columns that are all NaN with the original string columns

# Replace columns that are all NaN with the original columns
df_numeric[df_numeric.columns[df_numeric.isna().all()]] = df[df_numeric.columns[df_numeric.isna().all()]]

# Replace columns that should be strings
# todo:
df = df_numeric
df.columns = loop_column_names
for col in df.columns:
df[col] = df_numeric[col] if col not in self.parse_as_string else df[col]
return df


Expand Down
43 changes: 28 additions & 15 deletions tests/test_parsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,34 +243,47 @@ def test_empty_loop_block():
assert len(parser.data_blocks) == 1



@pytest.mark.parametrize("quote_character, filename", [("'",basic_single_quote),
('"',basic_double_quote),
])
def test_quote_basic(quote_character,filename):
@pytest.mark.parametrize("quote_character, filename", [("'", basic_single_quote),
('"', basic_double_quote),
])
def test_quote_basic(quote_character, filename):
parser = StarParser(filename)
assert len(parser.data_blocks) == 1
assert parser.data_blocks['']['no_quote_string'] == "noquote"
assert parser.data_blocks['']['quote_string'] == "quote string"
assert parser.data_blocks['']['whitespace_string'] == " "
assert parser.data_blocks['']['empty_string'] == ""

@pytest.mark.parametrize("quote_character, filename", [("'",loop_single_quote),
('"',loop_double_quote),
])
def test_quote_loop(quote_character,filename):

@pytest.mark.parametrize("quote_character, filename", [("'", loop_single_quote),
('"', loop_double_quote),
])
def test_quote_loop(quote_character, filename):
import math
parser = StarParser(filename)
assert len(parser.data_blocks) == 1
assert parser.data_blocks[''].loc[0,'no_quote_string'] == "noquote"
assert parser.data_blocks[''].loc[0,'quote_string'] == "quote string"
assert parser.data_blocks[''].loc[0,'whitespace_string'] == " "
assert parser.data_blocks[''].loc[0,'empty_string'] == ""
assert parser.data_blocks[''].loc[0, 'no_quote_string'] == "noquote"
assert parser.data_blocks[''].loc[0, 'quote_string'] == "quote string"
assert parser.data_blocks[''].loc[0, 'whitespace_string'] == " "
assert parser.data_blocks[''].loc[0, 'empty_string'] == ""

assert parser.data_blocks[''].dtypes['number_and_string'] == object
assert parser.data_blocks[''].dtypes['number_and_empty'] == 'float64'
assert parser.data_blocks[''].dtypes['number'] == 'float64'
assert parser.data_blocks[''].dtypes['empty_string_and_normal_string'] == object

assert math.isnan(parser.data_blocks[''].loc[1,'number_and_empty'])
assert parser.data_blocks[''].loc[0,'empty_string_and_normal_string'] == ''
assert math.isnan(parser.data_blocks[''].loc[1, 'number_and_empty'])
assert parser.data_blocks[''].loc[0, 'empty_string_and_normal_string'] == ''


def test_parse_as_string():
parser = StarParser(postprocess, parse_as_string=['rlnFinalResolution', 'rlnResolution'])

# check 'rlnFinalResolution' is parsed as string in general (basic) block
block = parser.data_blocks['general']
assert type(block['rlnFinalResolution']) == str

# check 'rlnResolution' is parsed as string in fsc (loop) block
df = parser.data_blocks['fsc']
assert df['rlnResolution'].dtype == 'object'

0 comments on commit 4128727

Please sign in to comment.