Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improved string quoting #41

Merged
merged 10 commits into from
Dec 5, 2023
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion starfile/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ def read(filename: PathLike, read_n_blocks: int = None, always_dict: bool = Fals
default behaviour in the case of only one data block being present in the STAR file is to
return only a dataframe, this can be changed by setting 'always_dict=True'
"""

parser = StarParser(filename, n_blocks_to_read=read_n_blocks)
if len(parser.data_blocks) == 1 and always_dict is False:
return list(parser.data_blocks.values())[0]
Expand All @@ -35,6 +36,8 @@ def write(
float_format: str = '%.6f',
sep: str = '\t',
na_rep: str = '<NA>',
quotechar: str = '"',
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
quotechar: str = '"',
quote_character: str = '"',

would prefer something more explicit and snake case, could you update in all relevant places?

quote_always: bool = False,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
quote_always: bool = False,
quote_all_strings: bool = False,

simple name preference, could you refactor to this too?

**kwargs,
):
"""Write data blocks as STAR files."""
Expand All @@ -43,5 +46,7 @@ def write(
filename=filename,
float_format=float_format,
na_rep=na_rep,
separator=sep
separator=sep,
quotechar=quotechar,
quote_always=quote_always,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

to be updated wrt previous comments

)
5 changes: 3 additions & 2 deletions starfile/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from collections import deque
from io import StringIO
from linecache import getline
import shlex

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -71,7 +72,7 @@ def _parse_simple_block(self) -> Dict[str, Union[str, int, float]]:
if self.current_line.startswith('data'):
break
elif self.current_line.startswith('_'): # '_foo bar'
k, v = self.current_line.split()
k, v = shlex.split(self.current_line)
block[k[1:]] = numericise(v)
self.current_line_number += 1
return block
Expand Down Expand Up @@ -103,7 +104,7 @@ def _parse_loop_block(self) -> pd.DataFrame:
df = pd.DataFrame(np.zeros(shape=(0, n_cols)))
else:
df = pd.read_csv(
StringIO(loop_data),
StringIO(loop_data.replace("'",'"')),
delim_whitespace=True,
header=None,
comment='#'
Expand Down
33 changes: 29 additions & 4 deletions starfile/writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from pathlib import Path
from typing import TYPE_CHECKING, Union, Dict, List
from importlib.metadata import version
import csv

import pandas as pd

Expand All @@ -24,6 +25,8 @@ def __init__(
float_format: str = '%.6f',
separator: str = '\t',
na_rep: str = '<NA>',
quotechar: str = '"',
quote_always: bool = False,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same again

):
# coerce data
self.data_blocks = self.coerce_data_blocks(data_blocks)
Expand All @@ -33,6 +36,8 @@ def __init__(
self.float_format = float_format
self.sep = separator
self.na_rep = na_rep
self.quotechar = quotechar
self.quote_always = quote_always
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same again

self.buffer = TextBuffer()
self.backup_if_file_exists()
self.write()
Expand Down Expand Up @@ -67,7 +72,9 @@ def write_data_blocks(self):
write_simple_block(
file=self.filename,
block_name=block_name,
data=block
data=block,
quotechar=self.quotechar,
quote_always=self.quote_always
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same again

)
elif isinstance(block, pd.DataFrame):
write_loop_block(
Expand All @@ -77,6 +84,8 @@ def write_data_blocks(self):
float_format=self.float_format,
separator=self.sep,
na_rep=self.na_rep,
quotechar=self.quotechar,
quote_always=self.quote_always
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same again

)

def backup_if_file_exists(self):
Expand Down Expand Up @@ -123,13 +132,22 @@ def write_package_info(file: Path):
def write_simple_block(
file: Path,
block_name: str,
data: Dict[str, Union[str, int, float]]
):
data: Dict[str, Union[str, int, float]],
quotechar: str = '"',
quote_always: bool = False
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same again

):
quoted_data = {
k: f"{quotechar}{v}{quotechar}"
if isinstance(v, str) and (quote_always or " " in v or v == "")
else v
for k, v
in data.items()
}
formatted_lines = '\n'.join(
[
f'_{k}\t\t\t{v}'
for k, v
in data.items()
in quoted_data.items()
]
)
with open(file, mode='a') as f:
Expand All @@ -145,6 +163,8 @@ def write_loop_block(
float_format: str = '%.6f',
separator: str = '\t',
na_rep: str = '<NA>',
quotechar: str = '"',
quote_always: bool = False
):
# write header
header_lines = [
Expand All @@ -158,6 +178,10 @@ def write_loop_block(
f.write('\n'.join(header_lines))
f.write('\n')

df = df.applymap(lambda x: f'{quotechar}{x}{quotechar}'
if isinstance(x, str) and (quote_always or " " in x or x == "")
else x)

# write data
df.to_csv(
path_or_buf=file,
Expand All @@ -167,5 +191,6 @@ def write_loop_block(
index=False,
float_format=float_format,
na_rep=na_rep,
quoting=csv.QUOTE_NONE
)
write_blank_lines(file, n=2)
4 changes: 4 additions & 0 deletions tests/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,10 @@
two_single_line_loop_blocks = test_data_directory / 'two_single_line_loop_blocks.star'
two_basic_blocks = test_data_directory / 'two_basic_blocks.star'
empty_loop = test_data_directory / 'empty_loop.star'
basic_single_quote = test_data_directory / 'basic_single_quote.star'
basic_double_quote = test_data_directory / 'basic_double_quote.star'
loop_single_quote = test_data_directory / 'loop_single_quote.star'
loop_double_quote = test_data_directory / 'loop_double_quote.star'

# Example DataFrame for testing
cars = {'Brand': ['Honda_Civic', 'Toyota_Corolla', 'Ford_Focus', 'Audi_A4'],
Expand Down
7 changes: 7 additions & 0 deletions tests/data/basic_double_quote.star
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
data_

_no_quote_string noquote
_quote_string "quote string"
_whitespace_string " "
_empty_string ""

7 changes: 7 additions & 0 deletions tests/data/basic_single_quote.star
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
data_

_no_quote_string noquote
_quote_string 'quote string'
_whitespace_string ' '
_empty_string ''

8 changes: 8 additions & 0 deletions tests/data/loop_double_quote.star
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
data_

loop_
_no_quote_string #1
_quote_string #2
_whitespace_string #3
_empty_string #4
noquote "quote string" " " ""
8 changes: 8 additions & 0 deletions tests/data/loop_single_quote.star
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
data_

loop_
_no_quote_string #1
_quote_string #2
_whitespace_string #3
_empty_string #4
noquote 'quote string' ' ' ''
31 changes: 31 additions & 0 deletions tests/test_parsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,10 @@
two_single_line_loop_blocks,
two_basic_blocks,
empty_loop,
basic_single_quote,
basic_double_quote,
loop_single_quote,
loop_double_quote,
)
from .utils import generate_large_star_file, remove_large_star_file, million_row_file

Expand Down Expand Up @@ -237,3 +241,30 @@ def test_empty_loop_block():
"""Parsing an empty loop block should return an empty dataframe."""
parser = StarParser(empty_loop)
assert len(parser.data_blocks) == 1



@pytest.mark.parametrize("quotechar, filename", [("'",basic_single_quote),
('"',basic_double_quote),
])
def test_quote_basic(quotechar,filename):
import math
parser = StarParser(filename)
assert len(parser.data_blocks) == 1
assert parser.data_blocks['']['no_quote_string'] == "noquote"
assert parser.data_blocks['']['quote_string'] == "quote string"
assert parser.data_blocks['']['whitespace_string'] == " "
assert parser.data_blocks['']['empty_string'] == ""
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is sick 🙂


@pytest.mark.parametrize("quotechar, filename", [("'",loop_single_quote),
('"',loop_double_quote),
])
def test_quote_loop(quotechar,filename):
import math
parser = StarParser(filename)
assert len(parser.data_blocks) == 1
assert parser.data_blocks[''].loc[0,'no_quote_string'] == "noquote"
assert parser.data_blocks[''].loc[0,'quote_string'] == "quote string"
assert parser.data_blocks[''].loc[0,'whitespace_string'] == " "
# Not optimal, but the way to_numeric behaves
assert math.isnan(parser.data_blocks[''].loc[0,'empty_string'])
alisterburt marked this conversation as resolved.
Show resolved Hide resolved
66 changes: 65 additions & 1 deletion tests/test_writing.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
from os.path import join as join_path
from tempfile import TemporaryDirectory
import time
import math

import pandas as pd
import pytest

from starfile.parser import StarParser
from starfile.writer import StarWriter

from .constants import loop_simple, postprocess, test_data_directory, test_df

from .utils import generate_large_star_file, remove_large_star_file

def test_write_simple_block():
s = StarParser(postprocess)
Expand Down Expand Up @@ -68,3 +71,64 @@ def test_can_write_non_zero_indexed_one_row_dataframe():
"1\t2\t3"
)
assert (expected in output)


@pytest.mark.parametrize("quotechar, quote_always, num_quotes",
[('"', False, 6),
('"', True, 8),
("'", False, 6),
("'", True, 8)
])
def test_string_quoting_loop_datablock(quotechar, quote_always, num_quotes, tmp_path):
df = pd.DataFrame([[1,"nospace", "String with space", " ", ""]],
columns=["a_number","string_without_space", "string_space", "just_space", "empty_string"])

filename = tmp_path / "test.star"
StarWriter(df, filename, quotechar=quotechar, quote_always=quote_always)

# Test for the appropriate number of quotes
with open(filename) as f:
star_content = f.read()
assert star_content.count(quotechar) == num_quotes

s = StarParser(filename)
assert df.loc[0, "a_number"] == s.data_blocks[""].loc[0, "a_number"]
assert df.loc[0, "string_without_space"] == s.data_blocks[""].loc[0, "string_without_space"]
assert df.loc[0, "string_space"] == s.data_blocks[""].loc[0, "string_space"]
assert df.loc[0, "just_space"] == s.data_blocks[""].loc[0, "just_space"]
assert math.isnan(s.data_blocks[""].loc[0, "empty_string"])

def test_writing_speed():
start = time.time()
generate_large_star_file()
end = time.time()
remove_large_star_file()

# Check that execution takes less than a second
assert end - start < 1

@pytest.mark.parametrize("quotechar, quote_always, num_quotes",
[('"', False, 6),
('"', True, 8),
("'", False, 6),
("'", True, 8)
])
def test_string_quoting_simple_datablock(quotechar, quote_always,num_quotes, tmp_path):
o = {
"a_number": 1,
"string_without_space": "nospace",
"string_space": "String with space",
"just_space": " ",
"empty_string": ""
}

filename = tmp_path / "test.star"
StarWriter(o, filename, quotechar=quotechar, quote_always=quote_always)

# Test for the appropriate number of quotes
with open(filename) as f:
star_content = f.read()
assert star_content.count(quotechar) == num_quotes

s = StarParser(filename)
assert o == s.data_blocks[""]
Loading