-
Notifications
You must be signed in to change notification settings - Fork 18
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Improved string quoting #41
Changes from 9 commits
56e6c0f
6f85fa4
a7d7407
2ab7a6f
722099a
a3834c1
7fba398
30bebdc
ce192b1
c96dbf1
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -22,6 +22,7 @@ def read(filename: PathLike, read_n_blocks: int = None, always_dict: bool = Fals | |||||
default behaviour in the case of only one data block being present in the STAR file is to | ||||||
return only a dataframe, this can be changed by setting 'always_dict=True' | ||||||
""" | ||||||
|
||||||
parser = StarParser(filename, n_blocks_to_read=read_n_blocks) | ||||||
if len(parser.data_blocks) == 1 and always_dict is False: | ||||||
return list(parser.data_blocks.values())[0] | ||||||
|
@@ -35,6 +36,8 @@ def write( | |||||
float_format: str = '%.6f', | ||||||
sep: str = '\t', | ||||||
na_rep: str = '<NA>', | ||||||
quotechar: str = '"', | ||||||
quote_always: bool = False, | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
simple name preference, could you refactor to this too? |
||||||
**kwargs, | ||||||
): | ||||||
"""Write data blocks as STAR files.""" | ||||||
|
@@ -43,5 +46,7 @@ def write( | |||||
filename=filename, | ||||||
float_format=float_format, | ||||||
na_rep=na_rep, | ||||||
separator=sep | ||||||
separator=sep, | ||||||
quotechar=quotechar, | ||||||
quote_always=quote_always, | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. to be updated wrt previous comments |
||||||
) |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,6 +4,7 @@ | |
from pathlib import Path | ||
from typing import TYPE_CHECKING, Union, Dict, List | ||
from importlib.metadata import version | ||
import csv | ||
|
||
import pandas as pd | ||
|
||
|
@@ -24,6 +25,8 @@ def __init__( | |
float_format: str = '%.6f', | ||
separator: str = '\t', | ||
na_rep: str = '<NA>', | ||
quotechar: str = '"', | ||
quote_always: bool = False, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. same again |
||
): | ||
# coerce data | ||
self.data_blocks = self.coerce_data_blocks(data_blocks) | ||
|
@@ -33,6 +36,8 @@ def __init__( | |
self.float_format = float_format | ||
self.sep = separator | ||
self.na_rep = na_rep | ||
self.quotechar = quotechar | ||
self.quote_always = quote_always | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. same again |
||
self.buffer = TextBuffer() | ||
self.backup_if_file_exists() | ||
self.write() | ||
|
@@ -67,7 +72,9 @@ def write_data_blocks(self): | |
write_simple_block( | ||
file=self.filename, | ||
block_name=block_name, | ||
data=block | ||
data=block, | ||
quotechar=self.quotechar, | ||
quote_always=self.quote_always | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. same again |
||
) | ||
elif isinstance(block, pd.DataFrame): | ||
write_loop_block( | ||
|
@@ -77,6 +84,8 @@ def write_data_blocks(self): | |
float_format=self.float_format, | ||
separator=self.sep, | ||
na_rep=self.na_rep, | ||
quotechar=self.quotechar, | ||
quote_always=self.quote_always | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. same again |
||
) | ||
|
||
def backup_if_file_exists(self): | ||
|
@@ -123,13 +132,22 @@ def write_package_info(file: Path): | |
def write_simple_block( | ||
file: Path, | ||
block_name: str, | ||
data: Dict[str, Union[str, int, float]] | ||
): | ||
data: Dict[str, Union[str, int, float]], | ||
quotechar: str = '"', | ||
quote_always: bool = False | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. same again |
||
): | ||
quoted_data = { | ||
k: f"{quotechar}{v}{quotechar}" | ||
if isinstance(v, str) and (quote_always or " " in v or v == "") | ||
else v | ||
for k, v | ||
in data.items() | ||
} | ||
formatted_lines = '\n'.join( | ||
[ | ||
f'_{k}\t\t\t{v}' | ||
for k, v | ||
in data.items() | ||
in quoted_data.items() | ||
] | ||
) | ||
with open(file, mode='a') as f: | ||
|
@@ -145,6 +163,8 @@ def write_loop_block( | |
float_format: str = '%.6f', | ||
separator: str = '\t', | ||
na_rep: str = '<NA>', | ||
quotechar: str = '"', | ||
quote_always: bool = False | ||
): | ||
# write header | ||
header_lines = [ | ||
|
@@ -158,6 +178,10 @@ def write_loop_block( | |
f.write('\n'.join(header_lines)) | ||
f.write('\n') | ||
|
||
df = df.applymap(lambda x: f'{quotechar}{x}{quotechar}' | ||
if isinstance(x, str) and (quote_always or " " in x or x == "") | ||
else x) | ||
|
||
# write data | ||
df.to_csv( | ||
path_or_buf=file, | ||
|
@@ -167,5 +191,6 @@ def write_loop_block( | |
index=False, | ||
float_format=float_format, | ||
na_rep=na_rep, | ||
quoting=csv.QUOTE_NONE | ||
) | ||
write_blank_lines(file, n=2) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
data_ | ||
|
||
_no_quote_string noquote | ||
_quote_string "quote string" | ||
_whitespace_string " " | ||
_empty_string "" | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
data_ | ||
|
||
_no_quote_string noquote | ||
_quote_string 'quote string' | ||
_whitespace_string ' ' | ||
_empty_string '' | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
data_ | ||
|
||
loop_ | ||
_no_quote_string #1 | ||
_quote_string #2 | ||
_whitespace_string #3 | ||
_empty_string #4 | ||
noquote "quote string" " " "" |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
data_ | ||
|
||
loop_ | ||
_no_quote_string #1 | ||
_quote_string #2 | ||
_whitespace_string #3 | ||
_empty_string #4 | ||
noquote 'quote string' ' ' '' |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -20,6 +20,10 @@ | |
two_single_line_loop_blocks, | ||
two_basic_blocks, | ||
empty_loop, | ||
basic_single_quote, | ||
basic_double_quote, | ||
loop_single_quote, | ||
loop_double_quote, | ||
) | ||
from .utils import generate_large_star_file, remove_large_star_file, million_row_file | ||
|
||
|
@@ -237,3 +241,30 @@ def test_empty_loop_block(): | |
"""Parsing an empty loop block should return an empty dataframe.""" | ||
parser = StarParser(empty_loop) | ||
assert len(parser.data_blocks) == 1 | ||
|
||
|
||
|
||
@pytest.mark.parametrize("quotechar, filename", [("'",basic_single_quote), | ||
('"',basic_double_quote), | ||
]) | ||
def test_quote_basic(quotechar,filename): | ||
import math | ||
parser = StarParser(filename) | ||
assert len(parser.data_blocks) == 1 | ||
assert parser.data_blocks['']['no_quote_string'] == "noquote" | ||
assert parser.data_blocks['']['quote_string'] == "quote string" | ||
assert parser.data_blocks['']['whitespace_string'] == " " | ||
assert parser.data_blocks['']['empty_string'] == "" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is sick 🙂 |
||
|
||
@pytest.mark.parametrize("quotechar, filename", [("'",loop_single_quote), | ||
('"',loop_double_quote), | ||
]) | ||
def test_quote_loop(quotechar,filename): | ||
import math | ||
parser = StarParser(filename) | ||
assert len(parser.data_blocks) == 1 | ||
assert parser.data_blocks[''].loc[0,'no_quote_string'] == "noquote" | ||
assert parser.data_blocks[''].loc[0,'quote_string'] == "quote string" | ||
assert parser.data_blocks[''].loc[0,'whitespace_string'] == " " | ||
# Not optimal, but the way to_numeric behaves | ||
assert math.isnan(parser.data_blocks[''].loc[0,'empty_string']) | ||
alisterburt marked this conversation as resolved.
Show resolved
Hide resolved
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
would prefer something more explicit and snake case, could you update in all relevant places?