teamtomo · alisterburt · Dec 5, 2023 · Jul 9, 2023 · Jul 21, 2023 · Dec 1, 2023
diff --git a/starfile/functions.py b/starfile/functions.py
@@ -22,6 +22,7 @@ def read(filename: PathLike, read_n_blocks: int = None, always_dict: bool = Fals
     default behaviour in the case of only one data block being present in the STAR file is to
     return only a dataframe, this can be changed by setting 'always_dict=True'
     """
+
     parser = StarParser(filename, n_blocks_to_read=read_n_blocks)
     if len(parser.data_blocks) == 1 and always_dict is False:
         return list(parser.data_blocks.values())[0]
@@ -35,6 +36,8 @@ def write(
     float_format: str = '%.6f',
     sep: str = '\t',
     na_rep: str = '<NA>',
+    quotechar: str = '"',
-    quotechar: str = '"',
+    quote_character: str = '"',
-    quotechar: str = '"',
+    quote_character: str = '"',
+    quote_always: bool = False,
-    quote_always: bool = False,
+    quote_all_strings: bool = False,
-    quote_always: bool = False,
+    quote_all_strings: bool = False,
     **kwargs,
 ):
     """Write data blocks as STAR files."""
@@ -43,5 +46,7 @@ def write(
         filename=filename,
         float_format=float_format,
         na_rep=na_rep,
-        separator=sep
+        separator=sep,
+        quotechar=quotechar,
+        quote_always=quote_always,
     )
diff --git a/starfile/parser.py b/starfile/parser.py
@@ -3,6 +3,7 @@
 from collections import deque
 from io import StringIO
 from linecache import getline
+import shlex
 
 import numpy as np
 import pandas as pd
@@ -71,7 +72,7 @@ def _parse_simple_block(self) -> Dict[str, Union[str, int, float]]:
             if self.current_line.startswith('data'):
                 break
             elif self.current_line.startswith('_'):  # '_foo bar'
-                k, v = self.current_line.split()
+                k, v = shlex.split(self.current_line)
                 block[k[1:]] = numericise(v)
             self.current_line_number += 1
         return block
@@ -103,7 +104,7 @@ def _parse_loop_block(self) -> pd.DataFrame:
             df = pd.DataFrame(np.zeros(shape=(0, n_cols)))
         else:
             df = pd.read_csv(
-                StringIO(loop_data),
+                StringIO(loop_data.replace("'",'"')),
                 delim_whitespace=True,
                 header=None,
                 comment='#'

diff --git a/starfile/writer.py b/starfile/writer.py
@@ -4,6 +4,7 @@
 from pathlib import Path
 from typing import TYPE_CHECKING, Union, Dict, List
 from importlib.metadata import version
+import csv
 
 import pandas as pd
 
@@ -24,6 +25,8 @@ def __init__(
         float_format: str = '%.6f',
         separator: str = '\t',
         na_rep: str = '<NA>',
+        quotechar: str = '"',
+        quote_always: bool = False,
     ):
         # coerce data
         self.data_blocks = self.coerce_data_blocks(data_blocks)
@@ -33,6 +36,8 @@ def __init__(
         self.float_format = float_format
         self.sep = separator
         self.na_rep = na_rep
+        self.quotechar = quotechar
+        self.quote_always = quote_always
         self.buffer = TextBuffer()
         self.backup_if_file_exists()
         self.write()
@@ -67,7 +72,9 @@ def write_data_blocks(self):
                 write_simple_block(
                     file=self.filename,
                     block_name=block_name,
-                    data=block
+                    data=block,
+                    quotechar=self.quotechar,
+                    quote_always=self.quote_always
                 )
             elif isinstance(block, pd.DataFrame):
                 write_loop_block(
@@ -77,6 +84,8 @@ def write_data_blocks(self):
                     float_format=self.float_format,
                     separator=self.sep,
                     na_rep=self.na_rep,
+                    quotechar=self.quotechar,
+                    quote_always=self.quote_always
                 )
 
     def backup_if_file_exists(self):
@@ -123,13 +132,22 @@ def write_package_info(file: Path):
 def write_simple_block(
     file: Path,
     block_name: str,
-    data: Dict[str, Union[str, int, float]]
-):
+    data: Dict[str, Union[str, int, float]],
+    quotechar: str = '"',
+    quote_always: bool = False
+):  
+    quoted_data = {
+        k: f"{quotechar}{v}{quotechar}" 
+        if isinstance(v, str) and (quote_always or " " in v or v == "") 
+        else v
+        for k, v
+        in data.items()    
+    }
     formatted_lines = '\n'.join(
         [
             f'_{k}\t\t\t{v}'
             for k, v
-            in data.items()
+            in quoted_data.items()
         ]
     )
     with open(file, mode='a') as f:
@@ -145,6 +163,8 @@ def write_loop_block(
     float_format: str = '%.6f',
     separator: str = '\t',
     na_rep: str = '<NA>',
+    quotechar: str = '"',
+    quote_always: bool = False
 ):
     # write header
     header_lines = [
@@ -158,6 +178,10 @@ def write_loop_block(
         f.write('\n'.join(header_lines))
         f.write('\n')
 
+    df = df.applymap(lambda x: f'{quotechar}{x}{quotechar}' 
+                     if isinstance(x, str) and (quote_always or " " in x or x == "") 
+                     else x)
+
     # write data
     df.to_csv(
         path_or_buf=file,
@@ -167,5 +191,6 @@ def write_loop_block(
         index=False,
         float_format=float_format,
         na_rep=na_rep,
+        quoting=csv.QUOTE_NONE
     )
     write_blank_lines(file, n=2)
diff --git a/tests/constants.py b/tests/constants.py
@@ -21,6 +21,10 @@
 two_single_line_loop_blocks = test_data_directory / 'two_single_line_loop_blocks.star'
 two_basic_blocks = test_data_directory / 'two_basic_blocks.star'
 empty_loop = test_data_directory / 'empty_loop.star'
+basic_single_quote = test_data_directory / 'basic_single_quote.star'
+basic_double_quote = test_data_directory / 'basic_double_quote.star'
+loop_single_quote = test_data_directory / 'loop_single_quote.star'
+loop_double_quote = test_data_directory / 'loop_double_quote.star'
 
 # Example DataFrame for testing
 cars = {'Brand': ['Honda_Civic', 'Toyota_Corolla', 'Ford_Focus', 'Audi_A4'],

diff --git a/tests/data/basic_double_quote.star b/tests/data/basic_double_quote.star
@@ -0,0 +1,7 @@
+data_
+
+_no_quote_string           noquote
+_quote_string              "quote string"
+_whitespace_string         " "
+_empty_string              ""
+
diff --git a/tests/data/basic_single_quote.star b/tests/data/basic_single_quote.star
@@ -0,0 +1,7 @@
+data_
+
+_no_quote_string           noquote
+_quote_string              'quote string'
+_whitespace_string         ' '
+_empty_string              ''
+
diff --git a/tests/data/loop_double_quote.star b/tests/data/loop_double_quote.star
@@ -0,0 +1,8 @@
+data_
+
+loop_
+_no_quote_string #1
+_quote_string #2
+_whitespace_string #3
+_empty_string #4
+noquote "quote string" " " ""
diff --git a/tests/data/loop_single_quote.star b/tests/data/loop_single_quote.star
@@ -0,0 +1,8 @@
+data_
+
+loop_
+_no_quote_string #1
+_quote_string #2
+_whitespace_string #3
+_empty_string #4
+noquote 'quote string' ' ' ''
diff --git a/tests/test_parsing.py b/tests/test_parsing.py
@@ -20,6 +20,10 @@
     two_single_line_loop_blocks,
     two_basic_blocks,
     empty_loop,
+    basic_single_quote,
+    basic_double_quote,
+    loop_single_quote,
+    loop_double_quote,
 )
 from .utils import generate_large_star_file, remove_large_star_file, million_row_file
 
@@ -237,3 +241,30 @@ def test_empty_loop_block():
     """Parsing an empty loop block should return an empty dataframe."""
     parser = StarParser(empty_loop)
     assert len(parser.data_blocks) == 1
+
+
+
+@pytest.mark.parametrize("quotechar, filename", [("'",basic_single_quote), 
+                                                 ('"',basic_double_quote), 
+                                                 ])
+def test_quote_basic(quotechar,filename):
+    import math
+    parser = StarParser(filename)
+    assert len(parser.data_blocks) == 1
+    assert parser.data_blocks['']['no_quote_string'] == "noquote"
+    assert parser.data_blocks['']['quote_string'] == "quote string"
+    assert parser.data_blocks['']['whitespace_string'] == " "
+    assert parser.data_blocks['']['empty_string'] == ""
+
+@pytest.mark.parametrize("quotechar, filename", [("'",loop_single_quote), 
+                                                 ('"',loop_double_quote), 
+                                                 ])
+def test_quote_loop(quotechar,filename):
+    import math
+    parser = StarParser(filename)
+    assert len(parser.data_blocks) == 1
+    assert parser.data_blocks[''].loc[0,'no_quote_string'] == "noquote"
+    assert parser.data_blocks[''].loc[0,'quote_string'] == "quote string"
+    assert parser.data_blocks[''].loc[0,'whitespace_string'] == " "
+    # Not optimal, but the way to_numeric behaves
+    assert math.isnan(parser.data_blocks[''].loc[0,'empty_string'])
diff --git a/tests/test_writing.py b/tests/test_writing.py
@@ -1,13 +1,16 @@
 from os.path import join as join_path
 from tempfile import TemporaryDirectory
+import time
+import math 
 
 import pandas as pd
+import pytest
 
 from starfile.parser import StarParser
 from starfile.writer import StarWriter
 
 from .constants import loop_simple, postprocess, test_data_directory, test_df
-
+from .utils import generate_large_star_file, remove_large_star_file
 
 def test_write_simple_block():
     s = StarParser(postprocess)
@@ -68,3 +71,64 @@ def test_can_write_non_zero_indexed_one_row_dataframe():
         "1\t2\t3"
     )
     assert (expected in output)
+
+
+@pytest.mark.parametrize("quotechar, quote_always, num_quotes", 
+                         [('"', False, 6),
+                          ('"', True, 8),
+                          ("'", False, 6),
+                          ("'", True, 8)
+                         ])
+def test_string_quoting_loop_datablock(quotechar, quote_always, num_quotes, tmp_path):
+    df = pd.DataFrame([[1,"nospace", "String with space", " ", ""]],
+                       columns=["a_number","string_without_space", "string_space", "just_space", "empty_string"])
+
+    filename = tmp_path / "test.star"
+    StarWriter(df, filename, quotechar=quotechar, quote_always=quote_always)
+
+    # Test for the appropriate number of quotes
+    with open(filename) as f:
+        star_content = f.read()
+        assert star_content.count(quotechar) == num_quotes
+
+    s = StarParser(filename)
+    assert df.loc[0, "a_number"] == s.data_blocks[""].loc[0, "a_number"]
+    assert df.loc[0, "string_without_space"] == s.data_blocks[""].loc[0, "string_without_space"]
+    assert df.loc[0, "string_space"] == s.data_blocks[""].loc[0, "string_space"]
+    assert df.loc[0, "just_space"] == s.data_blocks[""].loc[0, "just_space"]
+    assert math.isnan(s.data_blocks[""].loc[0, "empty_string"])
+
+def test_writing_speed():
+    start = time.time()
+    generate_large_star_file()
+    end = time.time()
+    remove_large_star_file()
+
+    # Check that execution takes less than a second
+    assert end - start < 1
+
+@pytest.mark.parametrize("quotechar, quote_always, num_quotes", 
+                         [('"', False, 6),
+                          ('"', True, 8),
+                          ("'", False, 6),
+                          ("'", True, 8)
+                         ])
+def test_string_quoting_simple_datablock(quotechar, quote_always,num_quotes, tmp_path):
+    o = {
+        "a_number": 1,
+        "string_without_space": "nospace",
+        "string_space": "String with space",
+        "just_space": " ",
+        "empty_string": ""
+    }
+
+    filename = tmp_path / "test.star"
+    StarWriter(o, filename, quotechar=quotechar, quote_always=quote_always)
+
+    # Test for the appropriate number of quotes
+    with open(filename) as f:
+        star_content = f.read()
+        assert star_content.count(quotechar) == num_quotes
+
+    s = StarParser(filename)
+    assert o == s.data_blocks[""]