From e9c8aa15b304a5032faf673bb898d359d465fd8f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Osvaldo?= Date: Sun, 9 Feb 2025 17:55:34 -0300 Subject: [PATCH 1/6] feat: Added BOM capability for output files (1267) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Added the '--add-bom' parameter for almost utilities Signed-off-by: Álvaro Osvaldo --- csvkit/cli.py | 5 ++++ csvkit/features/AddBom.py | 54 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 59 insertions(+) create mode 100644 csvkit/features/AddBom.py diff --git a/csvkit/cli.py b/csvkit/cli.py index 4d789d56..09497d9d 100644 --- a/csvkit/cli.py +++ b/csvkit/cli.py @@ -19,6 +19,7 @@ from agate.data_types.base import DEFAULT_NULL_VALUES from csvkit.exceptions import ColumnIdentifierError, RequiredHeaderError +from csvkit.features.AddBom import AddBOM try: import zstandard @@ -134,6 +135,8 @@ def run(self): if 'f' not in self.override_flags: self.input_file = self._open_input_file(self.args.input_path) + AddBOM.run(self.output_file, self.args) + try: with warnings.catch_warnings(): if getattr(self.args, 'no_header_row', None): @@ -245,6 +248,8 @@ def _init_common_parser(self): help='Insert a column of line numbers at the front of the output. Useful when piping to grep or as a ' 'simple primary key.') + AddBOM.argument(self.argparser,self) + # Input/Output if 'zero' not in self.override_flags: self.argparser.add_argument( diff --git a/csvkit/features/AddBom.py b/csvkit/features/AddBom.py new file mode 100644 index 00000000..c18a33f8 --- /dev/null +++ b/csvkit/features/AddBom.py @@ -0,0 +1,54 @@ +from argparse import ArgumentParser, Namespace +from io import TextIOWrapper, BytesIO +from typing import Union + + +class AddBOM: + + @staticmethod + def _get_BOM() -> bytes: + + from codecs import BOM_UTF8 + + return BOM_UTF8 + + @staticmethod + def enabled(arguments: Union[Namespace,list, None] = None) -> bool: + + if isinstance(arguments, Namespace) or isinstance(arguments, list): + return "add_bom" in arguments and arguments.add_bom + + return False + + @staticmethod + def argument(arguments: ArgumentParser, utility: object): + + # These string usage to validate the class is an architecture + # fail as is not possible to check the class type before + # is initialized + + if "SQL2CSV" in str(utility.__class__): + return + + if "CSVPy" in str(utility.__class__): + return + + arguments.add_argument( + "--add-bom", + dest="add_bom", + action="store_true", + default=False, + help="Add Byte Order Mark (BOM) to the output", + ) + + @staticmethod + def run( + output: TextIOWrapper, + arguments: Union[Namespace, None] = None, + ): + + if not AddBOM.enabled(arguments): + return + + BOM = AddBOM._get_BOM() + output.buffer.write(BOM) From aee71d63e386bf74185f3744fea22f4049c326db Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Osvaldo?= Date: Sun, 9 Feb 2025 18:16:23 -0300 Subject: [PATCH 2/6] chore: Fixed type in method MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Álvaro Osvaldo --- csvkit/features/AddBom.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csvkit/features/AddBom.py b/csvkit/features/AddBom.py index c18a33f8..39b86729 100644 --- a/csvkit/features/AddBom.py +++ b/csvkit/features/AddBom.py @@ -44,7 +44,7 @@ def argument(arguments: ArgumentParser, utility: object): @staticmethod def run( output: TextIOWrapper, - arguments: Union[Namespace, None] = None, + arguments: Union[Namespace,list, None] = None, ): if not AddBOM.enabled(arguments): From d911296922e029bb85048ecf85134a0afa9c2fa1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Osvaldo?= Date: Fri, 14 Feb 2025 15:13:39 -0300 Subject: [PATCH 3/6] refactor: Removed 'Feature Pattern' implementation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Code inlined to 'cli.py' script. - Configured 'csvpy' and 'sql2csv' to ignore 'add-bom' Signed-off-by: Álvaro Osvaldo --- csvkit/cli.py | 17 +++++++++--- csvkit/features/AddBom.py | 54 ------------------------------------- csvkit/utilities/csvpy.py | 2 +- csvkit/utilities/sql2csv.py | 2 +- 4 files changed, 16 insertions(+), 59 deletions(-) delete mode 100644 csvkit/features/AddBom.py diff --git a/csvkit/cli.py b/csvkit/cli.py index 09497d9d..a286cc75 100644 --- a/csvkit/cli.py +++ b/csvkit/cli.py @@ -14,12 +14,13 @@ import warnings from glob import glob from os.path import splitext +from codecs import BOM_UTF8 +from argparse import Namespace import agate from agate.data_types.base import DEFAULT_NULL_VALUES from csvkit.exceptions import ColumnIdentifierError, RequiredHeaderError -from csvkit.features.AddBom import AddBOM try: import zstandard @@ -135,7 +136,10 @@ def run(self): if 'f' not in self.override_flags: self.input_file = self._open_input_file(self.args.input_path) - AddBOM.run(self.output_file, self.args) + if isinstance(self.args, Namespace): + if "add_bom" in self.args and self.args.add_bom: + BOM = AddBOM._get_BOM() + self.output.buffer.write(BOM_UTF8) try: with warnings.catch_warnings(): @@ -248,7 +252,14 @@ def _init_common_parser(self): help='Insert a column of line numbers at the front of the output. Useful when piping to grep or as a ' 'simple primary key.') - AddBOM.argument(self.argparser,self) + if 'add-bom' not in self.override_flags: + self.argparser.add_argument( + "--add-bom", + dest="add_bom", + action="store_true", + default=False, + help="Add Byte Order Mark (BOM) to the output", + ) # Input/Output if 'zero' not in self.override_flags: diff --git a/csvkit/features/AddBom.py b/csvkit/features/AddBom.py deleted file mode 100644 index 39b86729..00000000 --- a/csvkit/features/AddBom.py +++ /dev/null @@ -1,54 +0,0 @@ -from argparse import ArgumentParser, Namespace -from io import TextIOWrapper, BytesIO -from typing import Union - - -class AddBOM: - - @staticmethod - def _get_BOM() -> bytes: - - from codecs import BOM_UTF8 - - return BOM_UTF8 - - @staticmethod - def enabled(arguments: Union[Namespace,list, None] = None) -> bool: - - if isinstance(arguments, Namespace) or isinstance(arguments, list): - return "add_bom" in arguments and arguments.add_bom - - return False - - @staticmethod - def argument(arguments: ArgumentParser, utility: object): - - # These string usage to validate the class is an architecture - # fail as is not possible to check the class type before - # is initialized - - if "SQL2CSV" in str(utility.__class__): - return - - if "CSVPy" in str(utility.__class__): - return - - arguments.add_argument( - "--add-bom", - dest="add_bom", - action="store_true", - default=False, - help="Add Byte Order Mark (BOM) to the output", - ) - - @staticmethod - def run( - output: TextIOWrapper, - arguments: Union[Namespace,list, None] = None, - ): - - if not AddBOM.enabled(arguments): - return - - BOM = AddBOM._get_BOM() - output.buffer.write(BOM) diff --git a/csvkit/utilities/csvpy.py b/csvkit/utilities/csvpy.py index f0605998..c9a153b6 100644 --- a/csvkit/utilities/csvpy.py +++ b/csvkit/utilities/csvpy.py @@ -10,7 +10,7 @@ class CSVPy(CSVKitUtility): description = 'Load a CSV file into a CSV reader and then drop into a Python shell.' - override_flags = ['l', 'zero'] + override_flags = ['l', 'zero','add-bom'] def add_arguments(self): self.argparser.add_argument( diff --git a/csvkit/utilities/sql2csv.py b/csvkit/utilities/sql2csv.py index e40a5d1d..da5ca92b 100644 --- a/csvkit/utilities/sql2csv.py +++ b/csvkit/utilities/sql2csv.py @@ -8,7 +8,7 @@ class SQL2CSV(CSVKitUtility): description = 'Execute a SQL query on a database and output the result to a CSV file.' # Overrides all flags except --linenumbers, --verbose, --version. - override_flags = ['f', 'b', 'd', 'e', 'H', 'I', 'K', 'L', 'p', 'q', 'S', 't', 'u', 'z', 'zero'] + override_flags = ['f', 'b', 'd', 'e', 'H', 'I', 'K', 'L', 'p', 'q', 'S', 't', 'u', 'z', 'zero','add-bom'] def add_arguments(self): self.argparser.add_argument( From a58baee2106ab358a0777d1f922bc8cf77c58847 Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Sat, 15 Feb 2025 01:58:40 -0500 Subject: [PATCH 4/6] chore: Fix lints --- csvkit/cli.py | 16 ++++------------ csvkit/utilities/csvpy.py | 2 +- csvkit/utilities/sql2csv.py | 2 +- 3 files changed, 6 insertions(+), 14 deletions(-) diff --git a/csvkit/cli.py b/csvkit/cli.py index fcebb81b..cd01cbfd 100644 --- a/csvkit/cli.py +++ b/csvkit/cli.py @@ -15,7 +15,6 @@ from glob import glob from os.path import splitext from codecs import BOM_UTF8 -from argparse import Namespace import agate from agate.data_types.base import DEFAULT_NULL_VALUES @@ -136,10 +135,8 @@ def run(self): if 'f' not in self.override_flags: self.input_file = self._open_input_file(self.args.input_path) - if isinstance(self.args, Namespace): - if "add_bom" in self.args and self.args.add_bom: - BOM = AddBOM._get_BOM() - self.output.buffer.write(BOM_UTF8) + if getattr(self.args, 'add_bom', False): + self.output.buffer.write(BOM_UTF8) try: with warnings.catch_warnings(): @@ -251,15 +248,10 @@ def _init_common_parser(self): '-l', '--linenumbers', dest='line_numbers', action='store_true', help='Insert a column of line numbers at the front of the output. Useful when piping to grep or as a ' 'simple primary key.') - if 'add-bom' not in self.override_flags: self.argparser.add_argument( - "--add-bom", - dest="add_bom", - action="store_true", - default=False, - help="Add Byte Order Mark (BOM) to the output", - ) + '--add-bom', dest='add_bom', action='store_true', + help='Add the UTF-8 byte-order mark (BOM) to the output, for Excel compatibility') # Input/Output if 'zero' not in self.override_flags: diff --git a/csvkit/utilities/csvpy.py b/csvkit/utilities/csvpy.py index c9a153b6..4689553e 100644 --- a/csvkit/utilities/csvpy.py +++ b/csvkit/utilities/csvpy.py @@ -10,7 +10,7 @@ class CSVPy(CSVKitUtility): description = 'Load a CSV file into a CSV reader and then drop into a Python shell.' - override_flags = ['l', 'zero','add-bom'] + override_flags = ['l', 'zero', 'add-bom'] def add_arguments(self): self.argparser.add_argument( diff --git a/csvkit/utilities/sql2csv.py b/csvkit/utilities/sql2csv.py index da5ca92b..1918aa9a 100644 --- a/csvkit/utilities/sql2csv.py +++ b/csvkit/utilities/sql2csv.py @@ -8,7 +8,7 @@ class SQL2CSV(CSVKitUtility): description = 'Execute a SQL query on a database and output the result to a CSV file.' # Overrides all flags except --linenumbers, --verbose, --version. - override_flags = ['f', 'b', 'd', 'e', 'H', 'I', 'K', 'L', 'p', 'q', 'S', 't', 'u', 'z', 'zero','add-bom'] + override_flags = ['f', 'b', 'd', 'e', 'H', 'I', 'K', 'L', 'p', 'q', 'S', 't', 'u', 'z', 'zero', 'add-bom'] def add_arguments(self): self.argparser.add_argument( From 03bde261573acc3cf325b32b74e1101efb80e696 Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Sat, 15 Feb 2025 02:25:19 -0500 Subject: [PATCH 5/6] test: Add --add-bom test --- csvkit/cli.py | 2 +- examples/test_utf8_bom.csv | 2 +- tests/test_utilities/test_in2csv.py | 4 ++++ tests/utils.py | 4 ++-- 4 files changed, 8 insertions(+), 4 deletions(-) diff --git a/csvkit/cli.py b/csvkit/cli.py index cd01cbfd..0af61511 100644 --- a/csvkit/cli.py +++ b/csvkit/cli.py @@ -136,7 +136,7 @@ def run(self): self.input_file = self._open_input_file(self.args.input_path) if getattr(self.args, 'add_bom', False): - self.output.buffer.write(BOM_UTF8) + self.output_file.buffer.write(BOM_UTF8) try: with warnings.catch_warnings(): diff --git a/examples/test_utf8_bom.csv b/examples/test_utf8_bom.csv index a3b29f13..4f593da0 100644 --- a/examples/test_utf8_bom.csv +++ b/examples/test_utf8_bom.csv @@ -1,3 +1,3 @@ foo,bar,baz 1,2,3 -4,5,ʤ \ No newline at end of file +4,5,ʤ diff --git a/tests/test_utilities/test_in2csv.py b/tests/test_utilities/test_in2csv.py index ccfe4c4b..d21563e1 100644 --- a/tests/test_utilities/test_in2csv.py +++ b/tests/test_utilities/test_in2csv.py @@ -58,6 +58,10 @@ def test_locale(self): self.assertConverted('csv', 'examples/test_locale.csv', 'examples/test_locale_converted.csv', ['--locale', 'de_DE']) + def test_add_bom(self): + self.assertConverted('csv', 'examples/test_utf8.csv', + 'examples/test_utf8_bom.csv', ['--add-bom']) + def test_no_blanks(self): self.assertConverted('csv', 'examples/blanks.csv', 'examples/blanks_converted.csv') diff --git a/tests/utils.py b/tests/utils.py index aa4794e7..dab594cb 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -49,12 +49,12 @@ class CSVKitTestCase(unittest.TestCase): warnings.filterwarnings(action='ignore', module='agate') def get_output(self, args): - output_file = io.StringIO() + output_file = io.TextIOWrapper(io.BytesIO(), encoding='utf-8', newline='', write_through=True) utility = self.Utility(args, output_file) utility.run() - output = output_file.getvalue() + output = output_file.buffer.getvalue().decode('utf-8') output_file.close() return output From 906e9a9caccf538918f69e804c4dcde16eab30b6 Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Sat, 15 Feb 2025 02:27:12 -0500 Subject: [PATCH 6/6] chore: Run isort --- csvkit/cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csvkit/cli.py b/csvkit/cli.py index 0af61511..101407c9 100644 --- a/csvkit/cli.py +++ b/csvkit/cli.py @@ -12,9 +12,9 @@ import re import sys import warnings +from codecs import BOM_UTF8 from glob import glob from os.path import splitext -from codecs import BOM_UTF8 import agate from agate.data_types.base import DEFAULT_NULL_VALUES