Skip to content

Commit f48ab1a

Browse files
authored
standardize : allow users to specify output encoding (#118)
* add target encoding * docstring and change arg name * fix error when write to_file, factorize 'target_encoding or encoding' * target_encoding -> target-encoding * test target_encoding * test target_encoding raise UnicodeEncodeError * revert unnecessary changes * test target_encoding2 * add detected encoding assertion * fix formating errors with black * add open encoding
1 parent 7201c3c commit f48ab1a

File tree

2 files changed

+108
-5
lines changed

2 files changed

+108
-5
lines changed

clevercsv/console/commands/standardize.py

+24-5
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,16 @@ def register(self) -> None:
6969
),
7070
default=[],
7171
)
72+
self.add_argument(
73+
"-E",
74+
"--target-encoding",
75+
help="Set the encoding of the output file(s)",
76+
description=(
77+
"If ommited, the output file encoding while be the same "
78+
"as that of the original file."
79+
),
80+
type=str,
81+
)
7282
self.add_argument(
7383
"-i",
7484
"--in-place",
@@ -115,6 +125,7 @@ def handle(self) -> int:
115125
encodings = self.args.encoding
116126
num_chars = parse_int(self.args.num_chars, "num-chars")
117127
in_place = self.args.in_place
128+
target_encoding = self.args.target_encoding
118129

119130
if in_place and outputs:
120131
print(
@@ -154,6 +165,7 @@ def handle(self) -> int:
154165
encoding=encoding,
155166
verbose=verbose,
156167
num_chars=num_chars,
168+
target_encoding=target_encoding,
157169
)
158170
if retval > 0 and global_retval == 0:
159171
global_retval = retval
@@ -168,8 +180,10 @@ def handle_path(
168180
encoding: Optional[str] = None,
169181
num_chars: Optional[int] = None,
170182
verbose: bool = False,
183+
target_encoding: Optional[str] = None,
171184
) -> int:
172185
encoding = encoding or get_encoding(path)
186+
target_encoding = target_encoding or encoding
173187
dialect = detect_dialect(
174188
path, num_chars=num_chars, encoding=encoding, verbose=verbose
175189
)
@@ -178,10 +192,10 @@ def handle_path(
178192
return 1
179193

180194
if self.args.in_place:
181-
return self._in_place(path, dialect, encoding)
195+
return self._in_place(path, dialect, encoding, target_encoding)
182196
elif output is None:
183197
return self._to_stdout(path, dialect, encoding)
184-
return self._to_file(path, output, dialect, encoding)
198+
return self._to_file(path, output, dialect, encoding, target_encoding)
185199

186200
def _write_transposed(
187201
self,
@@ -224,7 +238,11 @@ def _write_to_stream(
224238
self._write_direct(path, stream, dialect, encoding)
225239

226240
def _in_place(
227-
self, path: StrPath, dialect: SimpleDialect, encoding: Optional[str]
241+
self,
242+
path: StrPath,
243+
dialect: SimpleDialect,
244+
encoding: Optional[str],
245+
target_encoding: Optional[str],
228246
) -> int:
229247
"""In-place mode overwrites the input file, if necessary
230248
@@ -235,7 +253,7 @@ def _in_place(
235253
236254
"""
237255
tmpfd, tmpfname = tempfile.mkstemp(prefix="clevercsv_", suffix=".csv")
238-
tmpid = os.fdopen(tmpfd, "w", newline="", encoding=encoding)
256+
tmpid = os.fdopen(tmpfd, "w", newline="", encoding=target_encoding)
239257
self._write_to_stream(path, tmpid, dialect, encoding)
240258
tmpid.close()
241259

@@ -263,7 +281,8 @@ def _to_file(
263281
output: StrPath,
264282
dialect: SimpleDialect,
265283
encoding: Optional[str],
284+
target_encoding: Optional[str],
266285
) -> int:
267-
with open(output, "w", newline="", encoding=encoding) as fp:
286+
with open(output, "w", newline="", encoding=target_encoding) as fp:
268287
self._write_to_stream(path, fp, dialect, encoding)
269288
return 0

tests/test_unit/test_console.py

+84
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
from clevercsv._types import _DialectLike
2222
from clevercsv.console import build_application
2323
from clevercsv.dialect import SimpleDialect
24+
from clevercsv.encoding import get_encoding
2425
from clevercsv.write import writer
2526

2627
TableType = List[List[Any]]
@@ -640,3 +641,86 @@ def test_standardize_in_place_multi_noop(self) -> None:
640641
self.assertEqual(contents, exp)
641642
finally:
642643
any(map(os.unlink, tmpfnames))
644+
645+
def test_standardize_target_encoding(self) -> None:
646+
table: TableType = [["Å", "B", "C"], ["é", "ü", "中"], [4, 5, 6]]
647+
dialect = SimpleDialect(delimiter=";", quotechar="", escapechar="")
648+
encoding = "utf-8"
649+
tmpfname = self._build_file(table, dialect, encoding=encoding)
650+
651+
tmpfd, tmpoutname = tempfile.mkstemp(prefix="ccsv_", suffix=".csv")
652+
os.close(tmpfd)
653+
654+
application = build_application()
655+
tester = Tester(application)
656+
tester.test_command(
657+
"standardize", ["-o", tmpoutname, "-E", "utf-8", tmpfname]
658+
)
659+
660+
# Excel format (i.e. RFC4180) *requires* CRLF
661+
crlf = "\r\n"
662+
exp = crlf.join(["Å,B,C", "é,ü,中", "4,5,6", ""])
663+
with open(tmpoutname, "r", newline="", encoding="utf-8") as fp:
664+
output = fp.read()
665+
666+
try:
667+
self.assertEqual(exp, output)
668+
finally:
669+
os.unlink(tmpfname)
670+
os.unlink(tmpoutname)
671+
672+
def test_standardize_target_encoding2(self) -> None:
673+
table: TableType = [["A", "B", "C"], ["é", "è", "à"], [4, 5, 6]]
674+
dialect = SimpleDialect(delimiter=";", quotechar="", escapechar="")
675+
encoding = "latin-1"
676+
tmpfname = self._build_file(table, dialect, encoding=encoding)
677+
self.assertEqual(
678+
"ISO-8859-1", get_encoding(tmpfname, try_cchardet=False)
679+
)
680+
tmpfd, tmpoutname = tempfile.mkstemp(prefix="ccsv_", suffix=".csv")
681+
os.close(tmpfd)
682+
683+
application = build_application()
684+
tester = Tester(application)
685+
tester.test_command(
686+
"standardize",
687+
["-o", tmpoutname, "-e", "latin-1", "-E", "utf-8", tmpfname],
688+
)
689+
690+
# Excel format (i.e. RFC4180) *requires* CRLF
691+
crlf = "\r\n"
692+
exp = crlf.join(["A,B,C", "é,è,à", "4,5,6", ""])
693+
694+
self.assertEqual("utf-8", get_encoding(tmpoutname, try_cchardet=False))
695+
with open(tmpoutname, "r", newline="", encoding="utf-8") as fp:
696+
output = fp.read()
697+
698+
try:
699+
self.assertEqual(exp, output)
700+
701+
finally:
702+
os.unlink(tmpfname)
703+
os.unlink(tmpoutname)
704+
705+
def test_standardize_target_encoding_raise_UnicodeEncodeError(
706+
self,
707+
) -> None:
708+
table: TableType = [["Å", "B", "C"], ["é", "ü", "中"], [4, 5, 6]]
709+
dialect = SimpleDialect(delimiter=";", quotechar="", escapechar="")
710+
encoding = "utf-8"
711+
tmpfname = self._build_file(table, dialect, encoding=encoding)
712+
713+
tmpfd, tmpoutname = tempfile.mkstemp(prefix="ccsv_", suffix=".csv")
714+
os.close(tmpfd)
715+
716+
application = build_application()
717+
tester = Tester(application)
718+
try:
719+
with self.assertRaises(UnicodeEncodeError):
720+
tester.test_command(
721+
"standardize",
722+
["-o", tmpoutname, "-E", "latin-1", tmpfname],
723+
)
724+
finally:
725+
os.unlink(tmpfname)
726+
os.unlink(tmpoutname)

0 commit comments

Comments
 (0)