|
21 | 21 | from clevercsv._types import _DialectLike
|
22 | 22 | from clevercsv.console import build_application
|
23 | 23 | from clevercsv.dialect import SimpleDialect
|
| 24 | +from clevercsv.encoding import get_encoding |
24 | 25 | from clevercsv.write import writer
|
25 | 26 |
|
26 | 27 | TableType = List[List[Any]]
|
@@ -640,3 +641,86 @@ def test_standardize_in_place_multi_noop(self) -> None:
|
640 | 641 | self.assertEqual(contents, exp)
|
641 | 642 | finally:
|
642 | 643 | any(map(os.unlink, tmpfnames))
|
| 644 | + |
| 645 | + def test_standardize_target_encoding(self) -> None: |
| 646 | + table: TableType = [["Å", "B", "C"], ["é", "ü", "中"], [4, 5, 6]] |
| 647 | + dialect = SimpleDialect(delimiter=";", quotechar="", escapechar="") |
| 648 | + encoding = "utf-8" |
| 649 | + tmpfname = self._build_file(table, dialect, encoding=encoding) |
| 650 | + |
| 651 | + tmpfd, tmpoutname = tempfile.mkstemp(prefix="ccsv_", suffix=".csv") |
| 652 | + os.close(tmpfd) |
| 653 | + |
| 654 | + application = build_application() |
| 655 | + tester = Tester(application) |
| 656 | + tester.test_command( |
| 657 | + "standardize", ["-o", tmpoutname, "-E", "utf-8", tmpfname] |
| 658 | + ) |
| 659 | + |
| 660 | + # Excel format (i.e. RFC4180) *requires* CRLF |
| 661 | + crlf = "\r\n" |
| 662 | + exp = crlf.join(["Å,B,C", "é,ü,中", "4,5,6", ""]) |
| 663 | + with open(tmpoutname, "r", newline="", encoding="utf-8") as fp: |
| 664 | + output = fp.read() |
| 665 | + |
| 666 | + try: |
| 667 | + self.assertEqual(exp, output) |
| 668 | + finally: |
| 669 | + os.unlink(tmpfname) |
| 670 | + os.unlink(tmpoutname) |
| 671 | + |
| 672 | + def test_standardize_target_encoding2(self) -> None: |
| 673 | + table: TableType = [["A", "B", "C"], ["é", "è", "à"], [4, 5, 6]] |
| 674 | + dialect = SimpleDialect(delimiter=";", quotechar="", escapechar="") |
| 675 | + encoding = "latin-1" |
| 676 | + tmpfname = self._build_file(table, dialect, encoding=encoding) |
| 677 | + self.assertEqual( |
| 678 | + "ISO-8859-1", get_encoding(tmpfname, try_cchardet=False) |
| 679 | + ) |
| 680 | + tmpfd, tmpoutname = tempfile.mkstemp(prefix="ccsv_", suffix=".csv") |
| 681 | + os.close(tmpfd) |
| 682 | + |
| 683 | + application = build_application() |
| 684 | + tester = Tester(application) |
| 685 | + tester.test_command( |
| 686 | + "standardize", |
| 687 | + ["-o", tmpoutname, "-e", "latin-1", "-E", "utf-8", tmpfname], |
| 688 | + ) |
| 689 | + |
| 690 | + # Excel format (i.e. RFC4180) *requires* CRLF |
| 691 | + crlf = "\r\n" |
| 692 | + exp = crlf.join(["A,B,C", "é,è,à", "4,5,6", ""]) |
| 693 | + |
| 694 | + self.assertEqual("utf-8", get_encoding(tmpoutname, try_cchardet=False)) |
| 695 | + with open(tmpoutname, "r", newline="", encoding="utf-8") as fp: |
| 696 | + output = fp.read() |
| 697 | + |
| 698 | + try: |
| 699 | + self.assertEqual(exp, output) |
| 700 | + |
| 701 | + finally: |
| 702 | + os.unlink(tmpfname) |
| 703 | + os.unlink(tmpoutname) |
| 704 | + |
| 705 | + def test_standardize_target_encoding_raise_UnicodeEncodeError( |
| 706 | + self, |
| 707 | + ) -> None: |
| 708 | + table: TableType = [["Å", "B", "C"], ["é", "ü", "中"], [4, 5, 6]] |
| 709 | + dialect = SimpleDialect(delimiter=";", quotechar="", escapechar="") |
| 710 | + encoding = "utf-8" |
| 711 | + tmpfname = self._build_file(table, dialect, encoding=encoding) |
| 712 | + |
| 713 | + tmpfd, tmpoutname = tempfile.mkstemp(prefix="ccsv_", suffix=".csv") |
| 714 | + os.close(tmpfd) |
| 715 | + |
| 716 | + application = build_application() |
| 717 | + tester = Tester(application) |
| 718 | + try: |
| 719 | + with self.assertRaises(UnicodeEncodeError): |
| 720 | + tester.test_command( |
| 721 | + "standardize", |
| 722 | + ["-o", tmpoutname, "-E", "latin-1", tmpfname], |
| 723 | + ) |
| 724 | + finally: |
| 725 | + os.unlink(tmpfname) |
| 726 | + os.unlink(tmpoutname) |
0 commit comments