|
13 | 13 | #include <limits.h>
|
14 | 14 | #include <stdarg.h>
|
15 | 15 | #include <stdbool.h>
|
| 16 | +#include <stdint.h> |
16 | 17 | #include <stdio.h>
|
17 | 18 | #include <stdlib.h>
|
18 | 19 | #include <string.h>
|
@@ -744,3 +745,93 @@ arc_lowercase(char *str)
|
744 | 745 | }
|
745 | 746 | }
|
746 | 747 | }
|
| 748 | + |
| 749 | +/** |
| 750 | + * Check whether a string is valid UTF-8 |
| 751 | + * |
| 752 | + * Parameters: |
| 753 | + * str: string to check |
| 754 | + * |
| 755 | + * Returns: |
| 756 | + * Whether the string passed the checks. |
| 757 | + */ |
| 758 | + |
| 759 | +bool |
| 760 | +arc_check_utf8(const char *str) |
| 761 | +{ |
| 762 | + size_t charlen; |
| 763 | + uint32_t u; |
| 764 | + uint8_t mask; |
| 765 | + |
| 766 | + for (const unsigned char *p = (const unsigned char *) str; *p != '\0'; p++) |
| 767 | + { |
| 768 | + if (*p < 0x80) |
| 769 | + { |
| 770 | + continue; |
| 771 | + } |
| 772 | + |
| 773 | + if ((*p & 0xe0) == 0xc0) |
| 774 | + { |
| 775 | + charlen = 2; |
| 776 | + mask = 0x1f; |
| 777 | + } |
| 778 | + else if ((*p & 0xf0) == 0xe0) |
| 779 | + { |
| 780 | + charlen = 3; |
| 781 | + mask = 0x0f; |
| 782 | + } |
| 783 | + else if ((*p & 0xf8) == 0xf0) |
| 784 | + { |
| 785 | + charlen = 4; |
| 786 | + mask = 0x07; |
| 787 | + } |
| 788 | + else |
| 789 | + { |
| 790 | + /* Anything else that has the high bit set is invalid. */ |
| 791 | + return false; |
| 792 | + } |
| 793 | + |
| 794 | + u = *p & mask; |
| 795 | + for (int i = 1; i < charlen; i++) |
| 796 | + { |
| 797 | + p++; |
| 798 | + if ((*p & 0xc0) != 0x80) |
| 799 | + { |
| 800 | + return false; |
| 801 | + } |
| 802 | + u <<= 6; |
| 803 | + u |= (*p & 0x3f); |
| 804 | + } |
| 805 | + |
| 806 | + /* Check that the codepoint used the shortest representation. */ |
| 807 | + if ((u < 0x80) || ((u < 0x800) && (charlen > 2)) || |
| 808 | + ((u < 0x10000) && (charlen > 3))) |
| 809 | + { |
| 810 | + return false; |
| 811 | + } |
| 812 | + |
| 813 | + /* Check for invalid codepoints. */ |
| 814 | + |
| 815 | + /* surrogates */ |
| 816 | + if (u >= 0xd800 && u <= 0xdfff) |
| 817 | + { |
| 818 | + return false; |
| 819 | + } |
| 820 | + |
| 821 | + /* non-characters */ |
| 822 | + if ((u >= 0xfdd0 && u <= 0xfdef) || u == 0xfffe || u == 0xffff || |
| 823 | + u == 0x1fffe || u == 0x1ffff || u == 0x2fffe || u == 0x2ffff || |
| 824 | + u == 0x3fffe || u == 0x3ffff || u == 0x4fffe || u == 0x4ffff || |
| 825 | + u == 0x5fffe || u == 0x5ffff || u == 0x6fffe || u == 0x6ffff || |
| 826 | + u == 0x7fffe || u == 0x7ffff || u == 0x8fffe || u == 0x8ffff || |
| 827 | + u == 0x9fffe || u == 0x9ffff || u == 0xafffe || u == 0xaffff || |
| 828 | + u == 0xbfffe || u == 0xbffff || u == 0xcfffe || u == 0xcffff || |
| 829 | + u == 0xdfffe || u == 0xdffff || u == 0xefffe || u == 0xeffff || |
| 830 | + u == 0xffffe || u == 0xfffff || u == 0x10fffe || u == 0x10ffff) |
| 831 | + { |
| 832 | + return false; |
| 833 | + } |
| 834 | + } |
| 835 | + |
| 836 | + return true; |
| 837 | +} |
0 commit comments