Skip to content

Commit 59249b8

Browse files
committed
Validate UTF-8
1 parent 0e24211 commit 59249b8

File tree

3 files changed

+103
-1
lines changed

3 files changed

+103
-1
lines changed

libopenarc/arc.c

+11-1
Original file line numberDiff line numberDiff line change
@@ -1479,11 +1479,17 @@ arc_process_set(ARC_MESSAGE *msg,
14791479
set->set_data = hcopy;
14801480
set->set_bad = false;
14811481

1482+
if (!arc_check_utf8(hcopy))
1483+
{
1484+
arc_error(msg, "invalid UTF-8 in %s data", settype);
1485+
set->set_bad = true;
1486+
return ARC_STAT_SYNTAX;
1487+
}
1488+
14821489
for (p = hcopy; *p != '\0' && !stop; p++)
14831490
{
14841491
if (isascii(*p) && !isprint(*p) && !isspace(*p))
14851492
{
1486-
/* FIXME: should this do more validation of UTF-8? */
14871493
arc_error(
14881494
msg, "invalid character (ASCII 0x%02x at offset %d) in %s data",
14891495
*p, p - hcopy, settype);
@@ -2480,6 +2486,10 @@ arc_parse_header_field(ARC_MESSAGE *msg,
24802486
assert(hlen != 0);
24812487

24822488
/* enforce RFC 5322, Section 2.2 as extended by RFC 6532, Section 3.2 */
2489+
if (!arc_check_utf8(hdr))
2490+
{
2491+
return ARC_STAT_SYNTAX;
2492+
}
24832493
colon = NULL;
24842494
for (c = 0; c < hlen; c++)
24852495
{

util/arc-dstring.c

+91
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
#include <limits.h>
1414
#include <stdarg.h>
1515
#include <stdbool.h>
16+
#include <stdint.h>
1617
#include <stdio.h>
1718
#include <stdlib.h>
1819
#include <string.h>
@@ -744,3 +745,93 @@ arc_lowercase(char *str)
744745
}
745746
}
746747
}
748+
749+
/**
750+
* Check whether a string is valid UTF-8
751+
*
752+
* Parameters:
753+
* str: string to check
754+
*
755+
* Returns:
756+
* Whether the string passed the checks.
757+
*/
758+
759+
bool
760+
arc_check_utf8(const char *str)
761+
{
762+
size_t charlen;
763+
uint32_t u;
764+
uint8_t mask;
765+
766+
for (const unsigned char *p = (const unsigned char *) str; *p != '\0'; p++)
767+
{
768+
if (*p < 0x80)
769+
{
770+
continue;
771+
}
772+
773+
if ((*p & 0xe0) == 0xc0)
774+
{
775+
charlen = 2;
776+
mask = 0x1f;
777+
}
778+
else if ((*p & 0xf0) == 0xe0)
779+
{
780+
charlen = 3;
781+
mask = 0x0f;
782+
}
783+
else if ((*p & 0xf8) == 0xf0)
784+
{
785+
charlen = 4;
786+
mask = 0x07;
787+
}
788+
else
789+
{
790+
/* Anything else that has the high bit set is invalid. */
791+
return false;
792+
}
793+
794+
u = *p & mask;
795+
for (int i = 1; i < charlen; i++)
796+
{
797+
p++;
798+
if ((*p & 0xc0) != 0x80)
799+
{
800+
return false;
801+
}
802+
u <<= 6;
803+
u |= (*p & 0x3f);
804+
}
805+
806+
/* Check that the codepoint used the shortest representation. */
807+
if ((u < 0x80) || ((u < 0x800) && (charlen > 2)) ||
808+
((u < 0x10000) && (charlen > 3)))
809+
{
810+
return false;
811+
}
812+
813+
/* Check for invalid codepoints. */
814+
815+
/* surrogates */
816+
if (u >= 0xd800 && u <= 0xdfff)
817+
{
818+
return false;
819+
}
820+
821+
/* non-characters */
822+
if ((u >= 0xfdd0 && u <= 0xfdef) || u == 0xfffe || u == 0xffff ||
823+
u == 0x1fffe || u == 0x1ffff || u == 0x2fffe || u == 0x2ffff ||
824+
u == 0x3fffe || u == 0x3ffff || u == 0x4fffe || u == 0x4ffff ||
825+
u == 0x5fffe || u == 0x5ffff || u == 0x6fffe || u == 0x6ffff ||
826+
u == 0x7fffe || u == 0x7ffff || u == 0x8fffe || u == 0x8ffff ||
827+
u == 0x9fffe || u == 0x9ffff || u == 0xafffe || u == 0xaffff ||
828+
u == 0xbfffe || u == 0xbffff || u == 0xcfffe || u == 0xcffff ||
829+
u == 0xdfffe || u == 0xdffff || u == 0xefffe || u == 0xeffff ||
830+
u == 0xffffe || u == 0xfffff || u == 0x10fffe || u == 0x10ffff)
831+
{
832+
return false;
833+
}
834+
}
835+
836+
return true;
837+
}

util/arc-dstring.h

+1
Original file line numberDiff line numberDiff line change
@@ -45,5 +45,6 @@ extern void arc_clobber_array(char **);
4545
extern void arc_collapse(char *);
4646
extern char **arc_copy_array(char **);
4747
extern void arc_lowercase(char *);
48+
extern bool arc_check_utf8(const char *);
4849

4950
#endif /* ARC_DSTRING_H_ */

0 commit comments

Comments
 (0)