From 3e6f8047a6275d486139136153a08d51b18d3abd Mon Sep 17 00:00:00 2001 From: Leah Neukirchen Date: Thu, 23 Nov 2017 16:22:06 +0100 Subject: [PATCH] u8decode: detect invalid encodings --- u8decode.h | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/u8decode.h b/u8decode.h index 7599377..76ae43d 100644 --- a/u8decode.h +++ b/u8decode.h @@ -2,7 +2,7 @@ // Decode one UTF-8 codepoint into cp, return number of bytes to next one. // On invalid UTF-8, return -1, and do not change cp. -// Overlong sequences, surrogates and invalid codepoints are not checked. +// Invalid codepoints are not checked. // // This code is meant to be inlined, if cp is unused it can be optimized away. static int @@ -12,10 +12,18 @@ u8decode(const char *cs, uint32_t *cp) if (*s == 0) { *cp = 0; return 0; } if (*s < 0x80) { *cp = *s; return 1; } - if (*s < 0xc0) { return -1; } + if (*s < 0xc2) { return -1; } //cont+overlong if (*s < 0xe0) { *cp = *s & 0x1f; goto u2; } - if (*s < 0xf0) { *cp = *s & 0x0f; goto u3; } - if (*s < 0xf8) { *cp = *s & 0x07; goto u4; } + if (*s < 0xf0) { + if (*s == 0xe0 && (s[1] & 0xe0) == 0x80) return -1; //overlong + if (*s == 0xed && (s[1] & 0xe0) == 0xa0) return -1; //surrogate + *cp = *s & 0x0f; goto u3; + } + if (*s < 0xf5) { + if (*s == 0xf0 && (s[1] & 0xf0) == 0x80) return -1; //overlong + if (*s == 0xf4 && (s[1] > 0x8f)) return -1; //too high + *cp = *s & 0x07; goto u4; + } return -1; u4: if ((*++s & 0xc0) != 0x80) return -1; *cp = (*cp << 6) | (*s & 0x3f);