diff --git a/mscan.c b/mscan.c index def7fd6..4260af8 100644 --- a/mscan.c +++ b/mscan.c @@ -18,6 +18,7 @@ #include #include "blaze822.h" +#include "u8decode.h" static int cols; static wchar_t replacement = L'?'; @@ -41,25 +42,24 @@ u8putstr(FILE *out, char *s, ssize_t l, int pad) while (*s && l > 0) { if (*s == '\t') *s = ' '; - if (*s >= 32 && *s < 127) { - putc(*s, out); - s++; - l--; - } else if ((unsigned)*s < 32 || *s == 127) { // C0 + + if ((unsigned)*s < 32 || *s == 127) { // C0 fprintf(out, "%lc", (wint_t)(*s == 127 ? 0x2421 : 0x2400+*s)); s++; l--; } else { - wchar_t wc; - int r = mbtowc(&wc, s, 4); + uint32_t c; + int r = u8decode(s, &c); if (r < 0) { r = 1; - wc = replacement; + fprintf(out, "%lc", (wint_t)replacement); + s++; + } else { + l -= wcwidth((wchar_t)c); + if (l >= 0) + fwrite(s, 1, r, out); + s += r; } - s += r; - l -= wcwidth(wc); - if (l >= 0) - fprintf(out, "%lc", (wint_t)wc); } } if (pad) diff --git a/safe_u8putstr.c b/safe_u8putstr.c index e198b54..31cdb0b 100644 --- a/safe_u8putstr.c +++ b/safe_u8putstr.c @@ -1,76 +1,57 @@ #include #include +#include "u8decode.h" + void safe_u8putstr(char *s0, size_t l, FILE *stream) { // tty-safe output of s, with relaxed utf-8 semantics: // - C0 and C1 are displayed as escape sequences - // - valid utf8 is printed as is - // - rest is assumed to be latin1, and translated into utf8 + // - valid utf-8 is printed as is + // - rest is assumed to be latin-1, and translated into utf-8 // - translate CRLF to CR unsigned char *s = (unsigned char *)s0; unsigned char *e = s + l; + uint32_t c; while (s < e) { - if ((*s & 0x80) == 0) { - if (*s < 32 && - *s != ' ' && *s != '\t' && *s != '\n' && *s != '\r') { - // C0 + int l = u8decode((char *)s, &c); + if (l == -1) { + l = 1; + if (*s <= 0x9fu) { + // C1 fputc(0xe2, stream); fputc(0x90, stream); - fputc(0x80+*s, stream); - } else if (*s == 127) { - // DEL + fputc(0x80+0x1b, stream); + fputc(0xe2, stream); fputc(0x90, stream); - fputc(0xa1, stream); - } else if (*s == '\r') { - if (e - s > 1 && s[1] == '\n') - s++; fputc(*s, stream); } else { - // safe ASCII - fputc(*s, stream); + /* invalid utf-8, assume it was latin-1 */ + fputc(0xc0 | (*s >> 6), stream); + fputc(0x80 | (*s & 0x3f), stream); } - } else if ((*s & 0xc0) == 0x80) { - if (*s >= 0xa0) - goto latin1; - - // C1 + } else if (c < 32 && + *s != ' ' && *s != '\t' && *s != '\n' && *s != '\r') { + // C0 fputc(0xe2, stream); fputc(0x90, stream); - fputc(0x80+0x1b, stream); - + fputc(0x80+*s, stream); + } else if (c == 127) { + // DEL fputc(0xe2, stream); fputc(0x90, stream); + fputc(0xa1, stream); + } else if (c == '\r') { + if (e - s > 1 && s[1] == '\n') + s++; fputc(*s, stream); } else { - uint32_t f = 0; - if (e - s >= 4) - f = (s[0]<<24) | (s[1]<<16) | (s[2]<<8) | s[3]; - else if (e - s == 3) - f = (s[0]<<24) | (s[1]<<16) | (s[2]<<8); - else if (e - s == 2) - f = (s[0]<<24) | (s[1]<<16); - else if (e - s == 1) - f = (s[0]<<24); - - if ((f & 0xe0c00000) == 0xc0800000) goto u2; - else if ((f & 0xf0c0c000) == 0xe0808000) goto u3; - else if ((f & 0xf8c0c0c0) == 0xf0808080) { - fputc(*s++, stream); -u3: fputc(*s++, stream); -u2: fputc(*s++, stream); - fputc(*s, stream); - } else { -latin1: - /* invalid utf8, assume it was latin1 */ - fputc(0xc0 | (*s >> 6), stream); - fputc(0x80 | (*s & 0x3f), stream); - } + fwrite(s, 1, l, stream); } - s++; + s += l; } } diff --git a/u8decode.h b/u8decode.h new file mode 100644 index 0000000..7599377 --- /dev/null +++ b/u8decode.h @@ -0,0 +1,25 @@ +#include + +// Decode one UTF-8 codepoint into cp, return number of bytes to next one. +// On invalid UTF-8, return -1, and do not change cp. +// Overlong sequences, surrogates and invalid codepoints are not checked. +// +// This code is meant to be inlined, if cp is unused it can be optimized away. +static int +u8decode(const char *cs, uint32_t *cp) +{ + const uint8_t *s = (uint8_t *)cs; + + if (*s == 0) { *cp = 0; return 0; } + if (*s < 0x80) { *cp = *s; return 1; } + if (*s < 0xc0) { return -1; } + if (*s < 0xe0) { *cp = *s & 0x1f; goto u2; } + if (*s < 0xf0) { *cp = *s & 0x0f; goto u3; } + if (*s < 0xf8) { *cp = *s & 0x07; goto u4; } + return -1; + +u4: if ((*++s & 0xc0) != 0x80) return -1; *cp = (*cp << 6) | (*s & 0x3f); +u3: if ((*++s & 0xc0) != 0x80) return -1; *cp = (*cp << 6) | (*s & 0x3f); +u2: if ((*++s & 0xc0) != 0x80) return -1; *cp = (*cp << 6) | (*s & 0x3f); + return s - (uint8_t *)cs + 1; +}