revamp utf-8 handling code

pull/79/head
Leah Neukirchen 7 years ago
parent 8b50a794b0
commit 73641367db

@ -18,6 +18,7 @@
#include <wchar.h>
#include "blaze822.h"
#include "u8decode.h"
static int cols;
static wchar_t replacement = L'?';
@ -41,25 +42,24 @@ u8putstr(FILE *out, char *s, ssize_t l, int pad)
while (*s && l > 0) {
if (*s == '\t')
*s = ' ';
if (*s >= 32 && *s < 127) {
putc(*s, out);
s++;
l--;
} else if ((unsigned)*s < 32 || *s == 127) { // C0
if ((unsigned)*s < 32 || *s == 127) { // C0
fprintf(out, "%lc", (wint_t)(*s == 127 ? 0x2421 : 0x2400+*s));
s++;
l--;
} else {
wchar_t wc;
int r = mbtowc(&wc, s, 4);
uint32_t c;
int r = u8decode(s, &c);
if (r < 0) {
r = 1;
wc = replacement;
fprintf(out, "%lc", (wint_t)replacement);
s++;
} else {
l -= wcwidth((wchar_t)c);
if (l >= 0)
fwrite(s, 1, r, out);
s += r;
}
s += r;
l -= wcwidth(wc);
if (l >= 0)
fprintf(out, "%lc", (wint_t)wc);
}
}
if (pad)

@ -1,76 +1,57 @@
#include <stdint.h>
#include <stdio.h>
#include "u8decode.h"
void
safe_u8putstr(char *s0, size_t l, FILE *stream)
{
// tty-safe output of s, with relaxed utf-8 semantics:
// - C0 and C1 are displayed as escape sequences
// - valid utf8 is printed as is
// - rest is assumed to be latin1, and translated into utf8
// - valid utf-8 is printed as is
// - rest is assumed to be latin-1, and translated into utf-8
// - translate CRLF to CR
unsigned char *s = (unsigned char *)s0;
unsigned char *e = s + l;
uint32_t c;
while (s < e) {
if ((*s & 0x80) == 0) {
if (*s < 32 &&
*s != ' ' && *s != '\t' && *s != '\n' && *s != '\r') {
// C0
int l = u8decode((char *)s, &c);
if (l == -1) {
l = 1;
if (*s <= 0x9fu) {
// C1
fputc(0xe2, stream);
fputc(0x90, stream);
fputc(0x80+*s, stream);
} else if (*s == 127) {
// DEL
fputc(0x80+0x1b, stream);
fputc(0xe2, stream);
fputc(0x90, stream);
fputc(0xa1, stream);
} else if (*s == '\r') {
if (e - s > 1 && s[1] == '\n')
s++;
fputc(*s, stream);
} else {
// safe ASCII
fputc(*s, stream);
/* invalid utf-8, assume it was latin-1 */
fputc(0xc0 | (*s >> 6), stream);
fputc(0x80 | (*s & 0x3f), stream);
}
} else if ((*s & 0xc0) == 0x80) {
if (*s >= 0xa0)
goto latin1;
// C1
} else if (c < 32 &&
*s != ' ' && *s != '\t' && *s != '\n' && *s != '\r') {
// C0
fputc(0xe2, stream);
fputc(0x90, stream);
fputc(0x80+0x1b, stream);
fputc(0x80+*s, stream);
} else if (c == 127) {
// DEL
fputc(0xe2, stream);
fputc(0x90, stream);
fputc(0xa1, stream);
} else if (c == '\r') {
if (e - s > 1 && s[1] == '\n')
s++;
fputc(*s, stream);
} else {
uint32_t f = 0;
if (e - s >= 4)
f = (s[0]<<24) | (s[1]<<16) | (s[2]<<8) | s[3];
else if (e - s == 3)
f = (s[0]<<24) | (s[1]<<16) | (s[2]<<8);
else if (e - s == 2)
f = (s[0]<<24) | (s[1]<<16);
else if (e - s == 1)
f = (s[0]<<24);
if ((f & 0xe0c00000) == 0xc0800000) goto u2;
else if ((f & 0xf0c0c000) == 0xe0808000) goto u3;
else if ((f & 0xf8c0c0c0) == 0xf0808080) {
fputc(*s++, stream);
u3: fputc(*s++, stream);
u2: fputc(*s++, stream);
fputc(*s, stream);
} else {
latin1:
/* invalid utf8, assume it was latin1 */
fputc(0xc0 | (*s >> 6), stream);
fputc(0x80 | (*s & 0x3f), stream);
}
fwrite(s, 1, l, stream);
}
s++;
s += l;
}
}

@ -0,0 +1,25 @@
#include <stdint.h>
// Decode one UTF-8 codepoint into cp, return number of bytes to next one.
// On invalid UTF-8, return -1, and do not change cp.
// Overlong sequences, surrogates and invalid codepoints are not checked.
//
// This code is meant to be inlined, if cp is unused it can be optimized away.
static int
u8decode(const char *cs, uint32_t *cp)
{
const uint8_t *s = (uint8_t *)cs;
if (*s == 0) { *cp = 0; return 0; }
if (*s < 0x80) { *cp = *s; return 1; }
if (*s < 0xc0) { return -1; }
if (*s < 0xe0) { *cp = *s & 0x1f; goto u2; }
if (*s < 0xf0) { *cp = *s & 0x0f; goto u3; }
if (*s < 0xf8) { *cp = *s & 0x07; goto u4; }
return -1;
u4: if ((*++s & 0xc0) != 0x80) return -1; *cp = (*cp << 6) | (*s & 0x3f);
u3: if ((*++s & 0xc0) != 0x80) return -1; *cp = (*cp << 6) | (*s & 0x3f);
u2: if ((*++s & 0xc0) != 0x80) return -1; *cp = (*cp << 6) | (*s & 0x3f);
return s - (uint8_t *)cs + 1;
}
Loading…
Cancel
Save