revamp utf-8 handling code

7 years ago · 73641367db
parent 8b50a794b0
commit 73641367db
3 changed files with 64 additions and 58 deletions
--- a/mscan.c
+++ b/mscan.c
@ -18,6 +18,7 @@
 #include <wchar.h>

 #include "blaze822.h"
+#include "u8decode.h"

 static int cols;
 static wchar_t replacement = L'?';
@ -41,25 +42,24 @@ u8putstr(FILE *out, char *s, ssize_t l, int pad)
 	while (*s && l > 0) {
 		if (*s == '\t')
 			*s = ' ';
-		if (*s >= 32 && *s < 127) {
-			putc(*s, out);
-			s++;
-			l--;
-		} else if ((unsigned)*s < 32 || *s == 127) {  // C0
+
+		if ((unsigned)*s < 32 || *s == 127) {  // C0
 			fprintf(out, "%lc", (wint_t)(*s == 127 ? 0x2421 : 0x2400+*s));
 			s++;
 			l--;
 		} else {
-			wchar_t wc;
-			int r = mbtowc(&wc, s, 4);
+			uint32_t c;
+			int r = u8decode(s, &c);
 			if (r < 0) {
 				r = 1;
-				wc = replacement;
+				fprintf(out, "%lc", (wint_t)replacement);
+				s++;
+			} else {
+				l -= wcwidth((wchar_t)c);
+				if (l >= 0)
+					fwrite(s, 1, r, out);
+				s += r;
 			}
-			s += r;
-			l -= wcwidth(wc);
-			if (l >= 0)
-				fprintf(out, "%lc", (wint_t)wc);
 		}
 	}
 	if (pad)
--- a/safe_u8putstr.c
+++ b/safe_u8putstr.c
@ -1,76 +1,57 @@
 #include <stdint.h>
 #include <stdio.h>

+#include "u8decode.h"
+
 void
 safe_u8putstr(char *s0, size_t l, FILE *stream)
 {
 	// tty-safe output of s, with relaxed utf-8 semantics:
 	// - C0 and C1 are displayed as escape sequences
-	// - valid utf8 is printed as is
-	// - rest is assumed to be latin1, and translated into utf8
+	// - valid utf-8 is printed as is
+	// - rest is assumed to be latin-1, and translated into utf-8
 	// - translate CRLF to CR

 	unsigned char *s = (unsigned char *)s0;
 	unsigned char *e = s + l;
+	uint32_t c;

 	while (s < e) {
-		if ((*s & 0x80) == 0) {
-			if (*s < 32 &&
-			    *s != ' ' && *s != '\t' && *s != '\n' && *s != '\r') {
-				// C0
+		int l = u8decode((char *)s, &c);
+		if (l == -1) {
+			l = 1;
+			if (*s <= 0x9fu) {
+				// C1
 				fputc(0xe2, stream);
 				fputc(0x90, stream);
-				fputc(0x80+*s, stream);
-			} else if (*s == 127) {
-				// DEL
+				fputc(0x80+0x1b, stream);
+
 				fputc(0xe2, stream);
 				fputc(0x90, stream);
-				fputc(0xa1, stream);
-			} else if (*s == '\r') {
-				if (e - s > 1 && s[1] == '\n')
-					s++;
 				fputc(*s, stream);
 			} else {
-				// safe ASCII
-				fputc(*s, stream);
+				/* invalid utf-8, assume it was latin-1 */
+				fputc(0xc0 | (*s >> 6), stream);
+				fputc(0x80 | (*s & 0x3f), stream);
 			}
-		} else if ((*s & 0xc0) == 0x80) {
-			if (*s >= 0xa0)
-				goto latin1;
-
-			// C1
+		} else if (c < 32 && 
+		    *s != ' ' && *s != '\t' && *s != '\n' && *s != '\r') {
+			// C0
 			fputc(0xe2, stream);
 			fputc(0x90, stream);
-			fputc(0x80+0x1b, stream);
-
+			fputc(0x80+*s, stream);
+		} else if (c == 127) {
+			// DEL
 			fputc(0xe2, stream);
 			fputc(0x90, stream);
+			fputc(0xa1, stream);
+		} else if (c == '\r') {
+			if (e - s > 1 && s[1] == '\n')
+				s++;
 			fputc(*s, stream);
 		} else {
-			uint32_t f = 0;
-			if (e - s >= 4)
-				f = (s[0]<<24) | (s[1]<<16) | (s[2]<<8) | s[3];
-			else if (e - s == 3)
-				f = (s[0]<<24) | (s[1]<<16) | (s[2]<<8);
-			else if (e - s == 2)
-				f = (s[0]<<24) | (s[1]<<16);
-			else if (e - s == 1)
-				f = (s[0]<<24);
-
-			if      ((f & 0xe0c00000) == 0xc0800000) goto u2;
-			else if ((f & 0xf0c0c000) == 0xe0808000) goto u3;
-			else if ((f & 0xf8c0c0c0) == 0xf0808080) {
-				fputc(*s++, stream);
-u3:                             fputc(*s++, stream);
-u2:                             fputc(*s++, stream);
-				fputc(*s, stream);
-			} else {
-latin1:
-				/* invalid utf8, assume it was latin1 */
-				fputc(0xc0 | (*s >> 6), stream);
-				fputc(0x80 | (*s & 0x3f), stream);
-			}
+			fwrite(s, 1, l, stream);
 		}
-		s++;
+		s += l;
 	}
 }
--- a/u8decode.h
+++ b/u8decode.h
@ -0,0 +1,25 @@
+#include <stdint.h>
+
+// Decode one UTF-8 codepoint into cp, return number of bytes to next one.
+// On invalid UTF-8, return -1, and do not change cp.
+// Overlong sequences, surrogates and invalid codepoints are not checked.
+//
+// This code is meant to be inlined, if cp is unused it can be optimized away.
+static int
+u8decode(const char *cs, uint32_t *cp)
+{
+	const uint8_t *s = (uint8_t *)cs;
+
+	if (*s == 0)   { *cp = 0; return 0; }
+	if (*s < 0x80) { *cp = *s; return 1; }
+	if (*s < 0xc0) { return -1; }
+	if (*s < 0xe0) { *cp = *s & 0x1f; goto u2; }
+	if (*s < 0xf0) { *cp = *s & 0x0f; goto u3; }
+	if (*s < 0xf8) { *cp = *s & 0x07; goto u4; }
+	return -1;
+
+u4:	if ((*++s & 0xc0) != 0x80) return -1;  *cp = (*cp << 6) | (*s & 0x3f);
+u3:	if ((*++s & 0xc0) != 0x80) return -1;  *cp = (*cp << 6) | (*s & 0x3f);
+u2:	if ((*++s & 0xc0) != 0x80) return -1;  *cp = (*cp << 6) | (*s & 0x3f);
+	return s - (uint8_t *)cs + 1;
+}