overhaul utf8decode()

this changes the utf8decode function to:

* report when an error occurs
* report how many bytes to advance on error

these will be useful in the next commit to render invalid utf8
sequences.

the new implementation is also shorter and more direct.
This commit is contained in:
NRK 2024-07-04 21:25:37 +00:00 committed by Hiltjo Posthuma
parent 7be720cc88
commit 51e32d49b5

76
drw.c
View File

@ -9,54 +9,40 @@
#include "util.h" #include "util.h"
#define UTF_INVALID 0xFFFD #define UTF_INVALID 0xFFFD
#define UTF_SIZ 4
static const unsigned char utfbyte[UTF_SIZ + 1] = {0x80, 0, 0xC0, 0xE0, 0xF0}; static int
static const unsigned char utfmask[UTF_SIZ + 1] = {0xC0, 0x80, 0xE0, 0xF0, 0xF8}; utf8decode(const char *s_in, long *u, int *err)
static const long utfmin[UTF_SIZ + 1] = { 0, 0, 0x80, 0x800, 0x10000};
static const long utfmax[UTF_SIZ + 1] = {0x10FFFF, 0x7F, 0x7FF, 0xFFFF, 0x10FFFF};
static long
utf8decodebyte(const char c, size_t *i)
{ {
for (*i = 0; *i < (UTF_SIZ + 1); ++(*i)) static const unsigned char lens[] = {
if (((unsigned char)c & utfmask[*i]) == utfbyte[*i]) /* 0XXXX */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
return (unsigned char)c & ~utfmask[*i]; /* 10XXX */ 0, 0, 0, 0, 0, 0, 0, 0, /* invalid */
return 0; /* 110XX */ 2, 2, 2, 2,
} /* 1110X */ 3, 3,
/* 11110 */ 4,
static size_t /* 11111 */ 0, /* invalid */
utf8validate(long *u, size_t i) };
{ static const unsigned char leading_mask[] = { 0x7F, 0x1F, 0x0F, 0x07 };
if (!BETWEEN(*u, utfmin[i], utfmax[i]) || BETWEEN(*u, 0xD800, 0xDFFF)) static const unsigned int overlong[] = { 0x0, 0x80, 0x0800, 0x10000 };
*u = UTF_INVALID;
for (i = 1; *u > utfmax[i]; ++i)
;
return i;
}
static size_t
utf8decode(const char *c, long *u, size_t clen)
{
size_t i, j, len, type;
long udecoded;
const unsigned char *s = (const unsigned char *)s_in;
int len = lens[*s >> 3];
*u = UTF_INVALID; *u = UTF_INVALID;
if (!clen) *err = 1;
return 0; if (len == 0)
udecoded = utf8decodebyte(c[0], &len);
if (!BETWEEN(len, 1, UTF_SIZ))
return 1; return 1;
for (i = 1, j = 1; i < clen && j < len; ++i, ++j) {
udecoded = (udecoded << 6) | utf8decodebyte(c[i], &type);
if (type)
return j;
}
if (j < len)
return 0;
*u = udecoded;
utf8validate(u, len);
long cp = s[0] & leading_mask[len - 1];
for (int i = 1; i < len; ++i) {
if (s[i] == '\0' || (s[i] & 0xC0) != 0x80)
return i;
cp = (cp << 6) | (s[i] & 0x3F);
}
/* out of range, surrogate, overlong encoding */
if (cp > 0x10FFFF || (cp >> 11) == 0x1B || cp < overlong[len - 1])
return len;
*err = 0;
*u = cp;
return len; return len;
} }
@ -242,7 +228,7 @@ drw_text(Drw *drw, int x, int y, unsigned int w, unsigned int h, unsigned int lp
unsigned int tmpw, ew, ellipsis_w = 0, ellipsis_len, hash, h0, h1; unsigned int tmpw, ew, ellipsis_w = 0, ellipsis_len, hash, h0, h1;
XftDraw *d = NULL; XftDraw *d = NULL;
Fnt *usedfont, *curfont, *nextfont; Fnt *usedfont, *curfont, *nextfont;
int utf8strlen, utf8charlen, render = x || y || w || h; int utf8strlen, utf8charlen, utf8err, render = x || y || w || h;
long utf8codepoint = 0; long utf8codepoint = 0;
const char *utf8str; const char *utf8str;
FcCharSet *fccharset; FcCharSet *fccharset;
@ -272,11 +258,11 @@ drw_text(Drw *drw, int x, int y, unsigned int w, unsigned int h, unsigned int lp
if (!ellipsis_width && render) if (!ellipsis_width && render)
ellipsis_width = drw_fontset_getwidth(drw, "..."); ellipsis_width = drw_fontset_getwidth(drw, "...");
while (1) { while (1) {
ew = ellipsis_len = utf8strlen = 0; ew = ellipsis_len = utf8err = utf8charlen = utf8strlen = 0;
utf8str = text; utf8str = text;
nextfont = NULL; nextfont = NULL;
while (*text) { while (*text) {
utf8charlen = utf8decode(text, &utf8codepoint, UTF_SIZ); utf8charlen = utf8decode(text, &utf8codepoint, &utf8err);
for (curfont = drw->fonts; curfont; curfont = curfont->next) { for (curfont = drw->fonts; curfont; curfont = curfont->next) {
charexists = charexists || XftCharExists(drw->dpy, curfont->xfont, utf8codepoint); charexists = charexists || XftCharExists(drw->dpy, curfont->xfont, utf8codepoint);
if (charexists) { if (charexists) {