diff --git a/sys/kern/tty_ttydisc.c b/sys/kern/tty_ttydisc.c --- a/sys/kern/tty_ttydisc.c +++ b/sys/kern/tty_ttydisc.c @@ -43,6 +43,9 @@ #include #include +#include +#include + /* * Standard TTYDISC `termios' line discipline. */ @@ -78,8 +81,13 @@ /* Character is alphanumeric. */ #define CTL_ALNUM(c) (((c) >= '0' && (c) <= '9') || \ ((c) >= 'a' && (c) <= 'z') || ((c) >= 'A' && (c) <= 'Z')) +/* Character is UTF8-encoded. */ +#define CTL_UTF8(c) (!!((c) & 0x80)) +/* Character is a UTF8 continuation byte. */ +#define CTL_UTF8_CONT(c) (((c) & 0xc0) == 0x80) #define TTY_STACKBUF 256 +#define UTF8_STACKBUF 4 void ttydisc_open(struct tty *tp) @@ -800,6 +808,72 @@ ttyoutq_write_nofrag(&tp->t_outq, "\b\b\b\b\b\b\b\b", tablen); return (0); + } else if ((tp->t_termios.c_iflag & IUTF8) != 0 && + CTL_UTF8(c)) { + uint8_t bytes[UTF8_STACKBUF] = { 0 }; + int curidx = UTF8_STACKBUF - 1, cwidth = 1, + nb = 0; + teken_char_t codepoint; + + /* Save current byte. */ + bytes[curidx] = c; + curidx--; + nb++; + /* Loop back through inq until we hit the + * leading byte. */ + while (CTL_UTF8_CONT(c) && nb < UTF8_STACKBUF) { + ttyinq_peekchar(&tp->t_inq, &c, "e); + ttyinq_unputchar(&tp->t_inq); + bytes[curidx] = c; + curidx--; + nb++; + } + /* + * Shift array so that the leading + * byte ends up at idx 0. + */ + if (nb < UTF8_STACKBUF) + memmove(&bytes[0], &bytes[curidx + 1], + nb * sizeof(uint8_t)); + /* Check for malformed UTF8 characters. */ + if (nb == UTF8_STACKBUF && + CTL_UTF8_CONT(bytes[0])) { + /* + * Place all bytes back into the inq and + * delete the last byte only. + */ + ttyinq_write(&tp->t_inq, bytes, + UTF8_STACKBUF, 0); + } else { + /* Find codepoint and width. */ + codepoint = + teken_utf8_bytes_to_codepoint(bytes, + nb); + if (codepoint != + TEKEN_UTF8_INVALID_CODEPOINT) { + cwidth = teken_wcwidth( + codepoint); + } else { + /* + * Place all bytes back into the + * inq and fall back to + * default behaviour. + */ + ttyinq_write(&tp->t_inq, bytes, + nb, 0); + } + } + tp->t_column -= cwidth; + /* + * Delete character by punching + * 'cwidth' spaces over it. + */ + if (cwidth == 1) + ttyoutq_write_nofrag(&tp->t_outq, + "\b \b", 3); + else if (cwidth == 2) + ttyoutq_write_nofrag(&tp->t_outq, + "\b\b \b\b", 6); } else { /* * Remove a regular character by diff --git a/sys/teken/teken_wcwidth.h b/sys/teken/teken_wcwidth.h --- a/sys/teken/teken_wcwidth.h +++ b/sys/teken/teken_wcwidth.h @@ -8,6 +8,8 @@ * Latest version: http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c */ +#define TEKEN_UTF8_INVALID_CODEPOINT -1 + struct interval { teken_char_t first; teken_char_t last; @@ -116,3 +118,31 @@ (ucs >= 0x20000 && ucs <= 0x2fffd) || (ucs >= 0x30000 && ucs <= 0x3fffd))); } + +/* + * Converts an UTF-8 byte sequence to a codepoint as specified in + * https://datatracker.ietf.org/doc/html/rfc3629#section-3 . The function + * expects the 'bytes' array to start with the leading character. + */ +static teken_char_t +teken_utf8_bytes_to_codepoint(uint8_t bytes[4], int nbytes) +{ + + /* Check for malformed characters. */ + if (bitcount(bytes[0] & 0xf0) != nbytes) + return (TEKEN_UTF8_INVALID_CODEPOINT); + + switch (nbytes) { + case 1: + return (bytes[0] & 0x7f); + case 2: + return (bytes[0] & 0xf) << 6 | (bytes[1] & 0x3f); + case 3: + return (bytes[0] & 0xf) << 12 | (bytes[1] & 0x3f) << 6 | (bytes[2] & 0x3f); + case 4: + return (bytes[0] & 0x7) << 18 | (bytes[1] & 0x3f) << 12 | + (bytes[2] & 0x3f) << 6 | (bytes[3] & 0x3f); + default: + return (TEKEN_UTF8_INVALID_CODEPOINT); + } +}