Changeset View
Changeset View
Standalone View
Standalone View
head/contrib/less/charset.c
Show First 20 Lines • Show All 124 Lines • ▼ Show 20 Lines | |||||
* repetition of the letter. | * repetition of the letter. | ||||
* | * | ||||
* Each letter is one of: | * Each letter is one of: | ||||
* . normal character | * . normal character | ||||
* b binary character | * b binary character | ||||
* c control character | * c control character | ||||
*/ | */ | ||||
static void | static void | ||||
ichardef(s) | ichardef(char *s) | ||||
char *s; | |||||
{ | { | ||||
register char *cp; | char *cp; | ||||
register int n; | int n; | ||||
register char v; | char v; | ||||
n = 0; | n = 0; | ||||
v = 0; | v = 0; | ||||
cp = chardef; | cp = chardef; | ||||
while (*s != '\0') | while (*s != '\0') | ||||
{ | { | ||||
switch (*s++) | switch (*s++) | ||||
{ | { | ||||
Show All 35 Lines | while (cp < chardef + sizeof(chardef)) | ||||
*cp++ = v; | *cp++ = v; | ||||
} | } | ||||
/* | /* | ||||
* Define a charset, given a charset name. | * Define a charset, given a charset name. | ||||
* The valid charset names are listed in the "charsets" array. | * The valid charset names are listed in the "charsets" array. | ||||
*/ | */ | ||||
static int | static int | ||||
icharset(name, no_error) | icharset(char *name, int no_error) | ||||
register char *name; | |||||
int no_error; | |||||
{ | { | ||||
register struct charset *p; | struct charset *p; | ||||
register struct cs_alias *a; | struct cs_alias *a; | ||||
if (name == NULL || *name == '\0') | if (name == NULL || *name == '\0') | ||||
return (0); | return (0); | ||||
/* First see if the name is an alias. */ | /* First see if the name is an alias. */ | ||||
for (a = cs_aliases; a->name != NULL; a++) | for (a = cs_aliases; a->name != NULL; a++) | ||||
{ | { | ||||
if (strcmp(name, a->name) == 0) | if (strcmp(name, a->name) == 0) | ||||
Show All 21 Lines | icharset(char *name, int no_error) | ||||
return (0); | return (0); | ||||
} | } | ||||
#if HAVE_LOCALE | #if HAVE_LOCALE | ||||
/* | /* | ||||
* Define a charset, given a locale name. | * Define a charset, given a locale name. | ||||
*/ | */ | ||||
static void | static void | ||||
ilocale() | ilocale(void) | ||||
{ | { | ||||
register int c; | int c; | ||||
for (c = 0; c < (int) sizeof(chardef); c++) | for (c = 0; c < (int) sizeof(chardef); c++) | ||||
{ | { | ||||
if (isprint(c)) | if (isprint(c)) | ||||
chardef[c] = 0; | chardef[c] = 0; | ||||
else if (iscntrl(c)) | else if (iscntrl(c)) | ||||
chardef[c] = IS_CONTROL_CHAR; | chardef[c] = IS_CONTROL_CHAR; | ||||
else | else | ||||
chardef[c] = IS_BINARY_CHAR|IS_CONTROL_CHAR; | chardef[c] = IS_BINARY_CHAR|IS_CONTROL_CHAR; | ||||
} | } | ||||
} | } | ||||
#endif | #endif | ||||
/* | /* | ||||
* Define the printing format for control (or binary utf) chars. | * Define the printing format for control (or binary utf) chars. | ||||
*/ | */ | ||||
static void | static void | ||||
setbinfmt(s, fmtvarptr, default_fmt) | setbinfmt(char *s, char **fmtvarptr, char *default_fmt) | ||||
char *s; | |||||
char **fmtvarptr; | |||||
char *default_fmt; | |||||
{ | { | ||||
if (s && utf_mode) | if (s && utf_mode) | ||||
{ | { | ||||
/* It would be too hard to account for width otherwise. */ | /* It would be too hard to account for width otherwise. */ | ||||
char *t = s; | char *t = s; | ||||
while (*t) | while (*t) | ||||
{ | { | ||||
if (*t < ' ' || *t > '~') | if (*t < ' ' || *t > '~') | ||||
Show All 29 Lines | attr: | ||||
} | } | ||||
*fmtvarptr = s; | *fmtvarptr = s; | ||||
} | } | ||||
/* | /* | ||||
* | * | ||||
*/ | */ | ||||
static void | static void | ||||
set_charset() | set_charset(void) | ||||
{ | { | ||||
char *s; | char *s; | ||||
/* | /* | ||||
* See if environment variable LESSCHARSET is defined. | * See if environment variable LESSCHARSET is defined. | ||||
*/ | */ | ||||
s = lgetenv("LESSCHARSET"); | s = lgetenv("LESSCHARSET"); | ||||
if (icharset(s, 0)) | if (icharset(s, 0)) | ||||
▲ Show 20 Lines • Show All 54 Lines • ▼ Show 20 Lines | |||||
#endif | #endif | ||||
#endif | #endif | ||||
} | } | ||||
/* | /* | ||||
* Initialize charset data structures. | * Initialize charset data structures. | ||||
*/ | */ | ||||
public void | public void | ||||
init_charset() | init_charset(void) | ||||
{ | { | ||||
char *s; | char *s; | ||||
#if HAVE_LOCALE | #if HAVE_LOCALE | ||||
setlocale(LC_ALL, ""); | setlocale(LC_ALL, ""); | ||||
#endif | #endif | ||||
set_charset(); | set_charset(); | ||||
s = lgetenv("LESSBINFMT"); | s = lgetenv("LESSBINFMT"); | ||||
setbinfmt(s, &binfmt, "*s<%02X>"); | setbinfmt(s, &binfmt, "*s<%02X>"); | ||||
s = lgetenv("LESSUTFBINFMT"); | s = lgetenv("LESSUTFBINFMT"); | ||||
setbinfmt(s, &utfbinfmt, "<U+%04lX>"); | setbinfmt(s, &utfbinfmt, "<U+%04lX>"); | ||||
} | } | ||||
/* | /* | ||||
* Is a given character a "binary" character? | * Is a given character a "binary" character? | ||||
*/ | */ | ||||
public int | public int | ||||
binary_char(c) | binary_char(LWCHAR c) | ||||
LWCHAR c; | |||||
{ | { | ||||
if (utf_mode) | if (utf_mode) | ||||
return (is_ubin_char(c)); | return (is_ubin_char(c)); | ||||
c &= 0377; | c &= 0377; | ||||
return (chardef[c] & IS_BINARY_CHAR); | return (chardef[c] & IS_BINARY_CHAR); | ||||
} | } | ||||
/* | /* | ||||
* Is a given character a "control" character? | * Is a given character a "control" character? | ||||
*/ | */ | ||||
public int | public int | ||||
control_char(c) | control_char(LWCHAR c) | ||||
LWCHAR c; | |||||
{ | { | ||||
c &= 0377; | c &= 0377; | ||||
return (chardef[c] & IS_CONTROL_CHAR); | return (chardef[c] & IS_CONTROL_CHAR); | ||||
} | } | ||||
/* | /* | ||||
* Return the printable form of a character. | * Return the printable form of a character. | ||||
* For example, in the "ascii" charset '\3' is printed as "^C". | * For example, in the "ascii" charset '\3' is printed as "^C". | ||||
*/ | */ | ||||
public char * | public char * | ||||
prchar(c) | prchar(LWCHAR c) | ||||
LWCHAR c; | |||||
{ | { | ||||
/* {{ This buffer can be overrun if LESSBINFMT is a long string. }} */ | /* {{ This buffer can be overrun if LESSBINFMT is a long string. }} */ | ||||
static char buf[32]; | static char buf[32]; | ||||
c &= 0377; | c &= 0377; | ||||
if ((c < 128 || !utf_mode) && !control_char(c)) | if ((c < 128 || !utf_mode) && !control_char(c)) | ||||
SNPRINTF1(buf, sizeof(buf), "%c", (int) c); | SNPRINTF1(buf, sizeof(buf), "%c", (int) c); | ||||
else if (c == ESC) | else if (c == ESC) | ||||
Show All 18 Lines | else | ||||
SNPRINTF1(buf, sizeof(buf), binfmt, c); | SNPRINTF1(buf, sizeof(buf), binfmt, c); | ||||
return (buf); | return (buf); | ||||
} | } | ||||
/* | /* | ||||
* Return the printable form of a UTF-8 character. | * Return the printable form of a UTF-8 character. | ||||
*/ | */ | ||||
public char * | public char * | ||||
prutfchar(ch) | prutfchar(LWCHAR ch) | ||||
LWCHAR ch; | |||||
{ | { | ||||
static char buf[32]; | static char buf[32]; | ||||
if (ch == ESC) | if (ch == ESC) | ||||
strcpy(buf, "ESC"); | strcpy(buf, "ESC"); | ||||
else if (ch < 128 && control_char(ch)) | else if (ch < 128 && control_char(ch)) | ||||
{ | { | ||||
if (!control_char(ch ^ 0100)) | if (!control_char(ch ^ 0100)) | ||||
Show All 13 Lines | prutfchar(LWCHAR ch) | ||||
} | } | ||||
return (buf); | return (buf); | ||||
} | } | ||||
/* | /* | ||||
* Get the length of a UTF-8 character in bytes. | * Get the length of a UTF-8 character in bytes. | ||||
*/ | */ | ||||
public int | public int | ||||
utf_len(ch) | utf_len(char ch) | ||||
char ch; | |||||
{ | { | ||||
if ((ch & 0x80) == 0) | if ((ch & 0x80) == 0) | ||||
return 1; | return 1; | ||||
if ((ch & 0xE0) == 0xC0) | if ((ch & 0xE0) == 0xC0) | ||||
return 2; | return 2; | ||||
if ((ch & 0xF0) == 0xE0) | if ((ch & 0xF0) == 0xE0) | ||||
return 3; | return 3; | ||||
if ((ch & 0xF8) == 0xF0) | if ((ch & 0xF8) == 0xF0) | ||||
return 4; | return 4; | ||||
if ((ch & 0xFC) == 0xF8) | if ((ch & 0xFC) == 0xF8) | ||||
return 5; | return 5; | ||||
if ((ch & 0xFE) == 0xFC) | if ((ch & 0xFE) == 0xFC) | ||||
return 6; | return 6; | ||||
/* Invalid UTF-8 encoding. */ | /* Invalid UTF-8 encoding. */ | ||||
return 1; | return 1; | ||||
} | } | ||||
/* | /* | ||||
* Does the parameter point to the lead byte of a well-formed UTF-8 character? | * Does the parameter point to the lead byte of a well-formed UTF-8 character? | ||||
*/ | */ | ||||
public int | public int | ||||
is_utf8_well_formed(s, slen) | is_utf8_well_formed(unsigned char *s, int slen) | ||||
unsigned char *s; | |||||
int slen; | |||||
{ | { | ||||
int i; | int i; | ||||
int len; | int len; | ||||
if (IS_UTF8_INVALID(s[0])) | if (IS_UTF8_INVALID(s[0])) | ||||
return (0); | return (0); | ||||
len = utf_len((char) s[0]); | len = utf_len((char) s[0]); | ||||
Show All 18 Lines | if (!IS_UTF8_TRAIL(s[i])) | ||||
return (0); | return (0); | ||||
return (1); | return (1); | ||||
} | } | ||||
/* | /* | ||||
* Return number of invalid UTF-8 sequences found in a buffer. | * Return number of invalid UTF-8 sequences found in a buffer. | ||||
*/ | */ | ||||
public int | public int | ||||
utf_bin_count(data, len) | utf_bin_count(unsigned char *data, int len) | ||||
unsigned char *data; | |||||
int len; | |||||
{ | { | ||||
int bin_count = 0; | int bin_count = 0; | ||||
while (len > 0) | while (len > 0) | ||||
{ | { | ||||
if (is_utf8_well_formed(data, len)) | if (is_utf8_well_formed(data, len)) | ||||
{ | { | ||||
int clen = utf_len(*data); | int clen = utf_len(*data); | ||||
data += clen; | data += clen; | ||||
Show All 10 Lines | utf_bin_count(unsigned char *data, int len) | ||||
} | } | ||||
return (bin_count); | return (bin_count); | ||||
} | } | ||||
/* | /* | ||||
* Get the value of a UTF-8 character. | * Get the value of a UTF-8 character. | ||||
*/ | */ | ||||
public LWCHAR | public LWCHAR | ||||
get_wchar(p) | get_wchar(constant char *p) | ||||
char *p; | |||||
{ | { | ||||
switch (utf_len(p[0])) | switch (utf_len(p[0])) | ||||
{ | { | ||||
case 1: | case 1: | ||||
default: | default: | ||||
/* 0xxxxxxx */ | /* 0xxxxxxx */ | ||||
return (LWCHAR) | return (LWCHAR) | ||||
(p[0] & 0xFF); | (p[0] & 0xFF); | ||||
Show All 34 Lines | return (LWCHAR) ( | ||||
(p[5] & 0x3F)); | (p[5] & 0x3F)); | ||||
} | } | ||||
} | } | ||||
/* | /* | ||||
* Store a character into a UTF-8 string. | * Store a character into a UTF-8 string. | ||||
*/ | */ | ||||
public void | public void | ||||
put_wchar(pp, ch) | put_wchar(char **pp, LWCHAR ch) | ||||
char **pp; | |||||
LWCHAR ch; | |||||
{ | { | ||||
if (!utf_mode || ch < 0x80) | if (!utf_mode || ch < 0x80) | ||||
{ | { | ||||
/* 0xxxxxxx */ | /* 0xxxxxxx */ | ||||
*(*pp)++ = (char) ch; | *(*pp)++ = (char) ch; | ||||
} else if (ch < 0x800) | } else if (ch < 0x800) | ||||
{ | { | ||||
/* 110xxxxx 10xxxxxx */ | /* 110xxxxx 10xxxxxx */ | ||||
Show All 31 Lines | if (!utf_mode || ch < 0x80) | ||||
*(*pp)++ = (char) (0x80 | (ch & 0x3F)); | *(*pp)++ = (char) (0x80 | (ch & 0x3F)); | ||||
} | } | ||||
} | } | ||||
/* | /* | ||||
* Step forward or backward one character in a string. | * Step forward or backward one character in a string. | ||||
*/ | */ | ||||
public LWCHAR | public LWCHAR | ||||
step_char(pp, dir, limit) | step_char(constant char **pp, signed int dir, constant char *limit) | ||||
char **pp; | |||||
signed int dir; | |||||
char *limit; | |||||
{ | { | ||||
LWCHAR ch; | LWCHAR ch; | ||||
int len; | int len; | ||||
char *p = *pp; | constant char *p = *pp; | ||||
if (!utf_mode) | if (!utf_mode) | ||||
{ | { | ||||
/* It's easy if chars are one byte. */ | /* It's easy if chars are one byte. */ | ||||
if (dir > 0) | if (dir > 0) | ||||
ch = (LWCHAR) ((p < limit) ? *p++ : 0); | ch = (LWCHAR) ((p < limit) ? *p++ : 0); | ||||
else | else | ||||
ch = (LWCHAR) ((p > limit) ? *--p : 0); | ch = (LWCHAR) ((p > limit) ? *--p : 0); | ||||
▲ Show 20 Lines • Show All 46 Lines • ▼ Show 20 Lines | |||||
/* comb_table is special pairs, not ranges. */ | /* comb_table is special pairs, not ranges. */ | ||||
static struct wchar_range comb_table[] = { | static struct wchar_range comb_table[] = { | ||||
{0x0644,0x0622}, {0x0644,0x0623}, {0x0644,0x0625}, {0x0644,0x0627}, | {0x0644,0x0622}, {0x0644,0x0623}, {0x0644,0x0625}, {0x0644,0x0627}, | ||||
}; | }; | ||||
static int | static int | ||||
is_in_table(ch, table) | is_in_table(LWCHAR ch, struct wchar_range_table *table) | ||||
LWCHAR ch; | |||||
struct wchar_range_table *table; | |||||
{ | { | ||||
int hi; | int hi; | ||||
int lo; | int lo; | ||||
/* Binary search in the table. */ | /* Binary search in the table. */ | ||||
if (ch < table->table[0].first) | if (ch < table->table[0].first) | ||||
return 0; | return 0; | ||||
lo = 0; | lo = 0; | ||||
Show All 11 Lines | is_in_table(LWCHAR ch, struct wchar_range_table *table) | ||||
return 0; | return 0; | ||||
} | } | ||||
/* | /* | ||||
* Is a character a UTF-8 composing character? | * Is a character a UTF-8 composing character? | ||||
* If a composing character follows any char, the two combine into one glyph. | * If a composing character follows any char, the two combine into one glyph. | ||||
*/ | */ | ||||
public int | public int | ||||
is_composing_char(ch) | is_composing_char(LWCHAR ch) | ||||
LWCHAR ch; | |||||
{ | { | ||||
return is_in_table(ch, &compose_table); | return is_in_table(ch, &compose_table); | ||||
} | } | ||||
/* | /* | ||||
* Should this UTF-8 character be treated as binary? | * Should this UTF-8 character be treated as binary? | ||||
*/ | */ | ||||
public int | public int | ||||
is_ubin_char(ch) | is_ubin_char(LWCHAR ch) | ||||
LWCHAR ch; | |||||
{ | { | ||||
return is_in_table(ch, &ubin_table); | return is_in_table(ch, &ubin_table); | ||||
} | } | ||||
/* | /* | ||||
* Is this a double width UTF-8 character? | * Is this a double width UTF-8 character? | ||||
*/ | */ | ||||
public int | public int | ||||
is_wide_char(ch) | is_wide_char(LWCHAR ch) | ||||
LWCHAR ch; | |||||
{ | { | ||||
return is_in_table(ch, &wide_table); | return is_in_table(ch, &wide_table); | ||||
} | } | ||||
/* | /* | ||||
* Is a character a UTF-8 combining character? | * Is a character a UTF-8 combining character? | ||||
* A combining char acts like an ordinary char, but if it follows | * A combining char acts like an ordinary char, but if it follows | ||||
* a specific char (not any char), the two combine into one glyph. | * a specific char (not any char), the two combine into one glyph. | ||||
*/ | */ | ||||
public int | public int | ||||
is_combining_char(ch1, ch2) | is_combining_char(LWCHAR ch1, LWCHAR ch2) | ||||
LWCHAR ch1; | |||||
LWCHAR ch2; | |||||
{ | { | ||||
/* The table is small; use linear search. */ | /* The table is small; use linear search. */ | ||||
int i; | int i; | ||||
for (i = 0; i < sizeof(comb_table)/sizeof(*comb_table); i++) | for (i = 0; i < sizeof(comb_table)/sizeof(*comb_table); i++) | ||||
{ | { | ||||
if (ch1 == comb_table[i].first && | if (ch1 == comb_table[i].first && | ||||
ch2 == comb_table[i].last) | ch2 == comb_table[i].last) | ||||
return 1; | return 1; | ||||
} | } | ||||
return 0; | return 0; | ||||
} | } | ||||