1 /* -*- buffer-read-only: t -*- vi: set ro: */
2 /* DO NOT EDIT! GENERATED AUTOMATICALLY! */
3 /* Convert multibyte character to wide character.
4 Copyright (C) 1999-2002, 2005-2010 Free Software Foundation, Inc.
5 Written by Bruno Haible <bruno@clisp.org>, 2008.
7 This program is free software: you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3 of the License, or
10 (at your option) any later version.
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with this program. If not, see <http://www.gnu.org/licenses/>. */
25 #if GNULIB_defined_mbstate_t
26 /* Implement mbrtowc() on top of mbtowc(). */
31 # include "localcharset.h"
36 verify (sizeof (mbstate_t) >= 4);
38 static char internal_state[4];
41 mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps)
43 char *pstate = (char *)ps;
46 pstate = internal_state;
60 size_t nstate = pstate[0];
97 /* Work around bug <http://sourceware.org/bugzilla/show_bug.cgi?id=9674> */
98 mbtowc (NULL, NULL, 0);
101 int res = mbtowc (pwc, p, m);
105 if (pwc != NULL && ((*pwc == 0) != (res == 0)))
107 if (nstate >= (res > 0 ? res : 1))
114 /* mbtowc does not distinguish between invalid and incomplete multibyte
115 sequences. But mbrtowc needs to make this distinction.
116 There are two possible approaches:
117 - Use iconv() and its return value.
118 - Use built-in knowledge about the possible encodings.
119 Given the low quality of implementation of iconv() on the systems that
120 lack mbrtowc(), we use the second approach.
121 The possible encodings are:
123 - EUC-JP, EUC-KR, GB2312, EUC-TW, BIG5, GB18030, SJIS,
125 Use specialized code for each. */
126 if (m >= 4 || m >= MB_CUR_MAX)
128 /* Here MB_CUR_MAX > 1 and 0 < m < 4. */
130 const char *encoding = locale_charset ();
132 if (STREQ (encoding, "UTF-8", 'U', 'T', 'F', '-', '8', 0, 0, 0, 0))
134 /* Cf. unistr/u8-mblen.c. */
135 unsigned char c = (unsigned char) p[0];
150 unsigned char c2 = (unsigned char) p[1];
152 if ((c2 ^ 0x80) < 0x40
153 && (c >= 0xe1 || c2 >= 0xa0)
154 && (c != 0xed || c2 < 0xa0))
162 else /* m == 2 || m == 3 */
164 unsigned char c2 = (unsigned char) p[1];
166 if ((c2 ^ 0x80) < 0x40
167 && (c >= 0xf1 || c2 >= 0x90)
168 && (c < 0xf4 || (c == 0xf4 && c2 < 0x90)))
174 unsigned char c3 = (unsigned char) p[2];
176 if ((c3 ^ 0x80) < 0x40)
186 /* As a reference for this code, you can use the GNU libiconv
187 implementation. Look for uses of the RET_TOOFEW macro. */
189 if (STREQ (encoding, "EUC-JP", 'E', 'U', 'C', '-', 'J', 'P', 0, 0, 0))
193 unsigned char c = (unsigned char) p[0];
195 if ((c >= 0xa1 && c < 0xff) || c == 0x8e || c == 0x8f)
200 unsigned char c = (unsigned char) p[0];
204 unsigned char c2 = (unsigned char) p[1];
206 if (c2 >= 0xa1 && c2 < 0xff)
212 if (STREQ (encoding, "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0)
213 || STREQ (encoding, "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0)
214 || STREQ (encoding, "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0))
218 unsigned char c = (unsigned char) p[0];
220 if (c >= 0xa1 && c < 0xff)
225 if (STREQ (encoding, "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0))
229 unsigned char c = (unsigned char) p[0];
231 if ((c >= 0xa1 && c < 0xff) || c == 0x8e)
234 else /* m == 2 || m == 3 */
236 unsigned char c = (unsigned char) p[0];
243 if (STREQ (encoding, "GB18030", 'G', 'B', '1', '8', '0', '3', '0', 0, 0))
247 unsigned char c = (unsigned char) p[0];
249 if ((c >= 0x90 && c <= 0xe3) || (c >= 0xf8 && c <= 0xfe))
252 else /* m == 2 || m == 3 */
254 unsigned char c = (unsigned char) p[0];
256 if (c >= 0x90 && c <= 0xe3)
258 unsigned char c2 = (unsigned char) p[1];
260 if (c2 >= 0x30 && c2 <= 0x39)
266 unsigned char c3 = (unsigned char) p[2];
268 if (c3 >= 0x81 && c3 <= 0xfe)
276 if (STREQ (encoding, "SJIS", 'S', 'J', 'I', 'S', 0, 0, 0, 0, 0))
280 unsigned char c = (unsigned char) p[0];
282 if ((c >= 0x81 && c <= 0x9f) || (c >= 0xe0 && c <= 0xea)
283 || (c >= 0xf0 && c <= 0xf9))
289 /* An unknown multibyte encoding. */
296 /* Here 0 <= k < m < 4. */
312 /* The conversion state is undefined, says POSIX. */
319 /* Override the system's mbrtowc() function. */
324 rpl_mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps)
326 # if MBRTOWC_NULL_ARG_BUG || MBRTOWC_RETVAL_BUG
335 # if MBRTOWC_RETVAL_BUG
337 static mbstate_t internal_state;
339 /* Override mbrtowc's internal state. We can not call mbsinit() on the
340 hidden internal state, but we can call it on our variable. */
342 ps = &internal_state;
346 /* Parse the rest of the multibyte character byte for byte. */
348 for (; n > 0; s++, n--)
351 size_t ret = mbrtowc (&wc, s, 1, ps);
353 if (ret == (size_t)(-1))
356 if (ret != (size_t)(-2))
358 /* The multibyte character has been completed. */
361 return (wc == 0 ? 0 : count);
369 # if MBRTOWC_NUL_RETVAL_BUG
372 size_t ret = mbrtowc (&wc, s, n, ps);
374 if (ret != (size_t)(-1) && ret != (size_t)(-2))
384 return mbrtowc (pwc, s, n, ps);