1 /* -*- buffer-read-only: t -*- vi: set ro: */
2 /* DO NOT EDIT! GENERATED AUTOMATICALLY! */
3 /* Convert multibyte character to wide character.
4 Copyright (C) 1999-2002, 2005-2013 Free Software Foundation, Inc.
5 Written by Bruno Haible <bruno@clisp.org>, 2008.
7 This program is free software: you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3 of the License, or
10 (at your option) any later version.
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with this program. If not, see <http://www.gnu.org/licenses/>. */
25 #if GNULIB_defined_mbstate_t
26 /* Implement mbrtowc() on top of mbtowc(). */
31 # include "localcharset.h"
36 verify (sizeof (mbstate_t) >= 4);
38 static char internal_state[4];
41 mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps)
43 char *pstate = (char *)ps;
58 pstate = internal_state;
61 size_t nstate = pstate[0];
97 # if __GLIBC__ || defined __UCLIBC__
98 /* Work around bug <http://sourceware.org/bugzilla/show_bug.cgi?id=9674> */
99 mbtowc (NULL, NULL, 0);
102 int res = mbtowc (pwc, p, m);
106 if (pwc != NULL && ((*pwc == 0) != (res == 0)))
108 if (nstate >= (res > 0 ? res : 1))
115 /* mbtowc does not distinguish between invalid and incomplete multibyte
116 sequences. But mbrtowc needs to make this distinction.
117 There are two possible approaches:
118 - Use iconv() and its return value.
119 - Use built-in knowledge about the possible encodings.
120 Given the low quality of implementation of iconv() on the systems that
121 lack mbrtowc(), we use the second approach.
122 The possible encodings are:
124 - EUC-JP, EUC-KR, GB2312, EUC-TW, BIG5, GB18030, SJIS,
126 Use specialized code for each. */
127 if (m >= 4 || m >= MB_CUR_MAX)
129 /* Here MB_CUR_MAX > 1 and 0 < m < 4. */
131 const char *encoding = locale_charset ();
133 if (STREQ_OPT (encoding, "UTF-8", 'U', 'T', 'F', '-', '8', 0, 0, 0, 0))
135 /* Cf. unistr/u8-mblen.c. */
136 unsigned char c = (unsigned char) p[0];
151 unsigned char c2 = (unsigned char) p[1];
153 if ((c2 ^ 0x80) < 0x40
154 && (c >= 0xe1 || c2 >= 0xa0)
155 && (c != 0xed || c2 < 0xa0))
163 else /* m == 2 || m == 3 */
165 unsigned char c2 = (unsigned char) p[1];
167 if ((c2 ^ 0x80) < 0x40
168 && (c >= 0xf1 || c2 >= 0x90)
169 && (c < 0xf4 || (c == 0xf4 && c2 < 0x90)))
175 unsigned char c3 = (unsigned char) p[2];
177 if ((c3 ^ 0x80) < 0x40)
187 /* As a reference for this code, you can use the GNU libiconv
188 implementation. Look for uses of the RET_TOOFEW macro. */
190 if (STREQ_OPT (encoding,
191 "EUC-JP", 'E', 'U', 'C', '-', 'J', 'P', 0, 0, 0))
195 unsigned char c = (unsigned char) p[0];
197 if ((c >= 0xa1 && c < 0xff) || c == 0x8e || c == 0x8f)
202 unsigned char c = (unsigned char) p[0];
206 unsigned char c2 = (unsigned char) p[1];
208 if (c2 >= 0xa1 && c2 < 0xff)
214 if (STREQ_OPT (encoding,
215 "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0)
216 || STREQ_OPT (encoding,
217 "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0)
218 || STREQ_OPT (encoding,
219 "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0))
223 unsigned char c = (unsigned char) p[0];
225 if (c >= 0xa1 && c < 0xff)
230 if (STREQ_OPT (encoding,
231 "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0))
235 unsigned char c = (unsigned char) p[0];
237 if ((c >= 0xa1 && c < 0xff) || c == 0x8e)
240 else /* m == 2 || m == 3 */
242 unsigned char c = (unsigned char) p[0];
249 if (STREQ_OPT (encoding,
250 "GB18030", 'G', 'B', '1', '8', '0', '3', '0', 0, 0))
254 unsigned char c = (unsigned char) p[0];
256 if ((c >= 0x90 && c <= 0xe3) || (c >= 0xf8 && c <= 0xfe))
259 else /* m == 2 || m == 3 */
261 unsigned char c = (unsigned char) p[0];
263 if (c >= 0x90 && c <= 0xe3)
265 unsigned char c2 = (unsigned char) p[1];
267 if (c2 >= 0x30 && c2 <= 0x39)
273 unsigned char c3 = (unsigned char) p[2];
275 if (c3 >= 0x81 && c3 <= 0xfe)
283 if (STREQ_OPT (encoding, "SJIS", 'S', 'J', 'I', 'S', 0, 0, 0, 0, 0))
287 unsigned char c = (unsigned char) p[0];
289 if ((c >= 0x81 && c <= 0x9f) || (c >= 0xe0 && c <= 0xea)
290 || (c >= 0xf0 && c <= 0xf9))
296 /* An unknown multibyte encoding. */
303 /* Here 0 <= k < m < 4. */
319 /* The conversion state is undefined, says POSIX. */
326 /* Override the system's mbrtowc() function. */
331 rpl_mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps)
333 # if MBRTOWC_NULL_ARG2_BUG || MBRTOWC_RETVAL_BUG
342 # if MBRTOWC_RETVAL_BUG
344 static mbstate_t internal_state;
346 /* Override mbrtowc's internal state. We cannot call mbsinit() on the
347 hidden internal state, but we can call it on our variable. */
349 ps = &internal_state;
353 /* Parse the rest of the multibyte character byte for byte. */
355 for (; n > 0; s++, n--)
358 size_t ret = mbrtowc (&wc, s, 1, ps);
360 if (ret == (size_t)(-1))
363 if (ret != (size_t)(-2))
365 /* The multibyte character has been completed. */
368 return (wc == 0 ? 0 : count);
376 # if MBRTOWC_NUL_RETVAL_BUG
379 size_t ret = mbrtowc (&wc, s, n, ps);
381 if (ret != (size_t)(-1) && ret != (size_t)(-2))
392 # if MBRTOWC_NULL_ARG1_BUG
399 return mbrtowc (pwc, s, n, ps);