git.gag.com Git - debian/tar/blob - gnu/mbrtowc.c

   1 /* -*- buffer-read-only: t -*- vi: set ro: */
   2 /* DO NOT EDIT! GENERATED AUTOMATICALLY! */
   3 /* Convert multibyte character to wide character.
   4    Copyright (C) 1999-2002, 2005-2011 Free Software Foundation, Inc.
   5    Written by Bruno Haible <bruno@clisp.org>, 2008.
   6
   7    This program is free software: you can redistribute it and/or modify
   8    it under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3 of the License, or
  10    (at your option) any later version.
  11
  12    This program is distributed in the hope that it will be useful,
  13    but WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15    GNU General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
  19
  20 #include <config.h>
  21
  22 /* Specification.  */
  23 #include <wchar.h>
  24
  25 #if GNULIB_defined_mbstate_t
  26 /* Implement mbrtowc() on top of mbtowc().  */
  27
  28 # include <errno.h>
  29 # include <stdlib.h>
  30
  31 # include "localcharset.h"
  32 # include "streq.h"
  33 # include "verify.h"
  34
  35
  36 verify (sizeof (mbstate_t) >= 4);
  37
  38 static char internal_state[4];
  39
  40 size_t
  41 mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps)
  42 {
  43   char *pstate = (char *)ps;
  44
  45   if (s == NULL)
  46     {
  47       pwc = NULL;
  48       s = "";
  49       n = 1;
  50     }
  51
  52   if (n == 0)
  53     return (size_t)(-2);
  54
  55   /* Here n > 0.  */
  56
  57   if (pstate == NULL)
  58     pstate = internal_state;
  59
  60   {
  61     size_t nstate = pstate[0];
  62     char buf[4];
  63     const char *p;
  64     size_t m;
  65
  66     switch (nstate)
  67       {
  68       case 0:
  69         p = s;
  70         m = n;
  71         break;
  72       case 3:
  73         buf[2] = pstate[3];
  74         /*FALLTHROUGH*/
  75       case 2:
  76         buf[1] = pstate[2];
  77         /*FALLTHROUGH*/
  78       case 1:
  79         buf[0] = pstate[1];
  80         p = buf;
  81         m = nstate;
  82         buf[m++] = s[0];
  83         if (n >= 2 && m < 4)
  84           {
  85             buf[m++] = s[1];
  86             if (n >= 3 && m < 4)
  87               buf[m++] = s[2];
  88           }
  89         break;
  90       default:
  91         errno = EINVAL;
  92         return (size_t)(-1);
  93       }
  94
  95     /* Here m > 0.  */
  96
  97 # if __GLIBC__ || defined __UCLIBC__
  98     /* Work around bug <http://sourceware.org/bugzilla/show_bug.cgi?id=9674> */
  99     mbtowc (NULL, NULL, 0);
 100 # endif
 101     {
 102       int res = mbtowc (pwc, p, m);
 103
 104       if (res >= 0)
 105         {
 106           if (pwc != NULL && ((*pwc == 0) != (res == 0)))
 107             abort ();
 108           if (nstate >= (res > 0 ? res : 1))
 109             abort ();
 110           res -= nstate;
 111           pstate[0] = 0;
 112           return res;
 113         }
 114
 115       /* mbtowc does not distinguish between invalid and incomplete multibyte
 116          sequences.  But mbrtowc needs to make this distinction.
 117          There are two possible approaches:
 118            - Use iconv() and its return value.
 119            - Use built-in knowledge about the possible encodings.
 120          Given the low quality of implementation of iconv() on the systems that
 121          lack mbrtowc(), we use the second approach.
 122          The possible encodings are:
 123            - 8-bit encodings,
 124            - EUC-JP, EUC-KR, GB2312, EUC-TW, BIG5, GB18030, SJIS,
 125            - UTF-8.
 126          Use specialized code for each.  */
 127       if (m >= 4 || m >= MB_CUR_MAX)
 128         goto invalid;
 129       /* Here MB_CUR_MAX > 1 and 0 < m < 4.  */
 130       {
 131         const char *encoding = locale_charset ();
 132
 133         if (STREQ (encoding, "UTF-8", 'U', 'T', 'F', '-', '8', 0, 0, 0, 0))
 134           {
 135             /* Cf. unistr/u8-mblen.c.  */
 136             unsigned char c = (unsigned char) p[0];
 137
 138             if (c >= 0xc2)
 139               {
 140                 if (c < 0xe0)
 141                   {
 142                     if (m == 1)
 143                       goto incomplete;
 144                   }
 145                 else if (c < 0xf0)
 146                   {
 147                     if (m == 1)
 148                       goto incomplete;
 149                     if (m == 2)
 150                       {
 151                         unsigned char c2 = (unsigned char) p[1];
 152
 153                         if ((c2 ^ 0x80) < 0x40
 154                             && (c >= 0xe1 || c2 >= 0xa0)
 155                             && (c != 0xed || c2 < 0xa0))
 156                           goto incomplete;
 157                       }
 158                   }
 159                 else if (c <= 0xf4)
 160                   {
 161                     if (m == 1)
 162                       goto incomplete;
 163                     else /* m == 2 || m == 3 */
 164                       {
 165                         unsigned char c2 = (unsigned char) p[1];
 166
 167                         if ((c2 ^ 0x80) < 0x40
 168                             && (c >= 0xf1 || c2 >= 0x90)
 169                             && (c < 0xf4 || (c == 0xf4 && c2 < 0x90)))
 170                           {
 171                             if (m == 2)
 172                               goto incomplete;
 173                             else /* m == 3 */
 174                               {
 175                                 unsigned char c3 = (unsigned char) p[2];
 176
 177                                 if ((c3 ^ 0x80) < 0x40)
 178                                   goto incomplete;
 179                               }
 180                           }
 181                       }
 182                   }
 183               }
 184             goto invalid;
 185           }
 186
 187         /* As a reference for this code, you can use the GNU libiconv
 188            implementation.  Look for uses of the RET_TOOFEW macro.  */
 189
 190         if (STREQ (encoding, "EUC-JP", 'E', 'U', 'C', '-', 'J', 'P', 0, 0, 0))
 191           {
 192             if (m == 1)
 193               {
 194                 unsigned char c = (unsigned char) p[0];
 195
 196                 if ((c >= 0xa1 && c < 0xff) || c == 0x8e || c == 0x8f)
 197                   goto incomplete;
 198               }
 199             if (m == 2)
 200               {
 201                 unsigned char c = (unsigned char) p[0];
 202
 203                 if (c == 0x8f)
 204                   {
 205                     unsigned char c2 = (unsigned char) p[1];
 206
 207                     if (c2 >= 0xa1 && c2 < 0xff)
 208                       goto incomplete;
 209                   }
 210               }
 211             goto invalid;
 212           }
 213         if (STREQ (encoding, "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0)
 214             || STREQ (encoding, "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0)
 215             || STREQ (encoding, "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0))
 216           {
 217             if (m == 1)
 218               {
 219                 unsigned char c = (unsigned char) p[0];
 220
 221                 if (c >= 0xa1 && c < 0xff)
 222                   goto incomplete;
 223               }
 224             goto invalid;
 225           }
 226         if (STREQ (encoding, "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0))
 227           {
 228             if (m == 1)
 229               {
 230                 unsigned char c = (unsigned char) p[0];
 231
 232                 if ((c >= 0xa1 && c < 0xff) || c == 0x8e)
 233                   goto incomplete;
 234               }
 235             else /* m == 2 || m == 3 */
 236               {
 237                 unsigned char c = (unsigned char) p[0];
 238
 239                 if (c == 0x8e)
 240                   goto incomplete;
 241               }
 242             goto invalid;
 243           }
 244         if (STREQ (encoding, "GB18030", 'G', 'B', '1', '8', '0', '3', '0', 0, 0))
 245           {
 246             if (m == 1)
 247               {
 248                 unsigned char c = (unsigned char) p[0];
 249
 250                 if ((c >= 0x90 && c <= 0xe3) || (c >= 0xf8 && c <= 0xfe))
 251                   goto incomplete;
 252               }
 253             else /* m == 2 || m == 3 */
 254               {
 255                 unsigned char c = (unsigned char) p[0];
 256
 257                 if (c >= 0x90 && c <= 0xe3)
 258                   {
 259                     unsigned char c2 = (unsigned char) p[1];
 260
 261                     if (c2 >= 0x30 && c2 <= 0x39)
 262                       {
 263                         if (m == 2)
 264                           goto incomplete;
 265                         else /* m == 3 */
 266                           {
 267                             unsigned char c3 = (unsigned char) p[2];
 268
 269                             if (c3 >= 0x81 && c3 <= 0xfe)
 270                               goto incomplete;
 271                           }
 272                       }
 273                   }
 274               }
 275             goto invalid;
 276           }
 277         if (STREQ (encoding, "SJIS", 'S', 'J', 'I', 'S', 0, 0, 0, 0, 0))
 278           {
 279             if (m == 1)
 280               {
 281                 unsigned char c = (unsigned char) p[0];
 282
 283                 if ((c >= 0x81 && c <= 0x9f) || (c >= 0xe0 && c <= 0xea)
 284                     || (c >= 0xf0 && c <= 0xf9))
 285                   goto incomplete;
 286               }
 287             goto invalid;
 288           }
 289
 290         /* An unknown multibyte encoding.  */
 291         goto incomplete;
 292       }
 293
 294      incomplete:
 295       {
 296         size_t k = nstate;
 297         /* Here 0 <= k < m < 4.  */
 298         pstate[++k] = s[0];
 299         if (k < m)
 300           {
 301             pstate[++k] = s[1];
 302             if (k < m)
 303               pstate[++k] = s[2];
 304           }
 305         if (k != m)
 306           abort ();
 307       }
 308       pstate[0] = m;
 309       return (size_t)(-2);
 310
 311      invalid:
 312       errno = EILSEQ;
 313       /* The conversion state is undefined, says POSIX.  */
 314       return (size_t)(-1);
 315     }
 316   }
 317 }
 318
 319 #else
 320 /* Override the system's mbrtowc() function.  */
 321
 322 # undef mbrtowc
 323
 324 size_t
 325 rpl_mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps)
 326 {
 327 # if MBRTOWC_NULL_ARG2_BUG || MBRTOWC_RETVAL_BUG
 328   if (s == NULL)
 329     {
 330       pwc = NULL;
 331       s = "";
 332       n = 1;
 333     }
 334 # endif
 335
 336 # if MBRTOWC_RETVAL_BUG
 337   {
 338     static mbstate_t internal_state;
 339
 340     /* Override mbrtowc's internal state.  We can not call mbsinit() on the
 341        hidden internal state, but we can call it on our variable.  */
 342     if (ps == NULL)
 343       ps = &internal_state;
 344
 345     if (!mbsinit (ps))
 346       {
 347         /* Parse the rest of the multibyte character byte for byte.  */
 348         size_t count = 0;
 349         for (; n > 0; s++, n--)
 350           {
 351             wchar_t wc;
 352             size_t ret = mbrtowc (&wc, s, 1, ps);
 353
 354             if (ret == (size_t)(-1))
 355               return (size_t)(-1);
 356             count++;
 357             if (ret != (size_t)(-2))
 358               {
 359                 /* The multibyte character has been completed.  */
 360                 if (pwc != NULL)
 361                   *pwc = wc;
 362                 return (wc == 0 ? 0 : count);
 363               }
 364           }
 365         return (size_t)(-2);
 366       }
 367   }
 368 # endif
 369
 370 # if MBRTOWC_NUL_RETVAL_BUG
 371   {
 372     wchar_t wc;
 373     size_t ret = mbrtowc (&wc, s, n, ps);
 374
 375     if (ret != (size_t)(-1) && ret != (size_t)(-2))
 376       {
 377         if (pwc != NULL)
 378           *pwc = wc;
 379         if (wc == 0)
 380           ret = 0;
 381       }
 382     return ret;
 383   }
 384 # else
 385   {
 386 #   if MBRTOWC_NULL_ARG1_BUG
 387     wchar_t dummy;
 388
 389     if (pwc == NULL)
 390       pwc = &dummy;
 391 #   endif
 392
 393     return mbrtowc (pwc, s, n, ps);
 394   }
 395 # endif
 396 }
 397
 398 #endif