1 /* -*- buffer-read-only: t -*- vi: set ro: */
2 /* DO NOT EDIT! GENERATED AUTOMATICALLY! */
3 /* Determine a canonical name for the current locale's character encoding.
5 Copyright (C) 2000-2006, 2008-2013 Free Software Foundation, Inc.
7 This program is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License along
18 with this program; if not, see <http://www.gnu.org/licenses/>. */
20 /* Written by Bruno Haible <bruno@clisp.org>. */
25 #include "localcharset.h"
33 #if defined __APPLE__ && defined __MACH__ && HAVE_LANGINFO_CODESET
34 # define DARWIN7 /* Darwin 7 or newer, i.e. Mac OS X 10.3 or newer */
37 #if defined _WIN32 || defined __WIN32__
38 # define WINDOWS_NATIVE
42 /* Assume EMX program runs on OS/2, even if compiled under DOS. */
48 #if !defined WINDOWS_NATIVE
50 # if HAVE_LANGINFO_CODESET
51 # include <langinfo.h>
53 # if 0 /* see comment below */
58 # define WIN32_LEAN_AND_MEAN
61 #elif defined WINDOWS_NATIVE
62 # define WIN32_LEAN_AND_MEAN
70 #if ENABLE_RELOCATABLE
71 # include "relocatable.h"
73 # define relocate(pathname) (pathname)
78 # include "configmake.h"
81 /* Define O_NOFOLLOW to 0 on platforms where it does not exist. */
86 #if defined _WIN32 || defined __WIN32__ || defined __CYGWIN__ || defined __EMX__ || defined __DJGPP__
87 /* Native Windows, Cygwin, OS/2, DOS */
88 # define ISSLASH(C) ((C) == '/' || (C) == '\\')
91 #ifndef DIRECTORY_SEPARATOR
92 # define DIRECTORY_SEPARATOR '/'
96 # define ISSLASH(C) ((C) == DIRECTORY_SEPARATOR)
99 #if HAVE_DECL_GETC_UNLOCKED
101 # define getc getc_unlocked
104 /* The following static variable is declared 'volatile' to avoid a
105 possible multithread problem in the function get_charset_aliases. If we
106 are running in a threaded environment, and if two threads initialize
107 'charset_aliases' simultaneously, both will produce the same value,
108 and everything will be ok if the two assignments to 'charset_aliases'
109 are atomic. But I don't know what will happen if the two assignments mix. */
111 # define volatile /* empty */
113 /* Pointer to the contents of the charset.alias file, if it has already been
114 read, else NULL. Its format is:
115 ALIAS_1 '\0' CANONICAL_1 '\0' ... ALIAS_n '\0' CANONICAL_n '\0' '\0' */
116 static const char * volatile charset_aliases;
118 /* Return a pointer to the contents of the charset.alias file. */
120 get_charset_aliases (void)
124 cp = charset_aliases;
127 #if !(defined DARWIN7 || defined VMS || defined WINDOWS_NATIVE || defined __CYGWIN__)
129 const char *base = "charset.alias";
132 /* Make it possible to override the charset.alias location. This is
133 necessary for running the testsuite before "make install". */
134 dir = getenv ("CHARSETALIASDIR");
135 if (dir == NULL || dir[0] == '\0')
136 dir = relocate (LIBDIR);
138 /* Concatenate dir and base into freshly allocated file_name. */
140 size_t dir_len = strlen (dir);
141 size_t base_len = strlen (base);
142 int add_slash = (dir_len > 0 && !ISSLASH (dir[dir_len - 1]));
143 file_name = (char *) malloc (dir_len + add_slash + base_len + 1);
144 if (file_name != NULL)
146 memcpy (file_name, dir, dir_len);
148 file_name[dir_len] = DIRECTORY_SEPARATOR;
149 memcpy (file_name + dir_len + add_slash, base, base_len + 1);
153 if (file_name == NULL)
154 /* Out of memory. Treat the file as empty. */
160 /* Open the file. Reject symbolic links on platforms that support
161 O_NOFOLLOW. This is a security feature. Without it, an attacker
162 could retrieve parts of the contents (namely, the tail of the
163 first line that starts with "* ") of an arbitrary file by placing
164 a symbolic link to that file under the name "charset.alias" in
165 some writable directory and defining the environment variable
166 CHARSETALIASDIR to point to that directory. */
167 fd = open (file_name,
168 O_RDONLY | (HAVE_WORKING_O_NOFOLLOW ? O_NOFOLLOW : 0));
170 /* File not found. Treat it as empty. */
176 fp = fdopen (fd, "r");
179 /* Out of memory. Treat the file as empty. */
185 /* Parse the file's contents. */
186 char *res_ptr = NULL;
200 if (c == '\n' || c == ' ' || c == '\t')
204 /* Skip comment, to end of line. */
207 while (!(c == EOF || c == '\n'));
213 if (fscanf (fp, "%50s %50s", buf1, buf2) < 2)
217 old_res_ptr = res_ptr;
220 res_size = l1 + 1 + l2 + 1;
221 res_ptr = (char *) malloc (res_size + 1);
225 res_size += l1 + 1 + l2 + 1;
226 res_ptr = (char *) realloc (res_ptr, res_size + 1);
235 strcpy (res_ptr + res_size - (l2 + 1) - (l1 + 1), buf1);
236 strcpy (res_ptr + res_size - (l2 + 1), buf2);
243 *(res_ptr + res_size) = '\0';
255 /* To avoid the trouble of installing a file that is shared by many
256 GNU packages -- many packaging systems have problems with this --,
257 simply inline the aliases here. */
258 cp = "ISO8859-1" "\0" "ISO-8859-1" "\0"
259 "ISO8859-2" "\0" "ISO-8859-2" "\0"
260 "ISO8859-4" "\0" "ISO-8859-4" "\0"
261 "ISO8859-5" "\0" "ISO-8859-5" "\0"
262 "ISO8859-7" "\0" "ISO-8859-7" "\0"
263 "ISO8859-9" "\0" "ISO-8859-9" "\0"
264 "ISO8859-13" "\0" "ISO-8859-13" "\0"
265 "ISO8859-15" "\0" "ISO-8859-15" "\0"
266 "KOI8-R" "\0" "KOI8-R" "\0"
267 "KOI8-U" "\0" "KOI8-U" "\0"
268 "CP866" "\0" "CP866" "\0"
269 "CP949" "\0" "CP949" "\0"
270 "CP1131" "\0" "CP1131" "\0"
271 "CP1251" "\0" "CP1251" "\0"
272 "eucCN" "\0" "GB2312" "\0"
273 "GB2312" "\0" "GB2312" "\0"
274 "eucJP" "\0" "EUC-JP" "\0"
275 "eucKR" "\0" "EUC-KR" "\0"
276 "Big5" "\0" "BIG5" "\0"
277 "Big5HKSCS" "\0" "BIG5-HKSCS" "\0"
278 "GBK" "\0" "GBK" "\0"
279 "GB18030" "\0" "GB18030" "\0"
280 "SJIS" "\0" "SHIFT_JIS" "\0"
281 "ARMSCII-8" "\0" "ARMSCII-8" "\0"
282 "PT154" "\0" "PT154" "\0"
283 /*"ISCII-DEV" "\0" "?" "\0"*/
284 "*" "\0" "UTF-8" "\0";
288 /* To avoid the troubles of an extra file charset.alias_vms in the
289 sources of many GNU packages, simply inline the aliases here. */
290 /* The list of encodings is taken from the OpenVMS 7.3-1 documentation
291 "Compaq C Run-Time Library Reference Manual for OpenVMS systems"
292 section 10.7 "Handling Different Character Sets". */
293 cp = "ISO8859-1" "\0" "ISO-8859-1" "\0"
294 "ISO8859-2" "\0" "ISO-8859-2" "\0"
295 "ISO8859-5" "\0" "ISO-8859-5" "\0"
296 "ISO8859-7" "\0" "ISO-8859-7" "\0"
297 "ISO8859-8" "\0" "ISO-8859-8" "\0"
298 "ISO8859-9" "\0" "ISO-8859-9" "\0"
300 "eucJP" "\0" "EUC-JP" "\0"
301 "SJIS" "\0" "SHIFT_JIS" "\0"
302 "DECKANJI" "\0" "DEC-KANJI" "\0"
303 "SDECKANJI" "\0" "EUC-JP" "\0"
305 "eucTW" "\0" "EUC-TW" "\0"
306 "DECHANYU" "\0" "DEC-HANYU" "\0"
307 "DECHANZI" "\0" "GB2312" "\0"
309 "DECKOREAN" "\0" "EUC-KR" "\0";
312 # if defined WINDOWS_NATIVE || defined __CYGWIN__
313 /* To avoid the troubles of installing a separate file in the same
314 directory as the DLL and of retrieving the DLL's directory at
315 runtime, simply inline the aliases here. */
317 cp = "CP936" "\0" "GBK" "\0"
318 "CP1361" "\0" "JOHAB" "\0"
319 "CP20127" "\0" "ASCII" "\0"
320 "CP20866" "\0" "KOI8-R" "\0"
321 "CP20936" "\0" "GB2312" "\0"
322 "CP21866" "\0" "KOI8-RU" "\0"
323 "CP28591" "\0" "ISO-8859-1" "\0"
324 "CP28592" "\0" "ISO-8859-2" "\0"
325 "CP28593" "\0" "ISO-8859-3" "\0"
326 "CP28594" "\0" "ISO-8859-4" "\0"
327 "CP28595" "\0" "ISO-8859-5" "\0"
328 "CP28596" "\0" "ISO-8859-6" "\0"
329 "CP28597" "\0" "ISO-8859-7" "\0"
330 "CP28598" "\0" "ISO-8859-8" "\0"
331 "CP28599" "\0" "ISO-8859-9" "\0"
332 "CP28605" "\0" "ISO-8859-15" "\0"
333 "CP38598" "\0" "ISO-8859-8" "\0"
334 "CP51932" "\0" "EUC-JP" "\0"
335 "CP51936" "\0" "GB2312" "\0"
336 "CP51949" "\0" "EUC-KR" "\0"
337 "CP51950" "\0" "EUC-TW" "\0"
338 "CP54936" "\0" "GB18030" "\0"
339 "CP65001" "\0" "UTF-8" "\0";
343 charset_aliases = cp;
349 /* Determine the current locale's character encoding, and canonicalize it
350 into one of the canonical names listed in config.charset.
351 The result must not be freed; it is statically allocated.
352 If the canonical name cannot be determined, the result is a non-canonical
359 locale_charset (void)
364 #if !(defined WINDOWS_NATIVE || defined OS2)
366 # if HAVE_LANGINFO_CODESET
368 /* Most systems support nl_langinfo (CODESET) nowadays. */
369 codeset = nl_langinfo (CODESET);
372 /* Cygwin < 1.7 does not have locales. nl_langinfo (CODESET) always
373 returns "US-ASCII". Return the suffix of the locale name from the
374 environment variables (if present) or the codepage as a number. */
375 if (codeset != NULL && strcmp (codeset, "US-ASCII") == 0)
378 static char buf[2 + 10 + 1];
380 locale = getenv ("LC_ALL");
381 if (locale == NULL || locale[0] == '\0')
383 locale = getenv ("LC_CTYPE");
384 if (locale == NULL || locale[0] == '\0')
385 locale = getenv ("LANG");
387 if (locale != NULL && locale[0] != '\0')
389 /* If the locale name contains an encoding after the dot, return
391 const char *dot = strchr (locale, '.');
395 const char *modifier;
398 /* Look for the possible @... trailer and remove it, if any. */
399 modifier = strchr (dot, '@');
400 if (modifier == NULL)
402 if (modifier - dot < sizeof (buf))
404 memcpy (buf, dot, modifier - dot);
405 buf [modifier - dot] = '\0';
411 /* The Windows API has a function returning the locale's codepage as a
412 number: GetACP(). This encoding is used by Cygwin, unless the user
413 has set the environment variable CYGWIN=codepage:oem (which very few
415 Output directed to console windows needs to be converted (to
416 GetOEMCP() if the console is using a raster font, or to
417 GetConsoleOutputCP() if it is using a TrueType font). Cygwin does
418 this conversion transparently (see winsup/cygwin/fhandler_console.cc),
419 converting to GetConsoleOutputCP(). This leads to correct results,
420 except when SetConsoleOutputCP has been called and a raster font is
422 sprintf (buf, "CP%u", GetACP ());
429 /* On old systems which lack it, use setlocale or getenv. */
430 const char *locale = NULL;
432 /* But most old systems don't have a complete set of locales. Some
433 (like SunOS 4 or DJGPP) have only the C locale. Therefore we don't
434 use setlocale here; it would return "C" when it doesn't support the
435 locale name the user has set. */
437 locale = setlocale (LC_CTYPE, NULL);
439 if (locale == NULL || locale[0] == '\0')
441 locale = getenv ("LC_ALL");
442 if (locale == NULL || locale[0] == '\0')
444 locale = getenv ("LC_CTYPE");
445 if (locale == NULL || locale[0] == '\0')
446 locale = getenv ("LANG");
450 /* On some old systems, one used to set locale = "iso8859_1". On others,
451 you set it to "language_COUNTRY.charset". In any case, we resolve it
452 through the charset.alias file. */
457 #elif defined WINDOWS_NATIVE
459 static char buf[2 + 10 + 1];
461 /* The Windows API has a function returning the locale's codepage as a
463 When the output goes to a console window, it needs to be provided in
464 GetOEMCP() encoding if the console is using a raster font, or in
465 GetConsoleOutputCP() encoding if it is using a TrueType font.
466 But in GUI programs and for output sent to files and pipes, GetACP()
467 encoding is the best bet. */
468 sprintf (buf, "CP%u", GetACP ());
474 static char buf[2 + 10 + 1];
478 /* Allow user to override the codeset, as set in the operating system,
479 with standard language environment variables. */
480 locale = getenv ("LC_ALL");
481 if (locale == NULL || locale[0] == '\0')
483 locale = getenv ("LC_CTYPE");
484 if (locale == NULL || locale[0] == '\0')
485 locale = getenv ("LANG");
487 if (locale != NULL && locale[0] != '\0')
489 /* If the locale name contains an encoding after the dot, return it. */
490 const char *dot = strchr (locale, '.');
494 const char *modifier;
497 /* Look for the possible @... trailer and remove it, if any. */
498 modifier = strchr (dot, '@');
499 if (modifier == NULL)
501 if (modifier - dot < sizeof (buf))
503 memcpy (buf, dot, modifier - dot);
504 buf [modifier - dot] = '\0';
509 /* Resolve through the charset.alias file. */
514 /* OS/2 has a function returning the locale's codepage as a number. */
515 if (DosQueryCp (sizeof (cp), cp, &cplen))
519 sprintf (buf, "CP%u", cp[0]);
527 /* The canonical name cannot be determined. */
531 for (aliases = get_charset_aliases ();
533 aliases += strlen (aliases) + 1, aliases += strlen (aliases) + 1)
534 if (strcmp (codeset, aliases) == 0
535 || (aliases[0] == '*' && aliases[1] == '\0'))
537 codeset = aliases + strlen (aliases) + 1;
541 /* Don't return an empty string. GNU libc and GNU libiconv interpret
542 the empty string as denoting "the locale's character encoding",
543 thus GNU libiconv would call this function a second time. */
544 if (codeset[0] == '\0')
548 /* Mac OS X sets MB_CUR_MAX to 1 when LC_ALL=C, and "UTF-8"
549 (the default codeset) does not work when MB_CUR_MAX is 1. */
550 if (strcmp (codeset, "UTF-8") == 0 && MB_CUR_MAX <= 1)