1 /* -*- buffer-read-only: t -*- vi: set ro: */
2 /* DO NOT EDIT! GENERATED AUTOMATICALLY! */
3 /* Determine a canonical name for the current locale's character encoding.
5 Copyright (C) 2000-2006, 2008-2014 Free Software Foundation, Inc.
7 This program is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License along
18 with this program; if not, see <http://www.gnu.org/licenses/>. */
20 /* Written by Bruno Haible <bruno@clisp.org>. */
25 #include "localcharset.h"
33 #if defined __APPLE__ && defined __MACH__ && HAVE_LANGINFO_CODESET
34 # define DARWIN7 /* Darwin 7 or newer, i.e. Mac OS X 10.3 or newer */
37 #if defined _WIN32 || defined __WIN32__
38 # define WINDOWS_NATIVE
42 /* Assume EMX program runs on OS/2, even if compiled under DOS. */
48 #if !defined WINDOWS_NATIVE
50 # if HAVE_LANGINFO_CODESET
51 # include <langinfo.h>
53 # if 0 /* see comment below */
58 # define WIN32_LEAN_AND_MEAN
61 #elif defined WINDOWS_NATIVE
62 # define WIN32_LEAN_AND_MEAN
70 /* For MB_CUR_MAX_L */
75 #if ENABLE_RELOCATABLE
76 # include "relocatable.h"
78 # define relocate(pathname) (pathname)
83 # include "configmake.h"
86 /* Define O_NOFOLLOW to 0 on platforms where it does not exist. */
91 #if defined _WIN32 || defined __WIN32__ || defined __CYGWIN__ || defined __EMX__ || defined __DJGPP__
92 /* Native Windows, Cygwin, OS/2, DOS */
93 # define ISSLASH(C) ((C) == '/' || (C) == '\\')
96 #ifndef DIRECTORY_SEPARATOR
97 # define DIRECTORY_SEPARATOR '/'
101 # define ISSLASH(C) ((C) == DIRECTORY_SEPARATOR)
104 #if HAVE_DECL_GETC_UNLOCKED
106 # define getc getc_unlocked
109 /* The following static variable is declared 'volatile' to avoid a
110 possible multithread problem in the function get_charset_aliases. If we
111 are running in a threaded environment, and if two threads initialize
112 'charset_aliases' simultaneously, both will produce the same value,
113 and everything will be ok if the two assignments to 'charset_aliases'
114 are atomic. But I don't know what will happen if the two assignments mix. */
116 # define volatile /* empty */
118 /* Pointer to the contents of the charset.alias file, if it has already been
119 read, else NULL. Its format is:
120 ALIAS_1 '\0' CANONICAL_1 '\0' ... ALIAS_n '\0' CANONICAL_n '\0' '\0' */
121 static const char * volatile charset_aliases;
123 /* Return a pointer to the contents of the charset.alias file. */
125 get_charset_aliases (void)
129 cp = charset_aliases;
132 #if !(defined DARWIN7 || defined VMS || defined WINDOWS_NATIVE || defined __CYGWIN__)
134 const char *base = "charset.alias";
137 /* Make it possible to override the charset.alias location. This is
138 necessary for running the testsuite before "make install". */
139 dir = getenv ("CHARSETALIASDIR");
140 if (dir == NULL || dir[0] == '\0')
141 dir = relocate (LIBDIR);
143 /* Concatenate dir and base into freshly allocated file_name. */
145 size_t dir_len = strlen (dir);
146 size_t base_len = strlen (base);
147 int add_slash = (dir_len > 0 && !ISSLASH (dir[dir_len - 1]));
148 file_name = (char *) malloc (dir_len + add_slash + base_len + 1);
149 if (file_name != NULL)
151 memcpy (file_name, dir, dir_len);
153 file_name[dir_len] = DIRECTORY_SEPARATOR;
154 memcpy (file_name + dir_len + add_slash, base, base_len + 1);
158 if (file_name == NULL)
159 /* Out of memory. Treat the file as empty. */
165 /* Open the file. Reject symbolic links on platforms that support
166 O_NOFOLLOW. This is a security feature. Without it, an attacker
167 could retrieve parts of the contents (namely, the tail of the
168 first line that starts with "* ") of an arbitrary file by placing
169 a symbolic link to that file under the name "charset.alias" in
170 some writable directory and defining the environment variable
171 CHARSETALIASDIR to point to that directory. */
172 fd = open (file_name,
173 O_RDONLY | (HAVE_WORKING_O_NOFOLLOW ? O_NOFOLLOW : 0));
175 /* File not found. Treat it as empty. */
181 fp = fdopen (fd, "r");
184 /* Out of memory. Treat the file as empty. */
190 /* Parse the file's contents. */
191 char *res_ptr = NULL;
205 if (c == '\n' || c == ' ' || c == '\t')
209 /* Skip comment, to end of line. */
212 while (!(c == EOF || c == '\n'));
218 if (fscanf (fp, "%50s %50s", buf1, buf2) < 2)
222 old_res_ptr = res_ptr;
225 res_size = l1 + 1 + l2 + 1;
226 res_ptr = (char *) malloc (res_size + 1);
230 res_size += l1 + 1 + l2 + 1;
231 res_ptr = (char *) realloc (res_ptr, res_size + 1);
240 strcpy (res_ptr + res_size - (l2 + 1) - (l1 + 1), buf1);
241 strcpy (res_ptr + res_size - (l2 + 1), buf2);
248 *(res_ptr + res_size) = '\0';
260 /* To avoid the trouble of installing a file that is shared by many
261 GNU packages -- many packaging systems have problems with this --,
262 simply inline the aliases here. */
263 cp = "ISO8859-1" "\0" "ISO-8859-1" "\0"
264 "ISO8859-2" "\0" "ISO-8859-2" "\0"
265 "ISO8859-4" "\0" "ISO-8859-4" "\0"
266 "ISO8859-5" "\0" "ISO-8859-5" "\0"
267 "ISO8859-7" "\0" "ISO-8859-7" "\0"
268 "ISO8859-9" "\0" "ISO-8859-9" "\0"
269 "ISO8859-13" "\0" "ISO-8859-13" "\0"
270 "ISO8859-15" "\0" "ISO-8859-15" "\0"
271 "KOI8-R" "\0" "KOI8-R" "\0"
272 "KOI8-U" "\0" "KOI8-U" "\0"
273 "CP866" "\0" "CP866" "\0"
274 "CP949" "\0" "CP949" "\0"
275 "CP1131" "\0" "CP1131" "\0"
276 "CP1251" "\0" "CP1251" "\0"
277 "eucCN" "\0" "GB2312" "\0"
278 "GB2312" "\0" "GB2312" "\0"
279 "eucJP" "\0" "EUC-JP" "\0"
280 "eucKR" "\0" "EUC-KR" "\0"
281 "Big5" "\0" "BIG5" "\0"
282 "Big5HKSCS" "\0" "BIG5-HKSCS" "\0"
283 "GBK" "\0" "GBK" "\0"
284 "GB18030" "\0" "GB18030" "\0"
285 "SJIS" "\0" "SHIFT_JIS" "\0"
286 "ARMSCII-8" "\0" "ARMSCII-8" "\0"
287 "PT154" "\0" "PT154" "\0"
288 /*"ISCII-DEV" "\0" "?" "\0"*/
289 "*" "\0" "UTF-8" "\0";
293 /* To avoid the troubles of an extra file charset.alias_vms in the
294 sources of many GNU packages, simply inline the aliases here. */
295 /* The list of encodings is taken from the OpenVMS 7.3-1 documentation
296 "Compaq C Run-Time Library Reference Manual for OpenVMS systems"
297 section 10.7 "Handling Different Character Sets". */
298 cp = "ISO8859-1" "\0" "ISO-8859-1" "\0"
299 "ISO8859-2" "\0" "ISO-8859-2" "\0"
300 "ISO8859-5" "\0" "ISO-8859-5" "\0"
301 "ISO8859-7" "\0" "ISO-8859-7" "\0"
302 "ISO8859-8" "\0" "ISO-8859-8" "\0"
303 "ISO8859-9" "\0" "ISO-8859-9" "\0"
305 "eucJP" "\0" "EUC-JP" "\0"
306 "SJIS" "\0" "SHIFT_JIS" "\0"
307 "DECKANJI" "\0" "DEC-KANJI" "\0"
308 "SDECKANJI" "\0" "EUC-JP" "\0"
310 "eucTW" "\0" "EUC-TW" "\0"
311 "DECHANYU" "\0" "DEC-HANYU" "\0"
312 "DECHANZI" "\0" "GB2312" "\0"
314 "DECKOREAN" "\0" "EUC-KR" "\0";
317 # if defined WINDOWS_NATIVE || defined __CYGWIN__
318 /* To avoid the troubles of installing a separate file in the same
319 directory as the DLL and of retrieving the DLL's directory at
320 runtime, simply inline the aliases here. */
322 cp = "CP936" "\0" "GBK" "\0"
323 "CP1361" "\0" "JOHAB" "\0"
324 "CP20127" "\0" "ASCII" "\0"
325 "CP20866" "\0" "KOI8-R" "\0"
326 "CP20936" "\0" "GB2312" "\0"
327 "CP21866" "\0" "KOI8-RU" "\0"
328 "CP28591" "\0" "ISO-8859-1" "\0"
329 "CP28592" "\0" "ISO-8859-2" "\0"
330 "CP28593" "\0" "ISO-8859-3" "\0"
331 "CP28594" "\0" "ISO-8859-4" "\0"
332 "CP28595" "\0" "ISO-8859-5" "\0"
333 "CP28596" "\0" "ISO-8859-6" "\0"
334 "CP28597" "\0" "ISO-8859-7" "\0"
335 "CP28598" "\0" "ISO-8859-8" "\0"
336 "CP28599" "\0" "ISO-8859-9" "\0"
337 "CP28605" "\0" "ISO-8859-15" "\0"
338 "CP38598" "\0" "ISO-8859-8" "\0"
339 "CP51932" "\0" "EUC-JP" "\0"
340 "CP51936" "\0" "GB2312" "\0"
341 "CP51949" "\0" "EUC-KR" "\0"
342 "CP51950" "\0" "EUC-TW" "\0"
343 "CP54936" "\0" "GB18030" "\0"
344 "CP65001" "\0" "UTF-8" "\0";
348 charset_aliases = cp;
354 /* Determine the current locale's character encoding, and canonicalize it
355 into one of the canonical names listed in config.charset.
356 The result must not be freed; it is statically allocated.
357 If the canonical name cannot be determined, the result is a non-canonical
364 locale_charset (void)
369 #if !(defined WINDOWS_NATIVE || defined OS2)
371 # if HAVE_LANGINFO_CODESET
373 /* Most systems support nl_langinfo (CODESET) nowadays. */
374 codeset = nl_langinfo (CODESET);
377 /* Cygwin < 1.7 does not have locales. nl_langinfo (CODESET) always
378 returns "US-ASCII". Return the suffix of the locale name from the
379 environment variables (if present) or the codepage as a number. */
380 if (codeset != NULL && strcmp (codeset, "US-ASCII") == 0)
383 static char buf[2 + 10 + 1];
385 locale = getenv ("LC_ALL");
386 if (locale == NULL || locale[0] == '\0')
388 locale = getenv ("LC_CTYPE");
389 if (locale == NULL || locale[0] == '\0')
390 locale = getenv ("LANG");
392 if (locale != NULL && locale[0] != '\0')
394 /* If the locale name contains an encoding after the dot, return
396 const char *dot = strchr (locale, '.');
400 const char *modifier;
403 /* Look for the possible @... trailer and remove it, if any. */
404 modifier = strchr (dot, '@');
405 if (modifier == NULL)
407 if (modifier - dot < sizeof (buf))
409 memcpy (buf, dot, modifier - dot);
410 buf [modifier - dot] = '\0';
416 /* The Windows API has a function returning the locale's codepage as a
417 number: GetACP(). This encoding is used by Cygwin, unless the user
418 has set the environment variable CYGWIN=codepage:oem (which very few
420 Output directed to console windows needs to be converted (to
421 GetOEMCP() if the console is using a raster font, or to
422 GetConsoleOutputCP() if it is using a TrueType font). Cygwin does
423 this conversion transparently (see winsup/cygwin/fhandler_console.cc),
424 converting to GetConsoleOutputCP(). This leads to correct results,
425 except when SetConsoleOutputCP has been called and a raster font is
427 sprintf (buf, "CP%u", GetACP ());
434 /* On old systems which lack it, use setlocale or getenv. */
435 const char *locale = NULL;
437 /* But most old systems don't have a complete set of locales. Some
438 (like SunOS 4 or DJGPP) have only the C locale. Therefore we don't
439 use setlocale here; it would return "C" when it doesn't support the
440 locale name the user has set. */
442 locale = setlocale (LC_CTYPE, NULL);
444 if (locale == NULL || locale[0] == '\0')
446 locale = getenv ("LC_ALL");
447 if (locale == NULL || locale[0] == '\0')
449 locale = getenv ("LC_CTYPE");
450 if (locale == NULL || locale[0] == '\0')
451 locale = getenv ("LANG");
455 /* On some old systems, one used to set locale = "iso8859_1". On others,
456 you set it to "language_COUNTRY.charset". In any case, we resolve it
457 through the charset.alias file. */
462 #elif defined WINDOWS_NATIVE
464 static char buf[2 + 10 + 1];
466 /* The Windows API has a function returning the locale's codepage as a
468 When the output goes to a console window, it needs to be provided in
469 GetOEMCP() encoding if the console is using a raster font, or in
470 GetConsoleOutputCP() encoding if it is using a TrueType font.
471 But in GUI programs and for output sent to files and pipes, GetACP()
472 encoding is the best bet. */
473 sprintf (buf, "CP%u", GetACP ());
479 static char buf[2 + 10 + 1];
483 /* Allow user to override the codeset, as set in the operating system,
484 with standard language environment variables. */
485 locale = getenv ("LC_ALL");
486 if (locale == NULL || locale[0] == '\0')
488 locale = getenv ("LC_CTYPE");
489 if (locale == NULL || locale[0] == '\0')
490 locale = getenv ("LANG");
492 if (locale != NULL && locale[0] != '\0')
494 /* If the locale name contains an encoding after the dot, return it. */
495 const char *dot = strchr (locale, '.');
499 const char *modifier;
502 /* Look for the possible @... trailer and remove it, if any. */
503 modifier = strchr (dot, '@');
504 if (modifier == NULL)
506 if (modifier - dot < sizeof (buf))
508 memcpy (buf, dot, modifier - dot);
509 buf [modifier - dot] = '\0';
514 /* Resolve through the charset.alias file. */
519 /* OS/2 has a function returning the locale's codepage as a number. */
520 if (DosQueryCp (sizeof (cp), cp, &cplen))
524 sprintf (buf, "CP%u", cp[0]);
532 /* The canonical name cannot be determined. */
536 for (aliases = get_charset_aliases ();
538 aliases += strlen (aliases) + 1, aliases += strlen (aliases) + 1)
539 if (strcmp (codeset, aliases) == 0
540 || (aliases[0] == '*' && aliases[1] == '\0'))
542 codeset = aliases + strlen (aliases) + 1;
546 /* Don't return an empty string. GNU libc and GNU libiconv interpret
547 the empty string as denoting "the locale's character encoding",
548 thus GNU libiconv would call this function a second time. */
549 if (codeset[0] == '\0')
553 /* Mac OS X sets MB_CUR_MAX to 1 when LC_ALL=C, and "UTF-8"
554 (the default codeset) does not work when MB_CUR_MAX is 1. */
555 if (strcmp (codeset, "UTF-8") == 0 && MB_CUR_MAX_L (uselocale (NULL)) <= 1)