1 /* -*- buffer-read-only: t -*- vi: set ro: */
2 /* DO NOT EDIT! GENERATED AUTOMATICALLY! */
3 /* Determine a canonical name for the current locale's character encoding.
5 Copyright (C) 2000-2006, 2008-2009 Free Software Foundation, Inc.
7 This program is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License along
18 with this program; if not, write to the Free Software Foundation,
19 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
21 /* Written by Bruno Haible <bruno@clisp.org>. */
26 #include "localcharset.h"
33 #if defined __APPLE__ && defined __MACH__ && HAVE_LANGINFO_CODESET
34 # define DARWIN7 /* Darwin 7 or newer, i.e. MacOS X 10.3 or newer */
37 #if defined _WIN32 || defined __WIN32__
42 /* Assume EMX program runs on OS/2, even if compiled under DOS. */
48 #if !defined WIN32_NATIVE
49 # if HAVE_LANGINFO_CODESET
50 # include <langinfo.h>
52 # if 0 /* see comment below */
57 # define WIN32_LEAN_AND_MEAN
60 #elif defined WIN32_NATIVE
61 # define WIN32_LEAN_AND_MEAN
69 #if ENABLE_RELOCATABLE
70 # include "relocatable.h"
72 # define relocate(pathname) (pathname)
77 # include "configmake.h"
80 #if defined _WIN32 || defined __WIN32__ || defined __CYGWIN__ || defined __EMX__ || defined __DJGPP__
81 /* Win32, Cygwin, OS/2, DOS */
82 # define ISSLASH(C) ((C) == '/' || (C) == '\\')
85 #ifndef DIRECTORY_SEPARATOR
86 # define DIRECTORY_SEPARATOR '/'
90 # define ISSLASH(C) ((C) == DIRECTORY_SEPARATOR)
93 #if HAVE_DECL_GETC_UNLOCKED
95 # define getc getc_unlocked
98 /* The following static variable is declared 'volatile' to avoid a
99 possible multithread problem in the function get_charset_aliases. If we
100 are running in a threaded environment, and if two threads initialize
101 'charset_aliases' simultaneously, both will produce the same value,
102 and everything will be ok if the two assignments to 'charset_aliases'
103 are atomic. But I don't know what will happen if the two assignments mix. */
105 # define volatile /* empty */
107 /* Pointer to the contents of the charset.alias file, if it has already been
108 read, else NULL. Its format is:
109 ALIAS_1 '\0' CANONICAL_1 '\0' ... ALIAS_n '\0' CANONICAL_n '\0' '\0' */
110 static const char * volatile charset_aliases;
112 /* Return a pointer to the contents of the charset.alias file. */
114 get_charset_aliases (void)
118 cp = charset_aliases;
121 #if !(defined DARWIN7 || defined VMS || defined WIN32_NATIVE || defined __CYGWIN__)
124 const char *base = "charset.alias";
127 /* Make it possible to override the charset.alias location. This is
128 necessary for running the testsuite before "make install". */
129 dir = getenv ("CHARSETALIASDIR");
130 if (dir == NULL || dir[0] == '\0')
131 dir = relocate (LIBDIR);
133 /* Concatenate dir and base into freshly allocated file_name. */
135 size_t dir_len = strlen (dir);
136 size_t base_len = strlen (base);
137 int add_slash = (dir_len > 0 && !ISSLASH (dir[dir_len - 1]));
138 file_name = (char *) malloc (dir_len + add_slash + base_len + 1);
139 if (file_name != NULL)
141 memcpy (file_name, dir, dir_len);
143 file_name[dir_len] = DIRECTORY_SEPARATOR;
144 memcpy (file_name + dir_len + add_slash, base, base_len + 1);
148 if (file_name == NULL || (fp = fopen (file_name, "r")) == NULL)
149 /* Out of memory or file not found, treat it as empty. */
153 /* Parse the file's contents. */
154 char *res_ptr = NULL;
168 if (c == '\n' || c == ' ' || c == '\t')
172 /* Skip comment, to end of line. */
175 while (!(c == EOF || c == '\n'));
181 if (fscanf (fp, "%50s %50s", buf1, buf2) < 2)
185 old_res_ptr = res_ptr;
188 res_size = l1 + 1 + l2 + 1;
189 res_ptr = (char *) malloc (res_size + 1);
193 res_size += l1 + 1 + l2 + 1;
194 res_ptr = (char *) realloc (res_ptr, res_size + 1);
200 if (old_res_ptr != NULL)
204 strcpy (res_ptr + res_size - (l2 + 1) - (l1 + 1), buf1);
205 strcpy (res_ptr + res_size - (l2 + 1), buf2);
212 *(res_ptr + res_size) = '\0';
217 if (file_name != NULL)
223 /* To avoid the trouble of installing a file that is shared by many
224 GNU packages -- many packaging systems have problems with this --,
225 simply inline the aliases here. */
226 cp = "ISO8859-1" "\0" "ISO-8859-1" "\0"
227 "ISO8859-2" "\0" "ISO-8859-2" "\0"
228 "ISO8859-4" "\0" "ISO-8859-4" "\0"
229 "ISO8859-5" "\0" "ISO-8859-5" "\0"
230 "ISO8859-7" "\0" "ISO-8859-7" "\0"
231 "ISO8859-9" "\0" "ISO-8859-9" "\0"
232 "ISO8859-13" "\0" "ISO-8859-13" "\0"
233 "ISO8859-15" "\0" "ISO-8859-15" "\0"
234 "KOI8-R" "\0" "KOI8-R" "\0"
235 "KOI8-U" "\0" "KOI8-U" "\0"
236 "CP866" "\0" "CP866" "\0"
237 "CP949" "\0" "CP949" "\0"
238 "CP1131" "\0" "CP1131" "\0"
239 "CP1251" "\0" "CP1251" "\0"
240 "eucCN" "\0" "GB2312" "\0"
241 "GB2312" "\0" "GB2312" "\0"
242 "eucJP" "\0" "EUC-JP" "\0"
243 "eucKR" "\0" "EUC-KR" "\0"
244 "Big5" "\0" "BIG5" "\0"
245 "Big5HKSCS" "\0" "BIG5-HKSCS" "\0"
246 "GBK" "\0" "GBK" "\0"
247 "GB18030" "\0" "GB18030" "\0"
248 "SJIS" "\0" "SHIFT_JIS" "\0"
249 "ARMSCII-8" "\0" "ARMSCII-8" "\0"
250 "PT154" "\0" "PT154" "\0"
251 /*"ISCII-DEV" "\0" "?" "\0"*/
252 "*" "\0" "UTF-8" "\0";
256 /* To avoid the troubles of an extra file charset.alias_vms in the
257 sources of many GNU packages, simply inline the aliases here. */
258 /* The list of encodings is taken from the OpenVMS 7.3-1 documentation
259 "Compaq C Run-Time Library Reference Manual for OpenVMS systems"
260 section 10.7 "Handling Different Character Sets". */
261 cp = "ISO8859-1" "\0" "ISO-8859-1" "\0"
262 "ISO8859-2" "\0" "ISO-8859-2" "\0"
263 "ISO8859-5" "\0" "ISO-8859-5" "\0"
264 "ISO8859-7" "\0" "ISO-8859-7" "\0"
265 "ISO8859-8" "\0" "ISO-8859-8" "\0"
266 "ISO8859-9" "\0" "ISO-8859-9" "\0"
268 "eucJP" "\0" "EUC-JP" "\0"
269 "SJIS" "\0" "SHIFT_JIS" "\0"
270 "DECKANJI" "\0" "DEC-KANJI" "\0"
271 "SDECKANJI" "\0" "EUC-JP" "\0"
273 "eucTW" "\0" "EUC-TW" "\0"
274 "DECHANYU" "\0" "DEC-HANYU" "\0"
275 "DECHANZI" "\0" "GB2312" "\0"
277 "DECKOREAN" "\0" "EUC-KR" "\0";
280 # if defined WIN32_NATIVE || defined __CYGWIN__
281 /* To avoid the troubles of installing a separate file in the same
282 directory as the DLL and of retrieving the DLL's directory at
283 runtime, simply inline the aliases here. */
285 cp = "CP936" "\0" "GBK" "\0"
286 "CP1361" "\0" "JOHAB" "\0"
287 "CP20127" "\0" "ASCII" "\0"
288 "CP20866" "\0" "KOI8-R" "\0"
289 "CP20936" "\0" "GB2312" "\0"
290 "CP21866" "\0" "KOI8-RU" "\0"
291 "CP28591" "\0" "ISO-8859-1" "\0"
292 "CP28592" "\0" "ISO-8859-2" "\0"
293 "CP28593" "\0" "ISO-8859-3" "\0"
294 "CP28594" "\0" "ISO-8859-4" "\0"
295 "CP28595" "\0" "ISO-8859-5" "\0"
296 "CP28596" "\0" "ISO-8859-6" "\0"
297 "CP28597" "\0" "ISO-8859-7" "\0"
298 "CP28598" "\0" "ISO-8859-8" "\0"
299 "CP28599" "\0" "ISO-8859-9" "\0"
300 "CP28605" "\0" "ISO-8859-15" "\0"
301 "CP38598" "\0" "ISO-8859-8" "\0"
302 "CP51932" "\0" "EUC-JP" "\0"
303 "CP51936" "\0" "GB2312" "\0"
304 "CP51949" "\0" "EUC-KR" "\0"
305 "CP51950" "\0" "EUC-TW" "\0"
306 "CP54936" "\0" "GB18030" "\0"
307 "CP65001" "\0" "UTF-8" "\0";
311 charset_aliases = cp;
317 /* Determine the current locale's character encoding, and canonicalize it
318 into one of the canonical names listed in config.charset.
319 The result must not be freed; it is statically allocated.
320 If the canonical name cannot be determined, the result is a non-canonical
327 locale_charset (void)
332 #if !(defined WIN32_NATIVE || defined OS2)
334 # if HAVE_LANGINFO_CODESET
336 /* Most systems support nl_langinfo (CODESET) nowadays. */
337 codeset = nl_langinfo (CODESET);
340 /* Cygwin 2006 does not have locales. nl_langinfo (CODESET) always
341 returns "US-ASCII". As long as this is not fixed, return the suffix
342 of the locale name from the environment variables (if present) or
343 the codepage as a number. */
344 if (codeset != NULL && strcmp (codeset, "US-ASCII") == 0)
347 static char buf[2 + 10 + 1];
349 locale = getenv ("LC_ALL");
350 if (locale == NULL || locale[0] == '\0')
352 locale = getenv ("LC_CTYPE");
353 if (locale == NULL || locale[0] == '\0')
354 locale = getenv ("LANG");
356 if (locale != NULL && locale[0] != '\0')
358 /* If the locale name contains an encoding after the dot, return
360 const char *dot = strchr (locale, '.');
364 const char *modifier;
367 /* Look for the possible @... trailer and remove it, if any. */
368 modifier = strchr (dot, '@');
369 if (modifier == NULL)
371 if (modifier - dot < sizeof (buf))
373 memcpy (buf, dot, modifier - dot);
374 buf [modifier - dot] = '\0';
380 /* Woe32 has a function returning the locale's codepage as a number. */
381 sprintf (buf, "CP%u", GetACP ());
388 /* On old systems which lack it, use setlocale or getenv. */
389 const char *locale = NULL;
391 /* But most old systems don't have a complete set of locales. Some
392 (like SunOS 4 or DJGPP) have only the C locale. Therefore we don't
393 use setlocale here; it would return "C" when it doesn't support the
394 locale name the user has set. */
396 locale = setlocale (LC_CTYPE, NULL);
398 if (locale == NULL || locale[0] == '\0')
400 locale = getenv ("LC_ALL");
401 if (locale == NULL || locale[0] == '\0')
403 locale = getenv ("LC_CTYPE");
404 if (locale == NULL || locale[0] == '\0')
405 locale = getenv ("LANG");
409 /* On some old systems, one used to set locale = "iso8859_1". On others,
410 you set it to "language_COUNTRY.charset". In any case, we resolve it
411 through the charset.alias file. */
416 #elif defined WIN32_NATIVE
418 static char buf[2 + 10 + 1];
420 /* Woe32 has a function returning the locale's codepage as a number. */
421 sprintf (buf, "CP%u", GetACP ());
427 static char buf[2 + 10 + 1];
431 /* Allow user to override the codeset, as set in the operating system,
432 with standard language environment variables. */
433 locale = getenv ("LC_ALL");
434 if (locale == NULL || locale[0] == '\0')
436 locale = getenv ("LC_CTYPE");
437 if (locale == NULL || locale[0] == '\0')
438 locale = getenv ("LANG");
440 if (locale != NULL && locale[0] != '\0')
442 /* If the locale name contains an encoding after the dot, return it. */
443 const char *dot = strchr (locale, '.');
447 const char *modifier;
450 /* Look for the possible @... trailer and remove it, if any. */
451 modifier = strchr (dot, '@');
452 if (modifier == NULL)
454 if (modifier - dot < sizeof (buf))
456 memcpy (buf, dot, modifier - dot);
457 buf [modifier - dot] = '\0';
462 /* Resolve through the charset.alias file. */
467 /* OS/2 has a function returning the locale's codepage as a number. */
468 if (DosQueryCp (sizeof (cp), cp, &cplen))
472 sprintf (buf, "CP%u", cp[0]);
480 /* The canonical name cannot be determined. */
484 for (aliases = get_charset_aliases ();
486 aliases += strlen (aliases) + 1, aliases += strlen (aliases) + 1)
487 if (strcmp (codeset, aliases) == 0
488 || (aliases[0] == '*' && aliases[1] == '\0'))
490 codeset = aliases + strlen (aliases) + 1;
494 /* Don't return an empty string. GNU libc and GNU libiconv interpret
495 the empty string as denoting "the locale's character encoding",
496 thus GNU libiconv would call this function a second time. */
497 if (codeset[0] == '\0')