]>
Dogcows Code - chaz/tar/blob - src/utf8.c
1 /* Charset handling for GNU tar.
3 Copyright (C) 2004 Free Software Foundation, Inc.
5 This program is free software; you can redistribute it and/or modify it
6 under the terms of the GNU General Public License as published by the
7 Free Software Foundation; either version 2, or (at your option) any later
10 This program is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
13 Public License for more details.
15 You should have received a copy of the GNU General Public License along
16 with this program; if not, write to the Free Software Foundation, Inc.,
17 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
34 char const *lang
; /* Language code */
35 char const *terr
; /* Territory code */
36 char const *charset
; /* Corresponding charset */
39 /* The list of language codes defined in ISO 639 with the corresponding
40 default character sets.
44 1) The list must be ordered by:
45 a) lang field in ascending order
46 b) terr field in descending order.
47 NULL fields are considered less than non-null ones.
48 2) Many entries have NULL charset fields. Please help fill them!
49 3) The "default" character set for a given language is a matter
50 of preference. Possibly the table should contain a *list* of
51 possible character sets.
52 4) LC_ALL "modifier" field is not taken into account */
54 static struct langtab langtab
[] = {
55 { "C", NULL
, "ASCII"},
56 { "POSIX", NULL
, "ASCII" },
57 { "aa", NULL
, NULL
}, /* Afar */
58 { "ab", NULL
, NULL
}, /* Abkhazian */
59 { "ae", NULL
, NULL
}, /* Avestan */
60 { "af", NULL
, "iso-8859-1"}, /* Afrikaans */
61 { "am", NULL
, "UTF-8"}, /* Amharic */
62 { "ar", NULL
, "iso-8859-6"}, /* Arabic */
63 { "as", NULL
, NULL
}, /* Assamese */
64 { "ay", NULL
, "iso-8859-1"}, /* Aymara */
65 { "az", NULL
, NULL
}, /* Azerbaijani */
66 { "ba", NULL
, NULL
}, /* Bashkir */
67 { "be", NULL
, "UTF-8"}, /* Byelorussian; Belarusian */
68 { "bg", NULL
, "iso-8859-5"}, /* Bulgarian */
69 { "bh", NULL
, NULL
}, /* Bihari */
70 { "bi", NULL
, NULL
}, /* Bislama */
71 { "bn", NULL
, NULL
}, /* Bengali; Bangla */
72 { "bo", NULL
, NULL
}, /* Tibetan */
73 { "br", NULL
, "iso-8859-1"}, /* Breton: 1,5,8,9 */
74 { "bs", NULL
, NULL
}, /* Bosnian */
75 { "ca", NULL
, "iso-8859-1"}, /* Catalan: 1,5,8,9 */
76 { "ce", NULL
, NULL
}, /* Chechen */
77 { "ch", NULL
, NULL
}, /* Chamorro */
78 { "co", NULL
, "iso-8859-1"}, /* Corsican */
79 { "cs", NULL
, "iso-8859-2"}, /* Czech */
80 { "cu", NULL
, NULL
}, /* Church Slavic */
81 { "cv", NULL
, NULL
}, /* Chuvash */
82 { "cy", NULL
, "iso-8859-1"}, /* Welsh */
83 { "da", NULL
, "iso-8859-1"}, /* Danish: 4-9 */
84 { "de", NULL
, "iso-8859-1"}, /* German */
85 { "dz", NULL
, NULL
}, /* Dzongkha; Bhutani */
86 { "el", NULL
, "iso-8859-7"}, /* Greek */
87 { "en", NULL
, "iso-8859-1"}, /* English */
88 { "eo", NULL
, "iso-8859-3"}, /* Esperanto */
89 { "es", NULL
, "iso-8859-1"}, /* Spanish */
90 { "et", NULL
, "iso-8859-15"}, /* Estonian: 6,7,9 */
91 { "eu", NULL
, "iso-8859-1"}, /* Basque: 5,8,9 */
92 { "fa", NULL
, "UTF-8"}, /* Persian */
93 { "fi", NULL
, "iso-8859-15"}, /* Finnish */
94 { "fj", NULL
, NULL
}, /* Fijian; Fiji */
95 { "fo", NULL
, "iso-8859-1"}, /* Faroese: 6,9 */
96 { "fr", NULL
, "iso-8859-1"}, /* French */
97 { "fy", NULL
, "iso-8859-1"}, /* Frisian */
98 { "ga", NULL
, "iso-8859-14"}, /* Irish */
99 { "gd", NULL
, "iso-8859-14" }, /* Scots; Gaelic */
100 { "gl", NULL
, NULL
}, /* Gallegan; Galician */
101 { "gn", NULL
, NULL
}, /* Guarani */
102 { "gu", NULL
, NULL
}, /* Gujarati */
103 { "gv", NULL
, "iso-8859-14"}, /* Manx */
104 { "ha", NULL
, NULL
}, /* Hausa (?) */
105 { "he", NULL
, "iso-8859-8" }, /* Hebrew */
106 { "hi", NULL
, NULL
}, /* Hindi */
107 { "ho", NULL
, NULL
}, /* Hiri Motu */
108 { "hr", NULL
, "iso-8859-2"}, /* Croatian: 10 */
109 { "hu", NULL
, "iso-8859-2"}, /* Hungarian */
110 { "hy", NULL
, NULL
}, /* Armenian */
111 { "hz", NULL
, NULL
}, /* Herero */
112 { "id", NULL
, "iso-8859-1"}, /* Indonesian (formerly in) */
113 { "ia", NULL
, NULL
}, /* Interlingua */
114 { "ie", NULL
, NULL
}, /* Interlingue */
115 { "ik", NULL
, NULL
}, /* Inupiak */
116 { "io", NULL
, NULL
}, /* Ido */
117 { "is", NULL
, "iso-8859-1"}, /* Icelandic */
118 { "it", NULL
, "iso-8859-1"}, /* Italian */
119 { "iu", NULL
, NULL
}, /* Inuktitut */
120 { "ja", NULL
, "EUC-JP"}, /* Japanese */
121 { "jv", NULL
, NULL
}, /* Javanese */
122 { "ka", NULL
, NULL
}, /* Georgian */
123 { "ki", NULL
, NULL
}, /* Kikuyu */
124 { "kj", NULL
, NULL
}, /* Kuanyama */
125 { "kk", NULL
, NULL
}, /* Kazakh */
126 { "kl", NULL
, "iso-8859-1"}, /* Kalaallisut; Greenlandic */
127 { "km", NULL
, NULL
}, /* Khmer; Cambodian */
128 { "kn", NULL
, NULL
}, /* Kannada */
129 { "ko", NULL
, "EUC-KR"}, /* Korean */
130 { "ks", NULL
, NULL
}, /* Kashmiri */
131 { "ku", NULL
, NULL
}, /* Kurdish */
132 { "kv", NULL
, NULL
}, /* Komi */
133 { "kw", NULL
, "iso-8859-14"}, /* Cornish: 1,5,8 */
134 { "ky", NULL
, NULL
}, /* Kirghiz */
135 { "la", NULL
, "iso-8859-1"}, /* Latin */
136 { "lb", NULL
, "iso-8859-1"}, /* Letzeburgesch */
137 { "ln", NULL
, NULL
}, /* Lingala */
138 { "lo", NULL
, NULL
}, /* Lao; Laotian */
139 { "lt", NULL
, "iso-8859-4"}, /* Lithuanian */
140 { "lv", NULL
, "iso-8859-4"}, /* Latvian; Lettish */
141 { "mg", NULL
, NULL
}, /* Malagasy */
142 { "mh", NULL
, NULL
}, /* Marshall */
143 { "mi", NULL
, NULL
}, /* Maori */
144 { "mk", NULL
, NULL
}, /* Macedonian */
145 { "ml", NULL
, NULL
}, /* Malayalam */
146 { "mn", NULL
, NULL
}, /* Mongolian */
147 { "mo", NULL
, "iso-8859-2"}, /* Moldavian */
148 { "mr", NULL
, NULL
}, /* Marathi */
149 { "ms", NULL
, NULL
}, /* Malay */
150 { "mt", NULL
, "iso-8859-3"}, /* Maltese */
151 { "my", NULL
, NULL
}, /* Burmese */
152 { "na", NULL
, NULL
}, /* Nauru */
153 { "nb", NULL
, "iso-8859-1"}, /* Norwegian Bokmål; Bokm@aa{}l */
154 { "nd", NULL
, NULL
}, /* Ndebele, North */
155 { "ne", NULL
, NULL
}, /* Nepali */
156 { "ng", NULL
, NULL
}, /* Ndonga */
157 { "nl", NULL
, "iso-8859-1"}, /* Dutch: 5,9 */
158 { "nn", NULL
, "iso-8859-1"}, /* Norwegian Nynorsk */
159 { "no", NULL
, "iso-8859-1"}, /* Norwegian */
160 { "nr", NULL
, NULL
}, /* Ndebele, South */
161 { "nv", NULL
, NULL
}, /* Navajo */
162 { "ny", NULL
, NULL
}, /* Chichewa; Nyanja */
163 { "oc", NULL
, NULL
}, /* Occitan; Provençal; Proven@,{c}al */
164 { "om", NULL
, NULL
}, /* (Afan) Oromo */
165 { "or", NULL
, NULL
}, /* Oriya */
166 { "os", NULL
, NULL
}, /* Ossetian; Ossetic */
167 { "pa", NULL
, NULL
}, /* Panjabi; Punjabi */
168 { "pi", NULL
, NULL
}, /* Pali */
169 { "pl", NULL
, "iso-8859-2"}, /* Polish */
170 { "ps", NULL
, NULL
}, /* Pashto, Pushto */
171 { "pt", NULL
, "iso-8859-1"}, /* Portuguese */
172 { "qu", NULL
, "iso-8859-1"}, /* Quechua */
173 { "rm", NULL
, "iso-8859-1"}, /* Rhaeto-Romance */
174 { "rn", NULL
, NULL
}, /* Rundi; Kirundi */
175 { "ro", NULL
, "iso-8859-2"}, /* Romanian */
176 { "ru", NULL
, "koi8-r"}, /* Russian */
177 { "rw", NULL
, NULL
}, /* Kinyarwanda */
178 { "sa", NULL
, NULL
}, /* Sanskrit */
179 { "sc", NULL
, "iso-8859-1"}, /* Sardinian */
180 { "sd", NULL
, NULL
}, /* Sindhi */
181 { "se", NULL
, "iso-8859-10"}, /* Northern Sami */
182 { "sg", NULL
, NULL
}, /* Sango; Sangro */
183 { "si", NULL
, NULL
}, /* Sinhalese */
184 { "sk", NULL
, "iso-8859-2"}, /* Slovak */
185 { "sl", NULL
, "iso-8859-1"}, /* Slovenian */
186 { "sm", NULL
, NULL
}, /* Samoan */
187 { "sn", NULL
, NULL
}, /* Shona */
188 { "so", NULL
, NULL
}, /* Somali */
189 { "sq", NULL
, "iso-8859-1"}, /* Albanian: 2,5,8,9,10 */
190 { "sr", NULL
, "iso-8859-2"}, /* Serbian */
191 { "ss", NULL
, NULL
}, /* Swati; Siswati */
192 { "st", NULL
, NULL
}, /* Sesotho; Sotho, Southern */
193 { "su", NULL
, NULL
}, /* Sundanese */
194 { "sv", NULL
, "iso-8859-1"}, /* Swedish */
195 { "sw", NULL
, NULL
}, /* Swahili */
196 { "ta", NULL
, NULL
}, /* Tamil */
197 { "te", NULL
, NULL
}, /* Telugu */
198 { "tg", NULL
, NULL
}, /* Tajik */
199 { "th", NULL
, "iso-8859-11"}, /* Thai */
200 { "ti", NULL
, NULL
}, /* Tigrinya */
201 { "tk", NULL
, NULL
}, /* Turkmen */
202 { "tl", NULL
, "iso-8859-1"}, /* Tagalog */
203 { "tn", NULL
, NULL
}, /* Tswana; Setswana */
204 { "to", NULL
, NULL
}, /* Tonga (?) */
205 { "tr", NULL
, "iso-8859-9"}, /* Turkish */
206 { "ts", NULL
, NULL
}, /* Tsonga */
207 { "tt", NULL
, NULL
}, /* Tatar */
208 { "tw", NULL
, NULL
}, /* Twi */
209 { "ty", NULL
, NULL
}, /* Tahitian */
210 { "ug", NULL
, NULL
}, /* Uighur */
211 { "uk", NULL
, "koi8-u"}, /* Ukrainian */
212 { "ur", NULL
, NULL
}, /* Urdu */
213 { "uz", NULL
, NULL
}, /* Uzbek */
214 { "vi", NULL
, NULL
}, /* Vietnamese */
215 { "vo", NULL
, NULL
}, /* Volapük; Volap@"{u}k; Volapuk */
216 { "wa", NULL
, "iso-8859-1"}, /* Walloon */
217 { "wo", NULL
, NULL
}, /* Wolof */
218 { "xh", NULL
, NULL
}, /* Xhosa */
219 { "yi", NULL
, "iso-8859-8"}, /* Yiddish (formerly ji) */
220 { "yo", NULL
, NULL
}, /* Yoruba */
221 { "za", NULL
, NULL
}, /* Zhuang */
222 { "zh", "TW", "big5"}, /* Chinese */
223 { "zh", NULL
, "gb2312"}, /* Chinese */
224 { "zu", NULL
, NULL
}, /* Zulu */
228 /* Given the language and (optionally) territory code, return the
229 default character set for that language. See notes above. */
232 charset_lookup (char const *lang
, char const *terr
)
234 struct langtab
const *p
;
238 for (p
= langtab
; p
->lang
; p
++)
239 if (strcasecmp (p
->lang
, lang
) == 0
242 || !strcasecmp (p
->terr
, terr
) == 0))
248 get_input_charset (void)
250 const char *charset
= NULL
;
253 /* Try to deduce the charset from LC_ALL or LANG variables */
255 tmp
= getenv ("LC_ALL");
257 tmp
= getenv ("LANG");
264 lang
= strtok (tmp
, "_");
265 terr
= strtok (NULL
, ".");
266 charset
= strtok (NULL
, "@");
269 charset
= charset_lookup (lang
, terr
);
273 charset
= "iso-8859-1";
277 #else /* !defined HAVE_LIBICONV */
280 # define iconv_open(tocode, fromcode) ((iconv_t) -1)
283 # define iconv(cd, inbuf, inbytesleft, outbuf, outbytesleft) ((size_t) 0)
286 # define iconv_close(cd) 0
288 #endif /* !defined HAVE_LIBICONV */
293 static iconv_t conv_desc
[2] = { (iconv_t
) -1, (iconv_t
) -1 };
296 utf8_init (bool to_utf
)
298 if (conv_desc
[(int) to_utf
] == (iconv_t
) -1)
301 conv_desc
[(int) to_utf
] = iconv_open ("UTF-8", get_input_charset ());
303 conv_desc
[(int) to_utf
] = iconv_open (get_input_charset (), "UTF-8");
305 return conv_desc
[(int) to_utf
];
309 utf8_convert (bool to_utf
, char const *input
, char **output
)
311 char ICONV_CONST
*ib
;
316 iconv_t cd
= utf8_init (to_utf
);
320 *output
= xstrdup (input
);
323 else if (cd
== (iconv_t
)-1)
326 inlen
= strlen (input
) + 1;
327 outlen
= inlen
* MB_LEN_MAX
+ 1;
328 ob
= *output
= xmalloc (outlen
);
329 ib
= (char ICONV_CONST
*) input
;
330 rc
= iconv (cd
, &ib
, &inlen
, &ob
, &outlen
);
337 string_ascii_p (const char *str
)
339 const unsigned char *p
= (const unsigned char *)str
;
This page took 0.061758 seconds and 4 git commands to generate.