1uconv_u16tou32(3C) Standard C Library Functions uconv_u16tou32(3C)
2
3
4
6 uconv_u16tou32, uconv_u16tou8, uconv_u32tou16, uconv_u32tou8,
7 uconv_u8tou16, uconv_u8tou32 - Unicode encoding conversion functions
8
10 #include <sys/types.h>
11 #include <sys/errno.h>
12 #include <sys/u8_textprep.h>
13
14 int uconv_u16tou32(const uint16_t *utf16str, size_t *utf16len,
15 uint32_t *utf32str, size_t *utf32len, int flag);
16
17
18 int uconv_u16tou8(const uint16_t *utf16str, size_t *utf16len,
19 uchar_t *utf8str, size_t *utf8len, int flag);
20
21
22 int uconv_u32tou16(const uint32_t *utf32str, size_t *utf32len,
23 uint16_t *utf16str, size_t *utf16len, int flag);
24
25
26 int uconv_u32tou8(const uint32_t *utf32str, size_t *utf32len,
27 uchar_t *utf8str, size_t *utf8len, int flag);
28
29
30 int uconv_u8tou16(const uchar_t *utf8str, size_t *utf8len,
31 uint16_t *utf16str, size_t *utf16len, int flag);
32
33
34 int uconv_u8tou32(const uchar_t *utf8str, size_t *utf8len,
35 uint32_t *utf32str, size_t *utf32len, int flag);
36
37
39 utf16str A pointer to a UTF-16 character string.
40
41
42 utf16len As an input parameter, the number of 16-bit unsigned inte‐
43 gers in utf16str as UTF-16 characters to be converted or
44 saved.
45
46 As an output parameter, the number of 16-bit unsigned inte‐
47 gers in utf16str consumed or saved during conversion.
48
49
50 utf32str A pointer to a UTF-32 character string.
51
52
53 utf32len As an input parameter, the number of 32-bit unsigned inte‐
54 gers in utf32str as UTF-32 characters to be converted or
55 saved.
56
57 As an output parameter, the number of 32-bit unsigned inte‐
58 gers in utf32str consumed or saved during conversion.
59
60
61 utf8str A pointer to a UTF-8 character string.
62
63
64 utf8len As an input parameter, the number of bytes in utf8str as
65 UTF-8 characters to be converted or saved.
66
67 As an output parameter, the number of bytes in utf8str con‐
68 sumed or saved during conversion.
69
70
71 flag The possible conversion options that are constructed by a
72 bitwise-inclusive-OR of the following values:
73
74 UCONV_IN_BIG_ENDIAN
75
76 The input parameter is in big endian byte ordering.
77
78
79 UCONV_OUT_BIG_ENDIAN
80
81 The output parameter should be in big endian byte
82 ordering.
83
84
85 UCONV_IN_SYSTEM_ENDIAN
86
87 The input parameter is in the default byte ordering of
88 the current system.
89
90
91 UCONV_OUT_SYSTEM_ENDIAN
92
93 The output parameter should be in the default byte
94 ordering of the current system.
95
96
97 UCONV_IN_LITTLE_ENDIAN
98
99 The input parameter is in little endian byte ordering.
100
101
102 UCONV_OUT_LITTLE_ENDIAN
103
104 The output parameter should be in little endian byte
105 ordering.
106
107
108 UCONV_IGNORE_NULL
109
110 The null or U+0000 character should not stop the con‐
111 version.
112
113
114 UCONV_IN_ACCEPT_BOM
115
116 If the Byte Order Mark (BOM, U+FEFF) character exists
117 as the first character of the input parameter, inter‐
118 pret it as the BOM character.
119
120
121 UCONV_OUT_EMIT_BOM
122
123 Start the output parameter with Byte Order Mark (BOM,
124 U+FEFF) character to indicate the byte ordering if the
125 output parameter is in UTF-16 or UTF-32.
126
127
128
130 The uconv_u16tou32() function reads the given utf16str in UTF-16 until
131 U+0000 (zero) in utf16str is encountered as a character or until the
132 number of 16-bit unsigned integers specified in utf16len is read. The
133 UTF-16 characters that are read are converted into UTF-32 and the
134 result is saved at utf32str. After the successful conversion, utf32len
135 contains the number of 32-bit unsigned integers saved at utf32str as
136 UTF-32 characters.
137
138
139 The uconv_u16tou8() function reads the given utf16str in UTF-16 until
140 U+0000 (zero) in utf16str is encountered as a character or until the
141 number of 16-bit unsigned integers specified in utf16len is read. The
142 UTF-16 characters that are read are converted into UTF-8 and the result
143 is saved at utf8str. After the successful conversion, utf8len contains
144 the number of bytes saved at utf8str as UTF-8 characters.
145
146
147 The uconv_u32tou16() function reads the given utf32str in UTF-32 until
148 U+0000 (zero) in utf32str is encountered as a character or until the
149 number of 32-bit unsigned integers specified in utf32len is read. The
150 UTF-32 characters that are read are converted into UTF-16 and the
151 result is saved at utf16str. After the successful conversion, utf16len
152 contains the number of 16-bit unsigned integers saved at utf16str as
153 UTF-16 characters.
154
155
156 The uconv_u32tou8() function reads the given utf32str in UTF-32 until
157 U+0000 (zero) in utf32str is encountered as a character or until the
158 number of 32-bit unsigned integers specified in utf32len is read. The
159 UTF-32 characters that are read are converted into UTF-8 and the result
160 is saved at utf8str. After the successful conversion, utf8len contains
161 the number of bytes saved at utf8str as UTF-8 characters.
162
163
164 The uconv_u8tou16() function reads the given utf8str in UTF-8 until the
165 null ('\0') byte in utf8str is encountered or until the number of bytes
166 specified in utf8len is read. The UTF-8 characters that are read are
167 converted into UTF-16 and the result is saved at utf16str. After the
168 successful conversion, utf16len contains the number of 16-bit unsigned
169 integers saved at utf16str as UTF-16 characters.
170
171
172 The uconv_u8tou32() function reads the given utf8str in UTF-8 until the
173 null ('\0') byte in utf8str is encountered or until the number of bytes
174 specified in utf8len is read. The UTF-8 characters that are read are
175 converted into UTF-32 and the result is saved at utf32str. After the
176 successful conversion, utf32len contains the number of 32-bit unsigned
177 integers saved at utf32str as UTF-32 characters.
178
179
180 During the conversion, the input and the output parameters are treated
181 with byte orderings specified in the flag parameter. When not speci‐
182 fied, the default byte ordering of the system is used. The byte order‐
183 ing flag value that is specified for UTF-8 is ignored.
184
185
186 When UCONV_IN_ACCEPT_BOM is specified as the flag and the first charac‐
187 ter of the string pointed to by the input parameter is the BOM charac‐
188 ter, the value of the BOM character dictates the byte ordering of the
189 subsequent characters in the string pointed to by the input parameter,
190 regardless of the supplied input parameter byte ordering option flag
191 values. If the UCONV_IN_ACCEPT_BOM is not specified, the BOM as the
192 first character is treated as a regular Unicode character: Zero Width
193 No Break Space (ZWNBSP) character.
194
195
196 When UCONV_IGNORE_NULL is specified, regardless of whether the input
197 parameter contains U+0000 or null byte, the conversion continues until
198 the specified number of input parameter elements at utf16len, utf32len,
199 or utf8len are entirely consumed during the conversion.
200
201
202 As output parameters, utf16len, utf32len, and utf8len are not changed
203 if conversion fails for any reason.
204
206 Upon successful conversion, the functions return 0. Upon failure, the
207 functions return one of the following errno values:
208
209 EILSEQ The conversion detected an illegal or out of bound character
210 value in the input parameter.
211
212
213 E2BIG The conversion cannot finish because the size specified in
214 the output parameter is too small.
215
216
217 EINVAL The conversion stops due to an incomplete character at the
218 end of the input string.
219
220
221 EBADF Conflicting byte-ordering option flag values are detected.
222
223
225 Example 1 Convert a UTF-16 string in little-endian byte ordering into
226 UTF-8 string.
227
228 #include <sys/types.h>
229 #include <sys/errno.h>
230 #include <sys/u8_textprep.h>
231 .
232 .
233 .
234 uint16_t u16s[MAXNAMELEN + 1];
235 uchar_t u8s[MAXNAMELEN + 1];
236 size_t u16len, u8len;
237 int ret;
238 .
239 .
240 .
241 u16len = u8len = MAXNAMELEN;
242 ret = uconv_u16tou8(u16s, &u16len, u8s, &u8len,
243 UCONV_IN_LITTLE_ENDIAN);
244 if (ret != 0) {
245 /* Conversion error occurred. */
246 return (ret);
247 }
248 .
249 .
250 .
251
252
253 Example 2 Convert a UTF-32 string in big endian byte ordering into lit‐
254 tle endian UTF-16.
255
256 #include <sys/types.h>
257 #include <sys/errno.h>
258 #include <sys/u8_textprep.h>
259 .
260 .
261 .
262 /*
263 * An UTF-32 character can be mapped to an UTF-16 character with
264 * two 16-bit integer entities as a "surrogate pair."
265 */
266 uint32_t u32s[101];
267 uint16_t u16s[101];
268 int ret;
269 size_t u32len, u16len;
270 .
271 .
272 .
273 u32len = u16len = 100;
274 ret = uconv_u32tou16(u32s, &u32len, u16s, &u16len,
275 UCONV_IN_BIG_ENDIAN | UCONV_OUT_LITTLE_ENDIAN);
276 if (ret == 0) {
277 return (0);
278 } else if (ret == E2BIG) {
279 /* Use bigger output parameter and try just one more time. */
280 uint16_t u16s2[201];
281
282 u16len = 200;
283 ret = uconv_u32tou16(u32s, &u32len, u16s2, &u16len,
284 UCONV_IN_BIG_ENDIAN | UCONV_OUT_LITTLE_ENDIAN);
285 if (ret == 0)
286 return (0);
287 }
288
289 /* Otherwise, return -1 to indicate an error condition. */
290 return (-1);
291
292
293 Example 3 Convert a UTF-8 string into UTF-16 in little-endian byte
294 ordering.
295
296
297 Convert a UTF-8 string into UTF-16 in little-endian byte ordering with
298 a Byte Order Mark (BOM) character at the beginning of the output param‐
299 eter.
300
301
302 #include <sys/types.h>
303 #include <sys/errno.h>
304 #include <sys/u8_textprep>
305 .
306 .
307 .
308 uchar_t u8s[MAXNAMELEN + 1];
309 uint16_t u16s[MAXNAMELEN + 1];
310 size_t u8len, u16len;
311 int ret;
312 .
313 .
314 .
315 u8len = u16len = MAXNAMELEN;
316 ret = uconv_u8tou16(u8s, &u8len, u16s, &u16len,
317 UCONV_IN_LITTLE_ENDIAN | UCONV_EMIT_BOM);
318 if (ret != 0) {
319 /* Conversion error occurred. */
320 return (ret);
321 }
322 .
323 .
324 .
325
326
328 See attributes(5) for descriptions of the following attributes:
329
330
331
332
333 ┌─────────────────────────────┬─────────────────────────────┐
334 │ ATTRIBUTE TYPE │ ATTRIBUTE VALUE │
335 ├─────────────────────────────┼─────────────────────────────┤
336 │Interface Stability │Committed │
337 ├─────────────────────────────┼─────────────────────────────┤
338 │MT-Level │MT-Safe │
339 └─────────────────────────────┴─────────────────────────────┘
340
342 attributes(5), uconv_u16tou32(9F)
343
344
345 The Unicode Standard (http://www.unicode.org)
346
348 Each UTF-16 or UTF-32 character maps to an UTF-8 character that might
349 need one to maximum of four bytes.
350
351
352 One UTF-32 or UTF-8 character can yield two 16-bit unsigned integers as
353 a UTF-16 character, which is a surrogate pair if the Unicode scalar
354 value is bigger than U+FFFF.
355
356
357 Ill-formed UTF-16 surrogate pairs are seen as illegal characters during
358 the conversion.
359
360
361
362SunOS 5.11 18 Sep 2007 uconv_u16tou32(3C)