1unicode(3)                 Erlang Module Definition                 unicode(3)
2
3
4

NAME

6       unicode - Functions for converting Unicode characters.
7

DESCRIPTION

9       This module contains functions for converting between different charac‐
10       ter representations. It converts between  ISO  Latin-1  characters  and
11       Unicode  characters,  but it can also convert between different Unicode
12       encodings (like UTF-8, UTF-16, and UTF-32).
13
14       The default Unicode encoding in Erlang binaries is UTF-8, which is also
15       the  format  in which built-in functions and libraries in OTP expect to
16       find binary Unicode data. In lists, Unicode data is  encoded  as  inte‐
17       gers, each integer representing one character and encoded simply as the
18       Unicode code point for the character.
19
20       Other Unicode encodings than integers representing code points or UTF-8
21       in  binaries  are  referred to as "external encodings". The ISO Latin-1
22       encoding is in binaries and lists referred to as latin1-encoding.
23
24       It is recommended to only use external encodings for communication with
25       external  entities  where this is required. When working inside the Er‐
26       lang/OTP environment, it is recommended to keep binaries in UTF-8  when
27       representing Unicode characters. ISO Latin-1 encoding is supported both
28       for backward compatibility and for communication with external entities
29       not supporting Unicode character sets.
30
31       Programs should always operate on a normalized form and compare canoni‐
32       cal-equivalent Unicode characters as equal. All characters should  thus
33       be  normalized  to one form once on the system borders. One of the fol‐
34       lowing functions can convert characters to their normalized forms char‐
35       acters_to_nfc_list/1,        characters_to_nfc_binary/1,        charac‐
36       ters_to_nfd_list/1  or  characters_to_nfd_binary/1.  For  general  text
37       characters_to_nfc_list/1  or  characters_to_nfc_binary/1  is preferred,
38       and for identifiers one of the compatibility  normalization  functions,
39       such  as  characters_to_nfkc_list/1, is preferred for security reasons.
40       The normalization functions where introduced in OTP 20. Additional  in‐
41       formation on normalization can be found in the Unicode FAQ.
42

DATA TYPES

44       encoding() =
45           latin1 | unicode | utf8 | utf16 |
46           {utf16, endian()} |
47           utf32 |
48           {utf32, endian()}
49
50       endian() = big | little
51
52       unicode_binary() = binary()
53
54              A binary() with characters encoded in the UTF-8 coding standard.
55
56       chardata() = charlist() | unicode_binary()
57
58       charlist() =
59           maybe_improper_list(char() | unicode_binary() | charlist(),
60                               unicode_binary() | [])
61
62       external_unicode_binary() = binary()
63
64              A binary() with characters coded in a user-specified Unicode en‐
65              coding other than UTF-8 (that is, UTF-16 or UTF-32).
66
67       external_chardata() =
68           external_charlist() | external_unicode_binary()
69
70       external_charlist() =
71           maybe_improper_list(char() |
72                               external_unicode_binary() |
73                               external_charlist(),
74                               external_unicode_binary() | [])
75
76       latin1_binary() = binary()
77
78              A binary() with characters coded in ISO Latin-1.
79
80       latin1_char() = byte()
81
82              An integer() representing a valid ISO Latin-1 character (0-255).
83
84       latin1_chardata() = latin1_charlist() | latin1_binary()
85
86              Same as iodata().
87
88       latin1_charlist() =
89           maybe_improper_list(latin1_char() |
90                               latin1_binary() |
91                               latin1_charlist(),
92                               latin1_binary() | [])
93
94              Same as iolist().
95

EXPORTS

97       bom_to_encoding(Bin) -> {Encoding, Length}
98
99              Types:
100
101                 Bin = binary()
102                    A binary() such that byte_size(Bin) >= 4.
103                 Encoding =
104                     latin1 | utf8 | {utf16, endian()} | {utf32, endian()}
105                 Length = integer() >= 0
106                 endian() = big | little
107
108              Checks for a UTF Byte Order Mark (BOM) in the beginning of a bi‐
109              nary. If the supplied binary Bin begins with a valid BOM for ei‐
110              ther UTF-8, UTF-16, or UTF-32, the function returns the encoding
111              identified along with the BOM length in bytes.
112
113              If no BOM is found, the function returns {latin1,0}.
114
115       characters_to_binary(Data) -> Result
116
117              Types:
118
119                 Data = latin1_chardata() | chardata() | external_chardata()
120                 Result =
121                     binary() |
122                     {error, binary(), RestData} |
123                     {incomplete, binary(), binary()}
124                 RestData  =  latin1_chardata()  | chardata() | external_char‐
125                 data()
126
127              Same as characters_to_binary(Data, unicode, unicode).
128
129       characters_to_binary(Data, InEncoding) -> Result
130
131              Types:
132
133                 Data = latin1_chardata() | chardata() | external_chardata()
134                 InEncoding = encoding()
135                 Result =
136                     binary() |
137                     {error, binary(), RestData} |
138                     {incomplete, binary(), binary()}
139                 RestData = latin1_chardata() |  chardata()  |  external_char‐
140                 data()
141
142              Same as characters_to_binary(Data, InEncoding, unicode).
143
144       characters_to_binary(Data, InEncoding, OutEncoding) -> Result
145
146              Types:
147
148                 Data = latin1_chardata() | chardata() | external_chardata()
149                 InEncoding = OutEncoding = encoding()
150                 Result =
151                     binary() |
152                     {error, binary(), RestData} |
153                     {incomplete, binary(), binary()}
154                 RestData  =  latin1_chardata()  | chardata() | external_char‐
155                 data()
156
157              Behaves as characters_to_list/2, but produces a  binary  instead
158              of a Unicode list.
159
160              InEncoding  defines  how  input is to be interpreted if binaries
161              are present in Data
162
163              OutEncoding defines in what format output is to be generated.
164
165              Options:
166
167                unicode:
168                  An alias for utf8, as this is  the  preferred  encoding  for
169                  Unicode characters in binaries.
170
171                utf16:
172                  An alias for {utf16,big}.
173
174                utf32:
175                  An alias for {utf32,big}.
176
177              The atoms big and little denote big- or little-endian encoding.
178
179              Errors  and exceptions occur as in characters_to_list/2, but the
180              second element in tuple error or incomplete is  a  binary()  and
181              not a list().
182
183       characters_to_list(Data) -> Result
184
185              Types:
186
187                 Data = latin1_chardata() | chardata() | external_chardata()
188                 Result =
189                     string() |
190                     {error, string(), RestData} |
191                     {incomplete, string(), binary()}
192                 RestData  =  latin1_chardata()  | chardata() | external_char‐
193                 data()
194
195              Same as characters_to_list(Data, unicode).
196
197       characters_to_list(Data, InEncoding) -> Result
198
199              Types:
200
201                 Data = latin1_chardata() | chardata() | external_chardata()
202                 InEncoding = encoding()
203                 Result =
204                     string() |
205                     {error, string(), RestData} |
206                     {incomplete, string(), binary()}
207                 RestData = latin1_chardata() |  chardata()  |  external_char‐
208                 data()
209
210              Converts  a  possibly  deep list of integers and binaries into a
211              list of integers representing Unicode characters.  The  binaries
212              in  the  input can have characters encoded as one of the follow‐
213              ing:
214
215                * ISO Latin-1 (0-255, one character per byte). Here, case  pa‐
216                  rameter InEncoding is to be specified as latin1.
217
218                * One  of  the  UTF-encodings, which is specified as parameter
219                  InEncoding.
220
221              Note that integers in the list always represent code points  re‐
222              gardless  of  InEncoding passed. If InEncoding latin1 is passed,
223              only code points < 256 are allowed; otherwise, all valid unicode
224              code points are allowed.
225
226              If  InEncoding  is latin1, parameter Data corresponds to the io‐
227              data() type, but for unicode, parameter Data can  contain  inte‐
228              gers  >  255  (Unicode characters beyond the ISO Latin-1 range),
229              which makes it invalid as iodata().
230
231              The purpose of the function is mainly to convert combinations of
232              Unicode  characters into a pure Unicode string in list represen‐
233              tation for further processing. For writing the data to an exter‐
234              nal entity, the reverse function characters_to_binary/3 comes in
235              handy.
236
237              Option unicode is an alias for utf8, as this  is  the  preferred
238              encoding  for  Unicode characters in binaries. utf16 is an alias
239              for {utf16,big} and utf32 is an alias for {utf32,big}. The atoms
240              big and little denote big- or little-endian encoding.
241
242              If  the data cannot be converted, either because of illegal Uni‐
243              code/ISO Latin-1 characters in the list, or because  of  invalid
244              UTF  encoding  in  any binaries, an error tuple is returned. The
245              error tuple contains the tag  error,  a  list  representing  the
246              characters that could be converted before the error occurred and
247              a representation of the characters including and after  the  of‐
248              fending integer/bytes. The last part is mostly for debugging, as
249              it still constitutes a possibly deep or mixed list, or both, not
250              necessarily  of  the  same depth as the original data. The error
251              occurs when traversing the list and whatever is left  to  decode
252              is returned "as is".
253
254              However,  if  the input Data is a pure binary, the third part of
255              the error tuple is guaranteed to be a binary as well.
256
257              Errors occur for the following reasons:
258
259                * Integers out of range.
260
261                  If InEncoding is latin1, an error occurs whenever an integer
262                  > 255 is found in the lists.
263
264                  If InEncoding is of a Unicode type, an error occurs whenever
265                  either of the following is found:
266
267                  * An integer > 16#10FFFF (the maximum Unicode character)
268
269                  * An integer in the range 16#D800 to 16#DFFF (invalid  range
270                    reserved for UTF-16 surrogate pairs)
271
272                * Incorrect UTF encoding.
273
274                  If  InEncoding is one of the UTF types, the bytes in any bi‐
275                  naries must be valid in that encoding.
276
277                  Errors can occur for various reasons, including the  follow‐
278                  ing:
279
280                  * "Pure"  decoding  errors (like the upper bits of the bytes
281                    being wrong).
282
283                  * The bytes are decoded to a too large number.
284
285                  * The bytes are decoded to a code point in the invalid  Uni‐
286                    code range.
287
288                  * Encoding  is "overlong", meaning that a number should have
289                    been encoded in fewer bytes.
290
291                  The case of a truncated UTF is handled  specially,  see  the
292                  paragraph about incomplete binaries below.
293
294                  If  InEncoding  is latin1, binaries are always valid as long
295                  as they contain whole bytes, as each  byte  falls  into  the
296                  valid ISO Latin-1 range.
297
298              A  special  type  of error is when no actual invalid integers or
299              bytes are found, but a trailing binary()  consists  of  too  few
300              bytes  to  decode  the  last  character. This error can occur if
301              bytes are read from a file in chunks or  if  binaries  in  other
302              ways  are  split  on non-UTF character boundaries. An incomplete
303              tuple is then returned instead of the error tuple.  It  consists
304              of  the same parts as the error tuple, but the tag is incomplete
305              instead of error and the last element is always guaranteed to be
306              a  binary  consisting  of the first part of a (so far) valid UTF
307              character.
308
309              If one UTF character is split over two consecutive  binaries  in
310              the  Data,  the conversion succeeds. This means that a character
311              can be decoded from a range of binaries as  long  as  the  whole
312              range is specified as input without errors occurring.
313
314              Example:
315
316              decode_data(Data) ->
317                 case unicode:characters_to_list(Data,unicode) of
318                    {incomplete,Encoded, Rest} ->
319                          More = get_some_more_data(),
320                          Encoded ++ decode_data([Rest, More]);
321                    {error,Encoded,Rest} ->
322                          handle_error(Encoded,Rest);
323                    List ->
324                          List
325                 end.
326
327              However,  bit  strings that are not whole bytes are not allowed,
328              so a UTF character must be split along 8-bit boundaries to  ever
329              be decoded.
330
331              A badarg exception is thrown for the following cases:
332
333                * Any parameters are of the wrong type.
334
335                * The list structure is invalid (a number as tail).
336
337                * The binaries do not contain whole bytes (bit strings).
338
339       characters_to_nfc_list(CD :: chardata()) ->
340                                 [char()] | {error, [char()], chardata()}
341
342              Converts  a possibly deep list of characters and binaries into a
343              Normalized Form of canonical equivalent Composed characters  ac‐
344              cording to the Unicode standard.
345
346              Any binaries in the input must be encoded with utf8 encoding.
347
348              The result is a list of characters.
349
350              3> unicode:characters_to_nfc_list([<<"abc..a">>,[778],$a,[776],$o,[776]]).
351              "abc..åäö"
352
353
354       characters_to_nfc_binary(CD :: chardata()) ->
355                                   unicode_binary() |
356                                   {error, unicode_binary(), chardata()}
357
358              Converts  a possibly deep list of characters and binaries into a
359              Normalized Form of canonical equivalent Composed characters  ac‐
360              cording to the Unicode standard.
361
362              Any binaries in the input must be encoded with utf8 encoding.
363
364              The result is an utf8 encoded binary.
365
366              4> unicode:characters_to_nfc_binary([<<"abc..a">>,[778],$a,[776],$o,[776]]).
367              <<"abc..åäö"/utf8>>
368
369
370       characters_to_nfd_list(CD :: chardata()) ->
371                                 [char()] | {error, [char()], chardata()}
372
373              Converts  a possibly deep list of characters and binaries into a
374              Normalized Form of canonical  equivalent  Decomposed  characters
375              according to the Unicode standard.
376
377              Any binaries in the input must be encoded with utf8 encoding.
378
379              The result is a list of characters.
380
381              1> unicode:characters_to_nfd_list("abc..åäö").
382              [97,98,99,46,46,97,778,97,776,111,776]
383
384
385       characters_to_nfd_binary(CD :: chardata()) ->
386                                   unicode_binary() |
387                                   {error, unicode_binary(), chardata()}
388
389              Converts  a possibly deep list of characters and binaries into a
390              Normalized Form of canonical  equivalent  Decomposed  characters
391              according to the Unicode standard.
392
393              Any binaries in the input must be encoded with utf8 encoding.
394
395              The result is an utf8 encoded binary.
396
397              2> unicode:characters_to_nfd_binary("abc..åäö").
398              <<97,98,99,46,46,97,204,138,97,204,136,111,204,136>>
399
400
401       characters_to_nfkc_list(CD :: chardata()) ->
402                                  [char()] |
403                                  {error, [char()], chardata()}
404
405              Converts  a possibly deep list of characters and binaries into a
406              Normalized Form of compatibly equivalent Composed characters ac‐
407              cording to the Unicode standard.
408
409              Any binaries in the input must be encoded with utf8 encoding.
410
411              The result is a list of characters.
412
413              3> unicode:characters_to_nfkc_list([<<"abc..a">>,[778],$a,[776],$o,[776],[65299,65298]]).
414              "abc..åäö32"
415
416
417       characters_to_nfkc_binary(CD :: chardata()) ->
418                                    unicode_binary() |
419                                    {error, unicode_binary(), chardata()}
420
421              Converts  a possibly deep list of characters and binaries into a
422              Normalized Form of compatibly equivalent Composed characters ac‐
423              cording to the Unicode standard.
424
425              Any binaries in the input must be encoded with utf8 encoding.
426
427              The result is an utf8 encoded binary.
428
429              4> unicode:characters_to_nfkc_binary([<<"abc..a">>,[778],$a,[776],$o,[776],[65299,65298]]).
430              <<"abc..åäö32"/utf8>>
431
432
433       characters_to_nfkd_list(CD :: chardata()) ->
434                                  [char()] |
435                                  {error, [char()], chardata()}
436
437              Converts  a possibly deep list of characters and binaries into a
438              Normalized Form of compatibly equivalent  Decomposed  characters
439              according to the Unicode standard.
440
441              Any binaries in the input must be encoded with utf8 encoding.
442
443              The result is a list of characters.
444
445              1> unicode:characters_to_nfkd_list(["abc..åäö",[65299,65298]]).
446              [97,98,99,46,46,97,778,97,776,111,776,51,50]
447
448
449       characters_to_nfkd_binary(CD :: chardata()) ->
450                                    unicode_binary() |
451                                    {error, unicode_binary(), chardata()}
452
453              Converts  a possibly deep list of characters and binaries into a
454              Normalized Form of compatibly equivalent  Decomposed  characters
455              according to the Unicode standard.
456
457              Any binaries in the input must be encoded with utf8 encoding.
458
459              The result is an utf8 encoded binary.
460
461              2> unicode:characters_to_nfkd_binary(["abc..åäö",[65299,65298]]).
462              <<97,98,99,46,46,97,204,138,97,204,136,111,204,136,51,50>>
463
464
465       encoding_to_bom(InEncoding) -> Bin
466
467              Types:
468
469                 Bin = binary()
470                    A binary() such that byte_size(Bin) >= 4.
471                 InEncoding = encoding()
472
473              Creates  a  UTF  Byte Order Mark (BOM) as a binary from the sup‐
474              plied InEncoding. The BOM is, if supported at all,  expected  to
475              be placed first in UTF encoded files or messages.
476
477              The  function  returns  <<>> for latin1 encoding, as there is no
478              BOM for ISO Latin-1.
479
480              Notice that the BOM for UTF-8 is seldom used, and it  is  really
481              not  a byte order mark. There are obviously no byte order issues
482              with UTF-8, so the BOM is only there to differentiate UTF-8  en‐
483              coding from other UTF formats.
484
485
486
487Ericsson AB                      stdlib 5.1.1                       unicode(3)
Impressum