unicode(3erl)

1unicode(3)                 Erlang Module Definition                 unicode(3)
2
3
4

NAME

6       unicode - Functions for converting Unicode characters.
7

DESCRIPTION

9       This module contains functions for converting between different charac‐
10       ter representations. It converts between  ISO  Latin-1  characters  and
11       Unicode  characters,  but it can also convert between different Unicode
12       encodings (like UTF-8, UTF-16, and UTF-32).
13
14       The default Unicode encoding in Erlang is in binaries UTF-8,  which  is
15       also the format in which built-in functions and libraries in OTP expect
16       to find binary Unicode data. In lists, Unicode data is encoded as inte‐
17       gers, each integer representing one character and encoded simply as the
18       Unicode code point for the character.
19
20       Other Unicode encodings than integers representing code points or UTF-8
21       in  binaries  are  referred to as "external encodings". The ISO Latin-1
22       encoding is in binaries and lists referred to as latin1-encoding.
23
24       It is recommended to only use external encodings for communication with
25       external  entities  where  this  is  required.  When working inside the
26       Erlang/OTP environment, it is recommended to  keep  binaries  in  UTF-8
27       when representing Unicode characters. ISO Latin-1 encoding is supported
28       both for backward compatibility and  for  communication  with  external
29       entities not supporting Unicode character sets.
30
31       Programs should always operate on a normalized form and compare canoni‐
32       cal-equivalent Unicode characters as equal. All characters should  thus
33       be  normalized  to one form once on the system borders. One of the fol‐
34       lowing functions can convert characters to their normalized forms char‐
35       acters_to_nfc_list/1,        characters_to_nfc_binary/1,        charac‐
36       ters_to_nfd_list/1  or  characters_to_nfd_binary/1.  For  general  text
37       characters_to_nfc_list/1  or  characters_to_nfc_binary/1  is preferred,
38       and for identifiers one of the compatibility  normalization  functions,
39       such  as  characters_to_nfkc_list/1, is preferred for security reasons.
40       The normalization functions where  introduced  in  OTP  20.  Additional
41       information on normalization can be found in the Unicode FAQ.
42

DATA TYPES

44       encoding() =
45           latin1 |
46           unicode |
47           utf8 |
48           utf16 |
49           {utf16, endian()} |
50           utf32 |
51           {utf32, endian()}
52
53       endian() = big | little
54
55       unicode_binary() = binary()
56
57              A binary() with characters encoded in the UTF-8 coding standard.
58
59       chardata() = charlist() | unicode_binary()
60
61       charlist() =
62           maybe_improper_list(char() | unicode_binary() | charlist(),
63                               unicode_binary() | [])
64
65       external_unicode_binary() = binary()
66
67              A  binary()  with  characters  coded in a user-specified Unicode
68              encoding other than UTF-8 (that is, UTF-16 or UTF-32).
69
70       external_chardata() =
71           external_charlist() | external_unicode_binary()
72
73       external_charlist() =
74           maybe_improper_list(char() |
75                               external_unicode_binary() |
76                               external_charlist(),
77                               external_unicode_binary() | [])
78
79       latin1_binary() = binary()
80
81              A binary() with characters coded in ISO Latin-1.
82
83       latin1_char() = byte()
84
85              An integer() representing a valid ISO Latin-1 character (0-255).
86
87       latin1_chardata() = latin1_charlist() | latin1_binary()
88
89              Same as iodata().
90
91       latin1_charlist() =
92           maybe_improper_list(latin1_char() |
93                               latin1_binary() |
94                               latin1_charlist(),
95                               latin1_binary() | [])
96
97              Same as iolist().
98

EXPORTS

100       bom_to_encoding(Bin) -> {Encoding, Length}
101
102              Types:
103
104                 Bin = binary()
105                    A binary() such that byte_size(Bin) >= 4.
106                 Encoding =
107                     latin1 | utf8 | {utf16, endian()} | {utf32, endian()}
108                 Length = integer() >= 0
109                 endian() = big | little
110
111              Checks for a UTF Byte Order Mark (BOM) in  the  beginning  of  a
112              binary.  If  the supplied binary Bin begins with a valid BOM for
113              either UTF-8, UTF-16, or UTF-32, the function returns the encod‐
114              ing identified along with the BOM length in bytes.
115
116              If no BOM is found, the function returns {latin1,0}.
117
118       characters_to_binary(Data) -> Result
119
120              Types:
121
122                 Data = latin1_chardata() | chardata() | external_chardata()
123                 Result =
124                     binary() |
125                     {error, binary(), RestData} |
126                     {incomplete, binary(), binary()}
127                 RestData  =  latin1_chardata()  | chardata() | external_char‐
128                 data()
129
130              Same as characters_to_binary(Data, unicode, unicode).
131
132       characters_to_binary(Data, InEncoding) -> Result
133
134              Types:
135
136                 Data = latin1_chardata() | chardata() | external_chardata()
137                 InEncoding = encoding()
138                 Result =
139                     binary() |
140                     {error, binary(), RestData} |
141                     {incomplete, binary(), binary()}
142                 RestData = latin1_chardata() |  chardata()  |  external_char‐
143                 data()
144
145              Same as characters_to_binary(Data, InEncoding, unicode).
146
147       characters_to_binary(Data, InEncoding, OutEncoding) -> Result
148
149              Types:
150
151                 Data = latin1_chardata() | chardata() | external_chardata()
152                 InEncoding = OutEncoding = encoding()
153                 Result =
154                     binary() |
155                     {error, binary(), RestData} |
156                     {incomplete, binary(), binary()}
157                 RestData  =  latin1_chardata()  | chardata() | external_char‐
158                 data()
159
160              Behaves as characters_to_list/2, but produces a  binary  instead
161              of a Unicode list.
162
163              InEncoding  defines  how  input is to be interpreted if binaries
164              are present in Data
165
166              OutEncoding defines in what format output is to be generated.
167
168              Options:
169
170                unicode:
171                  An alias for utf8, as this is  the  preferred  encoding  for
172                  Unicode characters in binaries.
173
174                utf16:
175                  An alias for {utf16,big}.
176
177                utf32:
178                  An alias for {utf32,big}.
179
180              The atoms big and little denote big- or little-endian encoding.
181
182              Errors  and exceptions occur as in characters_to_list/2, but the
183              second element in tuple error or incomplete is  a  binary()  and
184              not a list().
185
186       characters_to_list(Data) -> Result
187
188              Types:
189
190                 Data = latin1_chardata() | chardata() | external_chardata()
191                 Result =
192                     list() |
193                     {error, list(), RestData} |
194                     {incomplete, list(), binary()}
195                 RestData  =  latin1_chardata()  | chardata() | external_char‐
196                 data()
197
198              Same as characters_to_list(Data, unicode).
199
200       characters_to_list(Data, InEncoding) -> Result
201
202              Types:
203
204                 Data = latin1_chardata() | chardata() | external_chardata()
205                 InEncoding = encoding()
206                 Result =
207                     list() |
208                     {error, list(), RestData} |
209                     {incomplete, list(), binary()}
210                 RestData = latin1_chardata() |  chardata()  |  external_char‐
211                 data()
212
213              Converts  a  possibly  deep list of integers and binaries into a
214              list of integers representing Unicode characters.  The  binaries
215              in  the  input can have characters encoded as one of the follow‐
216              ing:
217
218                * ISO Latin-1 (0-255, one  character  per  byte).  Here,  case
219                  parameter InEncoding is to be specified as latin1.
220
221                * One  of  the  UTF-encodings, which is specified as parameter
222                  InEncoding.
223
224              Note that integers in the  list  always  represent  code  points
225              regardless of InEncoding passed. If InEncoding latin1 is passed,
226              only code points < 256 are allowed; otherwise, all valid unicode
227              code points are allowed.
228
229              If  InEncoding  is  latin1,  parameter  Data  corresponds to the
230              iodata() type, but for unicode, parameter Data can contain inte‐
231              gers  >  255  (Unicode characters beyond the ISO Latin-1 range),
232              which makes it invalid as iodata().
233
234              The purpose of the function is mainly to convert combinations of
235              Unicode  characters into a pure Unicode string in list represen‐
236              tation for further processing. For writing the data to an exter‐
237              nal entity, the reverse function characters_to_binary/3 comes in
238              handy.
239
240              Option unicode is an alias for utf8, as this  is  the  preferred
241              encoding  for  Unicode characters in binaries. utf16 is an alias
242              for {utf16,big} and utf32 is an alias for {utf32,big}. The atoms
243              big and little denote big- or little-endian encoding.
244
245              If  the data cannot be converted, either because of illegal Uni‐
246              code/ISO Latin-1 characters in the list, or because  of  invalid
247              UTF  encoding  in  any binaries, an error tuple is returned. The
248              error tuple contains the tag  error,  a  list  representing  the
249              characters that could be converted before the error occurred and
250              a representation of  the  characters  including  and  after  the
251              offending  integer/bytes. The last part is mostly for debugging,
252              as it still constitutes a possibly deep or mixed list, or  both,
253              not  necessarily  of  the  same  depth as the original data. The
254              error occurs when traversing the list and whatever  is  left  to
255              decode is returned "as is".
256
257              However,  if  the input Data is a pure binary, the third part of
258              the error tuple is guaranteed to be a binary as well.
259
260              Errors occur for the following reasons:
261
262                * Integers out of range.
263
264                  If InEncoding is latin1, an error occurs whenever an integer
265                  > 255 is found in the lists.
266
267                  If InEncoding is of a Unicode type, an error occurs whenever
268                  either of the following is found:
269
270                  * An integer > 16#10FFFF (the maximum Unicode character)
271
272                  * An integer in the range 16#D800 to 16#DFFF (invalid  range
273                    reserved for UTF-16 surrogate pairs)
274
275                * Incorrect UTF encoding.
276
277                  If  InEncoding  is  one  of  the UTF types, the bytes in any
278                  binaries must be valid in that encoding.
279
280                  Errors can occur for various reasons, including the  follow‐
281                  ing:
282
283                  * "Pure"  decoding  errors (like the upper bits of the bytes
284                    being wrong).
285
286                  * The bytes are decoded to a too large number.
287
288                  * The bytes are decoded to a code point in the invalid  Uni‐
289                    code range.
290
291                  * Encoding  is "overlong", meaning that a number should have
292                    been encoded in fewer bytes.
293
294                  The case of a truncated UTF is handled  specially,  see  the
295                  paragraph about incomplete binaries below.
296
297                  If  InEncoding  is latin1, binaries are always valid as long
298                  as they contain whole bytes, as each  byte  falls  into  the
299                  valid ISO Latin-1 range.
300
301              A  special  type  of error is when no actual invalid integers or
302              bytes are found, but a trailing binary()  consists  of  too  few
303              bytes  to  decode  the  last  character. This error can occur if
304              bytes are read from a file in chunks or  if  binaries  in  other
305              ways  are  split  on non-UTF character boundaries. An incomplete
306              tuple is then returned instead of the error tuple.  It  consists
307              of  the same parts as the error tuple, but the tag is incomplete
308              instead of error and the last element is always guaranteed to be
309              a  binary  consisting  of the first part of a (so far) valid UTF
310              character.
311
312              If one UTF character is split over two consecutive  binaries  in
313              the  Data,  the conversion succeeds. This means that a character
314              can be decoded from a range of binaries as  long  as  the  whole
315              range is specified as input without errors occurring.
316
317              Example:
318
319              decode_data(Data) ->
320                 case unicode:characters_to_list(Data,unicode) of
321                    {incomplete,Encoded, Rest} ->
322                          More = get_some_more_data(),
323                          Encoded ++ decode_data([Rest, More]);
324                    {error,Encoded,Rest} ->
325                          handle_error(Encoded,Rest);
326                    List ->
327                          List
328                 end.
329
330              However,  bit  strings that are not whole bytes are not allowed,
331              so a UTF character must be split along 8-bit boundaries to  ever
332              be decoded.
333
334              A badarg exception is thrown for the following cases:
335
336                * Any parameters are of the wrong type.
337
338                * The list structure is invalid (a number as tail).
339
340                * The binaries do not contain whole bytes (bit strings).
341
342       characters_to_nfc_list(CD :: chardata()) ->
343                                 [char()] | {error, [char()], chardata()}
344
345              Converts  a possibly deep list of characters and binaries into a
346              Normalized Form  of  canonical  equivalent  Composed  characters
347              according to the Unicode standard.
348
349              Any binaries in the input must be encoded with utf8 encoding.
350
351              The result is a list of characters.
352
353              3> unicode:characters_to_nfc_list([<<"abc..a">>,[778],$a,[776],$o,[776]]).
354              "abc..åäö"
355
356
357       characters_to_nfc_binary(CD :: chardata()) ->
358                                   unicode_binary() |
359                                   {error, unicode_binary(), chardata()}
360
361              Converts  a possibly deep list of characters and binaries into a
362              Normalized Form  of  canonical  equivalent  Composed  characters
363              according to the Unicode standard.
364
365              Any binaries in the input must be encoded with utf8 encoding.
366
367              The result is an utf8 encoded binary.
368
369              4> unicode:characters_to_nfc_binary([<<"abc..a">>,[778],$a,[776],$o,[776]]).
370              <<"abc..åäö"/utf8>>
371
372
373       characters_to_nfd_list(CD :: chardata()) ->
374                                 [char()] | {error, [char()], chardata()}
375
376              Converts  a possibly deep list of characters and binaries into a
377              Normalized Form of canonical  equivalent  Decomposed  characters
378              according to the Unicode standard.
379
380              Any binaries in the input must be encoded with utf8 encoding.
381
382              The result is a list of characters.
383
384              1> unicode:characters_to_nfd_list("abc..åäö").
385              [97,98,99,46,46,97,778,97,776,111,776]
386
387
388       characters_to_nfd_binary(CD :: chardata()) ->
389                                   unicode_binary() |
390                                   {error, unicode_binary(), chardata()}
391
392              Converts  a possibly deep list of characters and binaries into a
393              Normalized Form of canonical  equivalent  Decomposed  characters
394              according to the Unicode standard.
395
396              Any binaries in the input must be encoded with utf8 encoding.
397
398              The result is an utf8 encoded binary.
399
400              2> unicode:characters_to_nfd_binary("abc..åäö").
401              <<97,98,99,46,46,97,204,138,97,204,136,111,204,136>>
402
403
404       characters_to_nfkc_list(CD :: chardata()) ->
405                                  [char()] |
406                                  {error, [char()], chardata()}
407
408              Converts  a possibly deep list of characters and binaries into a
409              Normalized Form of  compatibly  equivalent  Composed  characters
410              according to the Unicode standard.
411
412              Any binaries in the input must be encoded with utf8 encoding.
413
414              The result is a list of characters.
415
416              3> unicode:characters_to_nfkc_list([<<"abc..a">>,[778],$a,[776],$o,[776],[65299,65298]]).
417              "abc..åäö32"
418
419
420       characters_to_nfkc_binary(CD :: chardata()) ->
421                                    unicode_binary() |
422                                    {error, unicode_binary(), chardata()}
423
424              Converts  a possibly deep list of characters and binaries into a
425              Normalized Form of  compatibly  equivalent  Composed  characters
426              according to the Unicode standard.
427
428              Any binaries in the input must be encoded with utf8 encoding.
429
430              The result is an utf8 encoded binary.
431
432              4> unicode:characters_to_nfkc_binary([<<"abc..a">>,[778],$a,[776],$o,[776],[65299,65298]]).
433              <<"abc..åäö32"/utf8>>
434
435
436       characters_to_nfkd_list(CD :: chardata()) ->
437                                  [char()] |
438                                  {error, [char()], chardata()}
439
440              Converts  a possibly deep list of characters and binaries into a
441              Normalized Form of compatibly equivalent  Decomposed  characters
442              according to the Unicode standard.
443
444              Any binaries in the input must be encoded with utf8 encoding.
445
446              The result is a list of characters.
447
448              1> unicode:characters_to_nfkd_list(["abc..åäö",[65299,65298]]).
449              [97,98,99,46,46,97,778,97,776,111,776,51,50]
450
451
452       characters_to_nfkd_binary(CD :: chardata()) ->
453                                    unicode_binary() |
454                                    {error, unicode_binary(), chardata()}
455
456              Converts  a possibly deep list of characters and binaries into a
457              Normalized Form of compatibly equivalent  Decomposed  characters
458              according to the Unicode standard.
459
460              Any binaries in the input must be encoded with utf8 encoding.
461
462              The result is an utf8 encoded binary.
463
464              2> unicode:characters_to_nfkd_binary(["abc..åäö",[65299,65298]]).
465              <<97,98,99,46,46,97,204,138,97,204,136,111,204,136,51,50>>
466
467
468       encoding_to_bom(InEncoding) -> Bin
469
470              Types:
471
472                 Bin = binary()
473                    A binary() such that byte_size(Bin) >= 4.
474                 InEncoding = encoding()
475
476              Creates  a  UTF  Byte Order Mark (BOM) as a binary from the sup‐
477              plied InEncoding. The BOM is, if supported at all,  expected  to
478              be placed first in UTF encoded files or messages.
479
480              The  function  returns  <<>> for latin1 encoding, as there is no
481              BOM for ISO Latin-1.
482
483              Notice that the BOM for UTF-8 is seldom used, and it  is  really
484              not  a byte order mark. There are obviously no byte order issues
485              with UTF-8, so the BOM is  only  there  to  differentiate  UTF-8
486              encoding from other UTF formats.
487
488
489
490Ericsson AB                     stdlib 3.8.2.1                      unicode(3)