1Tcl_GetEncoding(3)          Tcl Library Procedures          Tcl_GetEncoding(3)
2
3
4
5______________________________________________________________________________
6

NAME

8       Tcl_GetEncoding,       Tcl_FreeEncoding,      Tcl_ExternalToUtfDString,
9       Tcl_ExternalToUtf,     Tcl_UtfToExternalDString,     Tcl_UtfToExternal,
10       Tcl_WinTCharToUtf,  Tcl_WinUtfToTChar, Tcl_GetEncodingName, Tcl_SetSys‐
11       temEncoding, Tcl_GetEncodingNames, Tcl_CreateEncoding,  Tcl_GetDefault‐
12       EncodingDir,  Tcl_SetDefaultEncodingDir  -  procedures for creating and
13       using encodings.
14

SYNOPSIS

16       #include <tcl.h>
17
18       Tcl_Encoding
19       Tcl_GetEncoding(interp, name)
20
21       void
22       Tcl_FreeEncoding(encoding)
23
24       char *
25       Tcl_ExternalToUtfDString(encoding, src, srcLen, dstPtr)
26
27       int
28       Tcl_ExternalToUtf(interp, encoding, src, srcLen, flags, statePtr, dst, dstLen, srcReadPtr, dstWrotePtr,
29            dstCharsPtr)
30
31       char *
32       Tcl_UtfToExternalDString(encoding, src, srcLen, dstPtr)
33
34       int
35       Tcl_UtfToExternal(interp, encoding, src, srcLen, flags, statePtr, dst, dstLen, srcReadPtr, dstWrotePtr,
36            dstCharsPtr)
37
38       char *
39       Tcl_WinTCharToUtf(tsrc, srcLen, dstPtr)
40
41       TCHAR *
42       Tcl_WinUtfToTChar(src, srcLen, dstPtr)
43
44       CONST char *
45       Tcl_GetEncodingName(encoding)
46
47       int
48       Tcl_SetSystemEncoding(interp, name)
49
50       void
51       Tcl_GetEncodingNames(interp)
52
53       Tcl_Encoding
54       Tcl_CreateEncoding(typePtr)
55
56       CONST char *
57       Tcl_GetDefaultEncodingDir(void)
58
59       void
60       Tcl_SetDefaultEncodingDir(path)
61
62
63

ARGUMENTS

65       Tcl_Interp          *interp        (in)      Interpreter  to  use   for
66                                                    error  reporting,  or NULL
67                                                    if no error  reporting  is
68                                                    desired.
69
70       CONST char          *name          (in)      Name of encoding to load.
71
72       Tcl_Encoding        encoding       (in)      The   encoding  to  query,
73                                                    free, or use for  convert‐
74                                                    ing  text.  If encoding is
75                                                    NULL, the  current  system
76                                                    encoding is used.
77
78       CONST char          *src           (in)      For  the Tcl_ExternalToUtf
79                                                    functions,  an  array   of
80                                                    bytes   in  the  specified
81                                                    encoding that  are  to  be
82                                                    converted  to  UTF-8.  For
83                                                    the Tcl_UtfToExternal  and
84                                                    Tcl_WinUtfToTChar    func‐
85                                                    tions, an array  of  UTF-8
86                                                    characters to be converted
87                                                    to the specified encoding.
88
89       CONST TCHAR         *tsrc          (in)      An array of Windows  TCHAR
90                                                    characters  to  convert to
91                                                    UTF-8.
92
93       int                 srcLen         (in)      Length of src or  tsrc  in
94                                                    bytes.   If  the length is
95                                                    negative,  the   encoding-
96                                                    specific   length  of  the
97                                                    string is used.
98
99       Tcl_DString         *dstPtr        (out)     Pointer to  an  uninitial‐
100                                                    ized  or  free Tcl_DString
101                                                    in  which  the   converted
102                                                    result will be stored.
103
104       int                 flags          (in)      Various  flag  bits  OR-ed
105                                                    together.       TCL_ENCOD‐
106                                                    ING_START  signifies  that
107                                                    the source buffer  is  the
108                                                    first  block  in a (poten‐
109                                                    tially multi-block)  input
110                                                    stream,  telling  the con‐
111                                                    version routine  to  reset
112                                                    to  an  initial  state and
113                                                    perform any initialization
114                                                    that needs to occur before
115                                                    the  first  byte  is  con‐
116                                                    verted.   TCL_ENCODING_END
117                                                    signifies that the  source
118                                                    buffer  is  the last block
119                                                    in a  (potentially  multi-
120                                                    block)    input    stream,
121                                                    telling   the   conversion
122                                                    routine   to  perform  any
123                                                    finalization that needs to
124                                                    occur  after the last byte
125                                                    is converted and  then  to
126                                                    reset to an initial state.
127                                                    TCL_ENCODING_STOPONERROR
128                                                    signifies that the conver‐
129                                                    sion routine should return
130                                                    immediately upon reading a
131                                                    source   character    that
132                                                    doesn't  exist in the tar‐
133                                                    get encoding; otherwise  a
134                                                    default fallback character
135                                                    will automatically be sub‐
136                                                    stituted.
137
138       Tcl_EncodingState   *statePtr      (in/out)  Used   when  converting  a
139                                                    (generally long or indefi‐
140                                                    nite  length)  byte stream
141                                                    in a piece by piece  fash‐
142                                                    ion.   The conversion rou‐
143                                                    tine  stores  its  current
144                                                    state  in  *statePtr after
145                                                    src (the buffer containing
146                                                    the   current  piece)  has
147                                                    been converted; that state
148                                                    information must be passed
149                                                    back when  converting  the
150                                                    next  piece  of the stream
151                                                    so the conversion  routine
152                                                    knows what state it was in
153                                                    when it left  off  at  the
154                                                    end  of  the  last  piece.
155                                                    May be NULL, in which case
156                                                    the  value  specified  for
157                                                    flags is ignored  and  the
158                                                    source  buffer  is assumed
159                                                    to  contain  the  complete
160                                                    string to convert.
161
162       char                *dst           (out)     Buffer  in  which the con‐
163                                                    verted  result   will   be
164                                                    stored.    No   more  than
165                                                    dstLen   bytes   will   be
166                                                    stored in dst.
167
168       int                 dstLen         (in)      The  maximum length of the
169                                                    output   buffer   dst   in
170                                                    bytes.
171
172       int                 *srcReadPtr    (out)     Filled  with the number of
173                                                    bytes from src  that  were
174                                                    actually  converted.  This
175                                                    may be less than the orig‐
176                                                    inal   source   length  if
177                                                    there was a  problem  con‐
178                                                    verting  some source char‐
179                                                    acters.  May be NULL.
180
181       int                 *dstWrotePtr   (out)     Filled with the number  of
182                                                    bytes  that  were actually
183                                                    stored in the output  buf‐
184                                                    fer  as  a  result  of the
185                                                    conversion.  May be NULL.
186
187       int                 *dstCharsPtr   (out)     Filled with the number  of
188                                                    characters that correspond
189                                                    to  the  number  of  bytes
190                                                    stored  in the output buf‐
191                                                    fer.  May be NULL.
192
193       Tcl_EncodingType    *typePtr       (in)      Structure that  defines  a
194                                                    new type of encoding.
195
196       CONST char          *path          (in)      A  path to the location of
197                                                    the encoding file.
198_________________________________________________________________
199

INTRODUCTION

201       These routines convert between Tcl's internal character representation,
202       UTF-8,  and character representations used by various operating systems
203       or file systems, such as Unicode, ASCII, or Shift-JIS.  When  operating
204       on  strings, such as such as obtaining the names of files or displaying
205       characters using international fonts, the strings  must  be  translated
206       into one or possibly multiple formats that the various system calls can
207       use.  For instance, on a Japanese Unix workstation, a user might obtain
208       a  filename  represented in the EUC-JP file encoding and then translate
209       the characters to the jisx0208 font encoding in order  to  display  the
210       filename  in  a  Tk  widget.  The purpose of the encoding package is to
211       help bridge the translation gap.  UTF-8 provides an intermediate  stag‐
212       ing  ground  for all the various encodings.  In the example above, text
213       would be translated into UTF-8 from whatever file encoding the  operat‐
214       ing system is using.  Then it would be translated from UTF-8 into what‐
215       ever font encoding the display routines require.
216
217       Some basic encodings are compiled into Tcl.  Others can be  defined  by
218       the  user or dynamically loaded from encoding files in a platform-inde‐
219       pendent manner.
220

DESCRIPTION

222       Tcl_GetEncoding finds an encoding given its name.  The name  may  refer
223       to  a builtin Tcl encoding, a user-defined encoding registered by call‐
224       ing Tcl_CreateEncoding, or a dynamically-loadable encoding  file.   The
225       return value is a token that represents the encoding and can be used in
226       subsequent calls to procedures such as Tcl_GetEncodingName, Tcl_FreeEn‐
227       coding,  and Tcl_UtfToExternal.  If the name did not refer to any known
228       or loadable encoding, NULL is returned and an error message is returned
229       in interp.
230
231       The encoding package maintains a database of all encodings currently in
232       use.  The first time name is seen, Tcl_GetEncoding returns an  encoding
233       with  a  reference  count  of 1.  If the same name is requested further
234       times, then the reference count for that encoding is incremented  with‐
235       out  the  overhead  of allocating a new encoding and all its associated
236       data structures.
237
238       When an encoding is no longer needed, Tcl_FreeEncoding should be called
239       to release it.  When an encoding is no longer in use anywhere (i.e., it
240       has been freed as many times as it has  been  gotten)  Tcl_FreeEncoding
241       will  release all storage the encoding was using and delete it from the
242       database.
243
244       Tcl_ExternalToUtfDString converts a source buffer src from  the  speci‐
245       fied  encoding  into  UTF-8.  The converted bytes are stored in dstPtr,
246       which is then  null-terminated.   The  caller  should  eventually  call
247       Tcl_DStringFree  to  free  any information stored in dstPtr.  When con‐
248       verting, if any of the characters in the source buffer cannot be repre‐
249       sented  in  the  target  encoding, a default fallback character will be
250       used.  The return value is  a  pointer  to  the  value  stored  in  the
251       DString.
252
253       Tcl_ExternalToUtf  converts  a  source  buffer  src  from the specified
254       encoding into UTF-8.  Up to srcLen bytes are converted from the  source
255       buffer  and  up  to  dstLen  converted bytes are stored in dst.  In all
256       cases, *srcReadPtr is filled with the number of bytes  that  were  suc‐
257       cessfully converted from src and *dstWrotePtr is filled with the corre‐
258       sponding number of bytes that were stored in dst.  The return value  is
259       one of the following:
260
261              TCL_OK                       All bytes of src were converted.
262
263              TCL_CONVERT_NOSPACE          The   destination  buffer  was  not
264                                           large enough for all  of  the  con‐
265                                           verted  data; as many characters as
266                                           could fit were converted though.
267
268              TCL_CONVERT_MULTIBYTE        The last fews bytes in  the  source
269                                           buffer  were  the  beginning  of  a
270                                           multibyte sequence, but more  bytes
271                                           were   needed   to   complete  this
272                                           sequence.  A subsequent call to the
273                                           conversion  routine  should  pass a
274                                           buffer containing  the  unconverted
275                                           bytes  that  remained  in  src plus
276                                           some further bytes from the  source
277                                           stream to properly convert the for‐
278                                           merly split-up multibyte sequence.
279
280              TCL_CONVERT_SYNTAX           The  source  buffer  contained   an
281                                           invalid  character  sequence.  This
282                                           may occur if the input  stream  has
283                                           been damaged or if the input encod‐
284                                           ing method was misidentified.
285
286              TCL_CONVERT_UNKNOWN          The source buffer contained a char‐
287                                           acter that could not be represented
288                                           in   the   target   encoding    and
289                                           TCL_ENCODING_STOPONERROR was speci‐
290                                           fied.
291
292       Tcl_UtfToExternalDString converts a source buffer src from  UTF-8  into
293       the  specified  encoding.   The  converted  bytes are stored in dstPtr,
294       which is then terminated with the appropriate  encoding-specific  null.
295       The  caller should eventually call Tcl_DStringFree to free any informa‐
296       tion stored in dstPtr.  When converting, if any of  the  characters  in
297       the  source  buffer  cannot  be  represented  in the target encoding, a
298       default fallback character will be used.  The return value is a pointer
299       to the value stored in the DString.
300
301       Tcl_UtfToExternal  converts  a  source  buffer  src from UTF-8 into the
302       specified encoding.  Up to srcLen bytes are converted from  the  source
303       buffer  and  up  to  dstLen  converted bytes are stored in dst.  In all
304       cases, *srcReadPtr is filled with the number of bytes  that  were  suc‐
305       cessfully converted from src and *dstWrotePtr is filled with the corre‐
306       sponding number of bytes that were stored in dst.   The  return  values
307       are the same as the return values for Tcl_ExternalToUtf.
308
309       Tcl_WinUtfToTChar  and  Tcl_WinTCharToUtf  are Windows-only convenience
310       functions for converting between UTF-8 and Windows strings.  On Windows
311       95  (as  with  the  Macintosh  and Unix operating systems), all strings
312       exchanged between Tcl and the operating system are  "char"  based.   On
313       Windows NT, some strings exchanged between Tcl and the operating system
314       are "char" oriented while others are in  Unicode.   By  convention,  in
315       Windows  a TCHAR is a character in the ANSI code page on Windows 95 and
316       a Unicode character on Windows NT.
317
318       If you planned to use the same "char" based interfaces on both  Windows
319       95    and   Windows   NT,   you   could   use   Tcl_UtfToExternal   and
320       Tcl_ExternalToUtf (or their Tcl_DString equivalents) with  an  encoding
321       of  NULL  (the  current  system  encoding).   On the other hand, if you
322       planned to use the Unicode interface when running on Windows NT and the
323       "char" interfaces when running on Windows 95, you would have to perform
324       the following type of test over and over in  your  program  (as  repre‐
325       sented in pseudo-code):
326              if (running NT) {
327                  encoding <- Tcl_GetEncoding("unicode");
328                  nativeBuffer <- Tcl_UtfToExternal(encoding, utfBuffer);
329                  Tcl_FreeEncoding(encoding);
330              } else {
331                  nativeBuffer <- Tcl_UtfToExternal(NULL, utfBuffer);
332       Tcl_WinUtfToTChar  and Tcl_WinTCharToUtf automatically handle this test
333       and use the proper encoding based  on  the  current  operating  system.
334       Tcl_WinUtfToTChar   returns   a   pointer   to   a  TCHAR  string,  and
335       Tcl_WinTCharToUtf expects a TCHAR string pointer  as  the  src  string.
336       Otherwise,  these  functions  behave identically to Tcl_UtfToExternalD‐
337       String and Tcl_ExternalToUtfDString.
338
339       Tcl_GetEncodingName is roughly the inverse of  Tcl_GetEncoding.   Given
340       an  encoding,  the  return  value is the name argument that was used to
341       create the encoding.  The string  returned  by  Tcl_GetEncodingName  is
342       only  guaranteed  to persist until the encoding is deleted.  The caller
343       must not modify this string.
344
345       Tcl_SetSystemEncoding sets the default encoding  that  should  be  used
346       whenever  the user passes a NULL value for the encoding argument to any
347       of the other encoding functions.  If name is NULL, the system  encoding
348       is  reset  to the default system encoding, binary.  If the name did not
349       refer to any known or loadable encoding, TCL_ERROR is returned  and  an
350       error  message is left in interp.  Otherwise, this procedure increments
351       the reference count of the new system encoding, decrements  the  refer‐
352       ence count of the old system encoding, and returns TCL_OK.
353
354       Tcl_GetEncodingNames sets the interp result to a list consisting of the
355       names of all the encodings that are currently defined or can be dynami‐
356       cally  loaded, searching the encoding path specified by Tcl_SetDefault‐
357       EncodingDir.  This procedure does not ensure that the dynamically-load‐
358       able encoding files contain valid data, but merely that they exist.
359
360       Tcl_CreateEncoding  defines  a  new encoding and registers the C proce‐
361       dures that are called back to convert between the encoding  and  UTF-8.
362       Encodings  created  by Tcl_CreateEncoding are thereafter visible in the
363       database used by Tcl_GetEncoding.  Just  as  with  the  Tcl_GetEncoding
364       procedure, the return value is a token that represents the encoding and
365       can be used in subsequent calls to other encoding functions.   Tcl_Cre‐
366       ateEncoding  returns  an  encoding  with  a reference count of 1. If an
367       encoding with the specified name already exists, then its entry in  the
368       database  is  replaced  with  the  new  encoding; the token for the old
369       encoding will remain valid and continue to behave as before, but  users
370       of the new token will now call the new encoding procedures.
371
372       The  typePtr  argument to Tcl_CreateEncoding contains information about
373       the name of the encoding and the procedures that will be called to con‐
374       vert between this encoding and UTF-8.  It is defined as follows:
375
376              typedef struct Tcl_EncodingType {
377                CONST char *encodingName;
378                Tcl_EncodingConvertProc *toUtfProc;
379                Tcl_EncodingConvertProc *fromUtfProc;
380                Tcl_EncodingFreeProc *freeProc;
381                ClientData clientData;
382                int nullSize;
383              } Tcl_EncodingType;
384
385       The  encodingName  provides a string name for the encoding, by which it
386       can be referred in  other  procedures  such  as  Tcl_GetEncoding.   The
387       toUtfProc refers to a callback procedure to invoke to convert text from
388       this encoding into UTF-8.  The fromUtfProc refers to a callback  proce‐
389       dure  to  invoke  to  convert  text from UTF-8 into this encoding.  The
390       freeProc refers to a callback procedure to invoke when this encoding is
391       deleted.   The  freeProc field may be NULL.  The clientData contains an
392       arbitrary one-word value passed to toUtfProc, fromUtfProc, and freeProc
393       whenever  they  are  called.   Typically,  this  is a pointer to a data
394       structure containing encoding-specific information that can be used  by
395       the callback procedures.  For instance, two very similar encodings such
396       as ascii and macRoman may use the same callback procedure, but use dif‐
397       ferent  values  of  clientData  to  control its behavior.  The nullSize
398       specifies the number of zero bytes that signify end-of-string  in  this
399       encoding.   It  must be 1 (for single-byte or multi-byte encodings like
400       ASCII or Shift-JIS) or 2  (for  double-byte  encodings  like  Unicode).
401       Constant-sized  encodings  with  3 or more bytes per character (such as
402       CNS11643) are not accepted.
403
404       The callback procedures toUtfProc and fromUtfProc should match the type
405       Tcl_EncodingConvertProc:
406
407              typedef int Tcl_EncodingConvertProc(
408                ClientData clientData,
409                CONST char *src,
410                int srcLen,
411                int flags,
412                Tcl_Encoding *statePtr,
413                char *dst,
414                int dstLen,
415                int *srcReadPtr,
416                int *dstWrotePtr,
417                int *dstCharsPtr);
418
419       The   toUtfProc   and   fromUtfProc   procedures   are  called  by  the
420       Tcl_ExternalToUtf or Tcl_UtfToExternal family of functions  to  perform
421       the actual conversion.  The clientData parameter to these procedures is
422       the same as the clientData field specified to  Tcl_CreateEncoding  when
423       the encoding was created.  The remaining arguments to the callback pro‐
424       cedures are the same as  the  arguments,  documented  at  the  top,  to
425       Tcl_ExternalToUtf  or Tcl_UtfToExternal, with the following exceptions.
426       If the srcLen argument to one of those high-level  functions  is  nega‐
427       tive,  the value passed to the callback procedure will be the appropri‐
428       ate encoding-specific string length of src.  If any of the  srcReadPtr,
429       dstWrotePtr,  or  dstCharsPtr  arguments to one of the high-level func‐
430       tions is NULL, the corresponding value passed to the callback procedure
431       will be a non-NULL location.
432
433       The  callback  procedure  freeProc,  if non-NULL, should match the type
434       Tcl_EncodingFreeProc:
435              typedef void Tcl_EncodingFreeProc(
436                ClientData clientData);
437
438       This freeProc function is called when the  encoding  is  deleted.   The
439       clientData  parameter  is the same as the clientData field specified to
440       Tcl_CreateEncoding when the encoding was created.
441
442       Tcl_GetDefaultEncodingDir and Tcl_SetDefaultEncodingDir access and  set
443       the directory to use when locating the default encoding files.  If this
444       value is not NULL, the TclpInitLibraryPath routine appends the path  to
445       the  head  of the search path, and uses this path as the first place to
446       look into when trying to locate the encoding file.
447
448

ENCODING FILES

450       Space would prohibit precompiling  into  Tcl  every  possible  encoding
451       algorithm, so many encodings are stored on disk as dynamically-loadable
452       encoding files.  This behavior also allows the  user  to  create  addi‐
453       tional  encoding  files  that  can  be loaded using the same mechanism.
454       These encoding files contain information about the tables and/or escape
455       sequences  used  to  map between an external encoding and Unicode.  The
456       external encoding may consist of single-byte,  multi-byte,  or  double-
457       byte characters.
458
459       Each  dynamically-loadable encoding is represented as a text file.  The
460       initial line of the file, beginning with a ``#'' symbol, is  a  comment
461       that  provides a human-readable description of the file.  The next line
462       identifies the type of encoding file.  It can be one of  the  following
463       letters:
464
465       [1]   S
466              A  single-byte  encoding, where one character is always one byte
467              long in the encoding.  An example is  iso8859-1,  used  by  many
468              European languages.
469
470       [2]   D
471              A  double-byte encoding, where one character is always two bytes
472              long in the encoding.  An example  is  big5,  used  for  Chinese
473              text.
474
475       [3]   M
476              A  multi-byte encoding, where one character may be either one or
477              two bytes long.  Certain bytes are a lead bytes, indicating that
478              another  byte must follow and that together the two bytes repre‐
479              sent one character.  Other bytes are not lead bytes  and  repre‐
480              sent  themselves.  An example is shiftjis, used by many Japanese
481              computers.
482
483       [4]   E
484              An escape-sequence encoding, specifying that  certain  sequences
485              of bytes do not represent characters, but commands that describe
486              how following bytes should be interpreted.
487
488       The rest of the lines in the file depend on the type.
489
490       Cases [1], [2], and [3] are collectively  referred  to  as  table-based
491       encoding  files.   The  lines in a table-based encoding file are in the
492       same format as this example taken from the shiftjis encoding  (this  is
493       not the complete file):
494              # Encoding file: shiftjis, multi-byte
495              M
496              003F 0 40
497              00
498              0000000100020003000400050006000700080009000A000B000C000D000E000F
499              0010001100120013001400150016001700180019001A001B001C001D001E001F
500              0020002100220023002400250026002700280029002A002B002C002D002E002F
501              0030003100320033003400350036003700380039003A003B003C003D003E003F
502              0040004100420043004400450046004700480049004A004B004C004D004E004F
503              0050005100520053005400550056005700580059005A005B005C005D005E005F
504              0060006100620063006400650066006700680069006A006B006C006D006E006F
505              0070007100720073007400750076007700780079007A007B007C007D203E007F
506              0080000000000000000000000000000000000000000000000000000000000000
507              0000000000000000000000000000000000000000000000000000000000000000
508              0000FF61FF62FF63FF64FF65FF66FF67FF68FF69FF6AFF6BFF6CFF6DFF6EFF6F
509              FF70FF71FF72FF73FF74FF75FF76FF77FF78FF79FF7AFF7BFF7CFF7DFF7EFF7F
510              FF80FF81FF82FF83FF84FF85FF86FF87FF88FF89FF8AFF8BFF8CFF8DFF8EFF8F
511              FF90FF91FF92FF93FF94FF95FF96FF97FF98FF99FF9AFF9BFF9CFF9DFF9EFF9F
512              0000000000000000000000000000000000000000000000000000000000000000
513              0000000000000000000000000000000000000000000000000000000000000000
514              81
515              0000000000000000000000000000000000000000000000000000000000000000
516              0000000000000000000000000000000000000000000000000000000000000000
517              0000000000000000000000000000000000000000000000000000000000000000
518              0000000000000000000000000000000000000000000000000000000000000000
519              300030013002FF0CFF0E30FBFF1AFF1BFF1FFF01309B309C00B4FF4000A8FF3E
520              FFE3FF3F30FD30FE309D309E30034EDD30053006300730FC20152010FF0F005C
521              301C2016FF5C2026202520182019201C201DFF08FF0930143015FF3BFF3DFF5B
522              FF5D30083009300A300B300C300D300E300F30103011FF0B221200B100D70000
523              00F7FF1D2260FF1CFF1E22662267221E22342642264000B0203220332103FFE5
524              FF0400A200A3FF05FF03FF06FF0AFF2000A72606260525CB25CF25CE25C725C6
525              25A125A025B325B225BD25BC203B301221922190219121933013000000000000
526              000000000000000000000000000000002208220B2286228722822283222A2229
527              000000000000000000000000000000002227222800AC21D221D4220022030000
528              0000000000000000000000000000000000000000222022A52312220222072261
529              2252226A226B221A223D221D2235222B222C0000000000000000000000000000
530              212B2030266F266D266A2020202100B6000000000000000025EF000000000000
531
532       The  third  line of the file is three numbers.  The first number is the
533       fallback character (in base 16) to use when converting  from  UTF-8  to
534       this  encoding.   The  second number is a 1 if this file represents the
535       encoding for a symbol font, or 0 otherwise.  The last number  (in  base
536       10) is how many pages of data follow.
537
538       Subsequent  lines  in  the example above are pages that describe how to
539       map from the encoding into 2-byte Unicode.  The first line  in  a  page
540       identifies  the page number.  Following it are 256 double-byte numbers,
541       arranged as 16 rows of 16 numbers.  Given a character in the  encoding,
542       the  high  byte of that character is used to select which page, and the
543       low byte of that character is used as an index to  select  one  of  the
544       double-byte  numbers in that page - the value obtained being the corre‐
545       sponding Unicode character.  By examination of the example  above,  one
546       can see that the characters 0x7E and 0x8163 in shiftjis map to 203E and
547       2026 in Unicode, respectively.
548
549       Following the first page will be all the other pages, each in the  same
550       format  as  the  first: one number identifying the page followed by 256
551       double-byte Unicode characters.  If a character in the encoding maps to
552       the  Unicode  character 0000, it means that the character doesn't actu‐
553       ally exist.  If all characters on a page would map to 0000,  that  page
554       can be omitted.
555
556       Case  [4]  is  the escape-sequence encoding file.  The lines in an this
557       type of file are in the same format as  this  example  taken  from  the
558       iso2022-jp encoding:
559              # Encoding file: iso2022-jp, escape-driven
560              E
561              init           {}
562              final          {}
563              iso8859-1      \x1b(B
564              jis0201        \x1b(J
565              jis0208        \x1b$@
566              jis0208        \x1b$B
567              jis0212        \x1b$(D
568              gb2312         \x1b$A
569              ksc5601        \x1b$(C
570
571       In  the file, the first column represents an option and the second col‐
572       umn is the associated value.  init is a string to emit or expect before
573       the  first  character  is converted, while final is a string to emit or
574       expect after the last character.  All other options are names of table-
575       based encodings; the associated value is the escape-sequence that marks
576       that encoding.  Tcl syntax is used for the values; in the  above  exam‐
577       ple, for instance, ``{}'' represents the empty string and ``\x1b'' rep‐
578       resents character 27.
579
580       When Tcl_GetEncoding encounters an encoding  name  that  has  not  been
581       loaded,  it  attempts to load an encoding file called name.enc from the
582       encoding subdirectory of each directory specified in the  library  path
583       $tcl_libPath.   If the encoding file exists, but is malformed, an error
584       message will be left in interp.
585

KEYWORDS

587       utf, encoding, convert
588
589
590
591
592
593
594Tcl                                   8.1                   Tcl_GetEncoding(3)
Impressum