1Tcl_GetEncoding(3)          Tcl Library Procedures          Tcl_GetEncoding(3)
2
3
4
5______________________________________________________________________________
6

NAME

8       Tcl_GetEncoding,        Tcl_FreeEncoding,       Tcl_GetEncodingFromObj,
9       Tcl_ExternalToUtfDString, Tcl_ExternalToUtf,  Tcl_UtfToExternalDString,
10       Tcl_UtfToExternal,  Tcl_WinTCharToUtf, Tcl_WinUtfToTChar, Tcl_GetEncod‐
11       ingName,   Tcl_SetSystemEncoding,   Tcl_GetEncodingNameFromEnvironment,
12       Tcl_GetEncodingNames,   Tcl_CreateEncoding,  Tcl_GetEncodingSearchPath,
13       Tcl_SetEncodingSearchPath,  Tcl_GetDefaultEncodingDir,  Tcl_SetDefault‐
14       EncodingDir - procedures for creating and using encodings
15

SYNOPSIS

17       #include <tcl.h>
18
19       Tcl_Encoding
20       Tcl_GetEncoding(interp, name)
21
22       void
23       Tcl_FreeEncoding(encoding)
24
25       int                                                                     │
26       Tcl_GetEncodingFromObj(interp, objPtr, encodingPtr)                     │
27
28       char *
29       Tcl_ExternalToUtfDString(encoding, src, srcLen, dstPtr)
30
31       char *
32       Tcl_UtfToExternalDString(encoding, src, srcLen, dstPtr)
33
34       int
35       Tcl_ExternalToUtf(interp, encoding, src, srcLen, flags, statePtr,
36                         dst, dstLen, srcReadPtr, dstWrotePtr, dstCharsPtr)
37
38       int
39       Tcl_UtfToExternal(interp, encoding, src, srcLen, flags, statePtr,
40                         dst, dstLen, srcReadPtr, dstWrotePtr, dstCharsPtr)
41
42       char *
43       Tcl_WinTCharToUtf(tsrc, srcLen, dstPtr)
44
45       TCHAR *
46       Tcl_WinUtfToTChar(src, srcLen, dstPtr)
47
48       const char *
49       Tcl_GetEncodingName(encoding)
50
51       int
52       Tcl_SetSystemEncoding(interp, name)
53
54       const char *                                                            │
55       Tcl_GetEncodingNameFromEnvironment(bufPtr)                              │
56
57       void
58       Tcl_GetEncodingNames(interp)
59
60       Tcl_Encoding
61       Tcl_CreateEncoding(typePtr)
62
63       Tcl_Obj *                                                               │
64       Tcl_GetEncodingSearchPath()                                             │
65
66       int                                                                     │
67       Tcl_SetEncodingSearchPath(searchPath)                                   │
68
69       const char *
70       Tcl_GetDefaultEncodingDir(void)
71
72       void
73       Tcl_SetDefaultEncodingDir(path)
74

ARGUMENTS

76       Tcl_Interp *interp (in)                           Interpreter   to  use
77                                                         for error  reporting,
78                                                         or  NULL  if no error
79                                                         reporting is desired.
80
81       const char *name (in)                             Name of  encoding  to
82                                                         load.
83
84       Tcl_Encoding encoding (in)                        The    encoding    to
85                                                         query, free,  or  use
86                                                         for  converting text.
87                                                         If encoding is  NULL,
88                                                         the   current  system
89                                                         encoding is used.
90
91       Tcl_Obj *objPtr (in)                              Name of  encoding  to │
92                                                         get token for.
93
94       Tcl_Encoding *encodingPtr (out)                   Points   to   storage │
95                                                         where encoding  token │
96                                                         is to be written.
97
98       const char *src (in)                              For               the
99                                                         Tcl_ExternalToUtf
100                                                         functions,  an  array
101                                                         of bytes in the spec‐
102                                                         ified  encoding  that
103                                                         are to  be  converted
104                                                         to  UTF-8.   For  the
105                                                         Tcl_UtfToExternal and
106                                                         Tcl_WinUtfToTChar
107                                                         functions,  an  array
108                                                         of  UTF-8  characters
109                                                         to  be  converted  to
110                                                         the  specified encod‐
111                                                         ing.
112
113       const TCHAR *tsrc (in)                            An array  of  Windows
114                                                         TCHAR  characters  to
115                                                         convert to UTF-8.
116
117       int srcLen (in)                                   Length of src or tsrc
118                                                         in   bytes.   If  the
119                                                         length  is  negative,
120                                                         the encoding-specific
121                                                         length of the  string
122                                                         is used.
123
124       Tcl_DString *dstPtr (out)                         Pointer  to an unini‐
125                                                         tialized   or    free
126                                                         Tcl_DString  in which
127                                                         the converted  result
128                                                         will be stored.
129
130       int flags (in)                                    Various flag bits OR-
131                                                         ed          together.
132                                                         TCL_ENCODING_START
133                                                         signifies  that   the
134                                                         source  buffer is the
135                                                         first  block   in   a
136                                                         (potentially   multi-
137                                                         block) input  stream,
138                                                         telling  the  conver‐
139                                                         sion routine to reset
140                                                         to  an  initial state
141                                                         and perform any  ini‐
142                                                         tialization      that
143                                                         needs to occur before
144                                                         the   first  byte  is
145                                                         converted. TCL_ENCOD‐
146                                                         ING_END     signifies
147                                                         that the source  buf‐
148                                                         fer is the last block
149                                                         in   a   (potentially
150                                                         multi-block)    input
151                                                         stream,  telling  the
152                                                         conversion routine to
153                                                         perform any finaliza‐
154                                                         tion  that  needs  to
155                                                         occur after the  last
156                                                         byte is converted and
157                                                         then to reset  to  an
158                                                         initial        state.
159                                                         TCL_ENCODING_STOPON‐
160                                                         ERROR  signifies that
161                                                         the  conversion  rou‐
162                                                         tine   should  return
163                                                         immediately      upon
164                                                         reading    a   source
165                                                         character  that  does
166                                                         not exist in the tar‐
167                                                         get encoding;  other‐
168                                                         wise  a default fall‐
169                                                         back  character  will
170                                                         automatically be sub‐
171                                                         stituted.
172
173       Tcl_EncodingState *statePtr (in/out)              Used when  converting
174                                                         a  (generally long or
175                                                         indefinite    length)
176                                                         byte   stream   in  a
177                                                         piece-by-piece  fash‐
178                                                         ion.   The conversion
179                                                         routine  stores   its
180                                                         current    state   in
181                                                         *statePtr  after  src
182                                                         (the  buffer contain‐
183                                                         ing    the    current
184                                                         piece)  has been con‐
185                                                         verted;  that   state
186                                                         information  must  be
187                                                         passed back when con‐
188                                                         verting    the   next
189                                                         piece of  the  stream
190                                                         so   the   conversion
191                                                         routine  knows   what
192                                                         state  it was in when
193                                                         it left  off  at  the
194                                                         end   of   the   last
195                                                         piece.  May be  NULL,
196                                                         in   which  case  the
197                                                         value  specified  for
198                                                         flags  is ignored and
199                                                         the source buffer  is
200                                                         assumed   to  contain
201                                                         the  complete  string
202                                                         to convert.
203
204       char *dst (out)                                   Buffer  in  which the
205                                                         converted result will
206                                                         be  stored.   No more
207                                                         than   dstLen   bytes
208                                                         will   be  stored  in
209                                                         dst.
210
211       int dstLen (in)                                   The maximum length of
212                                                         the output buffer dst
213                                                         in bytes.
214
215       int *srcReadPtr (out)                             Filled with the  num‐
216                                                         ber of bytes from src
217                                                         that  were   actually
218                                                         converted.   This may
219                                                         be  less   than   the
220                                                         original       source
221                                                         length if there was a
222                                                         problem    converting
223                                                         some  source  charac‐
224                                                         ters.  May be NULL.
225
226       int *dstWrotePtr (out)                            Filled  with the num‐
227                                                         ber  of  bytes   that
228                                                         were  actually stored
229                                                         in the output  buffer
230                                                         as  a  result  of the
231                                                         conversion.   May  be
232                                                         NULL.
233
234       int *dstCharsPtr (out)                            Filled  with the num‐
235                                                         ber   of   characters
236                                                         that   correspond  to
237                                                         the number  of  bytes
238                                                         stored  in the output
239                                                         buffer.  May be NULL.
240
241       Tcl_DString *bufPtr (out)                         Storage for the  pre‐ │
242                                                         scribed system encod‐ │
243                                                         ing name.
244
245       const Tcl_EncodingType *typePtr (in)              Structure        that
246                                                         defines a new type of
247                                                         encoding.
248
249       Tcl_Obj *searchPath (in)                          List  of   filesystem │
250                                                         directories  in which │
251                                                         to search for  encod‐ │
252                                                         ing data files.
253
254       const char *path (in)                             A  path  to the loca‐
255                                                         tion of the  encoding
256                                                         file.
257_________________________________________________________________
258

INTRODUCTION

260       These routines convert between Tcl's internal character representation,
261       UTF-8, and character representations used by various operating  systems
262       or  file systems, such as Unicode, ASCII, or Shift-JIS.  When operating
263       on strings, such as such as obtaining the names of files or  displaying
264       characters  using  international  fonts, the strings must be translated
265       into one or possibly multiple formats that the various system calls can
266       use.  For instance, on a Japanese Unix workstation, a user might obtain
267       a filename represented in the EUC-JP file encoding and  then  translate
268       the  characters  to  the jisx0208 font encoding in order to display the
269       filename in a Tk widget.  The purpose of the  encoding  package  is  to
270       help  bridge the translation gap.  UTF-8 provides an intermediate stag‐
271       ing ground for all the various encodings.  In the example  above,  text
272       would  be translated into UTF-8 from whatever file encoding the operat‐
273       ing system is using.  Then it would be translated from UTF-8 into what‐
274       ever font encoding the display routines require.
275
276       Some  basic  encodings are compiled into Tcl.  Others can be defined by
277       the user or dynamically loaded from encoding files in a  platform-inde‐
278       pendent manner.
279

DESCRIPTION

281       Tcl_GetEncoding  finds  an encoding given its name.  The name may refer
282       to a built-in Tcl encoding, a user-defined encoding registered by call‐
283       ing  Tcl_CreateEncoding,  or a dynamically-loadable encoding file.  The
284       return value is a token that represents the encoding and can be used in
285       subsequent calls to procedures such as Tcl_GetEncodingName, Tcl_FreeEn‐
286       coding, and Tcl_UtfToExternal.  If the name did not refer to any  known
287       or loadable encoding, NULL is returned and an error message is returned
288       in interp.
289
290       The encoding package maintains a database of all encodings currently in
291       use.   The first time name is seen, Tcl_GetEncoding returns an encoding
292       with a reference count of 1.  If the same  name  is  requested  further
293       times,  then the reference count for that encoding is incremented with‐
294       out the overhead of allocating a new encoding and  all  its  associated
295       data structures.
296
297       When an encoding is no longer needed, Tcl_FreeEncoding should be called
298       to release it.  When an encoding is no longer in use anywhere (i.e., it
299       has  been  freed  as many times as it has been gotten) Tcl_FreeEncoding
300       will release all storage the encoding was using and delete it from  the
301       database.
302
303       Tcl_GetEncodingFromObj treats the string representation of objPtr as an │
304       encoding name, and finds an encoding with that name, just as Tcl_GetEn‐ 
305       coding  does. When an encoding is found, it is cached within the objPtr 
306       value for future reference, the Tcl_Encoding token is  written  to  the │
307       storage pointed to by encodingPtr, and the value TCL_OK is returned. If │
308       no such encoding is found, the value  TCL_ERROR  is  returned,  and  no │
309       writing  to *encodingPtr takes place. Just as with Tcl_GetEncoding, the │
310       caller should call Tcl_FreeEncoding on  the  resulting  encoding  token │
311       when that token will no longer be used.
312
313       Tcl_ExternalToUtfDString  converts  a source buffer src from the speci‐
314       fied encoding into UTF-8.  The converted bytes are  stored  in  dstPtr,
315       which  is  then  null-terminated.   The  caller  should eventually call
316       Tcl_DStringFree to free any information stored in  dstPtr.   When  con‐
317       verting, if any of the characters in the source buffer cannot be repre‐
318       sented in the target encoding, a default  fallback  character  will  be
319       used.   The  return  value  is  a  pointer  to  the value stored in the
320       DString.
321
322       Tcl_ExternalToUtf converts a  source  buffer  src  from  the  specified
323       encoding  into UTF-8.  Up to srcLen bytes are converted from the source
324       buffer and up to dstLen converted bytes are  stored  in  dst.   In  all
325       cases,  *srcReadPtr  is  filled with the number of bytes that were suc‐
326       cessfully converted from src and *dstWrotePtr is filled with the corre‐
327       sponding  number of bytes that were stored in dst.  The return value is
328       one of the following:
329
330              TCL_OK                       All bytes of src were converted.
331
332              TCL_CONVERT_NOSPACE          The  destination  buffer  was   not
333                                           large  enough  for  all of the con‐
334                                           verted data; as many characters  as
335                                           could fit were converted though.
336
337              TCL_CONVERT_MULTIBYTE        The  last  few  bytes in the source
338                                           buffer  were  the  beginning  of  a
339                                           multibyte  sequence, but more bytes
340                                           were  needed   to   complete   this
341                                           sequence.  A subsequent call to the
342                                           conversion routine  should  pass  a
343                                           buffer  containing  the unconverted
344                                           bytes that  remained  in  src  plus
345                                           some  further bytes from the source
346                                           stream to properly convert the for‐
347                                           merly split-up multibyte sequence.
348
349              TCL_CONVERT_SYNTAX           The   source  buffer  contained  an
350                                           invalid character  sequence.   This
351                                           may  occur  if the input stream has
352                                           been damaged or if the input encod‐
353                                           ing method was misidentified.
354
355              TCL_CONVERT_UNKNOWN          The source buffer contained a char‐
356                                           acter that could not be represented
357                                           in    the   target   encoding   and
358                                           TCL_ENCODING_STOPONERROR was speci‐
359                                           fied.
360
361       Tcl_UtfToExternalDString  converts  a source buffer src from UTF-8 into
362       the specified encoding.  The converted  bytes  are  stored  in  dstPtr,
363       which  is  then terminated with the appropriate encoding-specific null.
364       The caller should eventually call Tcl_DStringFree to free any  informa‐
365       tion  stored  in  dstPtr.  When converting, if any of the characters in
366       the source buffer cannot be  represented  in  the  target  encoding,  a
367       default fallback character will be used.  The return value is a pointer
368       to the value stored in the DString.
369
370       Tcl_UtfToExternal converts a source buffer  src  from  UTF-8  into  the
371       specified  encoding.   Up to srcLen bytes are converted from the source
372       buffer and up to dstLen converted bytes are  stored  in  dst.   In  all
373       cases,  *srcReadPtr  is  filled with the number of bytes that were suc‐
374       cessfully converted from src and *dstWrotePtr is filled with the corre‐
375       sponding  number  of  bytes that were stored in dst.  The return values
376       are the same as the return values for Tcl_ExternalToUtf.
377
378       Tcl_WinUtfToTChar and Tcl_WinTCharToUtf  are  Windows-only  convenience
379       functions for converting between UTF-8 and Windows strings.  On Windows
380       95 (as with the Unix operating system), all strings  exchanged  between
381       Tcl  and  the  operating  system are “char” based.  On Windows NT, some
382       strings exchanged between Tcl and the operating system are “char”  ori‐
383       ented  while  others are in Unicode.  By convention, in Windows a TCHAR
384       is a character in the ANSI code page on Windows 95 and a Unicode  char‐
385       acter on Windows NT.
386
387       If  you planned to use the same “char” based interfaces on both Windows
388       95   and   Windows   NT,   you   could   use   Tcl_UtfToExternal    and
389       Tcl_ExternalToUtf  (or  their Tcl_DString equivalents) with an encoding
390       of NULL (the current system encoding).   On  the  other  hand,  if  you
391       planned to use the Unicode interface when running on Windows NT and the
392       “char” interfaces when running on Windows 95, you would have to perform
393       the  following  type  of  test over and over in your program (as repre‐
394       sented in pseudo-code):
395              if (running NT) {
396                  encoding <- Tcl_GetEncoding("unicode");
397                  nativeBuffer <- Tcl_UtfToExternal(encoding, utfBuffer);
398                  Tcl_FreeEncoding(encoding);
399              } else {
400                  nativeBuffer <- Tcl_UtfToExternal(NULL, utfBuffer);
401              }
402       Tcl_WinUtfToTChar and Tcl_WinTCharToUtf automatically handle this  test
403       and  use  the  proper  encoding  based on the current operating system.
404       Tcl_WinUtfToTChar  returns  a  pointer   to   a   TCHAR   string,   and
405       Tcl_WinTCharToUtf  expects  a  TCHAR  string pointer as the src string.
406       Otherwise, these functions behave  identically  to  Tcl_UtfToExternalD‐
407       String and Tcl_ExternalToUtfDString.
408
409       Tcl_GetEncodingName  is  roughly the inverse of Tcl_GetEncoding.  Given
410       an encoding, the return value is the name argument  that  was  used  to
411       create  the  encoding.   The  string returned by Tcl_GetEncodingName is
412       only guaranteed to persist until the encoding is deleted.   The  caller
413       must not modify this string.
414
415       Tcl_SetSystemEncoding  sets  the  default  encoding that should be used
416       whenever the user passes a NULL value for the encoding argument to  any
417       of  the other encoding functions.  If name is NULL, the system encoding
418       is reset to the default system encoding, binary.  If the name  did  not
419       refer  to  any known or loadable encoding, TCL_ERROR is returned and an
420       error message is left in interp.  Otherwise, this procedure  increments
421       the  reference  count of the new system encoding, decrements the refer‐
422       ence count of the old system encoding, and returns TCL_OK.
423
424       Tcl_GetEncodingNameFromEnvironment provides a means for the Tcl library │
425       to report the encoding name it believes to be the correct one to use as │
426       the system encoding, based on system calls and examination of the envi‐ │
427       ronment  suitable for the platform.  It accepts bufPtr, a pointer to an │
428       uninitialized or freed Tcl_DString and writes the encoding name to  it. │
429       The Tcl_DStringValue is returned.
430
431       Tcl_GetEncodingNames sets the interp result to a list consisting of the
432       names of all the encodings that are currently defined or can be dynami‐
433       cally  loaded, searching the encoding path specified by Tcl_SetDefault‐
434       EncodingDir.  This procedure does not ensure that the dynamically-load‐
435       able encoding files contain valid data, but merely that they exist.
436
437       Tcl_CreateEncoding  defines  a  new encoding and registers the C proce‐
438       dures that are called back to convert between the encoding  and  UTF-8.
439       Encodings  created  by Tcl_CreateEncoding are thereafter visible in the
440       database used by Tcl_GetEncoding.  Just  as  with  the  Tcl_GetEncoding
441       procedure, the return value is a token that represents the encoding and
442       can be used in subsequent calls to other encoding functions.   Tcl_Cre‐
443       ateEncoding  returns  an  encoding  with  a reference count of 1. If an
444       encoding with the specified name already exists, then its entry in  the
445       database  is  replaced  with  the  new  encoding; the token for the old
446       encoding will remain valid and continue to behave as before, but  users
447       of the new token will now call the new encoding procedures.
448
449       The  typePtr  argument to Tcl_CreateEncoding contains information about
450       the name of the encoding and the procedures that will be called to con‐
451       vert between this encoding and UTF-8.  It is defined as follows:
452
453              typedef struct Tcl_EncodingType {
454                      const char *encodingName;
455                      Tcl_EncodingConvertProc *toUtfProc;
456                      Tcl_EncodingConvertProc *fromUtfProc;
457                      Tcl_EncodingFreeProc *freeProc;
458                      ClientData clientData;
459                      int nullSize;
460              } Tcl_EncodingType;
461
462       The  encodingName  provides a string name for the encoding, by which it
463       can be referred in  other  procedures  such  as  Tcl_GetEncoding.   The
464       toUtfProc refers to a callback procedure to invoke to convert text from
465       this encoding into UTF-8.  The fromUtfProc refers to a callback  proce‐
466       dure  to  invoke  to  convert  text from UTF-8 into this encoding.  The
467       freeProc refers to a callback procedure to invoke when this encoding is
468       deleted.   The  freeProc field may be NULL.  The clientData contains an
469       arbitrary one-word value passed to toUtfProc, fromUtfProc, and freeProc
470       whenever  they  are  called.   Typically,  this  is a pointer to a data
471       structure containing encoding-specific information that can be used  by
472       the callback procedures.  For instance, two very similar encodings such
473       as ascii and macRoman may use the same callback procedure, but use dif‐
474       ferent  values  of  clientData  to  control its behavior.  The nullSize
475       specifies the number of zero bytes that signify end-of-string  in  this
476       encoding.   It  must be 1 (for single-byte or multi-byte encodings like
477       ASCII or Shift-JIS) or 2  (for  double-byte  encodings  like  Unicode).
478       Constant-sized  encodings  with  3 or more bytes per character (such as
479       CNS11643) are not accepted.
480
481       The callback procedures toUtfProc and fromUtfProc should match the type
482       Tcl_EncodingConvertProc:
483
484              typedef int Tcl_EncodingConvertProc(
485                      ClientData clientData,
486                      const char *src,
487                      int srcLen,
488                      int flags,
489                      Tcl_EncodingState *statePtr,
490                      char *dst,
491                      int dstLen,
492                      int *srcReadPtr,
493                      int *dstWrotePtr,
494                      int *dstCharsPtr);
495
496       The   toUtfProc   and   fromUtfProc   procedures   are  called  by  the
497       Tcl_ExternalToUtf or Tcl_UtfToExternal family of functions  to  perform
498       the actual conversion.  The clientData parameter to these procedures is
499       the same as the clientData field specified to  Tcl_CreateEncoding  when
500       the encoding was created.  The remaining arguments to the callback pro‐
501       cedures are the same as  the  arguments,  documented  at  the  top,  to
502       Tcl_ExternalToUtf  or Tcl_UtfToExternal, with the following exceptions.
503       If the srcLen argument to one of those high-level  functions  is  nega‐
504       tive,  the value passed to the callback procedure will be the appropri‐
505       ate encoding-specific string length of src.  If any of the  srcReadPtr,
506       dstWrotePtr,  or  dstCharsPtr  arguments to one of the high-level func‐
507       tions is NULL, the corresponding value passed to the callback procedure
508       will be a non-NULL location.
509
510       The  callback  procedure  freeProc,  if non-NULL, should match the type
511       Tcl_EncodingFreeProc:
512              typedef void Tcl_EncodingFreeProc(
513                      ClientData clientData);
514
515       This freeProc function is called when the  encoding  is  deleted.   The
516       clientData  parameter  is the same as the clientData field specified to
517       Tcl_CreateEncoding when the encoding was created.
518
519       Tcl_GetEncodingSearchPath and Tcl_SetEncodingSearchPath are  called  to │
520       access and set the list of filesystem directories searched for encoding │
521       data files.                                                             │
522
523       The value returned by Tcl_GetEncodingSearchPath is the value stored  by │
524       the  last successful call to Tcl_SetEncodingSearchPath.  If no calls to │
525       Tcl_SetEncodingSearchPath have occurred, Tcl will  compute  an  initial │
526       value  based on the environment.  There is one encoding search path for │
527       the entire process, shared by all threads in the process.               │
528
529       Tcl_SetEncodingSearchPath stores searchPath and returns TCL_OK,  unless │
530       searchPath  is  not  a  valid  Tcl  list,  which causes TCL_ERROR to be │
531       returned.  The elements of searchPath  are  not  verified  as  existing │
532       readable  filesystem  directories.   When  searching  for encoding data │
533       files takes place, and non-existent or non-readable filesystem directo‐ │
534       ries on the searchPath are silently ignored.                            │
535
536       Tcl_GetDefaultEncodingDir  and  Tcl_SetDefaultEncodingDir  are obsolete │
537       interfaces best replaced with calls  to  Tcl_GetEncodingSearchPath  and │
538       Tcl_SetEncodingSearchPath.  They are called to access and set the first │
539       element of the searchPath list.   Since  Tcl  searches  searchPath  for │
540       encoding  data  files  in  list  order,  these  routines  establish the │
541       “default” directory in which to find encoding data files.
542

ENCODING FILES

544       Space would prohibit precompiling  into  Tcl  every  possible  encoding
545       algorithm, so many encodings are stored on disk as dynamically-loadable
546       encoding files.  This behavior also allows the  user  to  create  addi‐
547       tional  encoding  files  that  can  be loaded using the same mechanism.
548       These encoding files contain information about the tables and/or escape
549       sequences  used  to  map between an external encoding and Unicode.  The
550       external encoding may consist of single-byte,  multi-byte,  or  double-
551       byte characters.
552
553       Each  dynamically-loadable encoding is represented as a text file.  The
554       initial line of the file, beginning with a “#”  symbol,  is  a  comment
555       that  provides a human-readable description of the file.  The next line
556       identifies the type of encoding file.  It can be one of  the  following
557       letters:
558
559       [1] S  A  single-byte  encoding, where one character is always one byte
560              long in the encoding.  An example is  iso8859-1,  used  by  many
561              European languages.
562
563       [2] D  A  double-byte encoding, where one character is always two bytes
564              long in the encoding.  An example  is  big5,  used  for  Chinese
565              text.
566
567       [3] M  A  multi-byte encoding, where one character may be either one or
568              two bytes long.  Certain bytes are lead bytes,  indicating  that
569              another  byte must follow and that together the two bytes repre‐
570              sent one character.  Other bytes are not lead bytes  and  repre‐
571              sent  themselves.  An example is shiftjis, used by many Japanese
572              computers.
573
574       [4] E  An escape-sequence encoding, specifying that  certain  sequences
575              of bytes do not represent characters, but commands that describe
576              how following bytes should be interpreted.
577
578       The rest of the lines in the file depend on the type.
579
580       Cases [1], [2], and [3] are collectively  referred  to  as  table-based
581       encoding  files.   The  lines in a table-based encoding file are in the
582       same format as this example taken from the shiftjis encoding  (this  is
583       not the complete file):
584              # Encoding file: shiftjis, multi-byte
585              M
586              003F 0 40
587              00
588              0000000100020003000400050006000700080009000A000B000C000D000E000F
589              0010001100120013001400150016001700180019001A001B001C001D001E001F
590              0020002100220023002400250026002700280029002A002B002C002D002E002F
591              0030003100320033003400350036003700380039003A003B003C003D003E003F
592              0040004100420043004400450046004700480049004A004B004C004D004E004F
593              0050005100520053005400550056005700580059005A005B005C005D005E005F
594              0060006100620063006400650066006700680069006A006B006C006D006E006F
595              0070007100720073007400750076007700780079007A007B007C007D203E007F
596              0080000000000000000000000000000000000000000000000000000000000000
597              0000000000000000000000000000000000000000000000000000000000000000
598              0000FF61FF62FF63FF64FF65FF66FF67FF68FF69FF6AFF6BFF6CFF6DFF6EFF6F
599              FF70FF71FF72FF73FF74FF75FF76FF77FF78FF79FF7AFF7BFF7CFF7DFF7EFF7F
600              FF80FF81FF82FF83FF84FF85FF86FF87FF88FF89FF8AFF8BFF8CFF8DFF8EFF8F
601              FF90FF91FF92FF93FF94FF95FF96FF97FF98FF99FF9AFF9BFF9CFF9DFF9EFF9F
602              0000000000000000000000000000000000000000000000000000000000000000
603              0000000000000000000000000000000000000000000000000000000000000000
604              81
605              0000000000000000000000000000000000000000000000000000000000000000
606              0000000000000000000000000000000000000000000000000000000000000000
607              0000000000000000000000000000000000000000000000000000000000000000
608              0000000000000000000000000000000000000000000000000000000000000000
609              300030013002FF0CFF0E30FBFF1AFF1BFF1FFF01309B309C00B4FF4000A8FF3E
610              FFE3FF3F30FD30FE309D309E30034EDD30053006300730FC20152010FF0F005C
611              301C2016FF5C2026202520182019201C201DFF08FF0930143015FF3BFF3DFF5B
612              FF5D30083009300A300B300C300D300E300F30103011FF0B221200B100D70000
613              00F7FF1D2260FF1CFF1E22662267221E22342642264000B0203220332103FFE5
614              FF0400A200A3FF05FF03FF06FF0AFF2000A72606260525CB25CF25CE25C725C6
615              25A125A025B325B225BD25BC203B301221922190219121933013000000000000
616              000000000000000000000000000000002208220B2286228722822283222A2229
617              000000000000000000000000000000002227222800AC21D221D4220022030000
618              0000000000000000000000000000000000000000222022A52312220222072261
619              2252226A226B221A223D221D2235222B222C0000000000000000000000000000
620              212B2030266F266D266A2020202100B6000000000000000025EF000000000000
621
622       The  third  line of the file is three numbers.  The first number is the
623       fallback character (in base 16) to use when converting  from  UTF-8  to
624       this  encoding.   The  second number is a 1 if this file represents the
625       encoding for a symbol font, or 0 otherwise.  The last number  (in  base
626       10) is how many pages of data follow.
627
628       Subsequent  lines  in  the example above are pages that describe how to
629       map from the encoding into 2-byte Unicode.  The first line  in  a  page
630       identifies  the page number.  Following it are 256 double-byte numbers,
631       arranged as 16 rows of 16 numbers.  Given a character in the  encoding,
632       the  high  byte of that character is used to select which page, and the
633       low byte of that character is used as an index to  select  one  of  the
634       double-byte  numbers in that page - the value obtained being the corre‐
635       sponding Unicode character.  By examination of the example  above,  one
636       can see that the characters 0x7E and 0x8163 in shiftjis map to 203E and
637       2026 in Unicode, respectively.
638
639       Following the first page will be all the other pages, each in the  same
640       format  as  the  first: one number identifying the page followed by 256
641       double-byte Unicode characters.  If a character in the encoding maps to
642       the  Unicode character 0000, it means that the character does not actu‐
643       ally exist.  If all characters on a page would map to 0000,  that  page
644       can be omitted.
645
646       Case  [4]  is  the escape-sequence encoding file.  The lines in an this
647       type of file are in the same format as  this  example  taken  from  the
648       iso2022-jp encoding:
649              # Encoding file: iso2022-jp, escape-driven
650              E
651              init           {}
652              final          {}
653              iso8859-1      \x1b(B
654              jis0201        \x1b(J
655              jis0208        \x1b$@
656              jis0208        \x1b$B
657              jis0212        \x1b$(D
658              gb2312         \x1b$A
659              ksc5601        \x1b$(C
660
661       In  the file, the first column represents an option and the second col‐
662       umn is the associated value.  init is a string to emit or expect before
663       the  first  character  is converted, while final is a string to emit or
664       expect after the last character.  All other options are names of table-
665       based encodings; the associated value is the escape-sequence that marks
666       that encoding.  Tcl syntax is used for the values; in the  above  exam‐
667       ple,  for  instance, “{}” represents the empty string and “\x1b” repre‐
668       sents character 27.
669
670       When Tcl_GetEncoding encounters an encoding  name  that  has  not  been
671       loaded,  it  attempts to load an encoding file called name.enc from the
672       encoding subdirectory of each  directory  that  Tcl  searches  for  its
673       script  library.   If  the  encoding  file exists, but is malformed, an
674       error message will be left in interp.
675

KEYWORDS

677       utf, encoding, convert
678
679
680
681Tcl                                   8.1                   Tcl_GetEncoding(3)
Impressum