1Utf(3)                      Tcl Library Procedures                      Utf(3)
2
3
4
5______________________________________________________________________________
6

NAME

8       Tcl_UniChar,         Tcl_UniCharCaseMatch,         Tcl_UniCharNcasecmp,
9       Tcl_UniCharToUtf,      Tcl_UtfToUniChar,       Tcl_UniCharToUtfDString,
10       Tcl_UtfToUniCharDString,  Tcl_UniCharLen, Tcl_UniCharNcmp, Tcl_UtfChar‐
11       Complete,    Tcl_NumUtfChars,    Tcl_UtfFindFirst,     Tcl_UtfFindLast,
12       Tcl_UtfNext,  Tcl_UtfPrev, Tcl_UniCharAtIndex, Tcl_UtfAtIndex, Tcl_Utf‐
13       Backslash - routines for manipulating UTF-8 strings
14

SYNOPSIS

16       #include <tcl.h>
17
18       typedef ... Tcl_UniChar;
19
20       int
21       Tcl_UniCharToUtf(ch, buf)
22
23       int
24       Tcl_UtfToUniChar(src, chPtr)
25
26       char *
27       Tcl_UniCharToUtfDString(uniStr, uniLength, dsPtr)
28
29       Tcl_UniChar *
30       Tcl_UtfToUniCharDString(src, length, dsPtr)
31
32       int
33       Tcl_UniCharLen(uniStr)
34
35       int
36       Tcl_UniCharNcmp(ucs, uct, numChars)
37
38       int
39       Tcl_UniCharNcasecmp(ucs, uct, numChars)
40
41       int
42       Tcl_UniCharCaseMatch(uniStr, uniPattern, nocase)
43
44       int
45       Tcl_UtfNcmp(cs, ct, numChars)
46
47       int
48       Tcl_UtfNcasecmp(cs, ct, numChars)
49
50       int
51       Tcl_UtfCharComplete(src, length)
52
53       int
54       Tcl_NumUtfChars(src, length)
55
56       const char *
57       Tcl_UtfFindFirst(src, ch)
58
59       const char *
60       Tcl_UtfFindLast(src, ch)
61
62       const char *
63       Tcl_UtfNext(src)
64
65       const char *
66       Tcl_UtfPrev(src, start)
67
68       Tcl_UniChar
69       Tcl_UniCharAtIndex(src, index)
70
71       const char *
72       Tcl_UtfAtIndex(src, index)
73
74       int
75       Tcl_UtfBackslash(src, readPtr, dst)
76

ARGUMENTS

78       char *buf (out)                             Buffer in which  the  UTF-8
79                                                   representation    of    the
80                                                   Tcl_UniChar is stored.   At
81                                                   most  TCL_UTF_MAX bytes are
82                                                   stored in the buffer.
83
84       int ch (in)                                 The Tcl_UniChar to be  con‐
85                                                   verted or examined.
86
87       Tcl_UniChar *chPtr (out)                    Filled with the Tcl_UniChar
88                                                   represented by the head  of
89                                                   the UTF-8 string.
90
91       const char *src (in)                        Pointer to a UTF-8 string.
92
93       const char *cs (in)                         Pointer to a UTF-8 string.
94
95       const char *ct (in)                         Pointer to a UTF-8 string.
96
97       const Tcl_UniChar *uniStr (in)              A  null-terminated  Unicode
98                                                   string.
99
100       const Tcl_UniChar *ucs (in)                 A  null-terminated  Unicode
101                                                   string.
102
103       const Tcl_UniChar *uct (in)                 A  null-terminated  Unicode
104                                                   string.
105
106       const Tcl_UniChar *uniPattern (in)          A  null-terminated  Unicode
107                                                   string.
108
109       int length (in)                             The  length  of  the  UTF-8
110                                                   string in bytes (not  UTF-8
111                                                   characters).   If negative,
112                                                   all bytes up to  the  first
113                                                   null byte are used.
114
115       int uniLength (in)                          The  length  of the Unicode
116                                                   string in characters.  Must
117                                                   be greater than or equal to
118                                                   0.
119
120       Tcl_DString *dsPtr (in/out)                 A pointer to  a  previously
121                                                   initialized Tcl_DString.
122
123       unsigned long numChars (in)                 The number of characters to
124                                                   compare.
125
126       const char *start (in)                      Pointer to the beginning of
127                                                   a UTF-8 string.
128
129       int index (in)                              The  index  of  a character
130                                                   (not  byte)  in  the  UTF-8
131                                                   string.
132
133       int *readPtr (out)                          If  non-NULL,  filled  with
134                                                   the number of bytes in  the
135                                                   backslash sequence, includ‐
136                                                   ing the  backslash  charac‐
137                                                   ter.
138
139       char *dst (out)                             Buffer  in  which the bytes
140                                                   represented  by  the  back‐
141                                                   slash  sequence are stored.
142                                                   At most  TCL_UTF_MAX  bytes
143                                                   are stored in the buffer.
144
145       int nocase (in)                             Specifies whether the match
146                                                   should be done  case-sensi‐
147                                                   tive  (0)  or case-insensi‐
148                                                   tive (1).
149_________________________________________________________________
150
151

DESCRIPTION

153       These routines convert  between  UTF-8  strings  and  Tcl_UniChars.   A
154       Tcl_UniChar  is  a Unicode character represented as an unsigned, fixed-
155       size quantity.  A UTF-8 character is a Unicode character represented as
156       a  varying-length  sequence  of  up  to TCL_UTF_MAX bytes.  A multibyte
157       UTF-8 sequence consists of a lead byte followed by some number of trail
158       bytes.
159
160       TCL_UTF_MAX  is  the maximum number of bytes that it takes to represent
161       one Unicode character in the UTF-8 representation.
162
163       Tcl_UniCharToUtf stores the Tcl_UniChar ch as a UTF-8 string in  start‐
164       ing at buf.  The return value is the number of bytes stored in buf.
165
166       Tcl_UtfToUniChar  reads  one UTF-8 character starting at src and stores
167       it as a Tcl_UniChar in *chPtr.  The return value is the number of bytes
168       read  from  src.  The caller must ensure that the source buffer is long
169       enough such that this routine does not run off the end and  dereference
170       non-existent  or  random  memory;  if  the source buffer is known to be
171       null-terminated, this will not happen.  If the input is not  in  proper
172       UTF-8  format,  Tcl_UtfToUniChar  will  store  the first byte of src in
173       *chPtr as a Tcl_UniChar between 0x0000 and 0x00ff and return 1.
174
175       Tcl_UniCharToUtfDString converts the given  Unicode  string  to  UTF-8,
176       storing  the  result in a previously initialized Tcl_DString.  You must
177       specify uniLength, the length of the given Unicode string.  The  return
178       value  is  a pointer to the UTF-8 representation of the Unicode string.
179       Storage for the return value is appended to the end of the Tcl_DString.
180
181       Tcl_UtfToUniCharDString converts the given  UTF-8  string  to  Unicode,
182       storing  the  result in the previously initialized Tcl_DString.  In the
183       argument length, you may either specify the length of the  given  UTF-8
184       string  in  bytes  or  “-1”, in which case Tcl_UtfToUniCharDString uses
185       strlen to calculate the length.  The return value is a pointer  to  the
186       Unicode  representation  of  the  UTF-8 string.  Storage for the return
187       value is appended to the end of the Tcl_DString.  The Unicode string is
188       terminated with a Unicode null character.
189
190       Tcl_UniCharLen  corresponds  to  strlen  for  Unicode  characters.   It
191       accepts a null-terminated Unicode string and returns the number of Uni‐
192       code characters (not bytes) in that string.
193
194       Tcl_UniCharNcmp and Tcl_UniCharNcasecmp correspond to strncmp and strn‐
195       casecmp, respectively, for Unicode characters.  They accept  two  null-
196       terminated  Unicode  strings  and  the number of characters to compare.
197       Both strings are assumed to  be  at  least  numChars  characters  long.
198       Tcl_UniCharNcmp    compares   the  two  strings  character-by-character
199       according to the Unicode character ordering.   It  returns  an  integer
200       greater  than,  equal to, or less than 0 if the first string is greater
201       than,  equal  to,  or  less  than  the  second   string   respectively.
202       Tcl_UniCharNcasecmp is the Unicode case insensitive version.
203
204       Tcl_UniCharCaseMatch  is the Unicode equivalent to Tcl_StringCaseMatch.
205       It accepts a null-terminated Unicode string, a Unicode pattern,  and  a
206       boolean value specifying whether the match should be case sensitive and
207       returns whether the string matches the pattern.
208
209       Tcl_UtfNcmp corresponds to strncmp for UTF-8 strings.  It  accepts  two
210       null-terminated  UTF-8 strings and the number of characters to compare.
211       (Both strings are assumed to be at  least  numChars  characters  long.)
212       Tcl_UtfNcmp  compares  the two strings character-by-character according
213       to the Unicode character ordering.  It returns an integer greater than,
214       equal to, or less than 0 if the first string is greater than, equal to,
215       or less than the second string respectively.
216
217       Tcl_UtfNcasecmp corresponds to strncasecmp for UTF-8  strings.   It  is
218       similar  to  Tcl_UtfNcmp  except comparisons ignore differences in case
219       when comparing upper, lower or title case characters.
220
221       Tcl_UtfCharComplete returns 1 if the source UTF-8 string src of  length
222       bytes is long enough to be decoded by Tcl_UtfToUniChar, or 0 otherwise.
223       This function does not guarantee that  the  UTF-8  string  is  properly
224       formed.   This  routine  is  used by procedures that are operating on a
225       byte at a time and need to know if a full Tcl_UniChar has been seen.
226
227       Tcl_NumUtfChars corresponds to strlen for UTF-8  strings.   It  returns
228       the  number  of  Tcl_UniChars  that are represented by the UTF-8 string
229       src.  The length of the source string is length bytes.  If  the  length
230       is negative, all bytes up to the first null byte are used.
231
232       Tcl_UtfFindFirst corresponds to strchr for UTF-8 strings.  It returns a
233       pointer to the first occurrence of the Tcl_UniChar ch in the  null-ter‐
234       minated  UTF-8  string  src.  The null terminator is considered part of
235       the UTF-8 string.
236
237       Tcl_UtfFindLast corresponds to strrchr for UTF-8 strings.  It returns a
238       pointer to the last occurrence of the Tcl_UniChar ch in the null-termi‐
239       nated UTF-8 string src.  The null terminator is considered part of  the
240       UTF-8 string.
241
242       Given  src,  a  pointer to some location in a UTF-8 string, Tcl_UtfNext
243       returns a pointer to the next UTF-8 character in the string.  The call‐
244       er  must not ask for the next character after the last character in the
245       string if the string is not terminated by a null character.
246
247       Given src, a pointer to some location in a UTF-8 string (or to  a  null
248       byte  immediately  following  such  a  string),  Tcl_UtfPrev  returns a
249       pointer to the closest preceding byte that starts  a  UTF-8  character.
250       This function will not back up to a position before start, the start of
251       the UTF-8 string.  If src was already at start, the return  value  will
252       be start.
253
254       Tcl_UniCharAtIndex  corresponds  to a C string array dereference or the
255       Pascal Ord() function.  It returns the Tcl_UniChar represented  at  the
256       specified  character  (not  byte)  index  in the UTF-8 string src.  The
257       source string must contain at  least  index  characters.   Behavior  is
258       undefined if a negative index is given.
259
260       Tcl_UtfAtIndex  returns a pointer to the specified character (not byte)
261       index in the UTF-8 string src.  The source string must contain at least
262       index  characters.   This  is  equivalent  to calling Tcl_UtfNext index
263       times.  If a negative index is given, the return pointer points to  the
264       first character in the source string.
265
266       Tcl_UtfBackslash is a utility procedure used by several of the Tcl com‐
267       mands.  It parses a backslash sequence and stores the  properly  formed
268       UTF-8  character  represented  by  the backslash sequence in the output
269       buffer dst.  At most  TCL_UTF_MAX  bytes  are  stored  in  the  buffer.
270       Tcl_UtfBackslash  modifies  *readPtr  to contain the number of bytes in
271       the backslash sequence, including the backslash character.  The  return
272       value is the number of bytes stored in the output buffer.
273
274       See  the  Tcl  manual  entry  for  information  on  the valid backslash
275       sequences.  All of the sequences described in the Tcl manual entry  are
276       supported by Tcl_UtfBackslash.
277
278

KEYWORDS

280       utf, unicode, backslash
281
282
283
284Tcl                                   8.1                               Utf(3)
Impressum