1Utf(3)                      Tcl Library Procedures                      Utf(3)
2
3
4
5______________________________________________________________________________
6

NAME

8       Tcl_UniChar,   Tcl_UniCharToUtf,  Tcl_UtfToUniChar,  Tcl_UniCharToUtfD‐
9       String,   Tcl_UtfToUniCharDString,   Tcl_UniCharLen,   Tcl_UniCharNcmp,
10       Tcl_UniCharNcasecmp,   Tcl_UniCharCaseMatch,   Tcl_UtfNcmp,   Tcl_UtfN‐
11       casecmp,   Tcl_UtfCharComplete,   Tcl_NumUtfChars,    Tcl_UtfFindFirst,
12       Tcl_UtfFindLast,  Tcl_UtfNext, Tcl_UtfPrev, Tcl_UniCharAtIndex, Tcl_Ut‐
13       fAtIndex, Tcl_UtfBackslash - routines for manipulating UTF-8 strings
14

SYNOPSIS

16       #include <tcl.h>
17
18       typedef ... Tcl_UniChar;
19
20       int
21       Tcl_UniCharToUtf(ch, buf)
22
23       int
24       Tcl_UtfToUniChar(src, chPtr)
25
26       char *
27       Tcl_UniCharToUtfDString(uniStr, uniLength, dsPtr)
28
29       Tcl_UniChar *
30       Tcl_UtfToUniCharDString(src, length, dsPtr)
31
32       int
33       Tcl_UniCharLen(uniStr)
34
35       int
36       Tcl_UniCharNcmp(ucs, uct, numChars)
37
38       int
39       Tcl_UniCharNcasecmp(ucs, uct, numChars)
40
41       int
42       Tcl_UniCharCaseMatch(uniStr, uniPattern, nocase)
43
44       int
45       Tcl_UtfNcmp(cs, ct, numChars)
46
47       int
48       Tcl_UtfNcasecmp(cs, ct, numChars)
49
50       int
51       Tcl_UtfCharComplete(src, length)
52
53       int
54       Tcl_NumUtfChars(src, length)
55
56       const char *
57       Tcl_UtfFindFirst(src, ch)
58
59       const char *
60       Tcl_UtfFindLast(src, ch)
61
62       const char *
63       Tcl_UtfNext(src)
64
65       const char *
66       Tcl_UtfPrev(src, start)
67
68       Tcl_UniChar
69       Tcl_UniCharAtIndex(src, index)
70
71       const char *
72       Tcl_UtfAtIndex(src, index)
73
74       int
75       Tcl_UtfBackslash(src, readPtr, dst)
76

ARGUMENTS

78       char *buf (out)                             Buffer in which  the  UTF-8
79                                                   representation    of    the
80                                                   Tcl_UniChar is stored.   At
81                                                   most  TCL_UTF_MAX bytes are
82                                                   stored in the buffer.
83
84       int ch (in)                                 The Unicode character to be
85                                                   converted or examined.
86
87       Tcl_UniChar *chPtr (out)                    Filled with the Tcl_UniChar
88                                                   represented by the head  of
89                                                   the UTF-8 string.
90
91       const char *src (in)                        Pointer to a UTF-8 string.
92
93       const char *cs (in)                         Pointer to a UTF-8 string.
94
95       const char *ct (in)                         Pointer to a UTF-8 string.
96
97       const Tcl_UniChar *uniStr (in)              A  null-terminated  Unicode
98                                                   string.
99
100       const Tcl_UniChar *ucs (in)                 A  null-terminated  Unicode
101                                                   string.
102
103       const Tcl_UniChar *uct (in)                 A  null-terminated  Unicode
104                                                   string.
105
106       const Tcl_UniChar *uniPattern (in)          A  null-terminated  Unicode
107                                                   string.
108
109       int length (in)                             The  length  of  the  UTF-8
110                                                   string in bytes (not  UTF-8
111                                                   characters).   If negative,
112                                                   all bytes up to  the  first
113                                                   null byte are used.
114
115       int uniLength (in)                          The  length  of the Unicode
116                                                   string in characters.  Must
117                                                   be greater than or equal to
118                                                   0.
119
120       Tcl_DString *dsPtr (in/out)                 A pointer to  a  previously
121                                                   initialized Tcl_DString.
122
123       unsigned long numChars (in)                 The number of characters to
124                                                   compare.
125
126       const char *start (in)                      Pointer to the beginning of
127                                                   a UTF-8 string.
128
129       int index (in)                              The  index  of  a character
130                                                   (not  byte)  in  the  UTF-8
131                                                   string.
132
133       int *readPtr (out)                          If  non-NULL,  filled  with
134                                                   the number of bytes in  the
135                                                   backslash sequence, includ‐
136                                                   ing the  backslash  charac‐
137                                                   ter.
138
139       char *dst (out)                             Buffer  in  which the bytes
140                                                   represented  by  the  back‐
141                                                   slash  sequence are stored.
142                                                   At most  TCL_UTF_MAX  bytes
143                                                   are stored in the buffer.
144
145       int nocase (in)                             Specifies whether the match
146                                                   should be done  case-sensi‐
147                                                   tive  (0)  or case-insensi‐
148                                                   tive (1).
149______________________________________________________________________________
150
151

DESCRIPTION

153       These routines convert  between  UTF-8  strings  and  Tcl_UniChars.   A
154       Tcl_UniChar  is  a Unicode character represented as an unsigned, fixed-
155       size quantity.  A UTF-8 character is a Unicode character represented as
156       a  varying-length  sequence  of  up  to TCL_UTF_MAX bytes.  A multibyte
157       UTF-8 sequence consists of a lead byte followed by some number of trail
158       bytes.
159
160       TCL_UTF_MAX  is  the maximum number of bytes that it takes to represent
161       one Unicode character in the UTF-8 representation.
162
163       Tcl_UniCharToUtf stores the Tcl_UniChar ch as a UTF-8 string in  start‐
164       ing at buf.  The return value is the number of bytes stored in buf.
165
166       Tcl_UtfToUniChar  reads  one UTF-8 character starting at src and stores
167       it as a Tcl_UniChar in *chPtr.  The return value is the number of bytes
168       read  from  src.  The caller must ensure that the source buffer is long
169       enough such that this routine does not run off the end and  dereference
170       non-existent  or  random  memory;  if  the source buffer is known to be
171       null-terminated, this will not happen.  If the input is not  in  proper
172       UTF-8 format, Tcl_UtfToUniChar will store the first byte of src in *ch‐
173       Ptr as a Tcl_UniChar between 0x80 and 0xFF and return 1.
174
175       Tcl_UniCharToUtfDString converts the given  Unicode  string  to  UTF-8,
176       storing  the  result in a previously initialized Tcl_DString.  You must
177       specify uniLength, the length of the given Unicode string.  The  return
178       value  is  a pointer to the UTF-8 representation of the Unicode string.
179       Storage for the return value is appended to the end of the Tcl_DString.
180
181       Tcl_UtfToUniCharDString converts the given  UTF-8  string  to  Unicode,
182       storing  the  result in the previously initialized Tcl_DString.  In the
183       argument length, you may either specify the length of the  given  UTF-8
184       string  in  bytes  or  “-1”, in which case Tcl_UtfToUniCharDString uses
185       strlen to calculate the length.  The return value is a pointer  to  the
186       Unicode  representation  of  the  UTF-8 string.  Storage for the return
187       value is appended to the end of the Tcl_DString.  The Unicode string is
188       terminated with a Unicode null character.
189
190       Tcl_UniCharLen  corresponds  to  strlen for Unicode characters.  It ac‐
191       cepts a null-terminated Unicode string and returns the number  of  Uni‐
192       code characters (not bytes) in that string.
193
194       Tcl_UniCharNcmp and Tcl_UniCharNcasecmp correspond to strncmp and strn‐
195       casecmp, respectively, for Unicode characters.  They accept  two  null-
196       terminated  Unicode  strings  and  the number of characters to compare.
197       Both strings are assumed to  be  at  least  numChars  characters  long.
198       Tcl_UniCharNcmp   compares  the  two strings character-by-character ac‐
199       cording to the Unicode  character  ordering.   It  returns  an  integer
200       greater  than,  equal to, or less than 0 if the first string is greater
201       than,  equal  to,  or  less  than  the  second   string   respectively.
202       Tcl_UniCharNcasecmp is the Unicode case insensitive version.
203
204       Tcl_UniCharCaseMatch  is the Unicode equivalent to Tcl_StringCaseMatch.
205       It accepts a null-terminated Unicode string, a Unicode pattern,  and  a
206       boolean value specifying whether the match should be case sensitive and
207       returns whether the string matches the pattern.
208
209       Tcl_UtfNcmp corresponds to strncmp for UTF-8 strings.  It  accepts  two
210       null-terminated  UTF-8 strings and the number of characters to compare.
211       (Both strings are assumed to be at  least  numChars  characters  long.)
212       Tcl_UtfNcmp  compares  the two strings character-by-character according
213       to the Unicode character ordering.  It returns an integer greater than,
214       equal to, or less than 0 if the first string is greater than, equal to,
215       or less than the second string respectively.
216
217       Tcl_UtfNcasecmp corresponds to strncasecmp for UTF-8  strings.   It  is
218       similar  to  Tcl_UtfNcmp  except comparisons ignore differences in case
219       when comparing upper, lower or title case characters.
220
221       Tcl_UtfCharComplete returns 1 if the source UTF-8 string src of  length
222       bytes  is long enough to be decoded by Tcl_UtfToUniChar/Tcl_UtfNext, or
223       0 otherwise.  This function does not guarantee that the UTF-8 string is
224       properly formed.  This routine is used by procedures that are operating
225       on a byte at a time and need to know if a  full  Tcl_UniChar  has  been
226       seen.
227
228       Tcl_NumUtfChars  corresponds  to  strlen for UTF-8 strings.  It returns
229       the number of Tcl_UniChars that are represented  by  the  UTF-8  string
230       src.   The  length of the source string is length bytes.  If the length
231       is negative, all bytes up to the first null byte are used.
232
233       Tcl_UtfFindFirst corresponds to strchr for UTF-8 strings.  It returns a
234       pointer  to the first occurrence of the Tcl_UniChar ch in the null-ter‐
235       minated UTF-8 string src.  The null terminator is  considered  part  of
236       the UTF-8 string.
237
238       Tcl_UtfFindLast corresponds to strrchr for UTF-8 strings.  It returns a
239       pointer to the last occurrence of the Tcl_UniChar ch in the null-termi‐
240       nated  UTF-8 string src.  The null terminator is considered part of the
241       UTF-8 string.
242
243       Given src, a pointer to some location in a  UTF-8  string,  Tcl_UtfNext
244       returns  a  pointer  to  the  next  UTF-8 character in the string.  The
245       caller must not ask for the next character after the last character  in
246       the string if the string is not terminated by a null character. Tcl_Ut‐
247       fCharComplete can be used in that case to make sure  enough  bytes  are
248       available before calling Tcl_UtfNext.
249
250       Tcl_UtfPrev  is  used to step backward through but not beyond the UTF-8
251       string that begins at start.  If the UTF-8 string is made  up  entirely
252       of complete and well-formed characters, and src points to the lead byte
253       of one of those characters (or to the location one byte past the end of
254       the string), then repeated calls of Tcl_UtfPrev will return pointers to
255       the lead bytes of each character in the  string,  one  character  at  a
256       time, terminating when it returns start.
257
258       When the conditions of completeness and well-formedness may not be sat‐
259       isfied, a more precise description of the function  of  Tcl_UtfPrev  is
260       necessary.  It always returns a pointer greater than or equal to start;
261       that is, always a pointer to a location in the string.  It  always  re‐
262       turns  a  pointer  to  a byte that begins a character when scanning for
263       characters beginning from start. When src is greater than start, it al‐
264       ways  returns a pointer less than src and greater than or equal to (src
265       - TCL_UTF_MAX).  The character that begins at the returned  pointer  is
266       the  first  one that either includes the byte src[-1], or might include
267       it if the right trail bytes are present at src and greater. Tcl_UtfPrev
268       never  reads  the  byte  src[0]  nor  the  byte  start[-1] nor the byte
269       src[-TCL_UTF_MAX-1].
270
271       Tcl_UniCharAtIndex corresponds to a C string array dereference  or  the
272       Pascal  Ord()  function.  It returns the Tcl_UniChar represented at the
273       specified character (not byte) index in  the  UTF-8  string  src.   The
274       source  string must contain at least index characters.  Behavior is un‐
275       defined if a negative index is given.
276
277       Tcl_UtfAtIndex returns a pointer to the specified character (not  byte)
278       index in the UTF-8 string src.  The source string must contain at least
279       index characters.  This is equivalent to calling Tcl_UtfToUniChar index
280       times.   If a negative index is given, the return pointer points to the
281       first character in the source string.
282
283       Tcl_UtfBackslash is a utility procedure used by several of the Tcl com‐
284       mands.   It  parses a backslash sequence and stores the properly formed
285       UTF-8 character represented by the backslash  sequence  in  the  output
286       buffer  dst.   At  most  TCL_UTF_MAX  bytes  are  stored in the buffer.
287       Tcl_UtfBackslash modifies *readPtr to contain the number  of  bytes  in
288       the  backslash sequence, including the backslash character.  The return
289       value is the number of bytes stored in the output buffer.
290
291       See the Tcl manual entry for information on  the  valid  backslash  se‐
292       quences.   All  of  the sequences described in the Tcl manual entry are
293       supported by Tcl_UtfBackslash.
294
295

KEYWORDS

297       utf, unicode, backslash
298
299
300
301Tcl                                   8.1                               Utf(3)
Impressum