1Utf(3)                      Tcl Library Procedures                      Utf(3)
2
3
4
5______________________________________________________________________________
6

NAME

8       Tcl_UniChar,   Tcl_UniCharToUtf,  Tcl_UtfToUniChar,  Tcl_UniCharToUtfD‐
9       String,   Tcl_UtfToUniCharDString,   Tcl_UniCharLen,   Tcl_UniCharNcmp,
10       Tcl_UniCharNcasecmp,   Tcl_UniCharCaseMatch,   Tcl_UtfNcmp,   Tcl_UtfN‐
11       casecmp,   Tcl_UtfCharComplete,   Tcl_NumUtfChars,    Tcl_UtfFindFirst,
12       Tcl_UtfFindLast,    Tcl_UtfNext,    Tcl_UtfPrev,    Tcl_UniCharAtIndex,
13       Tcl_UtfAtIndex, Tcl_UtfBackslash  -  routines  for  manipulating  UTF-8
14       strings
15

SYNOPSIS

17       #include <tcl.h>
18
19       typedef ... Tcl_UniChar;
20
21       int
22       Tcl_UniCharToUtf(ch, buf)
23
24       int
25       Tcl_UtfToUniChar(src, chPtr)
26
27       char *
28       Tcl_UniCharToUtfDString(uniStr, uniLength, dsPtr)
29
30       Tcl_UniChar *
31       Tcl_UtfToUniCharDString(src, length, dsPtr)
32
33       int
34       Tcl_UniCharLen(uniStr)
35
36       int
37       Tcl_UniCharNcmp(ucs, uct, numChars)
38
39       int
40       Tcl_UniCharNcasecmp(ucs, uct, numChars)
41
42       int
43       Tcl_UniCharCaseMatch(uniStr, uniPattern, nocase)
44
45       int
46       Tcl_UtfNcmp(cs, ct, numChars)
47
48       int
49       Tcl_UtfNcasecmp(cs, ct, numChars)
50
51       int
52       Tcl_UtfCharComplete(src, length)
53
54       int
55       Tcl_NumUtfChars(src, length)
56
57       const char *
58       Tcl_UtfFindFirst(src, ch)
59
60       const char *
61       Tcl_UtfFindLast(src, ch)
62
63       const char *
64       Tcl_UtfNext(src)
65
66       const char *
67       Tcl_UtfPrev(src, start)
68
69       Tcl_UniChar
70       Tcl_UniCharAtIndex(src, index)
71
72       const char *
73       Tcl_UtfAtIndex(src, index)
74
75       int
76       Tcl_UtfBackslash(src, readPtr, dst)
77

ARGUMENTS

79       char *buf (out)                             Buffer  in  which the UTF-8
80                                                   representation    of    the
81                                                   Tcl_UniChar  is stored.  At
82                                                   most TCL_UTF_MAX bytes  are
83                                                   stored in the buffer.
84
85       int ch (in)                                 The Unicode character to be
86                                                   converted or examined.
87
88       Tcl_UniChar *chPtr (out)                    Filled with the Tcl_UniChar
89                                                   represented  by the head of
90                                                   the UTF-8 string.
91
92       const char *src (in)                        Pointer to a UTF-8 string.
93
94       const char *cs (in)                         Pointer to a UTF-8 string.
95
96       const char *ct (in)                         Pointer to a UTF-8 string.
97
98       const Tcl_UniChar *uniStr (in)              A  null-terminated  Unicode
99                                                   string.
100
101       const Tcl_UniChar *ucs (in)                 A  null-terminated  Unicode
102                                                   string.
103
104       const Tcl_UniChar *uct (in)                 A  null-terminated  Unicode
105                                                   string.
106
107       const Tcl_UniChar *uniPattern (in)          A  null-terminated  Unicode
108                                                   string.
109
110       int length (in)                             The  length  of  the  UTF-8
111                                                   string  in bytes (not UTF-8
112                                                   characters).  If  negative,
113                                                   all  bytes  up to the first
114                                                   null byte are used.
115
116       int uniLength (in)                          The length of  the  Unicode
117                                                   string in characters.  Must
118                                                   be greater than or equal to
119                                                   0.
120
121       Tcl_DString *dsPtr (in/out)                 A  pointer  to a previously
122                                                   initialized Tcl_DString.
123
124       unsigned long numChars (in)                 The number of characters to
125                                                   compare.
126
127       const char *start (in)                      Pointer to the beginning of
128                                                   a UTF-8 string.
129
130       int index (in)                              The index  of  a  character
131                                                   (not  byte)  in  the  UTF-8
132                                                   string.
133
134       int *readPtr (out)                          If  non-NULL,  filled  with
135                                                   the  number of bytes in the
136                                                   backslash sequence, includ‐
137                                                   ing  the  backslash charac‐
138                                                   ter.
139
140       char *dst (out)                             Buffer in which  the  bytes
141                                                   represented  by  the  back‐
142                                                   slash sequence are  stored.
143                                                   At  most  TCL_UTF_MAX bytes
144                                                   are stored in the buffer.
145
146       int nocase (in)                             Specifies whether the match
147                                                   should  be done case-sensi‐
148                                                   tive (0)  or  case-insensi‐
149                                                   tive (1).
150______________________________________________________________________________
151
152

DESCRIPTION

154       These  routines  convert  between  UTF-8  strings  and Tcl_UniChars.  A
155       Tcl_UniChar is a Unicode character represented as an  unsigned,  fixed-
156       size quantity.  A UTF-8 character is a Unicode character represented as
157       a varying-length sequence of up  to  TCL_UTF_MAX  bytes.   A  multibyte
158       UTF-8 sequence consists of a lead byte followed by some number of trail
159       bytes.
160
161       TCL_UTF_MAX is the maximum number of bytes that it takes  to  represent
162       one Unicode character in the UTF-8 representation.
163
164       Tcl_UniCharToUtf  stores the Tcl_UniChar ch as a UTF-8 string in start‐
165       ing at buf.  The return value is the number of bytes stored in buf.
166
167       Tcl_UtfToUniChar reads one UTF-8 character starting at src  and  stores
168       it as a Tcl_UniChar in *chPtr.  The return value is the number of bytes
169       read from src.  The caller must ensure that the source buffer  is  long
170       enough  such that this routine does not run off the end and dereference
171       non-existent or random memory; if the source  buffer  is  known  to  be
172       null-terminated,  this  will not happen.  If the input is not in proper
173       UTF-8 format, Tcl_UtfToUniChar will store the  first  byte  of  src  in
174       *chPtr as a Tcl_UniChar between 0x0000 and 0x00ff and return 1.
175
176       Tcl_UniCharToUtfDString  converts  the  given  Unicode string to UTF-8,
177       storing the result in a previously initialized Tcl_DString.   You  must
178       specify  uniLength, the length of the given Unicode string.  The return
179       value is a pointer to the UTF-8 representation of the  Unicode  string.
180       Storage for the return value is appended to the end of the Tcl_DString.
181
182       Tcl_UtfToUniCharDString  converts  the  given  UTF-8 string to Unicode,
183       storing the result in the previously initialized Tcl_DString.   In  the
184       argument  length,  you may either specify the length of the given UTF-8
185       string in bytes or “-1”, in  which  case  Tcl_UtfToUniCharDString  uses
186       strlen  to  calculate the length.  The return value is a pointer to the
187       Unicode representation of the UTF-8 string.   Storage  for  the  return
188       value is appended to the end of the Tcl_DString.  The Unicode string is
189       terminated with a Unicode null character.
190
191       Tcl_UniCharLen  corresponds  to  strlen  for  Unicode  characters.   It
192       accepts a null-terminated Unicode string and returns the number of Uni‐
193       code characters (not bytes) in that string.
194
195       Tcl_UniCharNcmp and Tcl_UniCharNcasecmp correspond to strncmp and strn‐
196       casecmp,  respectively,  for Unicode characters.  They accept two null-
197       terminated Unicode strings and the number  of  characters  to  compare.
198       Both  strings  are  assumed  to  be  at least numChars characters long.
199       Tcl_UniCharNcmp   compares  the  two   strings   character-by-character
200       according  to  the  Unicode  character ordering.  It returns an integer
201       greater than, equal to, or less than 0 if the first string  is  greater
202       than,   equal   to,  or  less  than  the  second  string  respectively.
203       Tcl_UniCharNcasecmp is the Unicode case insensitive version.
204
205       Tcl_UniCharCaseMatch is the Unicode equivalent to  Tcl_StringCaseMatch.
206       It  accepts  a null-terminated Unicode string, a Unicode pattern, and a
207       boolean value specifying whether the match should be case sensitive and
208       returns whether the string matches the pattern.
209
210       Tcl_UtfNcmp  corresponds  to  strncmp for UTF-8 strings. It accepts two
211       null-terminated UTF-8 strings and the number of characters to  compare.
212       (Both  strings  are  assumed  to be at least numChars characters long.)
213       Tcl_UtfNcmp compares the two strings  character-by-character  according
214       to the Unicode character ordering.  It returns an integer greater than,
215       equal to, or less than 0 if the first string is greater than, equal to,
216       or less than the second string respectively.
217
218       Tcl_UtfNcasecmp  corresponds  to  strncasecmp for UTF-8 strings.  It is
219       similar to Tcl_UtfNcmp except comparisons ignore  differences  in  case
220       when comparing upper, lower or title case characters.
221
222       Tcl_UtfCharComplete  returns 1 if the source UTF-8 string src of length
223       bytes is long enough to be decoded by Tcl_UtfToUniChar, or 0 otherwise.
224       This  function  does  not  guarantee  that the UTF-8 string is properly
225       formed.  This routine is used by procedures that  are  operating  on  a
226       byte at a time and need to know if a full Tcl_UniChar has been seen.
227
228       Tcl_NumUtfChars  corresponds  to  strlen for UTF-8 strings.  It returns
229       the number of Tcl_UniChars that are represented  by  the  UTF-8  string
230       src.   The  length of the source string is length bytes.  If the length
231       is negative, all bytes up to the first null byte are used.
232
233       Tcl_UtfFindFirst corresponds to strchr for UTF-8 strings.  It returns a
234       pointer  to the first occurrence of the Tcl_UniChar ch in the null-ter‐
235       minated UTF-8 string src.  The null terminator is  considered  part  of
236       the UTF-8 string.
237
238       Tcl_UtfFindLast corresponds to strrchr for UTF-8 strings.  It returns a
239       pointer to the last occurrence of the Tcl_UniChar ch in the null-termi‐
240       nated  UTF-8 string src.  The null terminator is considered part of the
241       UTF-8 string.
242
243       Given src, a pointer to some location in a  UTF-8  string,  Tcl_UtfNext
244       returns a pointer to the next UTF-8 character in the string.  The call‐
245       er must not ask for the next character after the last character in  the
246       string if the string is not terminated by a null character.
247
248       Given  src,  a pointer to some location in a UTF-8 string (or to a null
249       byte immediately  following  such  a  string),  Tcl_UtfPrev  returns  a
250       pointer  to  the  closest preceding byte that starts a UTF-8 character.
251       This function will not back up to a position before start, the start of
252       the  UTF-8  string.  If src was already at start, the return value will
253       be start.
254
255       Tcl_UniCharAtIndex corresponds to a C string array dereference  or  the
256       Pascal  Ord()  function.  It returns the Tcl_UniChar represented at the
257       specified character (not byte) index in  the  UTF-8  string  src.   The
258       source  string  must  contain  at  least index characters.  Behavior is
259       undefined if a negative index is given.
260
261       Tcl_UtfAtIndex returns a pointer to the specified character (not  byte)
262       index in the UTF-8 string src.  The source string must contain at least
263       index characters.  This is  equivalent  to  calling  Tcl_UtfNext  index
264       times.   If a negative index is given, the return pointer points to the
265       first character in the source string.
266
267       Tcl_UtfBackslash is a utility procedure used by several of the Tcl com‐
268       mands.   It  parses a backslash sequence and stores the properly formed
269       UTF-8 character represented by the backslash  sequence  in  the  output
270       buffer  dst.   At  most  TCL_UTF_MAX  bytes  are  stored in the buffer.
271       Tcl_UtfBackslash modifies *readPtr to contain the number  of  bytes  in
272       the  backslash sequence, including the backslash character.  The return
273       value is the number of bytes stored in the output buffer.
274
275       See the Tcl  manual  entry  for  information  on  the  valid  backslash
276       sequences.   All of the sequences described in the Tcl manual entry are
277       supported by Tcl_UtfBackslash.
278
279

KEYWORDS

281       utf, unicode, backslash
282
283
284
285Tcl                                   8.1                               Utf(3)
Impressum