1Utf(3)                      Tcl Library Procedures                      Utf(3)
2
3
4
5______________________________________________________________________________
6

NAME

8       Tcl_UniChar,         Tcl_UniCharCaseMatch,         Tcl_UniCharNcasecmp,
9       Tcl_UniCharToUtf,      Tcl_UtfToUniChar,       Tcl_UniCharToUtfDString,
10       Tcl_UtfToUniCharDString,  Tcl_UniCharLen, Tcl_UniCharNcmp, Tcl_UtfChar‐
11       Complete,    Tcl_NumUtfChars,    Tcl_UtfFindFirst,     Tcl_UtfFindLast,
12       Tcl_UtfNext,  Tcl_UtfPrev, Tcl_UniCharAtIndex, Tcl_UtfAtIndex, Tcl_Utf‐
13       Backslash - routines for manipulating UTF-8 strings.
14

SYNOPSIS

16       #include <tcl.h>
17
18       typedef ... Tcl_UniChar;
19
20       int
21       Tcl_UniCharToUtf(ch, buf)
22
23       int
24       Tcl_UtfToUniChar(src, chPtr)
25
26       char *                                                                  │
27       Tcl_UniCharToUtfDString(uniStr, numChars, dstPtr)                       │
28
29       Tcl_UniChar *                                                           │
30       Tcl_UtfToUniCharDString(src, len, dstPtr)                               │
31
32       int
33       Tcl_UniCharLen(uniStr)
34
35       int
36       Tcl_UniCharNcmp(uniStr, uniStr, num)
37
38       int                                                                     │
39       Tcl_UniCharNcasecmp(uniStr, uniStr, num)                                │
40
41       int                                                                     │
42       Tcl_UniCharCaseMatch(uniStr, uniPattern, nocase)                        │
43
44       int
45       Tcl_UtfNcmp(src, src, num)
46
47       int
48       Tcl_UtfNcasecmp(src, src, num)
49
50       int
51       Tcl_UtfCharComplete(src, len)
52
53       int
54       Tcl_NumUtfChars(src, len)
55
56       CONST char *                                                            │
57       Tcl_UtfFindFirst(src, ch)                                               │
58
59       CONST char *                                                            │
60       Tcl_UtfFindLast(src, ch)                                                │
61
62       CONST char *                                                            │
63       Tcl_UtfNext(src)                                                        │
64
65       CONST char *                                                            │
66       Tcl_UtfPrev(src, start)                                                 │
67
68       Tcl_UniChar
69       Tcl_UniCharAtIndex(src, index)
70
71       CONST char *                                                            │
72       Tcl_UtfAtIndex(src, index)                                              │
73
74       int
75       Tcl_UtfBackslash(src, readPtr, dst)
76

ARGUMENTS

78       char                *buf       (out)     Buffer in which the UTF-8 rep‐
79                                                resentation of the Tcl_UniChar
80                                                is    stored.      At     most
81                                                TCL_UTF_MAX  bytes  are stored
82                                                in the buffer.
83
84       int                 ch         (in)      The  Tcl_UniChar  to  be  con‐
85                                                verted or examined.
86
87       Tcl_UniChar         *chPtr     (out)     Filled  with  the  Tcl_UniChar
88                                                represented by the head of the
89                                                UTF-8 string.
90
91       CONST char          *src       (in)      Pointer to a UTF-8 string.
92
93       CONST Tcl_UniChar   *uniStr    (in)      A    null-terminated   Unicode
94                                                string.
95
96       CONST Tcl_UniChar   *uniPattern(in)      A   null-terminated    Unicode
97                                                string.
98
99       int                 len        (in)      The length of the UTF-8 string
100                                                in bytes  (not  UTF-8  charac‐
101                                                ters).  If negative, all bytes
102                                                up to the first null byte  are
103                                                used.
104
105       int                 numChars   (in)      The   length  of  the  Unicode
106                                                string in characters.  Must be
107                                                greater than or equal to 0.
108
109       Tcl_DString         *dstPtr    (in/out)  A pointer to a previously-ini‐
110                                                tialized Tcl_DString.
111
112       unsigned long       num        (in)      The number  of  characters  to
113                                                compare.
114
115       CONST char          *start     (in)      Pointer  to the beginning of a
116                                                UTF-8 string.
117
118       int                 index      (in)      The index of a character  (not
119                                                byte) in the UTF-8 string.
120
121       int                 *readPtr   (out)     If  non-NULL,  filled with the
122                                                number of bytes in  the  back‐
123                                                slash  sequence, including the
124                                                backslash character.
125
126       char                *dst       (out)     Buffer in which the bytes rep‐
127                                                resented   by   the  backslash
128                                                sequence are stored.  At  most
129                                                TCL_UTF_MAX  bytes  are stored
130                                                in the buffer.                 │
131
132       int                 nocase     (in)                                     │
133                                                Specifies  whether  the  match │
134                                                should  be done case-sensitive │
135                                                (0) or case-insensitive (1).
136_________________________________________________________________
137
138

DESCRIPTION

140       These routines convert  between  UTF-8  strings  and  Tcl_UniChars.   A
141       Tcl_UniChar  is  a Unicode character represented as an unsigned, fixed-
142       size quantity.  A UTF-8 character is a Unicode character represented as
143       a  varying-length  sequence  of  up  to TCL_UTF_MAX bytes.  A multibyte
144       UTF-8 sequence consists of a lead byte followed by some number of trail
145       bytes.
146
147       TCL_UTF_MAX  is  the maximum number of bytes that it takes to represent
148       one Unicode character in the UTF-8 representation.
149
150       Tcl_UniCharToUtf stores the Tcl_UniChar ch as a UTF-8 string in  start‐
151       ing at buf.  The return value is the number of bytes stored in buf.
152
153       Tcl_UtfToUniChar  reads  one UTF-8 character starting at src and stores
154       it as a Tcl_UniChar in *chPtr.  The return value is the number of bytes
155       read  from src..  The caller must ensure that the source buffer is long
156       enough such that this routine does not run off the end and  dereference
157       non-existent  or  random  memory;  if  the source buffer is known to be
158       null-terminated, this will not happen.  If the input is not  in  proper
159       UTF-8  format,  Tcl_UtfToUniChar  will  store  the first byte of src in
160       *chPtr as a Tcl_UniChar between 0x0000 and 0x00ff and return 1.
161
162       Tcl_UniCharToUtfDString converts the given  Unicode  string  to  UTF-8,
163       storing  the  result in a previously-initialized Tcl_DString.  You must
164       specify the length of the given Unicode string.  The return value is  a
165       pointer to the UTF-8 representation of the Unicode string.  Storage for
166       the return value is appended to the end of the Tcl_DString.
167
168       Tcl_UtfToUniCharDString converts the given  UTF-8  string  to  Unicode,
169       storing  the result in the previously-initialized Tcl_DString.  you may
170       either specify the length of the given UTF-8 string or "-1",  in  which
171       case  Tcl_UtfToUniCharDString uses strlen to calculate the length.  The
172       return value is a pointer to the Unicode representation  of  the  UTF-8
173       string.   Storage  for  the  return value is appended to the end of the
174       Tcl_DString.  The Unicode string is  terminated  with  a  Unicode  null
175       character.
176
177       Tcl_UniCharLen  corresponds  to  strlen  for  Unicode  characters.   It
178       accepts a null-terminated Unicode string and returns the number of Uni‐
179       code characters (not bytes) in that string.
180
181       Tcl_UniCharNcmp and Tcl_UniCharNcasecmp correspond to strncmp and strn‐
182       casecmp, respectively, for Unicode characters.  They accepts two  null-
183       terminated  Unicode  strings  and  the number of characters to compare.
184       Both  strings  are  assumed  to  be  at  least  len  characters   long.
185       Tcl_UniCharNcmp    compares   the  two  strings  character-by-character
186       according to the Unicode character ordering.   It  returns  an  integer
187       greater  than,  equal to, or less than 0 if the first string is greater
188       than,  equal  to,  or  less  than  the  second   string   respectively.
189       Tcl_UniCharNcasecmp is the Unicode case insensitive version.
190
191       Tcl_UniCharCaseMatch  is the Unicode equivalent to Tcl_StringCaseMatch. │
192       It accepts a null-terminated Unicode string, a Unicode pattern,  and  a │
193       boolean value specifying whether the match should be case sensitive and │
194       returns whether the string matches the pattern.
195
196       Tcl_UtfNcmp corresponds to strncmp for UTF-8 strings.  It  accepts  two
197       null-terminated  UTF-8 strings and the number of characters to compare.
198       (Both strings  are  assumed  to  be  at  least  len  characters  long.)
199       Tcl_UtfNcmp  compares  the two strings character-by-character according
200       to the Unicode character ordering.  It returns an integer greater than,
201       equal to, or less than 0 if the first string is greater than, equal to,
202       or less than the second string respectively.
203
204       Tcl_UtfNcasecmp corresponds to strncasecmp for UTF-8  strings.   It  is
205       similar  to  Tcl_UtfNcmp  except comparisons ignore differences in case
206       when comparing upper, lower or title case characters.
207
208       Tcl_UtfCharComplete returns 1 if the source UTF-8 string src of  length
209       len bytes is long enough to be decoded by Tcl_UtfToUniChar, or 0 other‐
210       wise.  This function does not guarantee that the UTF-8 string is  prop‐
211       erly  formed.  This routine is used by procedures that are operating on
212       a byte at a time and need to know if a full Tcl_UniChar has been seen.
213
214       Tcl_NumUtfChars corresponds to strlen for UTF-8  strings.   It  returns
215       the  number  of  Tcl_UniChars  that are represented by the UTF-8 string
216       src.  The length of the source string is len bytes.  If the  length  is
217       negative, all bytes up to the first null byte are used.
218
219       Tcl_UtfFindFirst corresponds to strchr for UTF-8 strings.  It returns a
220       pointer to the first occurrence of the Tcl_UniChar ch in the  null-ter‐
221       minated  UTF-8  string  src.  The null terminator is considered part of
222       the UTF-8 string.
223
224       Tcl_UtfFindLast corresponds to strrchr for UTF-8 strings.  It returns a
225       pointer to the last occurrence of the Tcl_UniChar ch in the null-termi‐
226       nated UTF-8 string src.  The null terminator is considered part of  the
227       UTF-8 string.
228
229       Given  src,  a  pointer to some location in a UTF-8 string, Tcl_UtfNext
230       returns a pointer to the next UTF-8 character in the string.  The call‐
231       er  must not ask for the next character after the last character in the
232       string if the string is not terminated by a null character.
233
234       Given src, a pointer to some location in a UTF-8 string (or to  a  null
235       byte  immediately  following  such  a  string),  Tcl_UtfPrev  returns a
236       pointer to the closest preceding byte that starts  a  UTF-8  character.
237       This function will not back up to a position before start, the start of
238       the UTF-8 string.  If src was already at start, the return  value  will
239       be start.
240
241       Tcl_UniCharAtIndex  corresponds  to a C string array dereference or the
242       Pascal Ord() function.  It returns the Tcl_UniChar represented  at  the
243       specified  character  (not  byte)  index  in the UTF-8 string src.  The
244       source string must contain at  least  index  characters.   Behavior  is
245       undefined if a negative index is given.
246
247       Tcl_UtfAtIndex  returns a pointer to the specified character (not byte)
248       index in the UTF-8 string src.  The source string must contain at least
249       index  characters.   This  is  equivalent  to calling Tcl_UtfNext index
250       times.  If a negative index is given, the return pointer points to  the
251       first character in the source string.
252
253       Tcl_UtfBackslash is a utility procedure used by several of the Tcl com‐
254       mands.  It parses a backslash sequence and stores the  properly  formed
255       UTF-8  character  represented  by  the backslash sequence in the output
256       buffer dst.  At most  TCL_UTF_MAX  bytes  are  stored  in  the  buffer.
257       Tcl_UtfBackslash  modifies  *readPtr  to contain the number of bytes in
258       the backslash sequence, including the backslash character.  The  return
259       value is the number of bytes stored in the output buffer.
260
261       See  the  Tcl  manual  entry  for  information  on  the valid backslash
262       sequences.  All of the sequences described in the Tcl manual entry  are
263       supported by Tcl_UtfBackslash.
264
265

KEYWORDS

267       utf, unicode, backslash
268
269
270
271Tcl                                   8.1                               Utf(3)
Impressum