1UNICODE_BIDI(3) Courier Unicode Library UNICODE_BIDI(3)
2
3
4
6 unicode_bidi, unicode_bidi_calc_levels, unicode_bidi_calc_types,
7 unicode_bidi_calc, unicode_bidi_reorder, unicode_bidi_cleanup,
8 unicode_bidi_cleaned_size, unicode_bidi_logical_order,
9 unicode_bidi_combinings, unicode_bidi_needs_embed, unicode_bidi_embed,
10 unicode_bidi_embed_paragraph_level, unicode_bidi_direction,
11 unicode_bidi_type, unicode_bidi_setbnl, unicode_bidi_mirror,
12 unicode_bidi_bracket_type - unicode bi-directional algorithm
13
15 #include <courier-unicode.h>
16
17 unicode_bidi_level_t lr=UNICODE_BIDI_LR;
18
19 void unicode_bidi_calc_types(const char32_t *p, size_t n,
20 unicode_bidi_type_t *types);
21
22 struct unicode_bidi_direction
23 unicode_bidi_calc_levels(const char32_t *p,
24 const unicode_bidi_type_t *types,
25 size_t n,
26 unicode_bidi_level_t *levels,
27 const unicode_bidi_level_t *initial_embedding_level);
28
29 struct unicode_bidi_direction unicode_bidi_calc(const char32_t *p,
30 size_t n,
31 unicode_bidi_level_t *levels,
32 const unicode_bidi_level_t *initial_embedding_level);
33
34 void unicode_bidi_reorder(char32_t *string,
35 unicode_bidi_level_t *levels, size_t n,
36 void (*reorder_callback)(size_t, size_t, void *),
37 void *arg);
38
39 size_t unicode_bidi_cleanup(char32_t *string,
40 unicode_bidi_level_t *levels, size_t n,
41 int options,
42 void (*removed_callback)(size_t, size_t, void *),
43 void *arg);
44
45 size_t unicode_bidi_cleaned_size(const char32_t *string, size_t n,
46 int options);
47
48 void unicode_bidi_logical_order(char32_t *string,
49 unicode_bidi_level_t *levels, size_t n,
50 unicode_bidi_level_t paragraph_embedding,
51 void (*reorder_callback)(size_t index, size_t n, void *arg),
52 void *arg);
53
54 void unicode_bidi_combinings(const char32_t *string,
55 const unicode_bidi_level_t *levels,
56 size_t n,
57 void (*combinings)(unicode_bidi_level_t level, size_t level_start, size_t n_chars, size_t comb_start, size_t n_comb_chars, void *arg),
58 void *arg);
59
60 int unicode_bidi_needs_embed(const char32_t *string,
61 const unicode_bidi_level_t *levels,
62 size_t n,
63 const unicode_bidi_level_t *paragraph_embedding);
64
65 size_t unicode_bidi_embed(const char32_t *string,
66 const unicode_bidi_level_t *levels, size_t n,
67 unicode_bidi_level_t paragraph_embedding,
68 void (*emit)(const char32_t *string, size_t n, int is_part_of_string, void *arg),
69 void *arg);
70
71 char32_t unicode_bidi_embed_paragraph_level(const char32_t *string,
72 size_t n,
73 unicode_bidi_level_t paragraph_embedding);
74
75 char32_t bidi_mirror(char32_t c);
76
77 char32_t bidi_bracket_type(char32_t c, unicode_bracket_type_t *ret);
78
79 struct unicode_bidi_direction unicode_bidi_get_direction(char32_t *c,
80 size_t n);
81
82 enum_bidi_type_t unicode_bidi_type(char32_t c);
83
84 void unicode_bidi_setbnl(char32_t *p, const unicode_bidi_type_t *types,
85 size_t n);
86
88 These functions are related to the Unicode Bi-Directional algorithm[1].
89 They implement the algorithm up to and including step L2, and provide
90 additional functionality of returning miscellaneous
91 bi-directional-related metadata of Unicode characters. There's also a
92 basic algorithm that “reverses” the bi-directional algorithm and
93 produces a Unicode string with bi-directional markers that results in
94 the same bi-directional string after reapplying the algorithm.
95
96 Calculating bi-directional rendering order
97 The following process computes the rendering order of characters
98 according to the Unicode Bi-Directional algorithm:
99
100 1. Allocate an array of unicode_bidi_type_t that's the same size as
101 the Unicode string.
102
103 2. Allocate an array of unicode_bidi_level_t that's the same size as
104 the Unicode string.
105
106 3. Use unicode_bidi_calc_types() to compute the Unicode string's
107 characters' bi-directional types, and populate the
108 unicode_bidi_type_t buffer.
109
110 4. Use unicode_bidi_calc_levels() to compute the Unicode string's
111 characters' bi-directional embedding level (executes the
112 Bi-Directional algorithm up to and including step L1). This
113 populates the unicode_bidi_level_t buffer.
114
115 5. Alternatively: allocate only the unicode_bidi_level_t array and use
116 unicode_bidi_calc(), which malloc()s the unicode_bidi_type_t
117 buffer, calls unicode_bidi_calc_levels(), and then free()s the
118 buffer.
119
120 6. Use unicode_bidi_reorder() to reverse any characters in the string,
121 according to the algorithm (step L2), with an optional callback
122 that reports which ranges of characters get reversed.
123
124 7. Use unicode_bidi_cleanup() to remove the characters from the string
125 which are used by the bi-directional algorithm, and are not needed
126 for rendering the text. unicode_bidi_cleaned_size() is available
127 to determine, in advance, how many characters will remain.
128
129 The parameters to unicode_bidi_calc_types() are:
130
131 • A pointer to the Unicode string.
132
133 • Number of characters in the Unicode string.
134
135 • A pointer to an array of unicode_bidi_type_t values. The caller is
136 responsible for allocating and deallocating this array, which has
137 the same size as the Unicode string.
138
139 The parameters to unicode_bidi_calc_levels() are:
140
141 • A pointer to the Unicode string.
142
143 • A pointer to the buffer that was passed to
144 unicode_bidi_calc_types().
145
146 • Number of characters in the Unicode string and the
147 unicode_bidi_type_t buffer.
148
149 • A pointer to an array of unicode_bidi_level_t values. The caller is
150 responsible for allocating and deallocating this array, which has
151 the same size as the Unicode string.
152
153 • An optional pointer to a UNICODE_BIDI_LR or UNICODE_BIDI_RL value.
154 This sets the default paragraph direction level. A null pointer
155 computes the default paragraph direction level based on the string,
156 as specified by the "P" rules of the bi-directional algorithm.
157
158 The parameters to unicode_bidi_calc() are the same except for the
159 unicode_bidi_type_t pointer. unicode_bidi_calc() allocates this buffer
160 by itself and calls unicode_bidi_calc_types, and destroys the buffer
161 before returning.
162
163 unicode_bidi_calc() and unicode_bidi_calc_levels() fill in the
164 unicode_bidi_level_t array with the values corresponding to the
165 embedding level of the corresponding character, according the Unicode
166 Bidirection Algorithm (even values for left-to-right ordering, and odd
167 values for right-to-left ordering). A value of UNICODE_BIDI_SKIP
168 designates directional markers (from step X9).
169
170 unicode_bidi_calc() and unicode_bidi_calc_levels() return the resolved
171 paragraph direction level, which always matches the passed in level, if
172 specified, else it reports the derived one. These functions return a
173 unicode_bidi_direction structure:
174
175 struct unicode_bidi_direction {
176 unicode_bidi_level_t direction;
177 int is_explicit;
178 };
179
180
181 direction gives the paragraph embedding level, UNICODE_BIDI_LR or
182 UNICODE_BIDI_RL. is_explicit indicates whether: the optional pointer
183 to a UNICODE_BIDI_LR or UNICODE_BIDI_RL value was specified (and
184 returned in direction), or whether the direction comes from an
185 character with an explicit direction indication.
186
187 unicode_bidi_reorder() takes the actual unicode string together with
188 the embedding values from unicode_bidi_calc or
189 unicode_bidi_calc_levels(), then reverses the bi-directional string, as
190 specified by step L2 of the bi-directional algorithm. The parameters to
191 unicode_bidi_reorder() are:
192
193 • A pointer to the Unicode string.
194
195 • A pointer to an array of unicode_bidi_level_t values.
196
197 • Number of characters in the Unicode string and the
198 unicode_bidi_level_t array.
199
200 • An optional reorder_callback function pointer.
201
202 A non-NULL reorder_callback gets invoked to report each reversed
203 character range. The callback's first parameter is the index of the
204 first reversed character, the second parameter is the number of
205 reversed characters, starting at the given index of the Unicode string.
206 The third parameter is the arg passthrough parameter.
207
208 unicode_bidi_reorder modifies its string and levels. reorder_callback
209 gets invoked after reversing each consecutive range of values in the
210 string and levels buffers. For example: “reorder_callback(5, 7, arg)”
211 reports that character indexes #5 through #11 got reversed.
212
213 A NULL string pointer leaves the levels buffer unchanged, but still
214 invokes the reorder_callback as if the character string, and their
215 embedding values, were reversed.
216
217 The resulting string and embedding levels are in “rendering order”, but
218 still contain bi-directional embedding, override, boundary-neutral,
219 isolate, and marker characters. unicode_bidi_cleanup removes these
220 characters and directional markers.
221
222 The parameters to unicode_bidi_cleanup() are:
223
224 • The pointer to the unicode string.
225
226 • A non-null pointer to the directional embedding level buffer, of
227 the same size as the string, also removes the corresponding values
228 from the buffer, and the remaining values in the embedding level
229 buffer get reset to levels UNICODE_BIDI_LR and UNICODE_BIDI_RL,
230 only.
231
232 • The size of the unicode string and the directional embedding buffer
233 (if not NULL).
234
235 • A a bitmask that selects the following options (or 0 if no
236 options):
237
238 UNICODE_BIDI_CLEANUP_EXTRA
239 In addition to removing all embedding, override, and
240 boundry-neutral characters as specified by step X9 of the
241 bi-directional algorithm (the default behavior without this
242 flag), also remove all isolation markers and implicit markers.
243
244 UNICODE_BIDI_CLEANUP_BNL
245 Replace all characters classified as paragraph separators with
246 a newline character.
247
248 UNICODE_BIDI_CLEANUP_CANONICAL
249 A combined set of UNICODE_BIDI_CLEANUP_EXTRA and
250 UNICODE_BIDI_CLEANUP_BNL,
251
252 • A pointer to a function that gets repeatedly invoked with the index
253 of the character that gets removed from the Unicode string.
254
255 • An opaque pointer that gets forwarded to the callback.
256
257 The function pointer (if not NULL) gets invoked to report the index of
258 each removed character. The reported index is the index from the
259 original string, and the callback gets invoked in strict order, from
260 the first to the last removed character (if any).
261
262 The character string and the embedding level values resulting from
263 unicode_bidi_cleanup() with the UNICODE_BIDI_CLEANUP_CANONICAL are in
264 “canonical rendering order”. unicode_bidi_logical_order(),
265 unicode_bidi_needs_embed() and unicode_bidi_embed() require the
266 canonical rendering order for their string and embedding level values.
267
268 The parameters to unicode_bidi_cleaned_size() are a pointer to the
269 unicode string, its size, and the bitmask option to
270 unicode_bidi_cleanup().
271
272 Embedding bi-directional markers in Unicode text strings
273 unicode_bidi_logical_order() rearranges the string from rendering to
274 its logical order. unicode_bidi_embed() adds various bi-directional
275 markers to a Unicode string in canonical rendering order. The resulting
276 string is not guaranteed to be identical to the original Unicode
277 bi-directional string. The algorithm is fairly basic, but the resulting
278 bi-directional string produces the same canonical rendering order after
279 applying unicode_bidi_calc() or unicode_bidi_calc_levels(),
280 unicode_reorder() and unicode_bidi_cleanup() (with the canonical
281 option), with the same paragraph_embedding level.
282 unicode_bidi_needs_embed() attempts to heuristically determine whether
283 unicode_bidi_embed() is required.
284
285 unicode_bidi_logical_order() gets called first, followed by
286 unicode_bidi_embed() (or unicode_bidi_needs_embed() in order to
287 determine whether bi-directional markers are required). Finally,
288 unicode_bidi_embed_paragraph_level() optionally determines whether the
289 resulting string's default paragraph embedding level matches the one
290 used for the actual embedding direction, and if not returns a
291 directional marker to be prepended to the Unicode character string, as
292 a hint.
293
294 unicode_bidi_logical_order() factors in the characters' embedding
295 values, and the provided paragraph embedding value (UNICODE_BIDI_LR or
296 UNICODE_BIDI_RL), and rearranges the characters and the embedding
297 levels in left-to-right order, while simultaneously invoking the
298 supplied reorder_callback indicating each range of characters whose
299 relative order gets reversed. The reorder_callback() receives, as
300 parameters:
301
302 • The starting index of the first reversed character, in the string.
303
304 • Number of reversed characters.
305
306 • Forwarded arg pointer value.
307
308 This specifies a consecutive range of characters (and directional
309 embedding values) that get reversed (first character in the range
310 becomes the last character, and the last character becomes the first
311 character).
312
313 After unicode_bidi_logical_order(), unicode_bidi_embed() progressively
314 invokes the passed-in callback with the contents of a bi-directional
315 unicode string. The parameters to unicode_bidi_embed() are:
316
317 • The Unicode string.
318
319 • The directional embedding buffer, in canonical rendering order.
320
321 • The size of the string and the embedding level buffer.
322
323 • The paragraph embedding level, either UNICODE_BIDI_LR or
324 UNICODE_BIDI_RL.
325
326 • The pointer to the callback function.
327
328 • An opaque pointer argument that gets forwarded to the callback
329 function.
330
331 The callback receives pointers to various parts of the original string
332 that gets passed to unicode_bidi_embed(), intermixed with
333 bi-directional markers, overrides, and isolates. The callback's
334 parameters are:
335
336 • The pointer to a Unicode string.
337
338 Note
339 It is not a given that the callback receives pointers to
340 progressively increasing pointers of the original string that
341 gets passed to unicode_bidi_embed(). Some calls will be for
342 individual bi-directional markers, and unicode_bidi_embed()
343 also performs some additional internal reordering, on the fly,
344 after unicode_bidi_logical_order()'s big hammer.
345
346 • Number of characters in the Unicode string.
347
348 • Indication whether the Unicode string pointer is pointing to a part
349 of the original Unicode string that's getting embedded. Otherwise
350 this must be some marker character that's not present in the
351 original Unicode string.
352
353 • Forwarded arg pointer value.
354
355 The assembled unicode string should produce the same canonical
356 rendering order, for the same paragraph embedding level.
357 unicode_bidi_embed_paragraph_level() checks if the specified Unicode
358 string computes the given default paragraph embedding level and returns
359 0 if it matches. Otherwise it returns a directional marker that should
360 be prepended to the Unicode string to allow unicode_bidi_calc's (or
361 unicode_bidi_calc_levels()) optional paragraph embedding level
362 pointer's value to be NULL, but derive the same default embedding
363 level. The parameters to unicode_bidi_embed_paragraph_level() are:
364
365 • The Unicode string.
366
367 • The size of the string.
368
369 • The paragraph embedding level, either UNICODE_BIDI_LR or
370 UNICODE_BIDI_RL.
371
372 unicode_bidi_needs_embed() attempts to heuristically determine whether
373 the Unicode string, in logical order, requires bi-directional markers.
374 The parameters to unicode_bidi_embed_paragraph_level() are:
375
376 • The Unicode string.
377
378 • The directional embedding buffer, in logical order.
379
380 • The size of the string and the embedding level buffer.
381
382 • A pointer to an explicit paragraph embedding level, either
383 UNICODE_BIDI_LR or UNICODE_BIDI_RL; or a NULL pointer (see
384 unicode_bidi_calc_types()'s explanation for this parameter).
385
386 unicode_bidi_needs_embed() returns 0 if the Unicode string does not
387 need explicit directional markers, or 1 if it does. This is done by
388 using unicode_bidi_calc(), unicode_bidi_reorder(),
389 unicode_bidi_logical_order and then checking if the end result is
390 different from what was passed in.
391
392 Combining character ranges
393 unicode_bidi_combinings() reports consecutive sequences of one or more
394 combining marks in bidirectional text (which can be either in rendering
395 or logical order) that have the same embedding level. It takes the
396 following parameters:
397
398 • The Unicode string.
399
400 • The directional embedding buffer, in logical or rendering order. A
401 NULL value for this pointer is equivalent to a directional
402 embedding buffer with a level of 0 for every character in the
403 Unicode string.
404
405 • Number of characters in the Unicode string.
406
407 • The pointer to the callback function.
408
409 • An opaque pointer argument that gets forwarded to the callback
410 function.
411
412 The callback function gets invoked for every consecutive sequence of
413 one or more characters that have a canonical combining class other than
414 0, and with the same embedding level. The parameters to the callback
415 function are:
416
417 • The embedding level of the combining characters.
418
419 • The starting index of a consecutive sequence of all characters with
420 the same embedding level.
421
422 • The number of characters with the same embedding level.
423
424 • The starting index of a consecutive sequence of all characters with
425 the same embedding level and a canonical combining class other than
426 0. This will always be equal to or greater than the value of the
427 second parameter.
428
429 • The number of consecutive characters with the characters with the
430 same embedding level and a canonical combining class other than 0.
431 The last character included in this sequence will always be less
432 than or equal to the last character in the sequence defined by the
433 second and the third parameters.
434
435 • The opaque pointer argument that was passed to
436 unicode_bidi_combinings.
437
438 A consecutive sequence of Unicode characters with non-0 combining
439 classes but different embedding levels gets reported individually, for
440 each consecutive sequence with the same embedding level.
441
442 This function helps with reordering the combining characters in
443 right-to-left-rendered text. Right-to-left text reversed by
444 unicode_bidi_reorder() results in combining characters preceding their
445 starter character. They get reversed no differently than any other
446 character. The same thing also occurs after
447 unicode_bidi_logical_order() reverses everything back. Use
448 unicode_bidi_combinings to identify consecutive sequences of combining
449 characters followed by their original starter.
450
451 The callback may reorder the characters identified by its third and the
452 fourth parameters in the manner described below.
453 unicode_bidi_reorder's parameter is pointers to a constant Unicode
454 string; but it can modify the string (via an out-of-band mutable
455 pointer) subject to the following conditions:
456
457 • The characters identified by the third and the fourth parameter may
458 be modified.
459
460 • If the last character in this sequence is not the last character
461 included in the range specified by the first and the second
462 character, then one more character after the last character may
463 also be modified.
464
465 This is, presumably, the original starter that preceded the
466 combining characters before the entire sequence was reversed.
467
468 Here's an example of a callback that reverses combining characters and
469 their immediately-following starter character:
470
471 void reorder_right_to_left_combining(unicode_bidi_level_t level,
472 size_t level_start,
473 size_t n_chars,
474 size_t comb_start,
475 size_t n_comb_chars,
476 void *arg)
477 {
478 /* Let's say that this is the Unicode string */
479 char32_t *buf=(char32_t *)arg;
480
481 if ((level & 1) == 0)
482 return; /* Left-to-right text not reversed */
483
484 char32_t *b=buf+comb_start;
485 char32_t *e=b+n_comb_chars;
486
487 /*
488 ** Include the starter characters in the reversed range.
489 ** The semantics of the combining characters with different
490 ** embedding levels -- so they get reported here separately -- is
491 ** not specified. This will reverse just the combining marks, and
492 ** they're on their own.
493 */
494
495 if (comb_start + n_comb_chars < level_start + n_chars)
496 ++e;
497
498 while (b < e)
499 {
500 char32_t t;
501
502 --e;
503 t=*b;
504 *b=*e;
505 *e=t;
506 ++b;
507 }
508 }
509
510 Miscellaneous utility functions
511 unicode_bidi_get_direction takes a pointer to a unicode string, the
512 number of characters in the unicode string, and determines default
513 paragraph level level. unicode_bidi_get_direction returns a struct
514 with the following fields:
515
516 direction
517 This value is either UNICODE_BIDI_LR or UNICODE_BIDI_RL (left to
518 right or right to left).
519
520 is_explicit
521 This value is a flag. A non-0 value indicates that the embedding
522 level was derived from an explicit character type (L, R or AL) from
523 the stirng. A 0 value indicates the default paragraph direction, no
524 explicit character was found in the string.
525
526 unicode_bidi_type looks up each character's bi-directional character
527 type.
528
529 unicode_bidi_setbnl takes a pointer to a unicode string, a pointer to
530 an array of enum_bidi_type_t values and the number of characters in the
531 string and the array. unicode_bidi_setbnl replaces all paragraph
532 separators in the unicode string with a newline character (same as the
533 UNICODE_BIDI_CLEANUP_BNL option to unicode_bidi_cleanup.
534
535 unicode_bidi_mirror returns the glyph that's a mirror image of the
536 parameter (i.e. an open parenthesis for a close parenthesis, and vice
537 versa); or the same value if there is no mirror image (this is the
538 Bidi_Mirrored=Yes property).
539
540 unicode_bidi_bracket_type looks up each bracket character and returns
541 its opposite, or the same value if the character is not a bracket that
542 has an opposing bracket character (this is the Bidi_Paired_Bracket_type
543 property). A non-NULL ret gets initialized to either UNICODE_BIDI_o,
544 UNICODE_BIDI_c or UNICODE_BIDI_n.
545
547 TR-9[1], unicode::bidi(3), courier-unicode(7),
548
550 Sam Varshavchik
551 Author
552
554 1. Unicode Bi-Directional algorithm
555 https://www.unicode.org/reports/tr9/tr9-42.html
556
557
558
559Courier Unicode Library 04/16/2022 UNICODE_BIDI(3)