1UNICODE_BIDI(3)             Courier Unicode Library            UNICODE_BIDI(3)
2
3
4

NAME

6       unicode_bidi, unicode_bidi_calc_levels, unicode_bidi_calc_types,
7       unicode_bidi_calc, unicode_bidi_reorder, unicode_bidi_cleanup,
8       unicode_bidi_cleaned_size, unicode_bidi_logical_order,
9       unicode_bidi_combinings, unicode_bidi_needs_embed, unicode_bidi_embed,
10       unicode_bidi_embed_paragraph_level, unicode_bidi_direction,
11       unicode_bidi_type, unicode_bidi_setbnl, unicode_bidi_mirror,
12       unicode_bidi_bracket_type - unicode bi-directional algorithm
13

SYNOPSIS

15       #include <courier-unicode.h>
16
17       unicode_bidi_level_t lr=UNICODE_BIDI_LR;
18
19       void unicode_bidi_calc_types(const char32_t *p, size_t n,
20                                    unicode_bidi_type_t *types);
21
22       struct unicode_bidi_direction
23                                                              unicode_bidi_calc_levels(const char32_t *p,
24                                                              const unicode_bidi_type_t *types,
25                                                              size_t n,
26                                                              unicode_bidi_level_t *levels,
27                                                              const unicode_bidi_level_t *initial_embedding_level);
28
29       struct unicode_bidi_direction unicode_bidi_calc(const char32_t *p,
30                                                       size_t n,
31                                                       unicode_bidi_level_t *levels,
32                                                       const unicode_bidi_level_t *initial_embedding_level);
33
34       void unicode_bidi_reorder(char32_t *string,
35                                 unicode_bidi_level_t *levels, size_t n,
36                                 void (*reorder_callback)(size_t, size_t, void *),
37                                 void *arg);
38
39       size_t unicode_bidi_cleanup(char32_t *string,
40                                   unicode_bidi_level_t *levels, size_t n,
41                                   int options,
42                                   void (*removed_callback)(size_t, size_t, void *),
43                                   void *arg);
44
45       size_t unicode_bidi_cleaned_size(const char32_t *string, size_t n,
46                                        int options);
47
48       void unicode_bidi_logical_order(char32_t *string,
49                                       unicode_bidi_level_t *levels, size_t n,
50                                       unicode_bidi_level_t paragraph_embedding,
51                                       void (*reorder_callback)(size_t index, size_t n, void *arg),
52                                       void *arg);
53
54       void unicode_bidi_combinings(const char32_t *string,
55                                    const unicode_bidi_level_t *levels,
56                                    size_t n,
57                                    void (*combinings)(unicode_bidi_level_t level, size_t level_start, size_t n_chars, size_t comb_start, size_t n_comb_chars, void *arg),
58                                    void *arg);
59
60       int unicode_bidi_needs_embed(const char32_t *string,
61                                    const unicode_bidi_level_t *levels,
62                                    size_t n,
63                                    const unicode_bidi_level_t *paragraph_embedding);
64
65       size_t unicode_bidi_embed(const char32_t *string,
66                                 const unicode_bidi_level_t *levels, size_t n,
67                                 unicode_bidi_level_t paragraph_embedding,
68                                 void (*emit)(const char32_t *string, size_t n, int is_part_of_string, void *arg),
69                                 void *arg);
70
71       char32_t unicode_bidi_embed_paragraph_level(const char32_t *string,
72                                                   size_t n,
73                                                   unicode_bidi_level_t paragraph_embedding);
74
75       char32_t bidi_mirror(char32_t c);
76
77       char32_t bidi_bracket_type(char32_t c, unicode_bracket_type_t *ret);
78
79       struct unicode_bidi_direction unicode_bidi_get_direction(char32_t *c,
80                                                                size_t n);
81
82       enum_bidi_type_t unicode_bidi_type(char32_t c);
83
84       void unicode_bidi_setbnl(char32_t *p, const unicode_bidi_type_t *types,
85                                size_t n);
86

DESCRIPTION

88       These functions are related to the Unicode Bi-Directional algorithm[1].
89       They implement the algorithm up to and including step L2, and provide
90       additional functionality of returning miscellaneous
91       bi-directional-related metadata of Unicode characters. There's also a
92       basic algorithm that “reverses” the bi-directional algorithm and
93       produces a Unicode string with bi-directional markers that results in
94       the same bi-directional string after reapplying the algorithm.
95
96   Calculating bi-directional rendering order
97       The following process computes the rendering order of characters
98       according to the Unicode Bi-Directional algorithm:
99
100        1. Allocate an array of unicode_bidi_type_t that's the same size as
101           the Unicode string.
102
103        2. Allocate an array of unicode_bidi_level_t that's the same size as
104           the Unicode string.
105
106        3. Use unicode_bidi_calc_types() to compute the Unicode string's
107           characters' bi-directional types, and populate the
108           unicode_bidi_type_t buffer.
109
110        4. Use unicode_bidi_calc_levels() to compute the Unicode string's
111           characters' bi-directional embedding level (executes the
112           Bi-Directional algorithm up to and including step L1). This
113           populates the unicode_bidi_level_t buffer.
114
115        5. Alternatively: allocate only the unicode_bidi_level_t array and use
116           unicode_bidi_calc(), which malloc()s the unicode_bidi_type_t
117           buffer, calls unicode_bidi_calc_levels(), and then free()s the
118           buffer.
119
120        6. Use unicode_bidi_reorder() to reverse any characters in the string,
121           according to the algorithm (step L2), with an optional callback
122           that reports which ranges of characters get reversed.
123
124        7. Use unicode_bidi_cleanup() to remove the characters from the string
125           which are used by the bi-directional algorithm, and are not needed
126           for rendering the text.  unicode_bidi_cleaned_size() is available
127           to determine, in advance, how many characters will remain.
128
129       The parameters to unicode_bidi_calc_types() are:
130
131       •   A pointer to the Unicode string.
132
133       •   Number of characters in the Unicode string.
134
135       •   A pointer to an array of unicode_bidi_type_t values. The caller is
136           responsible for allocating and deallocating this array, which has
137           the same size as the Unicode string.
138
139       The parameters to unicode_bidi_calc_levels() are:
140
141       •   A pointer to the Unicode string.
142
143       •   A pointer to the buffer that was passed to
144           unicode_bidi_calc_types().
145
146       •   Number of characters in the Unicode string and the
147           unicode_bidi_type_t buffer.
148
149       •   A pointer to an array of unicode_bidi_level_t values. The caller is
150           responsible for allocating and deallocating this array, which has
151           the same size as the Unicode string.
152
153       •   An optional pointer to a UNICODE_BIDI_LR or UNICODE_BIDI_RL value.
154           This sets the default paragraph direction level. A null pointer
155           computes the default paragraph direction level based on the string,
156           as specified by the "P" rules of the bi-directional algorithm.
157
158       The parameters to unicode_bidi_calc() are the same except for the
159       unicode_bidi_type_t pointer.  unicode_bidi_calc() allocates this buffer
160       by itself and calls unicode_bidi_calc_types, and destroys the buffer
161       before returning.
162
163       unicode_bidi_calc() and unicode_bidi_calc_levels() fill in the
164       unicode_bidi_level_t array with the values corresponding to the
165       embedding level of the corresponding character, according the Unicode
166       Bidirection Algorithm (even values for left-to-right ordering, and odd
167       values for right-to-left ordering). A value of UNICODE_BIDI_SKIP
168       designates directional markers (from step X9).
169
170       unicode_bidi_calc() and unicode_bidi_calc_levels() return the resolved
171       paragraph direction level, which always matches the passed in level, if
172       specified, else it reports the derived one. These functions return a
173       unicode_bidi_direction structure:
174
175       struct unicode_bidi_direction {
176           unicode_bidi_level_t   direction;
177           int                    is_explicit;
178       };
179
180
181       direction gives the paragraph embedding level, UNICODE_BIDI_LR or
182       UNICODE_BIDI_RL.  is_explicit indicates whether: the optional pointer
183       to a UNICODE_BIDI_LR or UNICODE_BIDI_RL value was specified (and
184       returned in direction), or whether the direction comes from an
185       character with an explicit direction indication.
186
187       unicode_bidi_reorder() takes the actual unicode string together with
188       the embedding values from unicode_bidi_calc or
189       unicode_bidi_calc_levels(), then reverses the bi-directional string, as
190       specified by step L2 of the bi-directional algorithm. The parameters to
191       unicode_bidi_reorder() are:
192
193       •   A pointer to the Unicode string.
194
195       •   A pointer to an array of unicode_bidi_level_t values.
196
197       •   Number of characters in the Unicode string and the
198           unicode_bidi_level_t array.
199
200       •   An optional reorder_callback function pointer.
201
202       A non-NULL reorder_callback gets invoked to report each reversed
203       character range. The callback's first parameter is the index of the
204       first reversed character, the second parameter is the number of
205       reversed characters, starting at the given index of the Unicode string.
206       The third parameter is the arg passthrough parameter.
207
208       unicode_bidi_reorder modifies its string and levels.  reorder_callback
209       gets invoked after reversing each consecutive range of values in the
210       string and levels buffers. For example: “reorder_callback(5, 7, arg)”
211       reports that character indexes #5 through #11 got reversed.
212
213       A NULL string pointer leaves the levels buffer unchanged, but still
214       invokes the reorder_callback as if the character string, and their
215       embedding values, were reversed.
216
217       The resulting string and embedding levels are in “rendering order”, but
218       still contain bi-directional embedding, override, boundary-neutral,
219       isolate, and marker characters.  unicode_bidi_cleanup removes these
220       characters and directional markers.
221
222       The parameters to unicode_bidi_cleanup() are:
223
224       •   The pointer to the unicode string.
225
226       •   A non-null pointer to the directional embedding level buffer, of
227           the same size as the string, also removes the corresponding values
228           from the buffer, and the remaining values in the embedding level
229           buffer get reset to levels UNICODE_BIDI_LR and UNICODE_BIDI_RL,
230           only.
231
232       •   The size of the unicode string and the directional embedding buffer
233           (if not NULL).
234
235       •   A a bitmask that selects the following options (or 0 if no
236           options):
237
238           UNICODE_BIDI_CLEANUP_EXTRA
239               In addition to removing all embedding, override, and
240               boundry-neutral characters as specified by step X9 of the
241               bi-directional algorithm (the default behavior without this
242               flag), also remove all isolation markers and implicit markers.
243
244           UNICODE_BIDI_CLEANUP_BNL
245               Replace all characters classified as paragraph separators with
246               a newline character.
247
248           UNICODE_BIDI_CLEANUP_CANONICAL
249               A combined set of UNICODE_BIDI_CLEANUP_EXTRA and
250               UNICODE_BIDI_CLEANUP_BNL,
251
252       •   A pointer to a function that gets repeatedly invoked with the index
253           of the character that gets removed from the Unicode string.
254
255       •   An opaque pointer that gets forwarded to the callback.
256
257       The function pointer (if not NULL) gets invoked to report the index of
258       each removed character. The reported index is the index from the
259       original string, and the callback gets invoked in strict order, from
260       the first to the last removed character (if any).
261
262       The character string and the embedding level values resulting from
263       unicode_bidi_cleanup() with the UNICODE_BIDI_CLEANUP_CANONICAL are in
264       “canonical rendering order”.  unicode_bidi_logical_order(),
265       unicode_bidi_needs_embed() and unicode_bidi_embed() require the
266       canonical rendering order for their string and embedding level values.
267
268       The parameters to unicode_bidi_cleaned_size() are a pointer to the
269       unicode string, its size, and the bitmask option to
270       unicode_bidi_cleanup().
271
272   Embedding bi-directional markers in Unicode text strings
273       unicode_bidi_logical_order() rearranges the string from rendering to
274       its logical order.  unicode_bidi_embed() adds various bi-directional
275       markers to a Unicode string in canonical rendering order. The resulting
276       string is not guaranteed to be identical to the original Unicode
277       bi-directional string. The algorithm is fairly basic, but the resulting
278       bi-directional string produces the same canonical rendering order after
279       applying unicode_bidi_calc() or unicode_bidi_calc_levels(),
280       unicode_reorder() and unicode_bidi_cleanup() (with the canonical
281       option), with the same paragraph_embedding level.
282       unicode_bidi_needs_embed() attempts to heuristically determine whether
283       unicode_bidi_embed() is required.
284
285       unicode_bidi_logical_order() gets called first, followed by
286       unicode_bidi_embed() (or unicode_bidi_needs_embed() in order to
287       determine whether bi-directional markers are required). Finally,
288       unicode_bidi_embed_paragraph_level() optionally determines whether the
289       resulting string's default paragraph embedding level matches the one
290       used for the actual embedding direction, and if not returns a
291       directional marker to be prepended to the Unicode character string, as
292       a hint.
293
294       unicode_bidi_logical_order() factors in the characters' embedding
295       values, and the provided paragraph embedding value (UNICODE_BIDI_LR or
296       UNICODE_BIDI_RL), and rearranges the characters and the embedding
297       levels in left-to-right order, while simultaneously invoking the
298       supplied reorder_callback indicating each range of characters whose
299       relative order gets reversed. The reorder_callback() receives, as
300       parameters:
301
302       •   The starting index of the first reversed character, in the string.
303
304       •   Number of reversed characters.
305
306       •   Forwarded arg pointer value.
307
308       This specifies a consecutive range of characters (and directional
309       embedding values) that get reversed (first character in the range
310       becomes the last character, and the last character becomes the first
311       character).
312
313       After unicode_bidi_logical_order(), unicode_bidi_embed() progressively
314       invokes the passed-in callback with the contents of a bi-directional
315       unicode string. The parameters to unicode_bidi_embed() are:
316
317       •   The Unicode string.
318
319       •   The directional embedding buffer, in canonical rendering order.
320
321       •   The size of the string and the embedding level buffer.
322
323       •   The paragraph embedding level, either UNICODE_BIDI_LR or
324           UNICODE_BIDI_RL.
325
326       •   The pointer to the callback function.
327
328       •   An opaque pointer argument that gets forwarded to the callback
329           function.
330
331       The callback receives pointers to various parts of the original string
332       that gets passed to unicode_bidi_embed(), intermixed with
333       bi-directional markers, overrides, and isolates. The callback's
334       parameters are:
335
336       •   The pointer to a Unicode string.
337
338               Note
339               It is not a given that the callback receives pointers to
340               progressively increasing pointers of the original string that
341               gets passed to unicode_bidi_embed(). Some calls will be for
342               individual bi-directional markers, and unicode_bidi_embed()
343               also performs some additional internal reordering, on the fly,
344               after unicode_bidi_logical_order()'s big hammer.
345
346       •   Number of characters in the Unicode string.
347
348       •   Indication whether the Unicode string pointer is pointing to a part
349           of the original Unicode string that's getting embedded. Otherwise
350           this must be some marker character that's not present in the
351           original Unicode string.
352
353       •   Forwarded arg pointer value.
354
355       The assembled unicode string should produce the same canonical
356       rendering order, for the same paragraph embedding level.
357       unicode_bidi_embed_paragraph_level() checks if the specified Unicode
358       string computes the given default paragraph embedding level and returns
359       0 if it matches. Otherwise it returns a directional marker that should
360       be prepended to the Unicode string to allow unicode_bidi_calc's (or
361       unicode_bidi_calc_levels()) optional paragraph embedding level
362       pointer's value to be NULL, but derive the same default embedding
363       level. The parameters to unicode_bidi_embed_paragraph_level() are:
364
365       •   The Unicode string.
366
367       •   The size of the string.
368
369       •   The paragraph embedding level, either UNICODE_BIDI_LR or
370           UNICODE_BIDI_RL.
371
372       unicode_bidi_needs_embed() attempts to heuristically determine whether
373       the Unicode string, in logical order, requires bi-directional markers.
374       The parameters to unicode_bidi_embed_paragraph_level() are:
375
376       •   The Unicode string.
377
378       •   The directional embedding buffer, in logical order.
379
380       •   The size of the string and the embedding level buffer.
381
382       •   A pointer to an explicit paragraph embedding level, either
383           UNICODE_BIDI_LR or UNICODE_BIDI_RL; or a NULL pointer (see
384           unicode_bidi_calc_types()'s explanation for this parameter).
385
386       unicode_bidi_needs_embed() returns 0 if the Unicode string does not
387       need explicit directional markers, or 1 if it does. This is done by
388       using unicode_bidi_calc(), unicode_bidi_reorder(),
389       unicode_bidi_logical_order and then checking if the end result is
390       different from what was passed in.
391
392   Combining character ranges
393       unicode_bidi_combinings() reports consecutive sequences of one or more
394       combining marks in bidirectional text (which can be either in rendering
395       or logical order) that have the same embedding level. It takes the
396       following parameters:
397
398       •   The Unicode string.
399
400       •   The directional embedding buffer, in logical or rendering order. A
401           NULL value for this pointer is equivalent to a directional
402           embedding buffer with a level of 0 for every character in the
403           Unicode string.
404
405       •   Number of characters in the Unicode string.
406
407       •   The pointer to the callback function.
408
409       •   An opaque pointer argument that gets forwarded to the callback
410           function.
411
412       The callback function gets invoked for every consecutive sequence of
413       one or more characters that have a canonical combining class other than
414       0, and with the same embedding level. The parameters to the callback
415       function are:
416
417       •   The embedding level of the combining characters.
418
419       •   The starting index of a consecutive sequence of all characters with
420           the same embedding level.
421
422       •   The number of characters with the same embedding level.
423
424       •   The starting index of a consecutive sequence of all characters with
425           the same embedding level and a canonical combining class other than
426           0. This will always be equal to or greater than the value of the
427           second parameter.
428
429       •   The number of consecutive characters with the characters with the
430           same embedding level and a canonical combining class other than 0.
431           The last character included in this sequence will always be less
432           than or equal to the last character in the sequence defined by the
433           second and the third parameters.
434
435       •   The opaque pointer argument that was passed to
436           unicode_bidi_combinings.
437
438       A consecutive sequence of Unicode characters with non-0 combining
439       classes but different embedding levels gets reported individually, for
440       each consecutive sequence with the same embedding level.
441
442       This function helps with reordering the combining characters in
443       right-to-left-rendered text. Right-to-left text reversed by
444       unicode_bidi_reorder() results in combining characters preceding their
445       starter character. They get reversed no differently than any other
446       character. The same thing also occurs after
447       unicode_bidi_logical_order() reverses everything back. Use
448       unicode_bidi_combinings to identify consecutive sequences of combining
449       characters followed by their original starter.
450
451       The callback may reorder the characters identified by its third and the
452       fourth parameters in the manner described below.
453       unicode_bidi_reorder's parameter is pointers to a constant Unicode
454       string; but it can modify the string (via an out-of-band mutable
455       pointer) subject to the following conditions:
456
457       •   The characters identified by the third and the fourth parameter may
458           be modified.
459
460       •   If the last character in this sequence is not the last character
461           included in the range specified by the first and the second
462           character, then one more character after the last character may
463           also be modified.
464
465           This is, presumably, the original starter that preceded the
466           combining characters before the entire sequence was reversed.
467
468       Here's an example of a callback that reverses combining characters and
469       their immediately-following starter character:
470
471           void reorder_right_to_left_combining(unicode_bidi_level_t level,
472                                                size_t level_start,
473                                                size_t n_chars,
474                                                size_t comb_start,
475                                                size_t n_comb_chars,
476                                                void *arg)
477           {
478               /* Let's say that this is the Unicode string */
479               char32_t *buf=(char32_t *)arg;
480
481               if ((level & 1) == 0)
482                   return; /* Left-to-right text not reversed */
483
484               char32_t *b=buf+comb_start;
485               char32_t *e=b+n_comb_chars;
486
487               /*
488               ** Include the starter characters in the reversed range.
489               ** The semantics of the combining characters with different
490               ** embedding levels -- so they get reported here separately -- is
491               ** not specified. This will reverse just the combining marks, and
492               ** they're on their own.
493               */
494
495               if (comb_start + n_comb_chars < level_start + n_chars)
496                   ++e;
497
498               while (b < e)
499               {
500                   char32_t t;
501
502                   --e;
503                   t=*b;
504                   *b=*e;
505                   *e=t;
506                   ++b;
507               }
508           }
509
510   Miscellaneous utility functions
511       unicode_bidi_get_direction takes a pointer to a unicode string, the
512       number of characters in the unicode string, and determines default
513       paragraph level level.  unicode_bidi_get_direction returns a struct
514       with the following fields:
515
516       direction
517           This value is either UNICODE_BIDI_LR or UNICODE_BIDI_RL (left to
518           right or right to left).
519
520       is_explicit
521           This value is a flag. A non-0 value indicates that the embedding
522           level was derived from an explicit character type (L, R or AL) from
523           the stirng. A 0 value indicates the default paragraph direction, no
524           explicit character was found in the string.
525
526       unicode_bidi_type looks up each character's bi-directional character
527       type.
528
529       unicode_bidi_setbnl takes a pointer to a unicode string, a pointer to
530       an array of enum_bidi_type_t values and the number of characters in the
531       string and the array.  unicode_bidi_setbnl replaces all paragraph
532       separators in the unicode string with a newline character (same as the
533       UNICODE_BIDI_CLEANUP_BNL option to unicode_bidi_cleanup.
534
535       unicode_bidi_mirror returns the glyph that's a mirror image of the
536       parameter (i.e. an open parenthesis for a close parenthesis, and vice
537       versa); or the same value if there is no mirror image (this is the
538       Bidi_Mirrored=Yes property).
539
540       unicode_bidi_bracket_type looks up each bracket character and returns
541       its opposite, or the same value if the character is not a bracket that
542       has an opposing bracket character (this is the Bidi_Paired_Bracket_type
543       property). A non-NULL ret gets initialized to either UNICODE_BIDI_o,
544       UNICODE_BIDI_c or UNICODE_BIDI_n.
545

SEE ALSO

547       TR-9[1], unicode::bidi(3), courier-unicode(7),
548

AUTHOR

550       Sam Varshavchik
551           Author
552

NOTES

554        1. Unicode Bi-Directional algorithm
555           https://www.unicode.org/reports/tr9/tr9-42.html
556
557
558
559Courier Unicode Library           04/16/2022                   UNICODE_BIDI(3)
Impressum