File: //usr/local/include/courier-unicode.h
#ifndef courier_unicode_h
#define courier_unicode_h
/*
** Copyright 2000-2021 Double Precision, Inc.
** See COPYING for distribution information.
**
*/
#ifdef __cplusplus
#include <string>
#include <vector>
#include <list>
#include <functional>
#include <tuple>
extern "C" {
#endif
#if 0
}
#endif
#include <stdlib.h>
#include <stdio.h>
#include <stdint.h>
#include <sys/types.h>
#include <locale.h>
#ifdef __cplusplus
#else
#if 1
#include <uchar.h>
#else
typedef uint32_t char32_t;
#endif
#endif
#define COURIER_UNICODE_VERSION 2002003
/*
** The system default character set, from the locale.
*/
extern const char *unicode_default_chset();
/*
** The current locale character set.
*/
extern const char *unicode_locale_chset();
#if 1
extern const char *unicode_locale_chset_l(locale_t l);
#endif
/* Unicode upper/lower/title case conversion functions */
extern char32_t unicode_uc(char32_t);
extern char32_t unicode_lc(char32_t);
extern char32_t unicode_tc(char32_t);
/*
** Look up HTML 4.0/XHTML entity.
**
** n="amp", etc...
**
** Returns the unicode entity value, or 0 if no such entity is defined.
*/
char32_t unicode_html40ent_lookup(const char *n);
/*
** East Asian Width lookup.
**
** unicode_eastasia looks up the EastAsianWidth property for the given
** Unicode character.
*/
#define UNICODE_EASTASIA_A 'A' /* Ambiguous */
#define UNICODE_EASTASIA_F 'F' /* Full width */
#define UNICODE_EASTASIA_H 'H' /* Half width */
#define UNICODE_EASTASIA_N '/' /* Unassigned */
#define UNICODE_EASTASIA_Na 'N' /* Narrow */
#define UNICODE_EASTASIA_W 'W' /* Wide */
typedef char unicode_eastasia_t;
unicode_eastasia_t unicode_eastasia(char32_t);
/*
**
** Return "width" of unicode character.
**
** This is defined as follows: for characters having the F or W property in
** tr11 (EastAsianWidth), unicode_wcwidth() returns 2.
**
** Otherwise, characters having the BK, CR, LF, CM, NL, WJ, and ZW line
** breaking property as per tr14, unicode_wcwdith() returns 0. For all other
** cases, 1.
**
** This provides a rough estimate of the "width" of the character if its
** shown on a text console.
*/
extern int unicode_wcwidth(char32_t c);
extern size_t unicode_wcwidth_str(const char32_t *c);
/* Internal unicode table lookup functions */
extern uint8_t unicode_tab_lookup(char32_t ch,
const size_t *unicode_starting_indextab,
const char32_t *unicode_starting_pagetab,
size_t unicode_tab_sizeof,
const uint8_t (*unicode_rangetab)[2],
size_t unicode_rangetab_sizeof,
const uint8_t *unicode_classtab,
uint8_t uclass);
extern uint32_t unicode_tab32_lookup(char32_t ch,
const size_t *unicode_starting_indextab,
const char32_t *unicode_starting_pagetab,
size_t unicode_tab_sizeof,
const uint8_t (*unicode_rangetab)[2],
size_t unicode_rangetab_sizeof,
const uint32_t *unicode_classtab,
uint32_t uclass);
/*
** Look up unicode categorization, see http://unicode.org/notes/tn36/
**
** Returns a 32 bit value with four unicode categories encoded in the
** bits defined by UNICODE_CATEGORY_1..4
*/
#define UNICODE_CATEGORY_1 0xFF000000
#define UNICODE_CATEGORY_2 0x00FF0000
#define UNICODE_CATEGORY_3 0x0000FF00
#define UNICODE_CATEGORY_4 0x000000FF
#include <courier-unicode-categories-tab.h>
uint32_t unicode_category_lookup(char32_t);
/*
** Return non-0 for TAB, and all UNICODE_CATEGORY_2_SPACE.
*/
extern int unicode_isblank(char32_t ch);
/*
** The unicode-ish isspace(). In addition to return non-0 for
** unicode_isblank(), this also returns non-0 for unicode characters
** with linebreaking properties of BK, CR, LF, NL, and SP.
*/
extern int unicode_isspace(char32_t ch);
/*
** Return non-0 for all UNICODE_CATEGORY_1_LETTER
*/
extern int unicode_isalpha(char32_t ch);
/*
** Return non-0 for all UNICODE_CATEGORY_1_NUMBER | UNICODE_CATEGORY_2_DIGIT,
** only (no third categories).
*/
extern int unicode_isdigit(char32_t ch);
/*
** Return non-0 for all unicode_isalpha() or unicode_isdigit().
*/
extern int unicode_isalnum(char32_t ch);
/*
** Returns non-0 for all codepoints above SPACE which are not
** unicode_isspace().
*/
extern int unicode_isgraph(char32_t ch);
/*
** Return non-0 for all UNICODE_CATEGORY_1_PUNCTUATION.
*/
extern int unicode_ispunct(char32_t ch);
/*
** Return non-0 for all unicode_isalpha() for which the character is
** equal to unicode_lc() of itself.
*/
extern int unicode_islower(char32_t ch);
/*
** Return non-0 for all unicode_isalpha() for which the character is
** equal to unicode_uc() of itself.
*/
extern int unicode_isupper(char32_t ch);
/*
** Implementation of Unicode emoji classification, as per
** http://www.unicode.org/reports/tr51/tr51-18.html
**
** Given a char32_t, returns the character's emoji value, which is a bitmask:
**
*/
#define UNICODE_EMOJI_NONE 0
#define UNICODE_EMOJI 1
#define UNICODE_EMOJI_PRESENTATION 2
#define UNICODE_EMOJI_MODIFIER 4
#define UNICODE_EMOJI_MODIFIER_BASE 8
#define UNICODE_EMOJI_COMPONENT 16
#define UNICODE_EMOJI_EXTENDED_PICTOGRAPHIC 32
typedef unsigned char unicode_emoji_t;
extern unicode_emoji_t unicode_emoji_lookup(char32_t);
/* Look up just one of the properties, returns non-0 if the char has it */
extern int unicode_emoji(char32_t);
extern int unicode_emoji_presentation(char32_t);
extern int unicode_emoji_modifier(char32_t);
extern int unicode_emoji_modifier_base(char32_t);
extern int unicode_emoji_component(char32_t);
extern int unicode_emoji_extended_pictographic(char32_t);
/*
** Implementation of grapheme cluster boundary rules, as per
** http://www.unicode.org/reports/tr29/tr29-37.html
** including GB9a and GB9b.
**
** unicode_grapheme_break_init() allocates an opaque
** unicode_grapheme_break_info_t handle, and
** unicode_grapheme_break_destroy() destroys it.
**
** Passing the handle to unicode_grapheme_break_next() returns non-0 if
** there's a grapheme break before the given character (and after the
** character that got passed in the previous call).
**
** The first call to unicode_grapheme_break_next() returns 1, as per GB1.
**
** unicode_grapheme_break() is a simplified interface that returns non-0
** if there is a grapheme break between the two characters. This simplified
** interface is equivalent to calling unicode_grapheme_break_init(),
** followed by two calls to unicode_grapheme_break_next(), and finally
** unicode_grapheme_break_deinit(), and returns the result of the second
** call to unicode_grapheme_break_next().
*/
struct unicode_grapheme_break_info_s;
typedef struct unicode_grapheme_break_info_s *unicode_grapheme_break_info_t;
extern unicode_grapheme_break_info_t unicode_grapheme_break_init();
extern int unicode_grapheme_break_next(unicode_grapheme_break_info_t, char32_t);
extern void unicode_grapheme_break_deinit(unicode_grapheme_break_info_t);
extern int unicode_grapheme_break(char32_t a, char32_t b);
typedef enum {
#include <courier-unicode-script-tab.h>
} unicode_script_t;
/*
** Look up the unicode script property, as per
** http://www.unicode.org/reports/tr24/tr24-31.html
*/
unicode_script_t unicode_script(char32_t a);
/*
** Implementation of line break rules, as per
** http://www.unicode.org/reports/tr14/tr14-45.html
**
** Invoke unicode_lb_init() to initialize the linebreaking algorithm. The
** first parameter is a callback function that gets invoked with two
** arguments: UNICODE_LB_{MANDATORY|NONE|ALLOWED}, and a passthrough argument.
** The second parameter to unicode_lb_init() is the opaque passthrough
** pointer, that is passed as the second argument to the callback function
** with no further interpretation.
**
** unicode_lb_init() returns an opaque handle. Invoke unicode_lb_next(),
** passing the handle and one unicode character. Repeatedly invoke
** unicode_lb_next() to specify the input string for the linebreaking
** algorithm, then invoke unicode_lb_end() to finish calculating the
** linebreaking algorithm, and deallocate the opaque linebreaking handle.
**
** The callback function gets invoked once for each invocation of
** unicode_lb_next(). The contract is that before unicode_lb_end() returns,
** the callback function will get invoked the exact number of times that
** unicode_lb_next(), as long as each invocation of the callback function
** returned 0; nothing more, nothing less. The first parameter to the callback
** function will be one of the following values:
**
** UNICODE_LB_MANDATORY - a linebreak is MANDATORY before the corresponding
** character.
** UNICODE_LB_NONE - a linebreak is PROHIBITED before the corresponding
** character.
** UNICODE_LB_ALLOWED - a linebreak is OPTIONAL before the corresponding
** character (the preceding character is a space, or an equivalent).
**
** The callback function should return 0. A non-zero value indicates an
** error, which gets propagated up to the caller. The contract that the
** callback function gets invoked the same number of times that
** unicode_lb_next() gets invoked is now broken.
*/
#define UNICODE_LB_MANDATORY -1
#define UNICODE_LB_NONE 0
#define UNICODE_LB_ALLOWED 1
struct unicode_lb_info;
typedef struct unicode_lb_info *unicode_lb_info_t;
/*
** Allocate a linebreaking handle.
*/
extern unicode_lb_info_t unicode_lb_init(int (*cb_func)(int, void *),
void *cb_arg);
/*
** Feed the next character through the linebreaking algorithm.
** A non-zero return code indicates that the callback function was invoked
** and it returned a non-zero return code (which is propagated as a return
** value). unicode_lb_end() must still be invoked, in this case.
**
** A zero return code indicates that if the callback function was invoked,
** it returned 0.
*/
extern int unicode_lb_next(unicode_lb_info_t i, char32_t ch);
/*
** Convenience function that invokes unicode_lb_next() with a list of
** unicode chars. Returns 0 if all invocations of unicode_lb_next() returned
** 0, or the first non-zero return value from unicode_lb_next().
*/
extern int unicode_lb_next_cnt(unicode_lb_info_t i,
const char32_t *chars,
size_t cnt);
/*
** Finish the linebreaking algorithm.
**
** A non-zero return code indicates that the callback function was invoked
** and it returned a non-zero return code (which is propagated as a return
** value).
**
** A zero return code indicates that if the callback function was invoked,
** it returned 0, and that the callback function was invoked exactly the same
** number of times that unicode_lb_next() was invoked.
**
** In all case, the linebreak handle will no longer be valid when this
** function returns.
*/
extern int unicode_lb_end(unicode_lb_info_t i);
/*
** An alternative linebreak API where the callback function receives the
** original unicode character in addition to its linebreak value.
**
** User unicode_lbc_init(), unicode_lbc_next(), and unicode_lbc_end(), whose
** semantics are the same as their _lb_ counterparts.
*/
struct unicode_lbc_info;
typedef struct unicode_lbc_info *unicode_lbc_info_t;
extern unicode_lbc_info_t unicode_lbc_init(int (*cb_func)(int, char32_t,
void *),
void *cb_arg);
extern int unicode_lbc_next(unicode_lbc_info_t i, char32_t ch);
extern int unicode_lbc_next_cnt(unicode_lbc_info_t i,
const char32_t *chars,
size_t cnt);
extern int unicode_lbc_end(unicode_lbc_info_t i);
/*
** Set linebreaking options.
**
** OPTIONS SUBJECT TO CHANGE.
*/
extern void unicode_lb_set_opts(unicode_lb_info_t i, int opts);
extern void unicode_lbc_set_opts(unicode_lbc_info_t i, int opts);
/*
** Tailorization of LB24: Prevent pluses, as in "C++", from breaking.
**
** Adds the following to LB24:
**
** PR x PR
**
** AL x PR
**
** ID x PR
**/
#define UNICODE_LB_OPT_PRBREAK 0x0001
/*
** Tailored "/" breaking rules. This prevents breaking after the "/"
** character. And provides an exception to the "x SY" rule in LB13.
**
** Adds the following rule to LB13:
**
** SY x EX
**
** SY x AL
**
** SY x ID
**
** SP รท SY, which takes precedence over "x SY".
*/
#define UNICODE_LB_OPT_SYBREAK 0x0002
/*
** Tailored / breaking rules.
**
** This reclassifies U+2013 and U+2014 as class WJ, prohibiting breaks before
** and after mdash and ndash.
*/
#define UNICODE_LB_OPT_DASHWJ 0x0004
/*
** Implemention of word break rules, as per
** http://www.unicode.org/reports/tr29/tr29-37.html
**
** Invoke unicode_wb_init() to initialize the wordbreaking algorithm. The
** first parameter is a callback function that gets invoked with two
** arguments: an int flag, and a passthrough argument. The second parameter to
** unicode_wb_init() is the opaque passthrough pointer, that is passed as the
** second argument to the callback function with no further interpretation.
**
** unicode_wb_init() returns an opaque handle. Invoke unicode_wb_next(),
** passing the handle and one unicode character. Repeatedly invoke
** unicode_wb_next() to specify the input string for the wordbreaking
** algorithm, then invoke unicode_wb_end() to finish calculating the
** wordbreaking algorithm, and deallocate the opaque wordbreaking handle.
**
** The callback function gets invoked once for each invocation of
** unicode_wb_next(). The contract is that before unicode_wb_end() returns,
** the callback function will get invoked the exact number of times that
** unicode_wb_next(), as long as each invocation of the callback function
** returned 0; nothing more, nothing less. The first parameter to the callback
** function will be an int. A non-zero value indicates that there is a word
** break between this character and the preceding one.
**
** The callback function should return 0. A non-zero value indicates an
** error, which gets propagated up to the caller. The contract that the
** callback function gets invoked the same number of times that
** unicode_lb_next() gets invoked is now broken.
*/
struct unicode_wb_info;
typedef struct unicode_wb_info *unicode_wb_info_t;
/*
** Allocate a wordbreaking handle.
*/
extern unicode_wb_info_t unicode_wb_init(int (*cb_func)(int, void *),
void *cb_arg);
/*
** Feed the next character through the wordbreaking algorithm.
** A non-zero return code indicates that the callback function was invoked
** and it returned a non-zero return code (which is propagated as a return
** value). unicode_wb_end() must still be invoked, in this case.
**
** A zero return code indicates that if the callback function was invoked,
** it returned 0.
*/
extern int unicode_wb_next(unicode_wb_info_t i, char32_t ch);
/*
** Convenience function that invokes unicode_wb_next() with a list of
** unicode chars. Returns 0 if all invocations of unicode_wb_next() returned
** 0, or the first non-zero return value from unicode_wb_next().
*/
extern int unicode_wb_next_cnt(unicode_wb_info_t i,
const char32_t *chars,
size_t cnt);
/*
** Finish the wordbreaking algorithm.
**
** A non-zero return code indicates that the callback function was invoked
** and it returned a non-zero return code (which is propagated as a return
** value).
**
** A zero return code indicates that if the callback function was invoked,
** it returned 0, and that the callback function was invoked exactly the same
** number of times that unicode_wb_next() was invoked.
**
** In all case, the wordbreak handle will no longer be valid when this
** function returns.
*/
extern int unicode_wb_end(unicode_wb_info_t i);
/*
** Search for a word boundary.
**
** Obtain a handle by calling unicode_wbscan_init(), then invoke
** unicode_wbscan_next() to provide a unicode stream, then invoke
** unicode_wbscan_end(). unicode_wbscan_end() returns the number of unicode
** characters from the beginning of the stream until the first word boundary.
**
** You may prematurely stop calling unicode_wbscan_next() once it returns a
** non-0 value, which means that there is sufficient context to compute the
** first word boundary, and all further calls to unicode_wbscan_next() will
** be internal no-ops.
*/
struct unicode_wbscan_info;
typedef struct unicode_wbscan_info *unicode_wbscan_info_t;
unicode_wbscan_info_t unicode_wbscan_init();
int unicode_wbscan_next(unicode_wbscan_info_t i, char32_t ch);
size_t unicode_wbscan_end(unicode_wbscan_info_t i);
/* Unicode directional markers */
#define UNICODE_LRM 0x200E /* Left-to-right marker */
#define UNICODE_RLM 0x200F /* Right-to-left marker */
#define UNICODE_ALM 0x061C /* Right-to-left Arabic marker */
#define UNICODE_LRI 0x2066 /* Left-to-right isolate */
#define UNICODE_RLI 0x2067 /* Right-to-left isolate */
#define UNICODE_PDI 0x2069 /* Pop isolate */
#define UNICODE_RLO 0x202e /* Right-to-left override */
#define UNICODE_LRO 0x202d /* Left-to-right override */
#define UNICODE_PDF 0x202c /* Pop directional override */
#ifdef __cplusplus
#if __cplusplus >= 201103L
namespace unicode {
namespace literals {
constexpr char32_t LRM[]={UNICODE_LRM, 0};
constexpr char32_t RLM[]={UNICODE_RLM, 0};
constexpr char32_t ALM[]={UNICODE_ALM, 0};
constexpr char32_t LRI[]={UNICODE_LRI, 0};
constexpr char32_t RLI[]={UNICODE_RLI, 0};
constexpr char32_t PDI[]={UNICODE_PDI, 0};
constexpr char32_t RLO[]={UNICODE_RLO, 0};
constexpr char32_t LRO[]={UNICODE_LRO, 0};
constexpr char32_t PDF[]={UNICODE_PDF, 0};
}
}
#endif
#endif
typedef char unicode_bidi_bracket_type_t;
#define UNICODE_BIDI_n 'n'
#define UNICODE_BIDI_o 'o'
#define UNICODE_BIDI_c 'c'
extern char32_t unicode_bidi_mirror(char32_t c);
extern char32_t unicode_bidi_bracket_type(char32_t c,
unicode_bidi_bracket_type_t *ret);
typedef unsigned char unicode_bidi_level_t;
#define UNICODE_BIDI_LR ((unicode_bidi_level_t)0)
#define UNICODE_BIDI_RL ((unicode_bidi_level_t)1)
#define UNICODE_BIDI_SKIP ((unicode_bidi_level_t)254)
/*
** What unicode_bidi_direction returns.
*/
struct unicode_bidi_direction {
/* Direction of the given text */
unicode_bidi_level_t direction;
/*
** The direction is explicit, if not direction is UNICODE_BIDI_LR by
** default.
*/
int is_explicit;
};
struct unicode_bidi_direction unicode_bidi_get_direction(const char32_t *p,
size_t n);
struct unicode_bidi_direction unicode_bidi_calc(const char32_t *p, size_t n,
unicode_bidi_level_t *bufp,
const unicode_bidi_level_t *
initial_embedding_level);
extern void unicode_bidi_reorder(char32_t *p,
unicode_bidi_level_t *levels,
size_t n,
void (*reorder_callback)(size_t, size_t,
void *),
void *arg);
/* BIDI_TYPE_LIST */
typedef enum {
UNICODE_BIDI_TYPE_AL,
UNICODE_BIDI_TYPE_AN,
UNICODE_BIDI_TYPE_B,
UNICODE_BIDI_TYPE_BN,
UNICODE_BIDI_TYPE_CS,
UNICODE_BIDI_TYPE_EN,
UNICODE_BIDI_TYPE_ES,
UNICODE_BIDI_TYPE_ET,
UNICODE_BIDI_TYPE_FSI,
UNICODE_BIDI_TYPE_L,
UNICODE_BIDI_TYPE_LRE,
UNICODE_BIDI_TYPE_LRI,
UNICODE_BIDI_TYPE_LRO,
UNICODE_BIDI_TYPE_NSM,
UNICODE_BIDI_TYPE_ON,
UNICODE_BIDI_TYPE_PDF,
UNICODE_BIDI_TYPE_PDI,
UNICODE_BIDI_TYPE_R,
UNICODE_BIDI_TYPE_RLE,
UNICODE_BIDI_TYPE_RLI,
UNICODE_BIDI_TYPE_RLO,
UNICODE_BIDI_TYPE_S,
UNICODE_BIDI_TYPE_WS,
} enum_bidi_type_t;
extern enum_bidi_type_t unicode_bidi_type(char32_t c);
extern void unicode_bidi_calc_types(const char32_t *p, size_t n,
enum_bidi_type_t *buf);
extern void unicode_bidi_setbnl(char32_t *p,
const enum_bidi_type_t *types,
size_t n);
extern struct unicode_bidi_direction
unicode_bidi_calc_levels(const char32_t *p,
const enum_bidi_type_t
*types,
size_t n,
unicode_bidi_level_t *bufp,
const unicode_bidi_level_t
*initial_embedding_level);
/* Bitmask options to unicode_bidi_cleanup */
/*
In addition to removing embedding, override, and boundary-neutral
characters also remove isolation markers and implicit markers.
*/
#define UNICODE_BIDI_CLEANUP_EXTRA 1
/*
Replace all characters classified as paragraph separators by a newline
character.
*/
#define UNICODE_BIDI_CLEANUP_BNL 2
/*
Options for canonical rendering order.
*/
#define UNICODE_BIDI_CLEANUP_CANONICAL \
(UNICODE_BIDI_CLEANUP_EXTRA | UNICODE_BIDI_CLEANUP_BNL)
#ifdef __cplusplus
#if __cplusplus >= 201103L
namespace unicode {
namespace literals {
constexpr int CLEANUP_EXTRA=UNICODE_BIDI_CLEANUP_EXTRA;
constexpr int CLEANUP_BNL=UNICODE_BIDI_CLEANUP_BNL;
constexpr int CLEANUP_CANONICAL=UNICODE_BIDI_CLEANUP_CANONICAL;
}
}
#endif
#endif
extern size_t unicode_bidi_cleanup(char32_t *string,
unicode_bidi_level_t *levels,
size_t n,
int options,
void (*removed_callback)(size_t, void *),
void *);
extern size_t unicode_bidi_cleaned_size(const char32_t *string,
size_t n,
int options);
extern void unicode_bidi_logical_order(char32_t *string,
unicode_bidi_level_t *levels,
size_t n,
unicode_bidi_level_t paragraph_embedding,
void (*reorder_callback)(size_t, size_t,
void *),
void *arg);
extern int unicode_bidi_needs_embed(const char32_t *string,
const unicode_bidi_level_t *levels,
size_t n,
const unicode_bidi_level_t *
paragraph_embedding);
extern void unicode_bidi_embed(const char32_t *string,
const unicode_bidi_level_t *levels,
size_t n,
unicode_bidi_level_t paragraph_embedding,
void (*emit)(const char32_t *string,
size_t n,
int,
void *arg),
void *arg);
extern char32_t unicode_bidi_embed_paragraph_level(const char32_t *str,
size_t n,
unicode_bidi_level_t);
extern void unicode_bidi_combinings(const char32_t *str,
const unicode_bidi_level_t *levels,
size_t n,
void (*combinings)
(unicode_bidi_level_t level,
size_t level_start,
size_t n_chars,
size_t comb_start,
size_t n_comb_chars,
void *arg),
void *arg);
/*
** unicode_canonical() returns the canonical mapping of the given Unicode
** character. The returned structure specifies:
**
** - A pointer to the canonical decomposition of the given character.
** - Number of characters in the canonical decomposition.
** - An optional formatting tag.
**
** A null pointer, and a 0 character count gets returned for characters
** without a canonical decomposition.
**
*/
typedef enum {
UNICODE_CANONICAL_FMT_NONE=0,
UNICODE_CANONICAL_FMT_CIRCLE,
UNICODE_CANONICAL_FMT_COMPAT,
UNICODE_CANONICAL_FMT_FINAL,
UNICODE_CANONICAL_FMT_FONT,
UNICODE_CANONICAL_FMT_FRACTION,
UNICODE_CANONICAL_FMT_INITIAL,
UNICODE_CANONICAL_FMT_ISOLATED,
UNICODE_CANONICAL_FMT_MEDIAL,
UNICODE_CANONICAL_FMT_NARROW,
UNICODE_CANONICAL_FMT_NOBREAK,
UNICODE_CANONICAL_FMT_SMALL,
UNICODE_CANONICAL_FMT_SQUARE,
UNICODE_CANONICAL_FMT_SUB,
UNICODE_CANONICAL_FMT_SUPER,
UNICODE_CANONICAL_FMT_VERTICAL,
UNICODE_CANONICAL_FMT_WIDE,
} unicode_canonical_fmt_t;
typedef struct {
const char32_t *canonical_chars;
size_t n_canonical_chars;
unicode_canonical_fmt_t format;
} unicode_canonical_t;
extern unicode_canonical_t unicode_canonical(char32_t);
/*
** A buffer that holds unicode characters, and dynamically grows as needed.
*/
struct unicode_buf {
char32_t *ptr; /* The unicode characters */
size_t size, /* Buffer size */
len, /* How many characters in ptr are initialized */
max; /* Maximum size the buffer can grow to */
};
/*
** Initialize a buffer. Constructor.
*/
void unicode_buf_init(/* Initialize this structure. ptr, size, len cleared */
struct unicode_buf *p,
/*
** Maximum size the buffer can grow to. (size_t)-1
** means unlimited.
*/
size_t max);
/*
** Like unicode_buf_init, and initialize the new buffer with the contents of
** another buffer. The maximum size of the initialized buffer is exactly the
** number of characters in the existing buffer. This copies a buffer using
** the minimum amount of heap space.
*/
#define unicode_buf_init_copy(a,b) \
do { \
unicode_buf_init((a), unicode_buf_len(b)); \
unicode_buf_append_buf((a),(b)); \
} while (0)
/*
** Deinitialize the buffer. Destructor. Frees memory.
*/
void unicode_buf_deinit(struct unicode_buf *p);
/*
** Official way to access the characters in the unicode buffer.
*/
#define unicode_buf_ptr(p) ((p)->ptr)
/*
** Official way of obtaining the number of characters in the unicode buffer.
*/
#define unicode_buf_len(p) ((p)->len)
/*
** Remove all existing characters from an initialized buffer. Sets len to 0.
*/
#define unicode_buf_clear(p) ((p)->len=0)
/*
** Append characters to the existing characters in the unicode buffer.
** The buffer grows, if needed. If the buffer would exceed its maximum size,
** the extra characters get truncated.
**
** Returns 0 if the characters were appended. -1 for a malloc failure.
*/
int unicode_buf_append(struct unicode_buf *p, /* The buffer */
const char32_t *uc, /* Characters to append */
size_t l); /* How many of them */
/*
** Convert an iso-8859-1 char string and invoke unicode_buf_append().
*/
void unicode_buf_append_char(struct unicode_buf *dst,
const char *str,
size_t cnt);
/*
** Remove some portion of the unicode buffer
*/
void unicode_buf_remove(struct unicode_buf *p, /* The buffer */
size_t pos, /* Offset in buffer */
size_t cnt); /* How many to remove */
/*
** Append the contents of an existing buffer to another one.
*/
#define unicode_buf_append_buf(a,b) \
unicode_buf_append((a), unicode_buf_ptr(b), unicode_buf_len(b))
/*
** The equivalent of strcmp() for unicode buffers.
*/
int unicode_buf_cmp(const struct unicode_buf *a,
const struct unicode_buf *b);
/*
** The equivalent of unicode_buf_cmp, except that the second buffer is an
** iso-8859-1 string.
*/
int unicode_buf_cmp_str(const struct unicode_buf *p,
const char *c, /* iso-8859-1 string */
size_t cl); /* Number of chars in c */
/*
** A wrapper for iconv(3). This wrapper provides a different API for iconv(3).
** A handle gets created by unicode_convert_init().
** unicode_convert_init() receives a pointer to the output function
** which receives converted character text.
**
** The output function receives a pointer to the converted character text, and
** the number of characters in the converted text.
**
** The character text to convert gets passed, repeatedly, to
** unicode_convert(). Each call to unicode_convert() results in
** the output function being invoked, zero or more times, with the converted
** text. Finally, unicode_convert_deinit() stops the conversion and
** deallocates the conversion handle.
**
** Internal buffering takes place. unicode_convert_deinit() may result
** in the output function being called one or more times, to receive the final
** part of the converted character stream.
**
** The output function should return 0. A non-0 value causes
** unicode_convert() and/or unicode_convert_deinit() returning
** non-0.
*/
struct unicode_convert_hdr;
typedef struct unicode_convert_hdr *unicode_convert_handle_t;
/*
** unicode_convert_init() returns a non-NULL handle for the requested
** conversion, or NULL if the requested conversion is not available.
*/
unicode_convert_handle_t
unicode_convert_init(/* Convert from this chset */
const char *src_chset,
/* Convert to this chset */
const char *dst_chset,
/* The output function */
int (*output_func)(const char *, size_t, void *),
/* Passthrough arg */
void *convert_arg);
/*
** Repeatedly pass the character text to convert to unicode_convert().
**
** Returns non-0 if the output function returned non-0, or 0 if all invocations
** of the output function returned 0.
*/
int unicode_convert(/* The conversion handle */
unicode_convert_handle_t handle,
/* Text to convert */
const char *text,
/* Number of bytes to convert */
size_t cnt);
/*
** Finish character set conversion. The handle gets deallocated.
**
** May still result in one or more invocations of the output function.
** Returns non-zero if any previous invocation of the output function returned
** non-zero (this includes any invocations of the output function resulting
** from this call, or prior unicode_convert() calls), or 0 if all
** invocations of the output function returned 0.
**
** If the errptr is not NULL, *errptr is set to non-zero if there were any
** conversion errors -- if there was any text that could not be converted to
** the destination character text.
*/
int unicode_convert_deinit(unicode_convert_handle_t handle,
int *errptr);
/*
** Specialization: save converted character text in a buffer.
**
** Implementation: call unicode_convert_tocbuf_init() instead of
** unicode_convert_init(), then call unicode_convert() and
** unicode_convert_deinit(), as usual.
**
** If unicode_convert_deinit() returns 0, *cbufptr_ret gets initialized to a
** malloc()ed buffer, and the number of converted characters, the size of the
** malloc()ed buffer, are placed into *csize_ret arguments, that were passed
** to unicode_convert_tou_init().
**
** Note: if the converted string is an empty string, *cbufsize_ret is set to 0,
** but *cbufptr_ptr still gets initialized (to a dummy malloced buffer).
**
** The optional nullterminate places a trailing \0 character after the
** converted string (this is included in *cbufsize_ret).
*/
unicode_convert_handle_t
unicode_convert_tocbuf_init(/* Convert from this chset */
const char *src_chset,
/* Convert to this chset */
const char *dst_chset,
/* malloced buffer */
char **cbufptr_ret,
/* size of the malloced buffer */
size_t *cbufsize_ret,
/* null terminate the resulting string */
int nullterminate
);
/*
** Specialization: convert some character text to a char32_t array.
**
** This is like unicode_convert_tocbuf_init(), but converts to a char32_t
** array.
**
** The returned *ucsize_ret is initialized with the number of char32_ts,
** rather than the byte count.
**
** In all other ways, this function behaves identically to
** unicode_convert_tocbuf_init().
*/
unicode_convert_handle_t
unicode_convert_tou_init(/* Convert from this chset */
const char *src_chset,
/* malloc()ed buffer pointer, on exit. */
char32_t **ucptr_ret,
/* size of the malloc()ed buffer, upon exit */
size_t *ucsize_ret,
/* If true, terminate with U+0x0000, for convenience */
int nullterminate
);
/*
** Specialization: convert a char32_t array to some character text.
**
** This is the opposite of unicode_convert_tou_init(). Call this to
** initialize the conversion handle, then use unicode_convert_uc()
** instead of unicode_convert.
*/
unicode_convert_handle_t
unicode_convert_fromu_init(/* Convert to this chset */
const char *dst_chset,
/* malloc()ed buffer pointer, on exit. */
char **cbufptr_ret,
/* size of the malloc()ed buffer, upon exit */
size_t *cbufsize_ret,
/* If true, terminate with U+0x0000, for convenience */
int nullterminate
);
int unicode_convert_uc(/* The conversion handle */
unicode_convert_handle_t handle,
/* Text to convert */
const char32_t *text,
/* Number of bytes to convert */
size_t cnt);
/*
** Initialize conversion to UTF-8.
**
** This is a wrapper for unicode_convert_tocbuf_init() that specifies the
** destination charset as UTF-8.
*/
unicode_convert_handle_t
unicode_convert_tocbuf_toutf8_init(const char *src_chset,
char **cbufptr_ret,
size_t *cbufsize_ret,
int nullterminate);
/*
** Initialize conversion from UTF-8.
**
** This is a wrapper for unicode_convert_tocbuf_init() that specifies the
** source charset as UTF-8.
*/
unicode_convert_handle_t
unicode_convert_tocbuf_fromutf8_init(const char *dst_chset,
char **cbufptr_ret,
size_t *cbufsize_ret,
int nullterminate);
/*
** Convert a character string to UTF-8.
**
** Returns a malloc-ed buffer holding the UTF-8 string, or NULL if an
** error occured.
*/
char *unicode_convert_toutf8(/* Text to convert to UTF-8 */
const char *text,
/* Character set to convert to UTF-8 */
const char *charset,
/*
** If non-NULL, and a non-NULL pointer is
** returned, *error is set to non-zero if
** a character conversion error has occured.
*/
int *error);
/*
** Convert UTF-8 text to another character set.
**
** Returns a malloc-ed buffer holding the string converted to the specified
** character set, or NULL if an error occured.
*/
char *unicode_convert_fromutf8(/* A UTF-8 string */
const char *text,
/*
** Convert the UTF-8 string to this character
** set.
*/
const char *charset,
/*
** If non-NULL, and a non-NULL pointer is
** returned, *error is set to non-zero if
** a character conversion error has occured.
*/
int *error);
/*
** Convert one charset to another charset, placing the result in a malloc-ed
** buffer.
**
** Returns a malloc-ed buffer holding the string converted to the specified
** character set, or NULL if an error occured.
*/
char *unicode_convert_tobuf(/* A string to convert */
const char *text,
/*
** String's charset.
*/
const char *charset,
/*
** Destination charset
*/
const char *dstcharset,
/*
** If non-NULL, and a non-NULL pointer is
** returned, *error is set to non-zero if
** a character conversion error has occured.
*/
int *error);
/*
** Convenience function: call unicode_convert_tou_init(), feed the
** character string through unicode_convert(), then call
** unicode_convert_deinit().
**
** If this function returns 0, *uc and *ucsize is set to a malloced buffer+size
** holding the unicode char array.
*/
int unicode_convert_tou_tobuf(/* Character text to convert */
const char *text,
/* Number of characters */
size_t text_l,
/* text's charset */
const char *charset,
/*
** If this function returns 0, this gets
** initialized
*/
char32_t **uc,
/*
** Size of the allocated buffer
*/
size_t *ucsize,
/*
** If not null and this function returns 0,
** this is set to non-0 if there
** was a conversion error (but the output
** buffer gets still allocated and
** initialized)
*/
int *err);
/*
** Convenience function: call unicode_convert_fromu_init(), feed the
** unicode_array through unicode_convert_uc(), then call
** unicode_convert_deinit().
**
** If this function returns 0, *uc and *ucsize is set to a malloced buffer+size
** holding the converted character string
*/
int unicode_convert_fromu_tobuf(/* Unicode array to convert to a char str */
const char32_t *utext,
/*
** Size of the unicode array.
** If this is (size_t)-1, utext is a
** 0-terminated array.
*/
size_t utext_l,
/*
** Convert the unicode array to this charset.
*/
const char *charset,
/*
** If unicode_convert_fromu_tobuf()
** returns 0, this is initialized to a
** malloced buffer with a 0-terminated
** string is kept.
*/
char **c,
/*
** Size of the initialized array, including
** the 0-terminator.
*/
size_t *csize,
/*
** If unicode_convert_fromu_tobuf()
** returns 0 and this is not NULL,
** *err is set to non-0 if there was a
** conversion error to the requested
** character set.
*/
int *err);
/*
** Convenience function: convert a string in a given character set
** to/from uppercase, lowercase, or something else.
**
** This is done by calling unicode_convert_tou_tobuf() first,
** applying the title_func and char_func, then using
** unicode_convert_fromu_tobuf().
**
** A NULL return indicates that the requested conversion cannot be performed.
*/
char *unicode_convert_tocase( /* String to convert */
const char *str,
/* String's character set */
const char *charset,
/*
** Conversion of the first character in
** str: unicode_uc, unicode_lc, or unicode_tc:
*/
char32_t (*first_char_func)(char32_t),
/*
** Conversion of the second and the remaining
** character in str. If NULL, same as
** first_char_func.
*/
char32_t (*char_func)(char32_t));
/* Either UCS-4BE or UCS-4LE, matching the native char32_t endianness */
extern const char unicode_u_ucs4_native[];
/* Either UCS-2BE or UCS-2LE, matching the native char32_t endianness */
extern const char unicode_u_ucs2_native[];
/*
** Modified-UTF7 encoding used for IMAP folder names. Pass it for a charset
** parameter.
**
** This can be followed by a " " and up to 15 characters to be escaped in
** addition to unicode chars.
*/
#define unicode_x_imap_modutf7 "x-imap-modutf7"
/*
** EAI-capable Courier-IMAP does not use modified-UTF7, and uses UTF-8.
**
** However, to support SMAP we will still need to encode/decode some
** special characters.
**
** The characters U+0000-U+001F, and ./~:\
**
** They are encoded as a backslash followed by three octal digits.
*/
#define unicode_x_smap_modutf8 "x-smap-modutf8"
extern uint8_t unicode_ccc(char32_t ch);
#define UNICODE_DECOMPOSE_FLAG_QC 1
#define UNICODE_DECOMPOSE_FLAG_COMPAT 2
typedef struct unicode_decomposition {
char32_t *string;
size_t string_size;
int decompose_flags;
int (*reallocate)(struct unicode_decomposition *info,
const size_t *offsets,
const size_t *sizes,
size_t n);
void *arg;
} unicode_decomposition_t;
extern void unicode_decomposition_init(unicode_decomposition_t *,
char32_t *string,
size_t string_size,
void *arg);
extern void unicode_decomposition_deinit(unicode_decomposition_t *);
extern int unicode_decompose(unicode_decomposition_t *);
extern size_t unicode_decompose_reallocate_size(unicode_decomposition_t *,
const size_t *sizes,
size_t n);
struct unicode_compose_info {
size_t index;
size_t n_composed;
char32_t *composition;
size_t n_composition;
};
typedef struct {
struct unicode_compose_info **compositions;
size_t n_compositions;
} unicode_composition_t;
#define UNICODE_COMPOSE_FLAG_REMOVEUNUSED 1
#define UNICODE_COMPOSE_FLAG_ONESHOT 128
int unicode_composition_init(const char32_t *string,
size_t string_size,
int flags,
unicode_composition_t *info);
void unicode_composition_deinit(unicode_composition_t *info);
size_t unicode_composition_apply(char32_t *string,
size_t string_size,
unicode_composition_t *info);
int unicode_compose(char32_t *string,
size_t string_size,
int flags,
size_t *new_size);
#if 0
{
#endif
#ifdef __cplusplus
}
extern size_t unicode_wcwidth(const std::u32string &uc);
namespace unicode {
#if 0
};
#endif
/*
** Various character sets
*/
extern const char ucs_4[], ucs_2[], utf_8[], iso_8859_1[];
/*
** Interface to iconv.
**
** Subclass converted(). Invoke begin(), then operator(), repeatedly,
** then end().
**
** converted() receives the converted text.
*/
class iconvert {
unicode_convert_handle_t handle;
public:
iconvert();
virtual ~iconvert();
/* Start conversion.
** Returns false if the requested conversion cannot be done.
**/
bool begin(/* Convert from */
const std::string &src_chset,
/* Convert to */
const std::string &dst_chset);
/* Feed iconv(3). Returns false if the conversion was aborted.
*/
bool operator()(const char *, size_t);
bool operator()(const char32_t *, size_t);
/*
** Get the results here. If the subclass returns a non-0
** value, the conversion is aborted.
*/
virtual int converted(const char *, size_t);
/*
** End of conversion.
**
** Returns true if all calls to converted() returned 0,
** false if the conversion was aborted.
**
** errflag is set to true if there was a character that could
** not be converted, and passed to converted().
*/
bool end(bool &errflag)
{
return end(&errflag);
}
bool end()
{
return end(NULL);
}
/* Convert between two different charsets */
static std::string convert(const std::string &text,
const std::string &charset,
const std::string &dstcharset,
bool &errflag);
/* Convert between two different charsets */
static std::string convert(const std::string &text,
const std::string &charset,
const std::string &dstcharset)
{
bool dummy;
return convert(text, charset, dstcharset, dummy);
}
/* Convert from unicode to a charset */
static std::string convert(const std::u32string &uc,
const std::string &dstcharset,
bool &errflag);
/* Convert from unicode to a charset */
static std::string convert(const std::u32string &uc,
const std::string &dstcharset)
{
bool dummy;
return convert(uc, dstcharset, dummy);
}
/* Convert charset to unicode */
static bool convert(const std::string &text,
const std::string &charset,
std::u32string &uc);
/* Convert to upper/lower/title case */
static std::string
convert_tocase(/* Text string */
const std::string &text,
/* Its charset */
const std::string &charset,
/* First character: unicode_uc, unicode_lc, or unicode_tc */
char32_t (*first_char_func)(char32_t),
/* If not NULL, second and subsequent chars */
char32_t (*char_func)(char32_t)
=NULL)
{
bool dummy;
return convert_tocase(text, charset, dummy,
first_char_func,
char_func);
}
/* Convert to upper/lower/title case */
static std::string
convert_tocase(/* Text string */
const std::string &text,
/* Its charset */
const std::string &charset,
/* Set if there's a conversion error */
bool &err,
/* First character: unicode_uc, unicode_lc, or unicode_tc */
char32_t (*first_char_func)(char32_t),
/* If not NULL, second and subsequent chars */
char32_t (*char_func)(char32_t)
=NULL);
private:
bool end(bool *);
public:
class tou;
class fromu;
};
/* Convert output of iconvert to char32_ts. */
class iconvert::tou : public iconvert {
public:
bool begin(const std::string &chset);
virtual int converted(const char32_t *, size_t);
using iconvert::operator();
private:
int converted(const char *ptr, size_t cnt);
public:
template<typename iter_t> class to_iter_class;
template<typename input_iter_t,
typename output_iter_t>
static output_iter_t convert(input_iter_t from_iter,
input_iter_t to_iter,
const std::string &chset,
bool &flag,
output_iter_t out_iter);
template<typename input_iter_t>
static bool convert(input_iter_t from_iter,
input_iter_t to_iter,
const std::string &chset,
std::u32string &out_buf)
{
bool flag;
out_buf.clear();
std::back_insert_iterator<std::u32string >
insert_iter(out_buf);
convert(from_iter, to_iter, chset, flag, insert_iter);
return flag;
}
static std::pair<std::u32string, bool>
convert(const std::string &str,
const std::string &chset);
};
/* Helper class that saves unicode output into an output iterator */
template<typename iter_t>
class iconvert::tou::to_iter_class : public iconvert::tou {
iter_t iter;
public:
to_iter_class(iter_t iterValue)
: iter(iterValue) {}
using tou::operator();
operator iter_t() const { return iter; }
private:
int converted(const char32_t *ptr, size_t cnt)
{
while (cnt)
{
*iter=*ptr;
++iter;
++ptr;
--cnt;
}
return 0;
}
};
template<typename input_iter_t,
typename output_iter_t>
output_iter_t iconvert::tou::convert(input_iter_t from_iter,
input_iter_t to_iter,
const std::string &chset,
bool &flag,
output_iter_t out_iter)
{
class to_iter_class<output_iter_t> out(out_iter);
if (!out.begin(chset))
return out;
std::vector<char> string;
while (from_iter != to_iter)
{
string.push_back(*from_iter++);
if (string.size() > 31)
{
out(&string[0], string.size());
string.clear();
}
}
if (string.size() > 0)
out(&string[0], string.size());
out.end(flag);
return out;
}
/* Convert output of iconvert from char32_ts. */
class iconvert::fromu : public iconvert {
public:
bool begin(const std::string &chset);
using iconvert::operator();
template<typename iter_t> class to_iter_class;
template<typename input_iter_t,
typename output_iter_t>
static output_iter_t convert(input_iter_t from_iter,
input_iter_t to_iter,
const std::string &chset,
output_iter_t out_iter,
bool &errflag);
template<typename input_iter_t>
static void convert(input_iter_t from_iter,
input_iter_t to_iter,
const std::string &chset,
std::string &out_buf,
bool &errflag)
{
out_buf="";
std::back_insert_iterator<std::string>
insert_iter(out_buf);
convert(from_iter, to_iter, chset, insert_iter,
errflag);
}
static std::pair<std::string, bool>
convert(const std::u32string &ubuf,
const std::string &chset);
};
/* Helper class that saves unicode output into an output iterator */
template<typename iter_t>
class iconvert::fromu::to_iter_class : public iconvert::fromu {
iter_t iter;
public:
to_iter_class(iter_t iterValue)
: iter(iterValue) {}
using fromu::operator();
operator iter_t() const { return iter; }
private:
int converted(const char *ptr, size_t cnt)
{
while (cnt)
{
*iter=*ptr;
++iter;
++ptr;
--cnt;
}
return 0;
}
};
template<typename input_iter_t,
typename output_iter_t>
output_iter_t iconvert::fromu::convert(input_iter_t from_iter,
input_iter_t to_iter,
const std::string &chset,
output_iter_t out_iter,
bool &errflag)
{
errflag=true;
class to_iter_class<output_iter_t> out(out_iter);
if (!out.begin(chset))
return out;
std::u32string string;
while (from_iter != to_iter)
{
string.push_back(*from_iter++);
if (string.size() > 31)
{
out(&string[0], string.size());
string.clear();
}
}
if (string.size() > 0)
out(&string[0], string.size());
out.end(errflag);
return out;
}
/*
** Unicode linebreaking algorithm, tr14.
*/
extern "C" int linebreak_trampoline(int value, void *ptr);
extern "C" int linebreakc_trampoline(int value, char32_t ch,
void *ptr);
/*
** Subclass linebreak_callback_base, implement operator()(int).
**
** Use operator<< or operator()(iterator, iterator) to feed
** char32_ts into the linebreaking algorithm. The subclass receives
** UNICODE_LB values, as they become available.
*/
class linebreak_callback_base {
unicode_lb_info_t handle;
int opts;
#if __cplusplus >= 201103L
public:
linebreak_callback_base(const linebreak_callback_base &)=delete;
linebreak_callback_base &operator=(const
linebreak_callback_base &)=delete;
private:
#else
linebreak_callback_base(const linebreak_callback_base &);
/* NOT IMPLEMENTED */
linebreak_callback_base &operator=(const
linebreak_callback_base &);
/* NOT IMPLEMENTED */
#endif
public:
linebreak_callback_base();
virtual ~linebreak_callback_base();
void finish();
void set_opts(int opts);
friend int linebreak_trampoline(int, void *);
linebreak_callback_base &operator<<(char32_t uc);
template<typename iter_type>
linebreak_callback_base &operator()(iter_type beg_iter,
iter_type end_iter)
{
while (beg_iter != end_iter)
operator<<(*beg_iter++);
return *this;
}
template<typename container_type>
linebreak_callback_base &operator()(const container_type &vec)
{
return operator()(vec.begin(), vec.end());
}
private:
virtual int callback(int);
};
class linebreak_callback_save_buf : public linebreak_callback_base {
public:
std::list<int> lb_buf;
linebreak_callback_save_buf();
~linebreak_callback_save_buf();
using linebreak_callback_base::operator<<;
using linebreak_callback_base::operator();
private:
int callback(int value);
};
/*
** Convert an input iterator sequence over char32_ts into
** an input iterator sequence over linebreak values.
*/
template<typename input_t> class linebreak_iter
: public std::iterator<std::input_iterator_tag, int, void>
{
mutable input_t iter_value, end_iter_value;
mutable linebreak_callback_save_buf *buf;
void fill() const
{
if (buf == NULL)
return;
while (buf->lb_buf.empty())
{
if (iter_value == end_iter_value)
{
buf->finish();
if (buf->lb_buf.empty())
{
delete buf;
buf=NULL;
}
break;
}
buf->operator<<(*iter_value++);
}
}
mutable value_type bufvalue;
public:
linebreak_iter(const input_t &iter_valueArg,
const input_t &iter_endvalueArg)
: iter_value(iter_valueArg),
end_iter_value(iter_endvalueArg),
buf(new linebreak_callback_save_buf)
{
}
linebreak_iter() : buf(NULL)
{
}
void set_opts(int opts)
{
if (buf)
buf->set_opts(opts);
}
~linebreak_iter()
{
if (buf)
delete buf;
}
linebreak_iter(const linebreak_iter<input_t> &v)
: buf(NULL)
{
operator=(v);
}
linebreak_iter<input_t> &operator=(const
linebreak_iter<input_t> &v)
{
if (buf)
delete buf;
buf=v.buf;
iter_value=v.iter_value;
end_iter_value=v.end_iter_value;
v.buf=NULL;
return *this;
}
bool operator==(const linebreak_iter<input_t> &v) const
{
fill();
v.fill();
return buf == NULL && v.buf == NULL;
}
bool operator!=(const linebreak_iter<input_t> &v) const
{
return !operator==(v);
}
value_type operator*() const
{
fill();
return buf == NULL ? UNICODE_LB_MANDATORY:
buf->lb_buf.front();
}
linebreak_iter<input_t> &operator++()
{
bufvalue=operator*();
if (buf)
buf->lb_buf.pop_front();
return *this;
}
const value_type *operator++(int)
{
operator++();
return &bufvalue;
}
};
/*
** Like linebreak_callback_base, except the subclass receives both
** the linebreaking value, and the unicode character.
*/
class linebreakc_callback_base {
unicode_lbc_info_t handle;
int opts;
#if __cplusplus >= 201103L
public:
linebreakc_callback_base(const linebreakc_callback_base &)
=delete;
linebreakc_callback_base &operator=(const
linebreakc_callback_base
&)=delete;
private:
#else
linebreakc_callback_base(const linebreakc_callback_base &);
/* NOT IMPLEMENTED */
linebreakc_callback_base &operator=(const
linebreakc_callback_base
&);
/* NOT IMPLEMENTED */
#endif
public:
linebreakc_callback_base();
virtual ~linebreakc_callback_base();
void finish();
void set_opts(int opts);
friend int linebreakc_trampoline(int, char32_t, void *);
linebreakc_callback_base &operator<<(char32_t uc);
template<typename iter_type>
linebreakc_callback_base &operator()(iter_type beg_iter,
iter_type end_iter)
{
while (beg_iter != end_iter)
operator<<(*beg_iter++);
return *this;
}
linebreakc_callback_base &operator<<(const
std::u32string
&vec)
{
return operator()(vec.begin(), vec.end());
}
private:
virtual int callback(int, char32_t);
};
class linebreakc_callback_save_buf : public linebreakc_callback_base {
public:
std::list<std::pair<int, char32_t> > lb_buf;
linebreakc_callback_save_buf();
~linebreakc_callback_save_buf();
using linebreakc_callback_base::operator<<;
using linebreakc_callback_base::operator();
private:
int callback(int, char32_t);
};
/*
** Convert an input iterator sequence over char32_ts into
** an input iterator sequence over std::pair<int, char32_t>,
** the original unicode character, and the linebreaking value before
** the character.
*/
template<typename input_t> class linebreakc_iter
: public std::iterator<std::input_iterator_tag,
std::pair<int, char32_t>, void>
{
mutable input_t iter_value, end_iter_value;
mutable linebreakc_callback_save_buf *buf;
void fill() const
{
if (buf == NULL)
return;
while (buf->lb_buf.empty())
{
if (iter_value == end_iter_value)
{
buf->finish();
if (buf->lb_buf.empty())
{
delete buf;
buf=NULL;
}
break;
}
buf->operator<<(*iter_value);
++iter_value;
}
}
mutable value_type bufvalue;
public:
linebreakc_iter(const input_t &iter_valueArg,
const input_t &iter_endvalueArg)
: iter_value(iter_valueArg),
end_iter_value(iter_endvalueArg),
buf(new linebreakc_callback_save_buf)
{
}
linebreakc_iter() : buf(NULL)
{
}
~linebreakc_iter()
{
if (buf)
delete buf;
}
linebreakc_iter(const linebreakc_iter<input_t> &v)
: buf(NULL)
{
operator=(v);
}
linebreakc_iter<input_t> &operator=(const
linebreakc_iter<input_t> &v)
{
if (buf)
delete buf;
buf=v.buf;
iter_value=v.iter_value;
end_iter_value=v.end_iter_value;
v.buf=NULL;
return *this;
}
bool operator==(const linebreakc_iter<input_t> &v) const
{
fill();
v.fill();
return buf == NULL && v.buf == NULL;
}
bool operator!=(const linebreakc_iter<input_t> &v) const
{
return !operator==(v);
}
value_type operator*() const
{
fill();
return buf == NULL ?
std::make_pair(UNICODE_LB_MANDATORY,
(char32_t)0):
buf->lb_buf.front();
}
linebreakc_iter<input_t> &operator++()
{
bufvalue=operator*();
if (buf)
buf->lb_buf.pop_front();
return *this;
}
const value_type *operator++(int)
{
operator++();
return &bufvalue;
}
};
/*
** Subclass wordbreak_callback_base, implement operator()(int).
**
** Use operator<< or operator()(iterator, iterator) to feed
** char32_ts into the wordbreaking algorithm. The subclass receives
** word flags, as they become available.
*/
extern "C" int wordbreak_trampoline(int value, void *ptr);
class wordbreak_callback_base {
unicode_wb_info_t handle;
#if __cplusplus >= 201103L
public:
wordbreak_callback_base(const wordbreak_callback_base &)=delete;
wordbreak_callback_base &operator=(const
wordbreak_callback_base &)
=delete;
private:
#else
wordbreak_callback_base(const wordbreak_callback_base &);
/* NOT IMPLEMENTED */
wordbreak_callback_base &operator=(const
wordbreak_callback_base &);
/* NOT IMPLEMENTED */
#endif
public:
wordbreak_callback_base();
virtual ~wordbreak_callback_base();
void finish();
friend int wordbreak_trampoline(int, void *);
wordbreak_callback_base &operator<<(char32_t uc);
template<typename iter_type>
wordbreak_callback_base &operator()(iter_type beg_iter,
iter_type end_iter)
{
while (beg_iter != end_iter)
operator<<(*beg_iter++);
return *this;
}
wordbreak_callback_base &operator<<(const
std::u32string
&vec)
{
return operator()(vec.begin(), vec.end());
}
private:
virtual int callback(bool);
};
/*
** A C++ wrapper for unicode_wbscan.
*/
class wordbreakscan {
unicode_wbscan_info_t handle;
#if __cplusplus >= 201103L
public:
wordbreakscan(const wordbreakscan &)=delete;
wordbreakscan &operator=(const wordbreakscan &)=delete;
private:
#else
wordbreakscan(const wordbreakscan &);
/* NOT IMPLEMENTED */
wordbreakscan &operator=(const wordbreakscan &);
/* NOT IMPLEMENTED */
#endif
public:
wordbreakscan();
~wordbreakscan();
bool operator<<(char32_t uc);
size_t finish();
};
//! Convert string in unicode_default_chset() to lowercase
std::string tolower(const std::string &string);
//! Convert string in unicode_default_chset() to uppercase
std::string toupper(const std::string &string);
//! Convert string in the given character set to lowercase
std::string tolower(const std::string &string,
const std::string &charset);
//! Convert string in the given character set to uppercase
std::string toupper(const std::string &string,
const std::string &charset);
//! Convert unicode to lowercase
std::u32string tolower(const std::u32string &u);
//! Convert unicode to uppercase
std::u32string toupper(const std::u32string &u);
//! Calculate bidirectional character types
//! Passed as a parameter to bidi_calc(), supplying the string and the
//! calculated bidirectional types.
struct bidi_calc_types {
const std::u32string &s;
//! Calculated bidirectional types.
std::vector<enum_bidi_type_t> types;
//! A reference to an existing std::u32string
//! bidi_calc_types can be constructed only from a reference to
//! an existing std::u32string.
bidi_calc_types(const std::u32string &);
#if __cplusplus >= 201103L
//! Deleted constructor
//! bidi_calc_types cannot be constructed from a temporary
//! std::u32string.
bidi_calc_types(std::u32string &&)=delete;
#endif
//! Replace all paragraph breaks by newlines.
void setbnl(std::u32string &);
//! Destructor
~bidi_calc_types();
};
//! Calculate bidirectional embedding levels
//! Returns the bidirectional embedding levels, and the paragraph
//! embedding level.
//!
//! The first parameter can be implicitly converted from an existing
//! std::u32string object. Alternatively a bidi_calc_types helper
//! can be constructed explicitly, and then passed in directly.
std::tuple<std::vector<unicode_bidi_level_t>,
struct unicode_bidi_direction> bidi_calc(const bidi_calc_types &s);
//! Calculate bidirectional embedding levels
//! Overload calculates the embedding levels using a predetermined
//! paragraph embedding level.
//!
//! Returns the bidirectional embedding levels, and the same paragraph
//! embedding level.
std::tuple<std::vector<unicode_bidi_level_t>,
struct unicode_bidi_direction> bidi_calc(const bidi_calc_types &s,
unicode_bidi_level_t level);
//! Reorder bidirectional text
//! Reorders the string and levels in place.
//!
//! Non-0 return value indicates the string and levels' sizes do not match.
int bidi_reorder(std::u32string &string,
std::vector<unicode_bidi_level_t> &levels,
const std::function<void (size_t, size_t)>
&reorder_callback=[](size_t, size_t){},
size_t starting_pos=0,
size_t n=(size_t)-1);
//! Dry-run reorder bidirectional text
void bidi_reorder(std::vector<unicode_bidi_level_t> &levels,
const std::function<void (size_t, size_t)>
&reorder_callback=[](size_t, size_t){},
size_t starting_pos=0,
size_t n=(size_t)-1);
//! Remove directional markers
//! Removes them from the string, in place. Optional lambda gets notified
//! of the index (in the original string, of each removed marker.
void bidi_cleanup(std::u32string &string,
const std::function<void (size_t)> &removed_callback=
[](size_t) {},
int cleanup_options=0);
//! Also remove them from the embedding direction level buffer.
//! Returns non-0 in case of non-matching level buffer size.
int bidi_cleanup(std::u32string &string,
std::vector<unicode_bidi_level_t> &levels,
const std::function<void (size_t)> &removed_callback=
[](size_t) {},
int cleanup_options=0);
//! Clean up a substring of the unicode string.
//! The substring gets specified by starting_pos and n.
//!
//! \note
//! The removed position parameter to the removed_callback is based on
//! the given starting_position. Add starting_pos to it to get the
//! actual removed index.
//!
//! Returns non-0 in case of non-matching level buffer size.
//!
//! The final size of the returned string is determined by counting
//! how many calls to removed_callback were made.
int bidi_cleanup(std::u32string &string,
std::vector<unicode_bidi_level_t> &levels,
const std::function<void (size_t)> &removed_callback,
int cleanup_options,
size_t starting_pos,
size_t n);
//! Convert Unicode string from canonical rendering order to logical order.
int bidi_logical_order(std::u32string &string,
std::vector<unicode_bidi_level_t> &levels,
unicode_bidi_level_t paragraph_embedding,
const std::function<void (size_t, size_t)>
&lambda=[](size_t,size_t){},
size_t starting_pos=0,
size_t n=(size_t)-1);
//! Convert Unicode string from canonical rendering order to logical order.
void bidi_logical_order(std::vector<unicode_bidi_level_t> &levels,
unicode_bidi_level_t paragraph_embedding,
const std::function<void (size_t, size_t)> &lambda,
size_t starting_pos=0,
size_t n=(size_t)-1);
//! Whether directional and isolation markers are needed.
bool bidi_needs_embed(const std::u32string &string,
const std::vector<unicode_bidi_level_t> &levels,
const unicode_bidi_level_t *paragraph_embedding=0,
size_t starting_pos=0,
size_t n=(size_t)-1);
//! Embed directional and isolation markers
//! Non-0 return value indicates the string and levels' sizes do not match.
//!
//! The lambda gets called repeatedly, to specify the contents of the
//! string with embedded direction markers.
int bidi_embed(const std::u32string &string,
const std::vector<unicode_bidi_level_t> &levels,
unicode_bidi_level_t paragraph_embedding,
const std::function<void (const char32_t *string,
size_t n,
bool is_part_of_string)> &lambda);
//! Embed directional and isolation markers
//! \overload
//!
//! Provides a lambda that collects the new string, and returns it. An
//! empty string gets returned if the string and levels' sizes do not match.
std::u32string bidi_embed(const std::u32string &string,
const std::vector<unicode_bidi_level_t> &levels,
unicode_bidi_level_t paragraph_embedding);
//! Identify contiguous sequences of combining characters
//! Bounded by each embedding level.
void bidi_combinings(const std::u32string &string,
const std::vector<unicode_bidi_level_t> &levels,
const std::function<void (unicode_bidi_level_t level,
size_t level_start,
size_t n_chars,
size_t comb_start,
size_t n_comb_chars)> &callback);
//! Identify contiguous sequences of composition characters
void bidi_combinings(const std::u32string &string,
const std::function<void (unicode_bidi_level_t level,
size_t level_start,
size_t n_chars,
size_t comb_start,
size_t n_comb_chars)> &callback);
//! Check if a directional marker needs to be inserted
//! In order for the unicode string to have the specified default
//! paragraph embedding level.
char32_t bidi_embed_paragraph_level(const std::u32string &string,
unicode_bidi_level_t level);
//! Compute default direction of text
unicode_bidi_direction bidi_get_direction(const std::u32string &string,
size_t starting_pos=0,
size_t n=(size_t)-1);
//! Override bidi direction.
std::u32string bidi_override(const std::u32string &s,
unicode_bidi_level_t direction,
int cleanup_options=0);
constexpr int decompose_flag_qc=UNICODE_DECOMPOSE_FLAG_QC;
constexpr int decompose_flag_compat=UNICODE_DECOMPOSE_FLAG_COMPAT;
void decompose_default_reallocate(std::u32string &,
const std::vector<std::tuple<size_t, size_t>>
&);
void decompose(std::u32string &,
int flags=0,
const std::function<void (std::u32string &,
const std::vector<std::tuple<size_t,
size_t>>)> &
=decompose_default_reallocate);
constexpr int compose_flag_removeunused=UNICODE_COMPOSE_FLAG_REMOVEUNUSED;
constexpr int compose_flag_oneshot=UNICODE_COMPOSE_FLAG_ONESHOT;
void compose_default_callback(unicode_composition_t &);
void compose(std::u32string &string,
int flags=0,
const std::function<void (unicode_composition_t &)> &cb=
compose_default_callback);
#if 0
{
#endif
}
#endif
#endif