Synopsis#include <glib.h> typedef gunichar; typedef gunichar2; gboolean g_unichar_validate (gunichar ch); gboolean g_unichar_isalnum (gunichar c); gboolean g_unichar_isalpha (gunichar c); gboolean g_unichar_iscntrl (gunichar c); gboolean g_unichar_isdefined (gunichar c); gboolean g_unichar_isdigit (gunichar c); gboolean g_unichar_isgraph (gunichar c); gboolean g_unichar_islower (gunichar c); gboolean g_unichar_ismark (gunichar c); gboolean g_unichar_isprint (gunichar c); gboolean g_unichar_ispunct (gunichar c); gboolean g_unichar_isspace (gunichar c); gboolean g_unichar_istitle (gunichar c); gboolean g_unichar_isupper (gunichar c); gboolean g_unichar_isxdigit (gunichar c); gboolean g_unichar_iswide (gunichar c); gboolean g_unichar_iswide_cjk (gunichar c); gboolean g_unichar_iszerowidth (gunichar c); gunichar g_unichar_toupper (gunichar c); gunichar g_unichar_tolower (gunichar c); gunichar g_unichar_totitle (gunichar c); gint g_unichar_digit_value (gunichar c); gint g_unichar_xdigit_value (gunichar c); enum GUnicodeType; GUnicodeType g_unichar_type (gunichar c); enum GUnicodeBreakType; GUnicodeBreakType g_unichar_break_type (gunichar c); gint g_unichar_combining_class (gunichar uc); void g_unicode_canonical_ordering (gunichar *string, gsize len); gunichar* g_unicode_canonical_decomposition (gunichar ch, gsize *result_len); gboolean g_unichar_get_mirror_char (gunichar ch, gunichar *mirrored_ch); enum GUnicodeScript; GUnicodeScript g_unichar_get_script (gunichar ch); #define g_utf8_next_char (p) gunichar g_utf8_get_char (const gchar *p); gunichar g_utf8_get_char_validated (const gchar *p, gssize max_len); gchar* g_utf8_offset_to_pointer (const gchar *str, glong offset); glong g_utf8_pointer_to_offset (const gchar *str, const gchar *pos); gchar* g_utf8_prev_char (const gchar *p); gchar* g_utf8_find_next_char (const gchar *p, const gchar *end); gchar* g_utf8_find_prev_char (const gchar *str, const gchar *p); glong g_utf8_strlen (const gchar *p, gssize max); gchar* g_utf8_strncpy (gchar *dest, const gchar *src, gsize n); gchar* g_utf8_strchr (const gchar *p, gssize len, gunichar c); gchar* g_utf8_strrchr (const gchar *p, gssize len, gunichar c); gchar* g_utf8_strreverse (const gchar *str, gssize len); gboolean g_utf8_validate (const gchar *str, gssize max_len, const gchar **end); gchar* g_utf8_strup (const gchar *str, gssize len); gchar* g_utf8_strdown (const gchar *str, gssize len); gchar* g_utf8_casefold (const gchar *str, gssize len); gchar* g_utf8_normalize (const gchar *str, gssize len, GNormalizeMode mode); enum GNormalizeMode; gint g_utf8_collate (const gchar *str1, const gchar *str2); gchar* g_utf8_collate_key (const gchar *str, gssize len); gchar* g_utf8_collate_key_for_filename (const gchar *str, gssize len); gunichar2* g_utf8_to_utf16 (const gchar *str, glong len, glong *items_read, glong *items_written, GError **error); gunichar* g_utf8_to_ucs4 (const gchar *str, glong len, glong *items_read, glong *items_written, GError **error); gunichar* g_utf8_to_ucs4_fast (const gchar *str, glong len, glong *items_written); gunichar* g_utf16_to_ucs4 (const gunichar2 *str, glong len, glong *items_read, glong *items_written, GError **error); gchar* g_utf16_to_utf8 (const gunichar2 *str, glong len, glong *items_read, glong *items_written, GError **error); gunichar2* g_ucs4_to_utf16 (const gunichar *str, glong len, glong *items_read, glong *items_written, GError **error); gchar* g_ucs4_to_utf8 (const gunichar *str, glong len, glong *items_read, glong *items_written, GError **error); gint g_unichar_to_utf8 (gunichar c, gchar *outbuf); Description
This section describes a number of functions for dealing with
Unicode characters and strings. There are analogues of the
traditional The implementations of the Unicode functions in GLib are based on the Unicode Character Data tables, which are available from www.unicode.org. GLib 2.8 supports Unicode 4.0, GLib 2.10 supports Unicode 4.1, GLib 2.12 supports Unicode 5.0, GLib 2.16.3 supports Unicode 5.1. Detailsg_unichar_validate ()gboolean g_unichar_validate (gunichar ch);
Checks whether
g_unichar_isalnum ()gboolean g_unichar_isalnum (gunichar c);
Determines whether a character is alphanumeric.
Given some UTF-8 text, obtain a character value
with
g_unichar_isalpha ()gboolean g_unichar_isalpha (gunichar c);
Determines whether a character is alphabetic (i.e. a letter).
Given some UTF-8 text, obtain a character value with
g_unichar_iscntrl ()gboolean g_unichar_iscntrl (gunichar c);
Determines whether a character is a control character.
Given some UTF-8 text, obtain a character value with
g_unichar_isdefined ()gboolean g_unichar_isdefined (gunichar c); Determines if a given character is assigned in the Unicode standard.
g_unichar_isdigit ()gboolean g_unichar_isdigit (gunichar c);
Determines whether a character is numeric (i.e. a digit). This
covers ASCII 0-9 and also digits in other languages/scripts. Given
some UTF-8 text, obtain a character value with
g_unichar_isgraph ()gboolean g_unichar_isgraph (gunichar c);
Determines whether a character is printable and not a space
(returns
g_unichar_islower ()gboolean g_unichar_islower (gunichar c);
Determines whether a character is a lowercase letter.
Given some UTF-8 text, obtain a character value with
g_unichar_ismark ()gboolean g_unichar_ismark (gunichar c);
Determines whether a character is a mark (non-spacing mark,
combining mark, or enclosing mark in Unicode speak).
Given some UTF-8 text, obtain a character value
with Note: in most cases where isalpha characters are allowed, ismark characters should be allowed to as they are essential for writing most European languages as well as many non-Latin scripts.
Since 2.14 g_unichar_isprint ()gboolean g_unichar_isprint (gunichar c);
Determines whether a character is printable.
Unlike
g_unichar_ispunct ()gboolean g_unichar_ispunct (gunichar c);
Determines whether a character is punctuation or a symbol.
Given some UTF-8 text, obtain a character value with
g_unichar_isspace ()gboolean g_unichar_isspace (gunichar c);
Determines whether a character is a space, tab, or line separator
(newline, carriage return, etc.). Given some UTF-8 text, obtain a
character value with (Note: don't use this to do word breaking; you have to use Pango or equivalent to get word breaking right, the algorithm is fairly complex.)
g_unichar_istitle ()gboolean g_unichar_istitle (gunichar c); Determines if a character is titlecase. Some characters in Unicode which are composites, such as the DZ digraph have three case variants instead of just two. The titlecase form is used at the beginning of a word where only the first letter is capitalized. The titlecase form of the DZ digraph is U+01F2 LATIN CAPITAL LETTTER D WITH SMALL LETTER Z.
g_unichar_isupper ()gboolean g_unichar_isupper (gunichar c); Determines if a character is uppercase.
g_unichar_isxdigit ()gboolean g_unichar_isxdigit (gunichar c); Determines if a character is a hexidecimal digit.
g_unichar_iswide ()gboolean g_unichar_iswide (gunichar c); Determines if a character is typically rendered in a double-width cell.
g_unichar_iswide_cjk ()gboolean g_unichar_iswide_cjk (gunichar c);
Determines if a character is typically rendered in a double-width
cell under legacy East Asian locales. If a character is wide according to
Since 2.12 g_unichar_iszerowidth ()gboolean g_unichar_iszerowidth (gunichar c);
Determines if a given character typically takes zero width when rendered.
The return value is
A typical use of this function is with one of
Since 2.14 g_unichar_toupper ()gunichar g_unichar_toupper (gunichar c); Converts a character to uppercase.
g_unichar_tolower ()gunichar g_unichar_tolower (gunichar c); Converts a character to lower case.
g_unichar_totitle ()gunichar g_unichar_totitle (gunichar c); Converts a character to the titlecase.
g_unichar_digit_value ()gint g_unichar_digit_value (gunichar c); Determines the numeric value of a character as a decimal digit.
g_unichar_xdigit_value ()gint g_unichar_xdigit_value (gunichar c); Determines the numeric value of a character as a hexidecimal digit.
enum GUnicodeTypetypedef enum
{
G_UNICODE_CONTROL,
G_UNICODE_FORMAT,
G_UNICODE_UNASSIGNED,
G_UNICODE_PRIVATE_USE,
G_UNICODE_SURROGATE,
G_UNICODE_LOWERCASE_LETTER,
G_UNICODE_MODIFIER_LETTER,
G_UNICODE_OTHER_LETTER,
G_UNICODE_TITLECASE_LETTER,
G_UNICODE_UPPERCASE_LETTER,
G_UNICODE_COMBINING_MARK,
G_UNICODE_ENCLOSING_MARK,
G_UNICODE_NON_SPACING_MARK,
G_UNICODE_DECIMAL_NUMBER,
G_UNICODE_LETTER_NUMBER,
G_UNICODE_OTHER_NUMBER,
G_UNICODE_CONNECT_PUNCTUATION,
G_UNICODE_DASH_PUNCTUATION,
G_UNICODE_CLOSE_PUNCTUATION,
G_UNICODE_FINAL_PUNCTUATION,
G_UNICODE_INITIAL_PUNCTUATION,
G_UNICODE_OTHER_PUNCTUATION,
G_UNICODE_OPEN_PUNCTUATION,
G_UNICODE_CURRENCY_SYMBOL,
G_UNICODE_MODIFIER_SYMBOL,
G_UNICODE_MATH_SYMBOL,
G_UNICODE_OTHER_SYMBOL,
G_UNICODE_LINE_SEPARATOR,
G_UNICODE_PARAGRAPH_SEPARATOR,
G_UNICODE_SPACE_SEPARATOR
} GUnicodeType;
These are the possible character classifications from the Unicode specification. See http://www.unicode.org/Public/UNIDATA/UnicodeData.html.
g_unichar_type ()GUnicodeType g_unichar_type (gunichar c); Classifies a Unicode character by type.
enum GUnicodeBreakTypetypedef enum
{
G_UNICODE_BREAK_MANDATORY,
G_UNICODE_BREAK_CARRIAGE_RETURN,
G_UNICODE_BREAK_LINE_FEED,
G_UNICODE_BREAK_COMBINING_MARK,
G_UNICODE_BREAK_SURROGATE,
G_UNICODE_BREAK_ZERO_WIDTH_SPACE,
G_UNICODE_BREAK_INSEPARABLE,
G_UNICODE_BREAK_NON_BREAKING_GLUE,
G_UNICODE_BREAK_CONTINGENT,
G_UNICODE_BREAK_SPACE,
G_UNICODE_BREAK_AFTER,
G_UNICODE_BREAK_BEFORE,
G_UNICODE_BREAK_BEFORE_AND_AFTER,
G_UNICODE_BREAK_HYPHEN,
G_UNICODE_BREAK_NON_STARTER,
G_UNICODE_BREAK_OPEN_PUNCTUATION,
G_UNICODE_BREAK_CLOSE_PUNCTUATION,
G_UNICODE_BREAK_QUOTATION,
G_UNICODE_BREAK_EXCLAMATION,
G_UNICODE_BREAK_IDEOGRAPHIC,
G_UNICODE_BREAK_NUMERIC,
G_UNICODE_BREAK_INFIX_SEPARATOR,
G_UNICODE_BREAK_SYMBOL,
G_UNICODE_BREAK_ALPHABETIC,
G_UNICODE_BREAK_PREFIX,
G_UNICODE_BREAK_POSTFIX,
G_UNICODE_BREAK_COMPLEX_CONTEXT,
G_UNICODE_BREAK_AMBIGUOUS,
G_UNICODE_BREAK_UNKNOWN,
G_UNICODE_BREAK_NEXT_LINE,
G_UNICODE_BREAK_WORD_JOINER,
G_UNICODE_BREAK_HANGUL_L_JAMO,
G_UNICODE_BREAK_HANGUL_V_JAMO,
G_UNICODE_BREAK_HANGUL_T_JAMO,
G_UNICODE_BREAK_HANGUL_LV_SYLLABLE,
G_UNICODE_BREAK_HANGUL_LVT_SYLLABLE
} GUnicodeBreakType;
These are the possible line break classifications.
The five Hangul types were added in Unicode 4.1, so, has been
introduced in GLib 2.10. Note that new types may be added in the future.
Applications should be ready to handle unknown values.
They may be regarded as
g_unichar_break_type ()GUnicodeBreakType g_unichar_break_type (gunichar c);
Determines the break type of
g_unichar_combining_class ()gint g_unichar_combining_class (gunichar uc); Determines the canonical combining class of a Unicode character.
Since 2.14 g_unicode_canonical_ordering ()void g_unicode_canonical_ordering (gunichar *string, gsize len); Computes the canonical ordering of a string in-place. This rearranges decomposed characters in the string according to their combining classes. See the Unicode manual for more information.
g_unicode_canonical_decomposition ()gunichar* g_unicode_canonical_decomposition (gunichar ch, gsize *result_len); Computes the canonical decomposition of a Unicode character.
g_unichar_get_mirror_char ()gboolean g_unichar_get_mirror_char (gunichar ch, gunichar *mirrored_ch); In Unicode, some characters are mirrored. This means that their images are mirrored horizontally in text that is laid out from right to left. For instance, "(" would become its mirror image, ")", in right-to-left text.
If
Since 2.4 enum GUnicodeScripttypedef enum
{ /* ISO 15924 code */
G_UNICODE_SCRIPT_INVALID_CODE = -1,
G_UNICODE_SCRIPT_COMMON = 0, /* Zyyy */
G_UNICODE_SCRIPT_INHERITED, /* Qaai */
G_UNICODE_SCRIPT_ARABIC, /* Arab */
G_UNICODE_SCRIPT_ARMENIAN, /* Armn */
G_UNICODE_SCRIPT_BENGALI, /* Beng */
G_UNICODE_SCRIPT_BOPOMOFO, /* Bopo */
G_UNICODE_SCRIPT_CHEROKEE, /* Cher */
G_UNICODE_SCRIPT_COPTIC, /* Qaac */
G_UNICODE_SCRIPT_CYRILLIC, /* Cyrl (Cyrs) */
G_UNICODE_SCRIPT_DESERET, /* Dsrt */
G_UNICODE_SCRIPT_DEVANAGARI, /* Deva */
G_UNICODE_SCRIPT_ETHIOPIC, /* Ethi */
G_UNICODE_SCRIPT_GEORGIAN, /* Geor (Geon, Geoa) */
G_UNICODE_SCRIPT_GOTHIC, /* Goth */
G_UNICODE_SCRIPT_GREEK, /* Grek */
G_UNICODE_SCRIPT_GUJARATI, /* Gujr */
G_UNICODE_SCRIPT_GURMUKHI, /* Guru */
G_UNICODE_SCRIPT_HAN, /* Hani */
G_UNICODE_SCRIPT_HANGUL, /* Hang */
G_UNICODE_SCRIPT_HEBREW, /* Hebr */
G_UNICODE_SCRIPT_HIRAGANA, /* Hira */
G_UNICODE_SCRIPT_KANNADA, /* Knda */
G_UNICODE_SCRIPT_KATAKANA, /* Kana */
G_UNICODE_SCRIPT_KHMER, /* Khmr */
G_UNICODE_SCRIPT_LAO, /* Laoo */
G_UNICODE_SCRIPT_LATIN, /* Latn (Latf, Latg) */
G_UNICODE_SCRIPT_MALAYALAM, /* Mlym */
G_UNICODE_SCRIPT_MONGOLIAN, /* Mong */
G_UNICODE_SCRIPT_MYANMAR, /* Mymr */
G_UNICODE_SCRIPT_OGHAM, /* Ogam */
G_UNICODE_SCRIPT_OLD_ITALIC, /* Ital */
G_UNICODE_SCRIPT_ORIYA, /* Orya */
G_UNICODE_SCRIPT_RUNIC, /* Runr */
G_UNICODE_SCRIPT_SINHALA, /* Sinh */
G_UNICODE_SCRIPT_SYRIAC, /* Syrc (Syrj, Syrn, Syre) */
G_UNICODE_SCRIPT_TAMIL, /* Taml */
G_UNICODE_SCRIPT_TELUGU, /* Telu */
G_UNICODE_SCRIPT_THAANA, /* Thaa */
G_UNICODE_SCRIPT_THAI, /* Thai */
G_UNICODE_SCRIPT_TIBETAN, /* Tibt */
G_UNICODE_SCRIPT_CANADIAN_ABORIGINAL, /* Cans */
G_UNICODE_SCRIPT_YI, /* Yiii */
G_UNICODE_SCRIPT_TAGALOG, /* Tglg */
G_UNICODE_SCRIPT_HANUNOO, /* Hano */
G_UNICODE_SCRIPT_BUHID, /* Buhd */
G_UNICODE_SCRIPT_TAGBANWA, /* Tagb */
/* Unicode-4.0 additions */
G_UNICODE_SCRIPT_BRAILLE, /* Brai */
G_UNICODE_SCRIPT_CYPRIOT, /* Cprt */
G_UNICODE_SCRIPT_LIMBU, /* Limb */
G_UNICODE_SCRIPT_OSMANYA, /* Osma */
G_UNICODE_SCRIPT_SHAVIAN, /* Shaw */
G_UNICODE_SCRIPT_LINEAR_B, /* Linb */
G_UNICODE_SCRIPT_TAI_LE, /* Tale */
G_UNICODE_SCRIPT_UGARITIC, /* Ugar */
/* Unicode-4.1 additions */
G_UNICODE_SCRIPT_NEW_TAI_LUE, /* Talu */
G_UNICODE_SCRIPT_BUGINESE, /* Bugi */
G_UNICODE_SCRIPT_GLAGOLITIC, /* Glag */
G_UNICODE_SCRIPT_TIFINAGH, /* Tfng */
G_UNICODE_SCRIPT_SYLOTI_NAGRI, /* Sylo */
G_UNICODE_SCRIPT_OLD_PERSIAN, /* Xpeo */
G_UNICODE_SCRIPT_KHAROSHTHI, /* Khar */
/* Unicode-5.0 additions */
G_UNICODE_SCRIPT_UNKNOWN, /* Zzzz */
G_UNICODE_SCRIPT_BALINESE, /* Bali */
G_UNICODE_SCRIPT_CUNEIFORM, /* Xsux */
G_UNICODE_SCRIPT_PHOENICIAN, /* Phnx */
G_UNICODE_SCRIPT_PHAGS_PA, /* Phag */
G_UNICODE_SCRIPT_NKO, /* Nkoo */
/* Unicode-5.1 additions */
G_UNICODE_SCRIPT_KAYAH_LI, /* Kali */
G_UNICODE_SCRIPT_LEPCHA, /* Lepc */
G_UNICODE_SCRIPT_REJANG, /* Rjng */
G_UNICODE_SCRIPT_SUNDANESE, /* Sund */
G_UNICODE_SCRIPT_SAURASHTRA, /* Saur */
G_UNICODE_SCRIPT_CHAM, /* Cham */
G_UNICODE_SCRIPT_OL_CHIKI, /* Olck */
G_UNICODE_SCRIPT_VAI, /* Vaii */
G_UNICODE_SCRIPT_CARIAN, /* Cari */
G_UNICODE_SCRIPT_LYCIAN, /* Lyci */
G_UNICODE_SCRIPT_LYDIAN /* Lydi */
} GUnicodeScript;
The GUnicodeScript enumeration identifies different writing systems. The values correspond to the names as defined in the Unicode standard. The enumeration has been added in GLib 2.14, and is interchangeable with PangoScript. Note that new types may be added in the future. Applications should be ready to handle unknown values. See Unicode Standard Annex "" Script names.
g_unichar_get_script ()GUnicodeScript g_unichar_get_script (gunichar ch);
Looks up the GUnicodeScript for a particular character (as defined
by Unicode Standard Annex 24). No check is made for
This function is equivalent to
Since 2.14 g_utf8_next_char()#define g_utf8_next_char(p)
Skips to the next character in a UTF-8 string. The string must be
valid; this macro is as fast as possible, and has no error-checking.
You would use this macro to iterate over a string character by
character. The macro returns the start of the next UTF-8 character.
Before using this macro, use
g_utf8_get_char ()gunichar g_utf8_get_char (const gchar *p);
Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
If
g_utf8_get_char_validated ()gunichar g_utf8_get_char_validated (const gchar *p, gssize max_len); Convert a sequence of bytes encoded as UTF-8 to a Unicode character. This function checks for incomplete characters, for invalid characters such as characters that are out of the range of Unicode, and for overlong encodings of valid characters.
g_utf8_offset_to_pointer ()gchar* g_utf8_offset_to_pointer (const gchar *str, glong offset); Converts from an integer character offset to a pointer to a position within the string.
Since 2.10, this function allows to pass a negative
g_utf8_pointer_to_offset ()glong g_utf8_pointer_to_offset (const gchar *str, const gchar *pos); Converts from a pointer to position within a string to a integer character offset.
Since 2.10, this function allows
g_utf8_prev_char ()gchar* g_utf8_prev_char (const gchar *p);
Finds the previous UTF-8 character in the string before
g_utf8_find_next_char ()gchar* g_utf8_find_next_char (const gchar *p, const gchar *end);
Finds the start of the next UTF-8 character in the string after
g_utf8_find_prev_char ()gchar* g_utf8_find_prev_char (const gchar *str, const gchar *p);
Given a position
g_utf8_strlen ()glong g_utf8_strlen (const gchar *p, gssize max); Returns the length of the string in characters.
g_utf8_strncpy ()gchar* g_utf8_strncpy (gchar *dest, const gchar *src, gsize n);
Like the standard C
g_utf8_strchr ()gchar* g_utf8_strchr (const gchar *p, gssize len, gunichar c);
Finds the leftmost occurrence of the given Unicode character
in a UTF-8 encoded string, while limiting the search to
g_utf8_strrchr ()gchar* g_utf8_strrchr (const gchar *p, gssize len, gunichar c);
Find the rightmost occurrence of the given Unicode character
in a UTF-8 encoded string, while limiting the search to
g_utf8_strreverse ()gchar* g_utf8_strreverse (const gchar *str, gssize len);
Reverses a UTF-8 string. This function is intended for programmatic uses of reversed strings. It pays no attention to decomposed characters, combining marks, byte order marks, directional indicators (LRM, LRO, etc) and similar characters which might need special handling when reversing a string for display purposes.
Note that unlike
Since 2.2 g_utf8_validate ()gboolean g_utf8_validate (const gchar *str, gssize max_len, const gchar **end);
Validates UTF-8 encoded text.
Note that
Returns
g_utf8_strup ()gchar* g_utf8_strup (const gchar *str, gssize len); Converts all Unicode characters in the string that have a case to uppercase. The exact manner that this is done depends on the current locale, and may result in the number of characters in the string increasing. (For instance, the German ess-zet will be changed to SS.)
g_utf8_strdown ()gchar* g_utf8_strdown (const gchar *str, gssize len); Converts all Unicode characters in the string that have a case to lowercase. The exact manner that this is done depends on the current locale, and may result in the number of characters in the string changing.
g_utf8_casefold ()gchar* g_utf8_casefold (const gchar *str, gssize len);
Converts a string into a form that is independent of case. The
result will not correspond to any particular case, but can be
compared for equality or ordered with the results of calling
Note that calling
g_utf8_normalize ()gchar* g_utf8_normalize (const gchar *str, gssize len, GNormalizeMode mode);
Converts a string into canonical form, standardizing
such issues as whether a character with an accent
is represented as a base character and combining
accent or as a single precomposed character. The
string has to be valid UTF-8, otherwise
The normalization mode
enum GNormalizeModetypedef enum {
G_NORMALIZE_DEFAULT,
G_NORMALIZE_NFD = G_NORMALIZE_DEFAULT,
G_NORMALIZE_DEFAULT_COMPOSE,
G_NORMALIZE_NFC = G_NORMALIZE_DEFAULT_COMPOSE,
G_NORMALIZE_ALL,
G_NORMALIZE_NFKD = G_NORMALIZE_ALL,
G_NORMALIZE_ALL_COMPOSE,
G_NORMALIZE_NFKC = G_NORMALIZE_ALL_COMPOSE
} GNormalizeMode;
Defines how a Unicode string is transformed in a canonical form, standardizing such issues as whether a character with an accent is represented as a base character and combining accent or as a single precomposed character. Unicode strings should generally be normalized before comparing them.
g_utf8_collate ()gint g_utf8_collate (const gchar *str1, const gchar *str2);
Compares two strings for ordering using the linguistically
correct rules for the current locale.
When sorting a large number of strings, it will be significantly
faster to obtain collation keys with
g_utf8_collate_key ()gchar* g_utf8_collate_key (const gchar *str, gssize len);
Converts a string into a collation key that can be compared
with other collation keys produced by the same function using
The results of comparing the collation keys of two strings
with Note that this function depends on the current locale.
g_utf8_collate_key_for_filename ()gchar* g_utf8_collate_key_for_filename (const gchar *str, gssize len);
Converts a string into a collation key that can be compared
with other collation keys produced by the same function using In order to sort filenames correctly, this function treats the dot '.' as a special case. Most dictionary orderings seem to consider it insignificant, thus producing the ordering "event.c" "eventgenerator.c" "event.h" instead of "event.c" "event.h" "eventgenerator.c". Also, we would like to treat numbers intelligently so that "file1" "file10" "file5" is sorted as "file1" "file5" "file10". Note that this function depends on the current locale.
Since 2.8 g_utf8_to_utf16 ()gunichar2* g_utf8_to_utf16 (const gchar *str, glong len, glong *items_read, glong *items_written, GError **error); Convert a string from UTF-8 to UTF-16. A 0 character will be added to the result after the converted text.
g_utf8_to_ucs4 ()gunichar* g_utf8_to_ucs4 (const gchar *str, glong len, glong *items_read, glong *items_written, GError **error); Convert a string from UTF-8 to a 32-bit fixed width representation as UCS-4. A trailing 0 will be added to the string after the converted text.
g_utf8_to_ucs4_fast ()gunichar* g_utf8_to_ucs4_fast (const gchar *str, glong len, glong *items_written);
Convert a string from UTF-8 to a 32-bit fixed width
representation as UCS-4, assuming valid UTF-8 input.
This function is roughly twice as fast as
g_utf16_to_ucs4 ()gunichar* g_utf16_to_ucs4 (const gunichar2 *str, glong len, glong *items_read, glong *items_written, GError **error); Convert a string from UTF-16 to UCS-4. The result will be nul-terminated.
g_utf16_to_utf8 ()gchar* g_utf16_to_utf8 (const gunichar2 *str, glong len, glong *items_read, glong *items_written, GError **error); Convert a string from UTF-16 to UTF-8. The result will be terminated with a 0 byte.
Note that the input is expected to be already in native endianness,
an initial byte-order-mark character is not handled specially.
g_ucs4_to_utf16 ()gunichar2* g_ucs4_to_utf16 (const gunichar *str, glong len, glong *items_read, glong *items_written, GError **error); Convert a string from UCS-4 to UTF-16. A 0 character will be added to the result after the converted text.
g_ucs4_to_utf8 ()gchar* g_ucs4_to_utf8 (const gunichar *str, glong len, glong *items_read, glong *items_written, GError **error); Convert a string from a 32-bit fixed width representation as UCS-4. to UTF-8. The result will be terminated with a 0 byte.
g_unichar_to_utf8 ()gint g_unichar_to_utf8 (gunichar c, gchar *outbuf); Converts a single character to UTF-8.
| ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||