// icucommon.h is autogenerated and merged from the ICU header files. // Code unused or not supported in the Windows ICU SDK has been removed. #if (NTDDI_VERSION >= NTDDI_WIN10_RS2) #ifndef SUPPRESS_LEGACY_ICU_HEADER_WARNINGS // For more information on the ICU breaking change to use char16_t by default, please see the page here: // https://go.microsoft.com/fwlink/?linkid=851033 #pragma message("The wchar_t versions of the ICU headers are no longer being updated, please use the char16_t based header icu.h instead; see https://go.microsoft.com/fwlink/?linkid=851033 for more info. To suppress this warning, define the macro SUPPRESS_LEGACY_ICU_HEADER_WARNINGS before including this header.") #endif /* SUPPRESS_LEGACY_ICU_HEADER_WARNINGS */ // Default Windows SDK ICU configuration options. // Alternate selections are not supported in the Windows SDK. #define U_DISABLE_RENAMING 1 #define U_SHOW_CPLUSPLUS_API 0 #define U_DEFAULT_SHOW_DRAFT 0 #define U_HIDE_DRAFT_API 1 #define U_HIDE_DEPRECATED_API 1 #define U_HIDE_OBSOLETE_API 1 #define U_HIDE_INTERNAL_API 1 #define U_HAVE_STD_STRING 0 #define U_NO_DEFAULT_INCLUDE_UTF_HEADERS 1 // appendable.h // No supported content // brkiter.h // No supported content // bytestream.h // No supported content // bytestrie.h // No supported content // bytestriebuilder.h // No supported content // chariter.h // No supported content // dbbi.h // No supported content // docmain.h // No supported content // dtintrv.h // No supported content // enumset.h // No supported content // errorcode.h // No supported content // filteredbrk.h // No supported content // icuplug.h // No supported content // idna.h // No supported content // listformatter.h // No supported content // localpointer.h // No supported content // locdspnm.h // No supported content // locid.h // No supported content // normalizer2.h // No supported content // normlzr.h // No supported content // parsepos.h // No supported content // rbbi.h // No supported content // rep.h // No supported content // resbund.h // No supported content // schriter.h // No supported content // simpleformatter.h // No supported content // std_string.h // No supported content // strenum.h // No supported content // stringpiece.h // No supported content // symtable.h // No supported content // ucharstrie.h // No supported content // ucharstriebuilder.h // No supported content // uchriter.h // No supported content // uconfig.h /* ********************************************************************** * Copyright (C) 2002-2016, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** * file name: uconfig.h * encoding: US-ASCII * tab size: 8 (not used) * indentation:4 * * created on: 2002sep19 * created by: Markus W. Scherer */ #ifndef __UCONFIG_H__ #define __UCONFIG_H__ /*! * \file * \brief User-configurable settings * * Miscellaneous switches: * * A number of macros affect a variety of minor aspects of ICU. * Most of them used to be defined elsewhere (e.g., in utypes.h or platform.h) * and moved here to make them easier to find. * * Switches for excluding parts of ICU library code modules: * * Changing these macros allows building partial, smaller libraries for special purposes. * By default, all modules are built. * The switches are fairly coarse, controlling large modules. * Basic services cannot be turned off. * * Building with any of these options does not guarantee that the * ICU build process will completely work. It is recommended that * the ICU libraries and data be built using the normal build. * At that time you should remove the data used by those services. * After building the ICU data library, you should rebuild the ICU * libraries with these switches customized to your needs. * * @stable ICU 2.4 */ /** * If this switch is defined, ICU will attempt to load a header file named "uconfig_local.h" * prior to determining default settings for uconfig variables. * * @internal ICU 4.0 */ #if defined(UCONFIG_USE_LOCAL) #include "uconfig_local.h" #endif /** * \def U_DEBUG * Determines whether to include debugging code. * Automatically set on Windows, but most compilers do not have * related predefined macros. * @internal */ #ifdef U_DEBUG /* Use the predefined value. */ #elif defined(_DEBUG) /* * _DEBUG is defined by Visual Studio debug compilation. * Do *not* test for its NDEBUG macro: It is an orthogonal macro * which disables assert(). */ # define U_DEBUG 1 # else # define U_DEBUG 0 #endif /** * Determines wheter to enable auto cleanup of libraries. * @internal */ #ifndef UCLN_NO_AUTO_CLEANUP #define UCLN_NO_AUTO_CLEANUP 1 #endif /** * \def U_DISABLE_RENAMING * Determines whether to disable renaming or not. * @internal */ #ifndef U_DISABLE_RENAMING #endif /** * \def U_NO_DEFAULT_INCLUDE_UTF_HEADERS * Determines whether utypes.h includes utf.h, utf8.h, utf16.h and utf_old.h. * utypes.h includes those headers if this macro is defined to 0. * Otherwise, each those headers must be included explicitly when using one of their macros. * Defaults to 0 for backward compatibility, except inside ICU. * @stable ICU 49 */ /** * \def U_OVERRIDE_CXX_ALLOCATION * Determines whether to override new and delete. * ICU is normally built such that all of its C++ classes, via their UMemory base, * override operators new and delete to use its internal, customizable, * non-exception-throwing memory allocation functions. (Default value 1 for this macro.) * * This is especially important when the application and its libraries use multiple heaps. * For example, on Windows, this allows the ICU DLL to be used by * applications that statically link the C Runtime library. * * @stable ICU 2.2 */ #ifndef U_OVERRIDE_CXX_ALLOCATION #define U_OVERRIDE_CXX_ALLOCATION 1 #endif /** * \def U_ENABLE_TRACING * Determines whether to enable tracing. * @internal */ #ifndef U_ENABLE_TRACING #define U_ENABLE_TRACING 0 #endif /** * \def UCONFIG_ENABLE_PLUGINS * Determines whether to enable ICU plugins. * @internal */ #ifndef UCONFIG_ENABLE_PLUGINS #define UCONFIG_ENABLE_PLUGINS 0 #endif /** * \def U_ENABLE_DYLOAD * Whether to enable Dynamic loading in ICU. * @internal */ #ifndef U_ENABLE_DYLOAD #define U_ENABLE_DYLOAD 1 #endif /** * \def U_CHECK_DYLOAD * Whether to test Dynamic loading as an OS capability. * @internal */ #ifndef U_CHECK_DYLOAD #define U_CHECK_DYLOAD 1 #endif /** * \def U_DEFAULT_SHOW_DRAFT * Do we allow ICU users to use the draft APIs by default? * @internal */ #ifndef U_DEFAULT_SHOW_DRAFT #define U_DEFAULT_SHOW_DRAFT 1 #endif /*===========================================================================*/ /* Custom icu entry point renaming */ /*===========================================================================*/ /** * \def U_HAVE_LIB_SUFFIX * 1 if a custom library suffix is set. * @internal */ #ifdef U_HAVE_LIB_SUFFIX /* Use the predefined value. */ #elif defined(U_LIB_SUFFIX_C_NAME) # define U_HAVE_LIB_SUFFIX 1 #endif /** * \def U_LIB_SUFFIX_C_NAME_STRING * Defines the library suffix as a string with C syntax. * @internal */ #ifdef U_LIB_SUFFIX_C_NAME_STRING /* Use the predefined value. */ #elif defined(U_LIB_SUFFIX_C_NAME) # define CONVERT_TO_STRING(s) #s # define U_LIB_SUFFIX_C_NAME_STRING CONVERT_TO_STRING(U_LIB_SUFFIX_C_NAME) #else # define U_LIB_SUFFIX_C_NAME_STRING "" #endif /* common/i18n library switches --------------------------------------------- */ /** * \def UCONFIG_ONLY_COLLATION * This switch turns off modules that are not needed for collation. * * It does not turn off legacy conversion because that is necessary * for ICU to work on EBCDIC platforms (for the default converter). * If you want "only collation" and do not build for EBCDIC, * then you can define UCONFIG_NO_CONVERSION or UCONFIG_NO_LEGACY_CONVERSION to 1 as well. * * @stable ICU 2.4 */ #ifndef UCONFIG_ONLY_COLLATION # define UCONFIG_ONLY_COLLATION 0 #endif #if UCONFIG_ONLY_COLLATION /* common library */ # define UCONFIG_NO_BREAK_ITERATION 1 # define UCONFIG_NO_IDNA 1 /* i18n library */ # if UCONFIG_NO_COLLATION # error Contradictory collation switches in uconfig.h. # endif # define UCONFIG_NO_FORMATTING 1 # define UCONFIG_NO_TRANSLITERATION 1 # define UCONFIG_NO_REGULAR_EXPRESSIONS 1 #endif /* common library switches -------------------------------------------------- */ /** * \def UCONFIG_NO_FILE_IO * This switch turns off all file access in the common library * where file access is only used for data loading. * ICU data must then be provided in the form of a data DLL (or with an * equivalent way to link to the data residing in an executable, * as in building a combined library with both the common library's code and * the data), or via udata_setCommonData(). * Application data must be provided via udata_setAppData() or by using * "open" functions that take pointers to data, for example ucol_openBinary(). * * File access is not used at all in the i18n library. * * File access cannot be turned off for the icuio library or for the ICU * test suites and ICU tools. * * @stable ICU 3.6 */ #ifndef UCONFIG_NO_FILE_IO # define UCONFIG_NO_FILE_IO 0 #endif #if UCONFIG_NO_FILE_IO && defined(U_TIMEZONE_FILES_DIR) # error Contradictory file io switches in uconfig.h. #endif /** * \def UCONFIG_NO_CONVERSION * ICU will not completely build with this switch turned on. * This switch turns off all converters. * * You may want to use this together with U_CHARSET_IS_UTF8 defined to 1 * in utypes.h if char* strings in your environment are always in UTF-8. * * @stable ICU 3.2 * @see U_CHARSET_IS_UTF8 */ #ifndef UCONFIG_NO_CONVERSION # define UCONFIG_NO_CONVERSION 0 #endif #if UCONFIG_NO_CONVERSION # define UCONFIG_NO_LEGACY_CONVERSION 1 #endif /** * \def UCONFIG_ONLY_HTML_CONVERSION * This switch turns off all of the converters NOT listed in * the HTML encoding standard: * http://www.w3.org/TR/encoding/#names-and-labels * * This is not possible on EBCDIC platforms * because they need ibm-37 or ibm-1047 default converters. * * @stable ICU 55 */ #ifndef UCONFIG_ONLY_HTML_CONVERSION # define UCONFIG_ONLY_HTML_CONVERSION 0 #endif /** * \def UCONFIG_NO_LEGACY_CONVERSION * This switch turns off all converters except for * - Unicode charsets (UTF-7/8/16/32, CESU-8, SCSU, BOCU-1) * - US-ASCII * - ISO-8859-1 * * Turning off legacy conversion is not possible on EBCDIC platforms * because they need ibm-37 or ibm-1047 default converters. * * @stable ICU 2.4 */ #ifndef UCONFIG_NO_LEGACY_CONVERSION # define UCONFIG_NO_LEGACY_CONVERSION 0 #endif /** * \def UCONFIG_NO_NORMALIZATION * This switch turns off normalization. * It implies turning off several other services as well, for example * collation and IDNA. * * @stable ICU 2.6 */ #ifndef UCONFIG_NO_NORMALIZATION # define UCONFIG_NO_NORMALIZATION 0 #elif UCONFIG_NO_NORMALIZATION /* common library */ /* ICU 50 CJK dictionary BreakIterator uses normalization */ # define UCONFIG_NO_BREAK_ITERATION 1 /* IDNA (UTS #46) is implemented via normalization */ # define UCONFIG_NO_IDNA 1 /* i18n library */ # if UCONFIG_ONLY_COLLATION # error Contradictory collation switches in uconfig.h. # endif # define UCONFIG_NO_COLLATION 1 # define UCONFIG_NO_TRANSLITERATION 1 #endif /** * \def UCONFIG_NO_BREAK_ITERATION * This switch turns off break iteration. * * @stable ICU 2.4 */ #ifndef UCONFIG_NO_BREAK_ITERATION # define UCONFIG_NO_BREAK_ITERATION 0 #endif /** * \def UCONFIG_NO_IDNA * This switch turns off IDNA. * * @stable ICU 2.6 */ #ifndef UCONFIG_NO_IDNA # define UCONFIG_NO_IDNA 0 #endif /** * \def UCONFIG_MSGPAT_DEFAULT_APOSTROPHE_MODE * Determines the default UMessagePatternApostropheMode. * See the documentation for that enum. * * @stable ICU 4.8 */ #ifndef UCONFIG_MSGPAT_DEFAULT_APOSTROPHE_MODE # define UCONFIG_MSGPAT_DEFAULT_APOSTROPHE_MODE UMSGPAT_APOS_DOUBLE_OPTIONAL #endif /* i18n library switches ---------------------------------------------------- */ /** * \def UCONFIG_NO_COLLATION * This switch turns off collation and collation-based string search. * * @stable ICU 2.4 */ #ifndef UCONFIG_NO_COLLATION # define UCONFIG_NO_COLLATION 0 #endif /** * \def UCONFIG_NO_FORMATTING * This switch turns off formatting and calendar/timezone services. * * @stable ICU 2.4 */ #ifndef UCONFIG_NO_FORMATTING # define UCONFIG_NO_FORMATTING 0 #endif /** * \def UCONFIG_NO_TRANSLITERATION * This switch turns off transliteration. * * @stable ICU 2.4 */ #ifndef UCONFIG_NO_TRANSLITERATION # define UCONFIG_NO_TRANSLITERATION 0 #endif /** * \def UCONFIG_NO_REGULAR_EXPRESSIONS * This switch turns off regular expressions. * * @stable ICU 2.4 */ #ifndef UCONFIG_NO_REGULAR_EXPRESSIONS # define UCONFIG_NO_REGULAR_EXPRESSIONS 0 #endif /** * \def UCONFIG_NO_SERVICE * This switch turns off service registration. * * @stable ICU 3.2 */ #ifndef UCONFIG_NO_SERVICE # define UCONFIG_NO_SERVICE 0 #endif /** * \def UCONFIG_HAVE_PARSEALLINPUT * This switch turns on the "parse all input" attribute. Binary incompatible. * * @internal */ #ifndef UCONFIG_HAVE_PARSEALLINPUT # define UCONFIG_HAVE_PARSEALLINPUT 1 #endif /** * \def UCONFIG_FORMAT_FASTPATHS_49 * This switch turns on other formatting fastpaths. Binary incompatible in object DecimalFormat and DecimalFormatSymbols * * @internal */ #ifndef UCONFIG_FORMAT_FASTPATHS_49 # define UCONFIG_FORMAT_FASTPATHS_49 1 #endif /** * \def UCONFIG_NO_FILTERED_BREAK_ITERATION * This switch turns off filtered break iteration code. * * @internal */ #ifndef UCONFIG_NO_FILTERED_BREAK_ITERATION # define UCONFIG_NO_FILTERED_BREAK_ITERATION 0 #endif #endif // udata.h // No supported content // unifilt.h // No supported content // unifunct.h // No supported content // unimatch.h // No supported content // uniset.h // No supported content // unorm.h // No supported content // urename.h // No supported content // usetiter.h // No supported content // utf32.h // No supported content // utf_old.h // No supported content // uvernum.h // No supported content // platform.h /* ****************************************************************************** * * Copyright (C) 1997-2016, International Business Machines * Corporation and others. All Rights Reserved. * ****************************************************************************** * * FILE NAME : platform.h * * Date Name Description * 05/13/98 nos Creation (content moved here from ptypes.h). * 03/02/99 stephen Added AS400 support. * 03/30/99 stephen Added Linux support. * 04/13/99 stephen Reworked for autoconf. ****************************************************************************** */ #ifndef _PLATFORM_H #define _PLATFORM_H /** * \file * \brief Basic types for the platform. * * This file used to be generated by autoconf/configure. * Starting with ICU 49, platform.h is a normal source file, * to simplify cross-compiling and working with non-autoconf/make build systems. * * When a value in this file does not work on a platform, then please * try to derive it from the U_PLATFORM value * (for which we might need a new value constant in rare cases) * and/or from other macros that are predefined by the compiler * or defined in standard (POSIX or platform or compiler) headers. * * As a temporary workaround, you can add an explicit #define for some macros * before it is first tested, or add an equivalent -D macro definition * to the compiler's command line. * * Note: Some compilers provide ways to show the predefined macros. * For example, with gcc you can compile an empty .c file and have the compiler * print the predefined macros with * \code * gcc -E -dM -x c /dev/null | sort * \endcode * (You can provide an actual empty .c file rather than /dev/null. * -x c++ is for C++.) */ /** * Define some things so that they can be documented. * @internal */ #ifdef U_IN_DOXYGEN /* * Problem: "platform.h:335: warning: documentation for unknown define U_HAVE_STD_STRING found." means that U_HAVE_STD_STRING is not documented. * Solution: #define any defines for non @internal API here, so that they are visible in the docs. If you just set PREDEFINED in Doxyfile.in, they won't be documented. */ /* None for now. */ #endif /** * \def U_PLATFORM * The U_PLATFORM macro defines the platform we're on. * * We used to define one different, value-less macro per platform. * That made it hard to know the set of relevant platforms and macros, * and hard to deal with variants of platforms. * * Starting with ICU 49, we define platforms as numeric macros, * with ranges of values for related platforms and their variants. * The U_PLATFORM macro is set to one of these values. * * Historical note from the Solaris Wikipedia article: * AT&T and Sun collaborated on a project to merge the most popular Unix variants * on the market at that time: BSD, System V, and Xenix. * This became Unix System V Release 4 (SVR4). * * @internal */ /** Unknown platform. @internal */ #define U_PF_UNKNOWN 0 /** Windows @internal */ #define U_PF_WINDOWS 1000 /** MinGW. Windows, calls to Win32 API, but using GNU gcc and binutils. @internal */ #define U_PF_MINGW 1800 /** * Cygwin. Windows, calls to cygwin1.dll for Posix functions, * using MSVC or GNU gcc and binutils. * @internal */ #define U_PF_CYGWIN 1900 /* Reserve 2000 for U_PF_UNIX? */ /** HP-UX is based on UNIX System V. @internal */ #define U_PF_HPUX 2100 /** Solaris is a Unix operating system based on SVR4. @internal */ #define U_PF_SOLARIS 2600 /** BSD is a UNIX operating system derivative. @internal */ #define U_PF_BSD 3000 /** AIX is based on UNIX System V Releases and 4.3 BSD. @internal */ #define U_PF_AIX 3100 /** IRIX is based on UNIX System V with BSD extensions. @internal */ #define U_PF_IRIX 3200 /** * Darwin is a POSIX-compliant operating system, composed of code developed by Apple, * as well as code derived from NeXTSTEP, BSD, and other projects, * built around the Mach kernel. * Darwin forms the core set of components upon which Mac OS X, Apple TV, and iOS are based. * (Original description modified from WikiPedia.) * @internal */ #define U_PF_DARWIN 3500 /** iPhone OS (iOS) is a derivative of Mac OS X. @internal */ #define U_PF_IPHONE 3550 /** QNX is a commercial Unix-like real-time operating system related to BSD. @internal */ #define U_PF_QNX 3700 /** Linux is a Unix-like operating system. @internal */ #define U_PF_LINUX 4000 /** * Native Client is pretty close to Linux. * See https://developer.chrome.com/native-client and * http://www.chromium.org/nativeclient * @internal */ #define U_PF_BROWSER_NATIVE_CLIENT 4020 /** Android is based on Linux. @internal */ #define U_PF_ANDROID 4050 /* Maximum value for Linux-based platform is 4499 */ /** z/OS is the successor to OS/390 which was the successor to MVS. @internal */ #define U_PF_OS390 9000 /** "IBM i" is the current name of what used to be i5/OS and earlier OS/400. @internal */ #define U_PF_OS400 9400 #ifdef U_PLATFORM /* Use the predefined value. */ #elif defined(__MINGW32__) # define U_PLATFORM U_PF_MINGW #elif defined(__CYGWIN__) # define U_PLATFORM U_PF_CYGWIN #elif defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) # define U_PLATFORM U_PF_WINDOWS #elif defined(__ANDROID__) # define U_PLATFORM U_PF_ANDROID /* Android wchar_t support depends on the API level. */ # include #elif defined(__native_client__) # define U_PLATFORM U_PF_BROWSER_NATIVE_CLIENT #elif defined(linux) || defined(__linux__) || defined(__linux) # define U_PLATFORM U_PF_LINUX #elif defined(__APPLE__) && defined(__MACH__) # include # if defined(TARGET_OS_IPHONE) && TARGET_OS_IPHONE /* variant of TARGET_OS_MAC */ # define U_PLATFORM U_PF_IPHONE # else # define U_PLATFORM U_PF_DARWIN # endif #elif defined(BSD) || defined(__FreeBSD__) || defined(__FreeBSD_kernel__) || defined(__NetBSD__) || defined(__OpenBSD__) || defined(__MirBSD__) # if defined(__FreeBSD__) # include # endif # define U_PLATFORM U_PF_BSD #elif defined(sun) || defined(__sun) /* Check defined(__SVR4) || defined(__svr4__) to distinguish Solaris from SunOS? */ # define U_PLATFORM U_PF_SOLARIS # if defined(__GNUC__) /* Solaris/GCC needs this header file to get the proper endianness. Normally, this * header file is included with stddef.h but on Solairs/GCC, the GCC version of stddef.h * is included which does not include this header file. */ # include # endif #elif defined(_AIX) || defined(__TOS_AIX__) # define U_PLATFORM U_PF_AIX #elif defined(_hpux) || defined(hpux) || defined(__hpux) # define U_PLATFORM U_PF_HPUX #elif defined(sgi) || defined(__sgi) # define U_PLATFORM U_PF_IRIX #elif defined(__QNX__) || defined(__QNXNTO__) # define U_PLATFORM U_PF_QNX #elif defined(__TOS_MVS__) # define U_PLATFORM U_PF_OS390 #elif defined(__OS400__) || defined(__TOS_OS400__) # define U_PLATFORM U_PF_OS400 #else # define U_PLATFORM U_PF_UNKNOWN #endif /** * \def CYGWINMSVC * Defined if this is Windows with Cygwin, but using MSVC rather than gcc. * Otherwise undefined. * @internal */ /* Commented out because this is already set in mh-cygwin-msvc #if U_PLATFORM == U_PF_CYGWIN && defined(_MSC_VER) # define CYGWINMSVC #endif */ /** * \def U_PLATFORM_USES_ONLY_WIN32_API * Defines whether the platform uses only the Win32 API. * Set to 1 for Windows/MSVC and MinGW but not Cygwin. * @internal */ #ifdef U_PLATFORM_USES_ONLY_WIN32_API /* Use the predefined value. */ #elif (U_PF_WINDOWS <= U_PLATFORM && U_PLATFORM <= U_PF_MINGW) || defined(CYGWINMSVC) # define U_PLATFORM_USES_ONLY_WIN32_API 1 #else /* Cygwin implements POSIX. */ # define U_PLATFORM_USES_ONLY_WIN32_API 0 #endif /** * \def U_PLATFORM_HAS_WIN32_API * Defines whether the Win32 API is available on the platform. * Set to 1 for Windows/MSVC, MinGW and Cygwin. * @internal */ #ifdef U_PLATFORM_HAS_WIN32_API /* Use the predefined value. */ #elif U_PF_WINDOWS <= U_PLATFORM && U_PLATFORM <= U_PF_CYGWIN # define U_PLATFORM_HAS_WIN32_API 1 #else # define U_PLATFORM_HAS_WIN32_API 0 #endif /** * \def U_PLATFORM_HAS_WINUWP_API * Defines whether target is intended for Universal Windows Platform API * Set to 1 for Windows10 Release Solution Configuration * @internal */ #ifdef U_PLATFORM_HAS_WINUWP_API /* Use the predefined value. */ #else # define U_PLATFORM_HAS_WINUWP_API 0 #endif /** * \def U_PLATFORM_IMPLEMENTS_POSIX * Defines whether the platform implements (most of) the POSIX API. * Set to 1 for Cygwin and most other platforms. * @internal */ #ifdef U_PLATFORM_IMPLEMENTS_POSIX /* Use the predefined value. */ #elif U_PLATFORM_USES_ONLY_WIN32_API # define U_PLATFORM_IMPLEMENTS_POSIX 0 #else # define U_PLATFORM_IMPLEMENTS_POSIX 1 #endif /** * \def U_PLATFORM_IS_LINUX_BASED * Defines whether the platform is Linux or one of its derivatives. * @internal */ #ifdef U_PLATFORM_IS_LINUX_BASED /* Use the predefined value. */ #elif U_PF_LINUX <= U_PLATFORM && U_PLATFORM <= 4499 # define U_PLATFORM_IS_LINUX_BASED 1 #else # define U_PLATFORM_IS_LINUX_BASED 0 #endif /** * \def U_PLATFORM_IS_DARWIN_BASED * Defines whether the platform is Darwin or one of its derivatives. * @internal */ #ifdef U_PLATFORM_IS_DARWIN_BASED /* Use the predefined value. */ #elif U_PF_DARWIN <= U_PLATFORM && U_PLATFORM <= U_PF_IPHONE # define U_PLATFORM_IS_DARWIN_BASED 1 #else # define U_PLATFORM_IS_DARWIN_BASED 0 #endif /** * \def U_HAVE_STDINT_H * Defines whether stdint.h is available. It is a C99 standard header. * We used to include inttypes.h which includes stdint.h but we usually do not need * the additional definitions from inttypes.h. * @internal */ #ifdef U_HAVE_STDINT_H /* Use the predefined value. */ #elif U_PLATFORM_USES_ONLY_WIN32_API # if defined(__BORLANDC__) || U_PLATFORM == U_PF_MINGW || (defined(_MSC_VER) && _MSC_VER>=1600) /* Windows Visual Studio 9 and below do not have stdint.h & inttypes.h, but VS 2010 adds them. */ # define U_HAVE_STDINT_H 1 # else # define U_HAVE_STDINT_H 0 # endif #elif U_PLATFORM == U_PF_SOLARIS /* Solaris has inttypes.h but not stdint.h. */ # define U_HAVE_STDINT_H 0 #elif U_PLATFORM == U_PF_AIX && !defined(_AIX51) && defined(_POWER) /* PPC AIX <= 4.3 has inttypes.h but not stdint.h. */ # define U_HAVE_STDINT_H 0 #else # define U_HAVE_STDINT_H 1 #endif /** * \def U_HAVE_INTTYPES_H * Defines whether inttypes.h is available. It is a C99 standard header. * We include inttypes.h where it is available but stdint.h is not. * @internal */ #ifdef U_HAVE_INTTYPES_H /* Use the predefined value. */ #elif U_PLATFORM == U_PF_SOLARIS /* Solaris has inttypes.h but not stdint.h. */ # define U_HAVE_INTTYPES_H 1 #elif U_PLATFORM == U_PF_AIX && !defined(_AIX51) && defined(_POWER) /* PPC AIX <= 4.3 has inttypes.h but not stdint.h. */ # define U_HAVE_INTTYPES_H 1 #else /* Most platforms have both inttypes.h and stdint.h, or neither. */ # define U_HAVE_INTTYPES_H U_HAVE_STDINT_H #endif /** * \def U_IOSTREAM_SOURCE * Defines what support for C++ streams is available. * * If U_IOSTREAM_SOURCE is set to 199711, then <iostream> is available * (the ISO/IEC C++ FDIS was published in November 1997), and then * one should qualify streams using the std namespace in ICU header * files. * Starting with ICU 49, this is the only supported version. * * If U_IOSTREAM_SOURCE is set to 198506, then <iostream.h> is * available instead (in June 1985 Stroustrup published * "An Extensible I/O Facility for C++" at the summer USENIX conference). * Starting with ICU 49, this version is not supported any more. * * If U_IOSTREAM_SOURCE is 0 (or any value less than 199711), * then C++ streams are not available and * support for them will be silently suppressed in ICU. * * @internal */ #ifndef U_IOSTREAM_SOURCE #define U_IOSTREAM_SOURCE 199711 #endif /** * \def U_HAVE_STD_STRING * Defines whether the standard C++ (STL) <string> header is available. * @internal */ /*===========================================================================*/ /** @{ Compiler and environment features */ /*===========================================================================*/ /** * \def U_GCC_MAJOR_MINOR * Indicates whether the compiler is gcc (test for != 0), * and if so, contains its major (times 100) and minor version numbers. * If the compiler is not gcc, then U_GCC_MAJOR_MINOR == 0. * * For example, for testing for whether we have gcc, and whether it's 4.6 or higher, * use "#if U_GCC_MAJOR_MINOR >= 406". * @internal */ #ifdef __GNUC__ # define U_GCC_MAJOR_MINOR (__GNUC__ * 100 + __GNUC_MINOR__) #else # define U_GCC_MAJOR_MINOR 0 #endif /** * \def U_IS_BIG_ENDIAN * Determines the endianness of the platform. * @internal */ # define U_IS_BIG_ENDIAN 0 /** * \def U_HAVE_PLACEMENT_NEW * Determines whether to override placement new and delete for STL. * @stable ICU 2.6 */ #ifdef U_HAVE_PLACEMENT_NEW /* Use the predefined value. */ #elif defined(__BORLANDC__) # define U_HAVE_PLACEMENT_NEW 0 #else # define U_HAVE_PLACEMENT_NEW 1 #endif /** * \def U_HAVE_DEBUG_LOCATION_NEW * Define this to define the MFC debug version of the operator new. * * @stable ICU 3.4 */ #ifdef U_HAVE_DEBUG_LOCATION_NEW /* Use the predefined value. */ #elif defined(_MSC_VER) # define U_HAVE_DEBUG_LOCATION_NEW 1 #else # define U_HAVE_DEBUG_LOCATION_NEW 0 #endif /* Compatibility with non clang compilers: http://clang.llvm.org/docs/LanguageExtensions.html */ #ifndef __has_attribute # define __has_attribute(x) 0 #endif #ifndef __has_cpp_attribute # define __has_cpp_attribute(x) 0 #endif #ifndef __has_builtin # define __has_builtin(x) 0 #endif #ifndef __has_feature # define __has_feature(x) 0 #endif #ifndef __has_extension # define __has_extension(x) 0 #endif #ifndef __has_warning # define __has_warning(x) 0 #endif /** * \def U_MALLOC_ATTR * Attribute to mark functions as malloc-like * @internal */ #if defined(__GNUC__) && __GNUC__>=3 # define U_MALLOC_ATTR __attribute__ ((__malloc__)) #else # define U_MALLOC_ATTR #endif /** * \def U_ALLOC_SIZE_ATTR * Attribute to specify the size of the allocated buffer for malloc-like functions * @internal */ #if (defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3))) || __has_attribute(alloc_size) # define U_ALLOC_SIZE_ATTR(X) __attribute__ ((alloc_size(X))) # define U_ALLOC_SIZE_ATTR2(X,Y) __attribute__ ((alloc_size(X,Y))) #else # define U_ALLOC_SIZE_ATTR(X) # define U_ALLOC_SIZE_ATTR2(X,Y) #endif /** * \def U_CPLUSPLUS_VERSION * 0 if no C++; 1, 11, 14, ... if C++. * Support for specific features cannot always be determined by the C++ version alone. * @internal */ #ifdef U_CPLUSPLUS_VERSION # if U_CPLUSPLUS_VERSION != 0 && !defined(__cplusplus) # undef U_CPLUSPLUS_VERSION # define U_CPLUSPLUS_VERSION 0 # endif /* Otherwise use the predefined value. */ #elif !defined(__cplusplus) # define U_CPLUSPLUS_VERSION 0 #elif __cplusplus >= 201402L # define U_CPLUSPLUS_VERSION 14 #elif __cplusplus >= 201103L # define U_CPLUSPLUS_VERSION 11 #else // C++98 or C++03 # define U_CPLUSPLUS_VERSION 1 #endif /** * \def U_HAVE_RVALUE_REFERENCES * Set to 1 if the compiler supports rvalue references. * C++11 feature, necessary for move constructor & move assignment. * @internal */ #ifdef U_HAVE_RVALUE_REFERENCES /* Use the predefined value. */ #elif U_CPLUSPLUS_VERSION >= 11 || __has_feature(cxx_rvalue_references) \ || defined(__GXX_EXPERIMENTAL_CXX0X__) \ || (defined(_MSC_VER) && _MSC_VER >= 1600) /* Visual Studio 2010 */ # define U_HAVE_RVALUE_REFERENCES 1 #else # define U_HAVE_RVALUE_REFERENCES 0 #endif /** * \def U_NOEXCEPT * "noexcept" if supported, otherwise empty. * Some code, especially STL containers, uses move semantics of objects only * if the move constructor and the move operator are declared as not throwing exceptions. * @internal */ #ifdef U_NOEXCEPT /* Use the predefined value. */ #elif defined(_HAS_EXCEPTIONS) && !_HAS_EXCEPTIONS /* Visual Studio */ # define U_NOEXCEPT #elif U_CPLUSPLUS_VERSION >= 11 || __has_feature(cxx_noexcept) || __has_extension(cxx_noexcept) \ || (defined(_MSC_VER) && _MSC_VER >= 1900) /* Visual Studio 2015 */ # define U_NOEXCEPT noexcept #else # define U_NOEXCEPT #endif /** * \def U_FALLTHROUGH * Annotate intentional fall-through between switch labels. * http://clang.llvm.org/docs/AttributeReference.html#fallthrough-clang-fallthrough * @internal */ #ifdef __cplusplus # if __has_cpp_attribute(clang::fallthrough) || \ (__has_feature(cxx_attributes) && __has_warning("-Wimplicit-fallthrough")) # define U_FALLTHROUGH [[clang::fallthrough]] # else # define U_FALLTHROUGH # endif #else # define U_FALLTHROUGH #endif /** @} */ /*===========================================================================*/ /** @{ Character data types */ /*===========================================================================*/ /** * U_CHARSET_FAMILY is equal to this value when the platform is an ASCII based platform. * @stable ICU 2.0 */ #define U_ASCII_FAMILY 0 /** * U_CHARSET_FAMILY is equal to this value when the platform is an EBCDIC based platform. * @stable ICU 2.0 */ #define U_EBCDIC_FAMILY 1 /** * \def U_CHARSET_FAMILY * *

These definitions allow to specify the encoding of text * in the char data type as defined by the platform and the compiler. * It is enough to determine the code point values of "invariant characters", * which are the ones shared by all encodings that are in use * on a given platform.

* *

Those "invariant characters" should be all the uppercase and lowercase * latin letters, the digits, the space, and "basic punctuation". * Also, '\\n', '\\r', '\\t' should be available.

* *

The list of "invariant characters" is:
* \code * A-Z a-z 0-9 SPACE " % & ' ( ) * + , - . / : ; < = > ? _ * \endcode *
* (52 letters + 10 numbers + 20 punc/sym/space = 82 total)

* *

This matches the IBM Syntactic Character Set (CS 640).

* *

In other words, all the graphic characters in 7-bit ASCII should * be safely accessible except the following:

* * \code * '\' * '[' * ']' * '{' * '}' * '^' * '~' * '!' * '#' * '|' * '$' * '@' * '`' * \endcode * @stable ICU 2.0 */ #ifdef U_CHARSET_FAMILY /* Use the predefined value. */ #elif U_PLATFORM == U_PF_OS390 && (!defined(__CHARSET_LIB) || !__CHARSET_LIB) # define U_CHARSET_FAMILY U_EBCDIC_FAMILY #elif U_PLATFORM == U_PF_OS400 && !defined(__UTF32__) # define U_CHARSET_FAMILY U_EBCDIC_FAMILY #else # define U_CHARSET_FAMILY U_ASCII_FAMILY #endif /** * \def U_CHARSET_IS_UTF8 * * Hardcode the default charset to UTF-8. * * If this is set to 1, then * - ICU will assume that all non-invariant char*, StringPiece, std::string etc. * contain UTF-8 text, regardless of what the system API uses * - some ICU code will use fast functions like u_strFromUTF8() * rather than the more general and more heavy-weight conversion API (ucnv.h) * - ucnv_getDefaultName() always returns "UTF-8" * - ucnv_setDefaultName() is disabled and will not change the default charset * - static builds of ICU are smaller * - more functionality is available with the UCONFIG_NO_CONVERSION build-time * configuration option (see unicode/uconfig.h) * - the UCONFIG_NO_CONVERSION build option in uconfig.h is more usable * * @stable ICU 4.2 * @see UCONFIG_NO_CONVERSION */ #ifdef U_CHARSET_IS_UTF8 /* Use the predefined value. */ #elif U_PLATFORM == U_PF_ANDROID || U_PLATFORM_IS_DARWIN_BASED # define U_CHARSET_IS_UTF8 1 #else # define U_CHARSET_IS_UTF8 0 #endif /** @} */ /*===========================================================================*/ /** @{ Information about wchar support */ /*===========================================================================*/ /** * \def U_HAVE_WCHAR_H * Indicates whether is available (1) or not (0). Set to 1 by default. * * @stable ICU 2.0 */ #ifdef U_HAVE_WCHAR_H /* Use the predefined value. */ #elif U_PLATFORM == U_PF_ANDROID && __ANDROID_API__ < 9 /* * Android before Gingerbread (Android 2.3, API level 9) did not support wchar_t. * The type and header existed, but the library functions did not work as expected. * The size of wchar_t was 1 but L"xyz" string literals had 32-bit units anyway. */ # define U_HAVE_WCHAR_H 0 #else # define U_HAVE_WCHAR_H 1 #endif /** * \def U_SIZEOF_WCHAR_T * U_SIZEOF_WCHAR_T==sizeof(wchar_t) * * @stable ICU 2.0 */ #ifdef U_SIZEOF_WCHAR_T /* Use the predefined value. */ #elif (U_PLATFORM == U_PF_ANDROID && __ANDROID_API__ < 9) /* * Classic Mac OS and Mac OS X before 10.3 (Panther) did not support wchar_t or wstring. * Newer Mac OS X has size 4. */ # define U_SIZEOF_WCHAR_T 1 #elif U_PLATFORM_HAS_WIN32_API || U_PLATFORM == U_PF_CYGWIN # define U_SIZEOF_WCHAR_T 2 #elif U_PLATFORM == U_PF_AIX /* * AIX 6.1 information, section "Wide character data representation": * "... the wchar_t datatype is 32-bit in the 64-bit environment and * 16-bit in the 32-bit environment." * and * "All locales use Unicode for their wide character code values (process code), * except the IBM-eucTW codeset." */ # ifdef __64BIT__ # define U_SIZEOF_WCHAR_T 4 # else # define U_SIZEOF_WCHAR_T 2 # endif #elif U_PLATFORM == U_PF_OS390 /* * z/OS V1R11 information center, section "LP64 | ILP32": * "In 31-bit mode, the size of long and pointers is 4 bytes and the size of wchar_t is 2 bytes. * Under LP64, the size of long and pointer is 8 bytes and the size of wchar_t is 4 bytes." */ # ifdef _LP64 # define U_SIZEOF_WCHAR_T 4 # else # define U_SIZEOF_WCHAR_T 2 # endif #elif U_PLATFORM == U_PF_OS400 # if defined(__UTF32__) /* * LOCALETYPE(*LOCALEUTF) is specified. * Wide-character strings are in UTF-32, * narrow-character strings are in UTF-8. */ # define U_SIZEOF_WCHAR_T 4 # elif defined(__UCS2__) /* * LOCALETYPE(*LOCALEUCS2) is specified. * Wide-character strings are in UCS-2, * narrow-character strings are in EBCDIC. */ # define U_SIZEOF_WCHAR_T 2 #else /* * LOCALETYPE(*CLD) or LOCALETYPE(*LOCALE) is specified. * Wide-character strings are in 16-bit EBCDIC, * narrow-character strings are in EBCDIC. */ # define U_SIZEOF_WCHAR_T 2 # endif #else # define U_SIZEOF_WCHAR_T 4 #endif #ifndef U_HAVE_WCSCPY #define U_HAVE_WCSCPY U_HAVE_WCHAR_H #endif /** @} */ /** * \def U_HAVE_CHAR16_T * Defines whether the char16_t type is available for UTF-16 * and u"abc" UTF-16 string literals are supported. * This is a new standard type and standard string literal syntax in C++0x * but has been available in some compilers before. * @internal */ #ifdef U_HAVE_CHAR16_T /* Use the predefined value. */ #else /* * Notes: * Visual Studio 10 (_MSC_VER>=1600) defines char16_t but * does not support u"abc" string literals. * gcc 4.4 defines the __CHAR16_TYPE__ macro to a usable type but * does not support u"abc" string literals. * C++11 and C11 require support for UTF-16 literals */ # if U_CPLUSPLUS_VERSION >= 11 || (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L) # define U_HAVE_CHAR16_T 1 # else # define U_HAVE_CHAR16_T 0 # endif #endif /** * @{ * \def U_DECLARE_UTF16 * Do not use this macro because it is not defined on all platforms. * Use the UNICODE_STRING or U_STRING_DECL macros instead. * @internal */ #ifdef U_DECLARE_UTF16 /* Use the predefined value. */ #elif U_HAVE_CHAR16_T \ || (defined(__xlC__) && defined(__IBM_UTF_LITERAL) && U_SIZEOF_WCHAR_T != 2) \ || (defined(__HP_aCC) && __HP_aCC >= 035000) \ || (defined(__HP_cc) && __HP_cc >= 111106) # define U_DECLARE_UTF16(string) u ## string #elif U_SIZEOF_WCHAR_T == 2 \ && (U_CHARSET_FAMILY == 0 || (U_PF_OS390 <= U_PLATFORM && U_PLATFORM <= U_PF_OS400 && defined(__UCS2__))) # define U_DECLARE_UTF16(string) L ## string #else /* Leave U_DECLARE_UTF16 undefined. See unistr.h. */ #endif /** @} */ /*===========================================================================*/ /** @{ Symbol import-export control */ /*===========================================================================*/ #ifdef U_EXPORT /* Use the predefined value. */ #elif defined(U_STATIC_IMPLEMENTATION) # define U_EXPORT #elif defined(__GNUC__) # define U_EXPORT __attribute__((visibility("default"))) #elif (defined(__SUNPRO_CC) && __SUNPRO_CC >= 0x550) \ || (defined(__SUNPRO_C) && __SUNPRO_C >= 0x550) # define U_EXPORT __global /*#elif defined(__HP_aCC) || defined(__HP_cc) # define U_EXPORT __declspec(dllexport)*/ #elif defined(_MSC_VER) # define U_EXPORT __declspec(dllexport) #else # define U_EXPORT #endif /* U_CALLCONV is releated to U_EXPORT2 */ #ifdef U_EXPORT2 /* Use the predefined value. */ #elif defined(_MSC_VER) # define U_EXPORT2 __cdecl #else # define U_EXPORT2 #endif #ifdef U_IMPORT /* Use the predefined value. */ #elif defined(_MSC_VER) /* Windows needs to export/import data. */ # define U_IMPORT __declspec(dllimport) #else # define U_IMPORT #endif /** * \def U_CALLCONV * Similar to U_CDECL_BEGIN/U_CDECL_END, this qualifier is necessary * in callback function typedefs to make sure that the calling convention * is compatible. * * This is only used for non-ICU-API functions. * When a function is a public ICU API, * you must use the U_CAPI and U_EXPORT2 qualifiers. * @stable ICU 2.0 */ #if U_PLATFORM == U_PF_OS390 && defined(__cplusplus) # define U_CALLCONV __cdecl #else # define U_CALLCONV U_EXPORT2 #endif /* @} */ #endif // ptypes.h /* ****************************************************************************** * * Copyright (C) 1997-2012, International Business Machines * Corporation and others. All Rights Reserved. * ****************************************************************************** * * FILE NAME : ptypes.h * * Date Name Description * 05/13/98 nos Creation (content moved here from ptypes.h). * 03/02/99 stephen Added AS400 support. * 03/30/99 stephen Added Linux support. * 04/13/99 stephen Reworked for autoconf. * 09/18/08 srl Moved basic types back to ptypes.h from platform.h ****************************************************************************** */ /** * \file * \brief C API: Definitions of integer types of various widths */ #ifndef _PTYPES_H #define _PTYPES_H /** * \def __STDC_LIMIT_MACROS * According to the Linux stdint.h, the ISO C99 standard specifies that in C++ implementations * macros like INT32_MIN and UINTPTR_MAX should only be defined if explicitly requested. * We need to define __STDC_LIMIT_MACROS before including stdint.h in C++ code * that uses such limit macros. * @internal */ #ifndef __STDC_LIMIT_MACROS #define __STDC_LIMIT_MACROS #endif /* NULL, size_t, wchar_t */ #include /* * If all compilers provided all of the C99 headers and types, * we would just unconditionally #include here * and not need any of the stuff after including platform.h. */ /* Find out if we have stdint.h etc. */ /*===========================================================================*/ /* Generic data types */ /*===========================================================================*/ /* If your platform does not have the header, you may need to edit the typedefs in the #else section below. Use #if...#else...#endif with predefined compiler macros if possible. */ #if U_HAVE_STDINT_H /* * We mostly need (which defines the standard integer types) but not . * includes and adds the printf/scanf helpers PRId32, SCNx16 etc. * which we almost never use, plus stuff like imaxabs() which we never use. */ #include #if U_PLATFORM == U_PF_OS390 /* The features header is needed to get (u)int64_t sometimes. */ #include /* z/OS has , but some versions are missing uint8_t (APAR PK62248). */ #if !defined(__uint8_t) #define __uint8_t 1 typedef unsigned char uint8_t; #endif #endif /* U_PLATFORM == U_PF_OS390 */ #elif U_HAVE_INTTYPES_H # include #else /* neither U_HAVE_STDINT_H nor U_HAVE_INTTYPES_H */ #if ! U_HAVE_INT8_T typedef signed char int8_t; #endif #if ! U_HAVE_UINT8_T typedef unsigned char uint8_t; #endif #if ! U_HAVE_INT16_T typedef signed short int16_t; #endif #if ! U_HAVE_UINT16_T typedef unsigned short uint16_t; #endif #if ! U_HAVE_INT32_T typedef signed int int32_t; #endif #if ! U_HAVE_UINT32_T typedef unsigned int uint32_t; #endif #if ! U_HAVE_INT64_T #ifdef _MSC_VER typedef signed __int64 int64_t; #else typedef signed long long int64_t; #endif #endif #if ! U_HAVE_UINT64_T #ifdef _MSC_VER typedef unsigned __int64 uint64_t; #else typedef unsigned long long uint64_t; #endif #endif #endif /* U_HAVE_STDINT_H / U_HAVE_INTTYPES_H */ #endif /* _PTYPES_H */ // umachine.h /* ****************************************************************************** * * Copyright (C) 1999-2015, International Business Machines * Corporation and others. All Rights Reserved. * ****************************************************************************** * file name: umachine.h * encoding: US-ASCII * tab size: 8 (not used) * indentation:4 * * created on: 1999sep13 * created by: Markus W. Scherer * * This file defines basic types and constants for ICU to be * platform-independent. umachine.h and utf.h are included into * utypes.h to provide all the general definitions for ICU. * All of these definitions used to be in utypes.h before * the UTF-handling macros made this unmaintainable. */ #ifndef __UMACHINE_H__ #define __UMACHINE_H__ /** * \file * \brief Basic types and constants for UTF * *

Basic types and constants for UTF

* This file defines basic types and constants for utf.h to be * platform-independent. umachine.h and utf.h are included into * utypes.h to provide all the general definitions for ICU. * All of these definitions used to be in utypes.h before * the UTF-handling macros made this unmaintainable. * */ /*==========================================================================*/ /* Include platform-dependent definitions */ /* which are contained in the platform-specific file platform.h */ /*==========================================================================*/ /* * ANSI C headers: * stddef.h defines wchar_t */ #include /*==========================================================================*/ /* For C wrappers, we use the symbol U_STABLE. */ /* This works properly if the includer is C or C++. */ /* Functions are declared U_STABLE return-type U_EXPORT2 function-name()... */ /*==========================================================================*/ /** * \def U_CFUNC * This is used in a declaration of a library private ICU C function. * @stable ICU 2.4 */ /** * \def U_CDECL_BEGIN * This is used to begin a declaration of a library private ICU C API. * @stable ICU 2.4 */ /** * \def U_CDECL_END * This is used to end a declaration of a library private ICU C API * @stable ICU 2.4 */ #ifdef __cplusplus # define U_CFUNC extern "C" # define U_CDECL_BEGIN extern "C" { # define U_CDECL_END } #else # define U_CFUNC extern # define U_CDECL_BEGIN # define U_CDECL_END #endif #ifndef U_ATTRIBUTE_DEPRECATED /** * \def U_ATTRIBUTE_DEPRECATED * This is used for GCC specific attributes * @internal */ #if U_GCC_MAJOR_MINOR >= 302 # define U_ATTRIBUTE_DEPRECATED __attribute__ ((deprecated)) /** * \def U_ATTRIBUTE_DEPRECATED * This is used for Visual C++ specific attributes * @internal */ #elif defined(_MSC_VER) && (_MSC_VER >= 1400) # define U_ATTRIBUTE_DEPRECATED __declspec(deprecated) #else # define U_ATTRIBUTE_DEPRECATED #endif #endif /** This is used to declare a function as a public ICU C API @stable ICU 2.0*/ #define U_CAPI U_CFUNC U_EXPORT /** This is used to declare a function as a stable public ICU C API*/ #define U_STABLE U_CAPI /** This is used to declare a function as a draft public ICU C API */ #define U_DRAFT U_CAPI /** This is used to declare a function as a deprecated public ICU C API */ #define U_DEPRECATED U_CAPI U_ATTRIBUTE_DEPRECATED /** This is used to declare a function as an obsolete public ICU C API */ #define U_OBSOLETE U_CAPI /** This is used to declare a function as an internal ICU C API */ #define U_INTERNAL U_CAPI /** * \def U_OVERRIDE * Defined to the C++11 "override" keyword if available. * Denotes a class or member which is an override of the base class. * May result in an error if it applied to something not an override. * @internal */ /** * \def U_FINAL * Defined to the C++11 "final" keyword if available. * Denotes a class or member which may not be overridden in subclasses. * May result in an error if subclasses attempt to override. * @internal */ #if U_CPLUSPLUS_VERSION >= 11 /* C++11 */ #ifndef U_OVERRIDE #define U_OVERRIDE override #endif #ifndef U_FINAL #define U_FINAL final #endif #else /* not C++11 - define to nothing */ #ifndef U_OVERRIDE #define U_OVERRIDE #endif #ifndef U_FINAL #define U_FINAL #endif #endif /*==========================================================================*/ /* limits for int32_t etc., like in POSIX inttypes.h */ /*==========================================================================*/ #ifndef INT8_MIN /** The smallest value an 8 bit signed integer can hold @stable ICU 2.0 */ # define INT8_MIN ((int8_t)(-128)) #endif #ifndef INT16_MIN /** The smallest value a 16 bit signed integer can hold @stable ICU 2.0 */ # define INT16_MIN ((int16_t)(-32767-1)) #endif #ifndef INT32_MIN /** The smallest value a 32 bit signed integer can hold @stable ICU 2.0 */ # define INT32_MIN ((int32_t)(-2147483647-1)) #endif #ifndef INT8_MAX /** The largest value an 8 bit signed integer can hold @stable ICU 2.0 */ # define INT8_MAX ((int8_t)(127)) #endif #ifndef INT16_MAX /** The largest value a 16 bit signed integer can hold @stable ICU 2.0 */ # define INT16_MAX ((int16_t)(32767)) #endif #ifndef INT32_MAX /** The largest value a 32 bit signed integer can hold @stable ICU 2.0 */ # define INT32_MAX ((int32_t)(2147483647)) #endif #ifndef UINT8_MAX /** The largest value an 8 bit unsigned integer can hold @stable ICU 2.0 */ # define UINT8_MAX ((uint8_t)(255U)) #endif #ifndef UINT16_MAX /** The largest value a 16 bit unsigned integer can hold @stable ICU 2.0 */ # define UINT16_MAX ((uint16_t)(65535U)) #endif #ifndef UINT32_MAX /** The largest value a 32 bit unsigned integer can hold @stable ICU 2.0 */ # define UINT32_MAX ((uint32_t)(4294967295U)) #endif #if defined(U_INT64_T_UNAVAILABLE) # error int64_t is required for decimal format and rule-based number format. #else # ifndef INT64_C /** * Provides a platform independent way to specify a signed 64-bit integer constant. * note: may be wrong for some 64 bit platforms - ensure your compiler provides INT64_C * @stable ICU 2.8 */ # define INT64_C(c) c ## LL # endif # ifndef UINT64_C /** * Provides a platform independent way to specify an unsigned 64-bit integer constant. * note: may be wrong for some 64 bit platforms - ensure your compiler provides UINT64_C * @stable ICU 2.8 */ # define UINT64_C(c) c ## ULL # endif # ifndef U_INT64_MIN /** The smallest value a 64 bit signed integer can hold @stable ICU 2.8 */ # define U_INT64_MIN ((int64_t)(INT64_C(-9223372036854775807)-1)) # endif # ifndef U_INT64_MAX /** The largest value a 64 bit signed integer can hold @stable ICU 2.8 */ # define U_INT64_MAX ((int64_t)(INT64_C(9223372036854775807))) # endif # ifndef U_UINT64_MAX /** The largest value a 64 bit unsigned integer can hold @stable ICU 2.8 */ # define U_UINT64_MAX ((uint64_t)(UINT64_C(18446744073709551615))) # endif #endif /*==========================================================================*/ /* Boolean data type */ /*==========================================================================*/ /** The ICU boolean type @stable ICU 2.0 */ typedef int8_t UBool; #ifndef TRUE /** The TRUE value of a UBool @stable ICU 2.0 */ # define TRUE 1 #endif #ifndef FALSE /** The FALSE value of a UBool @stable ICU 2.0 */ # define FALSE 0 #endif /*==========================================================================*/ /* Unicode data types */ /*==========================================================================*/ /* wchar_t-related definitions -------------------------------------------- */ /* * \def U_WCHAR_IS_UTF16 * Defined if wchar_t uses UTF-16. * * @stable ICU 2.0 */ /* * \def U_WCHAR_IS_UTF32 * Defined if wchar_t uses UTF-32. * * @stable ICU 2.0 */ #if !defined(U_WCHAR_IS_UTF16) && !defined(U_WCHAR_IS_UTF32) # ifdef __STDC_ISO_10646__ # if (U_SIZEOF_WCHAR_T==2) # define U_WCHAR_IS_UTF16 # elif (U_SIZEOF_WCHAR_T==4) # define U_WCHAR_IS_UTF32 # endif # elif defined __UCS2__ # if (U_PF_OS390 <= U_PLATFORM && U_PLATFORM <= U_PF_OS400) && (U_SIZEOF_WCHAR_T==2) # define U_WCHAR_IS_UTF16 # endif # elif defined(__UCS4__) || (U_PLATFORM == U_PF_OS400 && defined(__UTF32__)) # if (U_SIZEOF_WCHAR_T==4) # define U_WCHAR_IS_UTF32 # endif # elif U_PLATFORM_IS_DARWIN_BASED || (U_SIZEOF_WCHAR_T==4 && U_PLATFORM_IS_LINUX_BASED) # define U_WCHAR_IS_UTF32 # elif U_PLATFORM_HAS_WIN32_API # define U_WCHAR_IS_UTF16 # endif #endif /* UChar and UChar32 definitions -------------------------------------------- */ /** Number of bytes in a UChar. @stable ICU 2.0 */ #define U_SIZEOF_UCHAR 2 /** * \var UChar * Define UChar to be UCHAR_TYPE, if that is #defined (for example, to char16_t), * or wchar_t if that is 16 bits wide; always assumed to be unsigned. * If neither is available, then define UChar to be uint16_t. * * This makes the definition of UChar platform-dependent * but allows direct string type compatibility with platforms with * 16-bit wchar_t types. * * @stable ICU 4.4 */ #if defined(UCHAR_TYPE) typedef UCHAR_TYPE UChar; /* Not #elif U_HAVE_CHAR16_T -- because that is type-incompatible with pre-C++11 callers typedef char16_t UChar; */ #elif U_SIZEOF_WCHAR_T==2 typedef wchar_t UChar; #elif defined(__CHAR16_TYPE__) typedef __CHAR16_TYPE__ UChar; #else typedef uint16_t UChar; #endif /** * Define UChar32 as a type for single Unicode code points. * UChar32 is a signed 32-bit integer (same as int32_t). * * The Unicode code point range is 0..0x10ffff. * All other values (negative or >=0x110000) are illegal as Unicode code points. * They may be used as sentinel values to indicate "done", "error" * or similar non-code point conditions. * * Before ICU 2.4 (Jitterbug 2146), UChar32 was defined * to be wchar_t if that is 32 bits wide (wchar_t may be signed or unsigned) * or else to be uint32_t. * That is, the definition of UChar32 was platform-dependent. * * @see U_SENTINEL * @stable ICU 2.4 */ typedef int32_t UChar32; /** * This value is intended for sentinel values for APIs that * (take or) return single code points (UChar32). * It is outside of the Unicode code point range 0..0x10ffff. * * For example, a "done" or "error" value in a new API * could be indicated with U_SENTINEL. * * ICU APIs designed before ICU 2.4 usually define service-specific "done" * values, mostly 0xffff. * Those may need to be distinguished from * actual U+ffff text contents by calling functions like * CharacterIterator::hasNext() or UnicodeString::length(). * * @return -1 * @see UChar32 * @stable ICU 2.4 */ #define U_SENTINEL (-1) #endif // utf.h /* ******************************************************************************* * * Copyright (C) 1999-2011, International Business Machines * Corporation and others. All Rights Reserved. * ******************************************************************************* * file name: utf.h * encoding: US-ASCII * tab size: 8 (not used) * indentation:4 * * created on: 1999sep09 * created by: Markus W. Scherer */ /** * \file * \brief C API: Code point macros * * This file defines macros for checking whether a code point is * a surrogate or a non-character etc. * * The UChar and UChar32 data types for Unicode code units and code points * are defined in umachine.h because they can be machine-dependent. * * If U_NO_DEFAULT_INCLUDE_UTF_HEADERS is 0 then utf.h is included by utypes.h * and itself includes utf8.h and utf16.h after some * common definitions. * If U_NO_DEFAULT_INCLUDE_UTF_HEADERS is 1 then each of these headers must be * included explicitly if their definitions are used. * * utf8.h and utf16.h define macros for efficiently getting code points * in and out of UTF-8/16 strings. * utf16.h macros have "U16_" prefixes. * utf8.h defines similar macros with "U8_" prefixes for UTF-8 string handling. * * ICU mostly processes 16-bit Unicode strings. * Most of the time, such strings are well-formed UTF-16. * Single, unpaired surrogates must be handled as well, and are treated in ICU * like regular code points where possible. * (Pairs of surrogate code points are indistinguishable from supplementary * code points encoded as pairs of supplementary code units.) * * In fact, almost all Unicode code points in normal text (>99%) * are on the BMP (<=U+ffff) and even <=U+d7ff. * ICU functions handle supplementary code points (U+10000..U+10ffff) * but are optimized for the much more frequently occurring BMP code points. * * umachine.h defines UChar to be an unsigned 16-bit integer. * Where available, UChar is defined to be a char16_t * or a wchar_t (if that is an unsigned 16-bit type), otherwise uint16_t. * * UChar32 is defined to be a signed 32-bit integer (int32_t), large enough for a 21-bit * Unicode code point (Unicode scalar value, 0..0x10ffff). * Before ICU 2.4, the definition of UChar32 was similarly platform-dependent as * the definition of UChar. For details see the documentation for UChar32 itself. * * utf.h defines a small number of C macros for single Unicode code points. * These are simple checks for surrogates and non-characters. * For actual Unicode character properties see uchar.h. * * By default, string operations must be done with error checking in case * a string is not well-formed UTF-16. * The macros will detect if a surrogate code unit is unpaired * (lead unit without trail unit or vice versa) and just return the unit itself * as the code point. * * The regular "safe" macros require that the initial, passed-in string index * is within bounds. They only check the index when they read more than one * code unit. This is usually done with code similar to the following loop: *
while(i
 *
 * When it is safe to assume that text is well-formed UTF-16
 * (does not contain single, unpaired surrogates), then one can use
 * U16_..._UNSAFE macros.
 * These do not check for proper code unit sequences or truncated text and may
 * yield wrong results or even cause a crash if they are used with "malformed"
 * text.
 * In practice, U16_..._UNSAFE macros will produce slightly less code but
 * should not be faster because the processing is only different when a
 * surrogate code unit is detected, which will be rare.
 *
 * Similarly for UTF-8, there are "safe" macros without a suffix,
 * and U8_..._UNSAFE versions.
 * The performance differences are much larger here because UTF-8 provides so
 * many opportunities for malformed sequences.
 * The unsafe UTF-8 macros are entirely implemented inside the macro definitions
 * and are fast, while the safe UTF-8 macros call functions for all but the
 * trivial (ASCII) cases.
 * (ICU 3.6 optimizes U8_NEXT() and U8_APPEND() to handle most other common
 * characters inline as well.)
 *
 * Unlike with UTF-16, malformed sequences cannot be expressed with distinct
 * code point values (0..U+10ffff). They are indicated with negative values instead.
 *
 * For more information see the ICU User Guide Strings chapter
 * (http://userguide.icu-project.org/strings).
 *
 * Usage:
 * ICU coding guidelines for if() statements should be followed when using these macros.
 * Compound statements (curly braces {}) must be used  for if-else-while... 
 * bodies and all macro statements should be terminated with semicolon.
 *
 * @stable ICU 2.4
 */

#ifndef __UTF_H__
#define __UTF_H__

/* include the utfXX.h after the following definitions */

/* single-code point definitions -------------------------------------------- */

/**
 * Is this code point a Unicode noncharacter?
 * @param c 32-bit code point
 * @return TRUE or FALSE
 * @stable ICU 2.4
 */
#define U_IS_UNICODE_NONCHAR(c) \
    ((c)>=0xfdd0 && \
     ((uint32_t)(c)<=0xfdef || ((c)&0xfffe)==0xfffe) && \
     (uint32_t)(c)<=0x10ffff)

/**
 * Is c a Unicode code point value (0..U+10ffff)
 * that can be assigned a character?
 *
 * Code points that are not characters include:
 * - single surrogate code points (U+d800..U+dfff, 2048 code points)
 * - the last two code points on each plane (U+__fffe and U+__ffff, 34 code points)
 * - U+fdd0..U+fdef (new with Unicode 3.1, 32 code points)
 * - the highest Unicode code point value is U+10ffff
 *
 * This means that all code points below U+d800 are character code points,
 * and that boundary is tested first for performance.
 *
 * @param c 32-bit code point
 * @return TRUE or FALSE
 * @stable ICU 2.4
 */
#define U_IS_UNICODE_CHAR(c) \
    ((uint32_t)(c)<0xd800 || \
        ((uint32_t)(c)>0xdfff && \
         (uint32_t)(c)<=0x10ffff && \
         !U_IS_UNICODE_NONCHAR(c)))

/**
 * Is this code point a BMP code point (U+0000..U+ffff)?
 * @param c 32-bit code point
 * @return TRUE or FALSE
 * @stable ICU 2.8
 */
#define U_IS_BMP(c) ((uint32_t)(c)<=0xffff)

/**
 * Is this code point a supplementary code point (U+10000..U+10ffff)?
 * @param c 32-bit code point
 * @return TRUE or FALSE
 * @stable ICU 2.8
 */
#define U_IS_SUPPLEMENTARY(c) ((uint32_t)((c)-0x10000)<=0xfffff)
 
/**
 * Is this code point a lead surrogate (U+d800..U+dbff)?
 * @param c 32-bit code point
 * @return TRUE or FALSE
 * @stable ICU 2.4
 */
#define U_IS_LEAD(c) (((c)&0xfffffc00)==0xd800)

/**
 * Is this code point a trail surrogate (U+dc00..U+dfff)?
 * @param c 32-bit code point
 * @return TRUE or FALSE
 * @stable ICU 2.4
 */
#define U_IS_TRAIL(c) (((c)&0xfffffc00)==0xdc00)

/**
 * Is this code point a surrogate (U+d800..U+dfff)?
 * @param c 32-bit code point
 * @return TRUE or FALSE
 * @stable ICU 2.4
 */
#define U_IS_SURROGATE(c) (((c)&0xfffff800)==0xd800)

/**
 * Assuming c is a surrogate code point (U_IS_SURROGATE(c)),
 * is it a lead surrogate?
 * @param c 32-bit code point
 * @return TRUE or FALSE
 * @stable ICU 2.4
 */
#define U_IS_SURROGATE_LEAD(c) (((c)&0x400)==0)

/**
 * Assuming c is a surrogate code point (U_IS_SURROGATE(c)),
 * is it a trail surrogate?
 * @param c 32-bit code point
 * @return TRUE or FALSE
 * @stable ICU 4.2
 */
#define U_IS_SURROGATE_TRAIL(c) (((c)&0x400)!=0)

/* include the utfXX.h ------------------------------------------------------ */


#endif  /* __UTF_H__ */

// utf8.h
/*
*******************************************************************************
*
*   Copyright (C) 1999-2015, International Business Machines
*   Corporation and others.  All Rights Reserved.
*
*******************************************************************************
*   file name:  utf8.h
*   encoding:   US-ASCII
*   tab size:   8 (not used)
*   indentation:4
*
*   created on: 1999sep13
*   created by: Markus W. Scherer
*/

/**
 * \file
 * \brief C API: 8-bit Unicode handling macros
 * 
 * This file defines macros to deal with 8-bit Unicode (UTF-8) code units (bytes) and strings.
 *
 * For more information see utf.h and the ICU User Guide Strings chapter
 * (http://userguide.icu-project.org/strings).
 *
 * Usage:
 * ICU coding guidelines for if() statements should be followed when using these macros.
 * Compound statements (curly braces {}) must be used  for if-else-while... 
 * bodies and all macro statements should be terminated with semicolon.
 */

#ifndef __UTF8_H__
#define __UTF8_H__

#ifndef __UTF_H__
#   include "icu/utf.h"
#endif

/* internal definitions ----------------------------------------------------- */

/**
 * \var utf8_countTrailBytes
 * Internal array with numbers of trail bytes for any given byte used in
 * lead byte position.
 *
 * This is internal since it is not meant to be called directly by external clients;
 * however it is called by public macros in this file and thus must remain stable,
 * and should not be hidden when other internal functions are hidden (otherwise
 * public macros would fail to compile).
 * @internal
 */
#ifdef U_UTF8_IMPL
U_EXPORT const uint8_t 
#elif defined(U_STATIC_IMPLEMENTATION) || defined(U_COMMON_IMPLEMENTATION)
U_CFUNC const uint8_t
#else
U_CFUNC U_IMPORT const uint8_t /* U_IMPORT2? */ /*U_IMPORT*/ 
#endif
utf8_countTrailBytes[256];

/**
 * Counts the trail bytes for a UTF-8 lead byte.
 * Returns 0 for 0..0xbf as well as for 0xfe and 0xff.
 *
 * This is internal since it is not meant to be called directly by external clients;
 * however it is called by public macros in this file and thus must remain stable.
 *
 * Note: Beginning with ICU 50, the implementation uses a multi-condition expression
 * which was shown in 2012 (on x86-64) to compile to fast, branch-free code.
 * leadByte is evaluated multiple times.
 *
 * The pre-ICU 50 implementation used the exported array utf8_countTrailBytes:
 * #define U8_COUNT_TRAIL_BYTES(leadByte) (utf8_countTrailBytes[leadByte])
 * leadByte was evaluated exactly once.
 *
 * @param leadByte The first byte of a UTF-8 sequence. Must be 0..0xff.
 * @internal
 */
#define U8_COUNT_TRAIL_BYTES(leadByte) \
    ((uint8_t)(leadByte)<0xf0 ? \
        ((uint8_t)(leadByte)>=0xc0)+((uint8_t)(leadByte)>=0xe0) : \
        (uint8_t)(leadByte)<0xfe ? 3+((uint8_t)(leadByte)>=0xf8)+((uint8_t)(leadByte)>=0xfc) : 0)

/**
 * Counts the trail bytes for a UTF-8 lead byte of a valid UTF-8 sequence.
 * The maximum supported lead byte is 0xf4 corresponding to U+10FFFF.
 * leadByte might be evaluated multiple times.
 *
 * This is internal since it is not meant to be called directly by external clients;
 * however it is called by public macros in this file and thus must remain stable.
 *
 * @param leadByte The first byte of a UTF-8 sequence. Must be 0..0xff.
 * @internal
 */
#define U8_COUNT_TRAIL_BYTES_UNSAFE(leadByte) \
    (((leadByte)>=0xc0)+((leadByte)>=0xe0)+((leadByte)>=0xf0))

/**
 * Mask a UTF-8 lead byte, leave only the lower bits that form part of the code point value.
 *
 * This is internal since it is not meant to be called directly by external clients;
 * however it is called by public macros in this file and thus must remain stable.
 * @internal
 */
#define U8_MASK_LEAD_BYTE(leadByte, countTrailBytes) ((leadByte)&=(1<<(6-(countTrailBytes)))-1)

/**
 * Function for handling "next code point" with error-checking.
 *
 * This is internal since it is not meant to be called directly by external clients;
 * however it is U_STABLE (not U_INTERNAL) since it is called by public macros in this
 * file and thus must remain stable, and should not be hidden when other internal
 * functions are hidden (otherwise public macros would fail to compile).
 * @internal
 */
U_STABLE UChar32 U_EXPORT2
utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, UChar32 c, UBool strict);

/**
 * Function for handling "append code point" with error-checking.
 *
 * This is internal since it is not meant to be called directly by external clients;
 * however it is U_STABLE (not U_INTERNAL) since it is called by public macros in this
 * file and thus must remain stable, and should not be hidden when other internal
 * functions are hidden (otherwise public macros would fail to compile).
 * @internal
 */
U_STABLE int32_t U_EXPORT2
utf8_appendCharSafeBody(uint8_t *s, int32_t i, int32_t length, UChar32 c, UBool *pIsError);

/**
 * Function for handling "previous code point" with error-checking.
 *
 * This is internal since it is not meant to be called directly by external clients;
 * however it is U_STABLE (not U_INTERNAL) since it is called by public macros in this
 * file and thus must remain stable, and should not be hidden when other internal
 * functions are hidden (otherwise public macros would fail to compile).
 * @internal
 */
U_STABLE UChar32 U_EXPORT2
utf8_prevCharSafeBody(const uint8_t *s, int32_t start, int32_t *pi, UChar32 c, UBool strict);

/**
 * Function for handling "skip backward one code point" with error-checking.
 *
 * This is internal since it is not meant to be called directly by external clients;
 * however it is U_STABLE (not U_INTERNAL) since it is called by public macros in this
 * file and thus must remain stable, and should not be hidden when other internal
 * functions are hidden (otherwise public macros would fail to compile).
 * @internal
 */
U_STABLE int32_t U_EXPORT2
utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i);

/* single-code point definitions -------------------------------------------- */

/**
 * Does this code unit (byte) encode a code point by itself (US-ASCII 0..0x7f)?
 * @param c 8-bit code unit (byte)
 * @return TRUE or FALSE
 * @stable ICU 2.4
 */
#define U8_IS_SINGLE(c) (((c)&0x80)==0)

/**
 * Is this code unit (byte) a UTF-8 lead byte?
 * @param c 8-bit code unit (byte)
 * @return TRUE or FALSE
 * @stable ICU 2.4
 */
#define U8_IS_LEAD(c) ((uint8_t)((c)-0xc0)<0x3e)

/**
 * Is this code unit (byte) a UTF-8 trail byte?
 * @param c 8-bit code unit (byte)
 * @return TRUE or FALSE
 * @stable ICU 2.4
 */
#define U8_IS_TRAIL(c) (((c)&0xc0)==0x80)

/**
 * How many code units (bytes) are used for the UTF-8 encoding
 * of this Unicode code point?
 * @param c 32-bit code point
 * @return 1..4, or 0 if c is a surrogate or not a Unicode code point
 * @stable ICU 2.4
 */
#define U8_LENGTH(c) \
    ((uint32_t)(c)<=0x7f ? 1 : \
        ((uint32_t)(c)<=0x7ff ? 2 : \
            ((uint32_t)(c)<=0xd7ff ? 3 : \
                ((uint32_t)(c)<=0xdfff || (uint32_t)(c)>0x10ffff ? 0 : \
                    ((uint32_t)(c)<=0xffff ? 3 : 4)\
                ) \
            ) \
        ) \
    )

/**
 * The maximum number of UTF-8 code units (bytes) per Unicode code point (U+0000..U+10ffff).
 * @return 4
 * @stable ICU 2.4
 */
#define U8_MAX_LENGTH 4

/**
 * Get a code point from a string at a random-access offset,
 * without changing the offset.
 * The offset may point to either the lead byte or one of the trail bytes
 * for a code point, in which case the macro will read all of the bytes
 * for the code point.
 * The result is undefined if the offset points to an illegal UTF-8
 * byte sequence.
 * Iteration through a string is more efficient with U8_NEXT_UNSAFE or U8_NEXT.
 *
 * @param s const uint8_t * string
 * @param i string offset
 * @param c output UChar32 variable
 * @see U8_GET
 * @stable ICU 2.4
 */
#define U8_GET_UNSAFE(s, i, c) { \
    int32_t _u8_get_unsafe_index=(int32_t)(i); \
    U8_SET_CP_START_UNSAFE(s, _u8_get_unsafe_index); \
    U8_NEXT_UNSAFE(s, _u8_get_unsafe_index, c); \
}

/**
 * Get a code point from a string at a random-access offset,
 * without changing the offset.
 * The offset may point to either the lead byte or one of the trail bytes
 * for a code point, in which case the macro will read all of the bytes
 * for the code point.
 *
 * The length can be negative for a NUL-terminated string.
 *
 * If the offset points to an illegal UTF-8 byte sequence, then
 * c is set to a negative value.
 * Iteration through a string is more efficient with U8_NEXT_UNSAFE or U8_NEXT.
 *
 * @param s const uint8_t * string
 * @param start int32_t starting string offset
 * @param i int32_t string offset, must be start<=i=0x80) { \
        if((c)<0xe0) { \
            (c)=(((c)&0x1f)<<6)|((s)[(i)++]&0x3f); \
        } else if((c)<0xf0) { \
            /* no need for (c&0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ \
            (c)=(UChar)(((c)<<12)|(((s)[i]&0x3f)<<6)|((s)[(i)+1]&0x3f)); \
            (i)+=2; \
        } else { \
            (c)=(((c)&7)<<18)|(((s)[i]&0x3f)<<12)|(((s)[(i)+1]&0x3f)<<6)|((s)[(i)+2]&0x3f); \
            (i)+=3; \
        } \
    } \
}

/**
 * Get a code point from a string at a code point boundary offset,
 * and advance the offset to the next code point boundary.
 * (Post-incrementing forward iteration.)
 * "Safe" macro, checks for illegal sequences and for string boundaries.
 *
 * The length can be negative for a NUL-terminated string.
 *
 * The offset may point to the lead byte of a multi-byte sequence,
 * in which case the macro will read the whole sequence.
 * If the offset points to a trail byte or an illegal UTF-8 sequence, then
 * c is set to a negative value.
 *
 * @param s const uint8_t * string
 * @param i int32_t string offset, must be i=0x80) { \
        uint8_t __t1, __t2; \
        if( /* handle U+1000..U+CFFF inline */ \
            (0xe0<(c) && (c)<=0xec) && \
            (((i)+1)<(length) || (length)<0) && \
            (__t1=(uint8_t)((s)[i]-0x80))<=0x3f && \
            (__t2=(uint8_t)((s)[(i)+1]-0x80))<= 0x3f \
        ) { \
            /* no need for (c&0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ \
            (c)=(UChar)(((c)<<12)|(__t1<<6)|__t2); \
            (i)+=2; \
        } else if( /* handle U+0080..U+07FF inline */ \
            ((c)<0xe0 && (c)>=0xc2) && \
            ((i)!=(length)) && \
            (__t1=(uint8_t)((s)[i]-0x80))<=0x3f \
        ) { \
            (c)=(((c)&0x1f)<<6)|__t1; \
            ++(i); \
        } else { \
            /* function call for "complicated" and error cases */ \
            (c)=utf8_nextCharSafeBody((const uint8_t *)s, &(i), (length), c, -1); \
        } \
    } \
}

/**
 * Get a code point from a string at a code point boundary offset,
 * and advance the offset to the next code point boundary.
 * (Post-incrementing forward iteration.)
 * "Safe" macro, checks for illegal sequences and for string boundaries.
 *
 * The length can be negative for a NUL-terminated string.
 *
 * The offset may point to the lead byte of a multi-byte sequence,
 * in which case the macro will read the whole sequence.
 * If the offset points to a trail byte or an illegal UTF-8 sequence, then
 * c is set to U+FFFD.
 *
 * This macro does not distinguish between a real U+FFFD in the text
 * and U+FFFD returned for an ill-formed sequence.
 * Use U8_NEXT() if that distinction is important.
 *
 * @param s const uint8_t * string
 * @param i int32_t string offset, must be i=0x80) { \
        uint8_t __t1, __t2; \
        if( /* handle U+1000..U+CFFF inline */ \
            (0xe0<(c) && (c)<=0xec) && \
            (((i)+1)<(length) || (length)<0) && \
            (__t1=(uint8_t)((s)[i]-0x80))<=0x3f && \
            (__t2=(uint8_t)((s)[(i)+1]-0x80))<= 0x3f \
        ) { \
            /* no need for (c&0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ \
            (c)=(UChar)(((c)<<12)|(__t1<<6)|__t2); \
            (i)+=2; \
        } else if( /* handle U+0080..U+07FF inline */ \
            ((c)<0xe0 && (c)>=0xc2) && \
            ((i)!=(length)) && \
            (__t1=(uint8_t)((s)[i]-0x80))<=0x3f \
        ) { \
            (c)=(((c)&0x1f)<<6)|__t1; \
            ++(i); \
        } else { \
            /* function call for "complicated" and error cases */ \
            (c)=utf8_nextCharSafeBody((const uint8_t *)s, &(i), (length), c, -3); \
        } \
    } \
}

/**
 * Append a code point to a string, overwriting 1 to 4 bytes.
 * The offset points to the current end of the string contents
 * and is advanced (post-increment).
 * "Unsafe" macro, assumes a valid code point and sufficient space in the string.
 * Otherwise, the result is undefined.
 *
 * @param s const uint8_t * string buffer
 * @param i string offset
 * @param c code point to append
 * @see U8_APPEND
 * @stable ICU 2.4
 */
#define U8_APPEND_UNSAFE(s, i, c) { \
    if((uint32_t)(c)<=0x7f) { \
        (s)[(i)++]=(uint8_t)(c); \
    } else { \
        if((uint32_t)(c)<=0x7ff) { \
            (s)[(i)++]=(uint8_t)(((c)>>6)|0xc0); \
        } else { \
            if((uint32_t)(c)<=0xffff) { \
                (s)[(i)++]=(uint8_t)(((c)>>12)|0xe0); \
            } else { \
                (s)[(i)++]=(uint8_t)(((c)>>18)|0xf0); \
                (s)[(i)++]=(uint8_t)((((c)>>12)&0x3f)|0x80); \
            } \
            (s)[(i)++]=(uint8_t)((((c)>>6)&0x3f)|0x80); \
        } \
        (s)[(i)++]=(uint8_t)(((c)&0x3f)|0x80); \
    } \
}

/**
 * Append a code point to a string, overwriting 1 to 4 bytes.
 * The offset points to the current end of the string contents
 * and is advanced (post-increment).
 * "Safe" macro, checks for a valid code point.
 * If a non-ASCII code point is written, checks for sufficient space in the string.
 * If the code point is not valid or trail bytes do not fit,
 * then isError is set to TRUE.
 *
 * @param s const uint8_t * string buffer
 * @param i int32_t string offset, must be i>6)|0xc0); \
        (s)[(i)++]=(uint8_t)(((c)&0x3f)|0x80); \
    } else if((uint32_t)(c)<=0xd7ff && (i)+2<(capacity)) { \
        (s)[(i)++]=(uint8_t)(((c)>>12)|0xe0); \
        (s)[(i)++]=(uint8_t)((((c)>>6)&0x3f)|0x80); \
        (s)[(i)++]=(uint8_t)(((c)&0x3f)|0x80); \
    } else { \
        (i)=utf8_appendCharSafeBody(s, (i), (capacity), c, &(isError)); \
    } \
}

/**
 * Advance the string offset from one code point boundary to the next.
 * (Post-incrementing iteration.)
 * "Unsafe" macro, assumes well-formed UTF-8.
 *
 * @param s const uint8_t * string
 * @param i string offset
 * @see U8_FWD_1
 * @stable ICU 2.4
 */
#define U8_FWD_1_UNSAFE(s, i) { \
    (i)+=1+U8_COUNT_TRAIL_BYTES_UNSAFE((uint8_t)(s)[i]); \
}

/**
 * Advance the string offset from one code point boundary to the next.
 * (Post-incrementing iteration.)
 * "Safe" macro, checks for illegal sequences and for string boundaries.
 *
 * The length can be negative for a NUL-terminated string.
 *
 * @param s const uint8_t * string
 * @param i int32_t string offset, must be i(length) && (length)>=0) { \
            __count=(uint8_t)((length)-(i)); \
        } \
        while(__count>0 && U8_IS_TRAIL((s)[i])) { \
            ++(i); \
            --__count; \
        } \
    } \
}

/**
 * Advance the string offset from one code point boundary to the n-th next one,
 * i.e., move forward by n code points.
 * (Post-incrementing iteration.)
 * "Unsafe" macro, assumes well-formed UTF-8.
 *
 * @param s const uint8_t * string
 * @param i string offset
 * @param n number of code points to skip
 * @see U8_FWD_N
 * @stable ICU 2.4
 */
#define U8_FWD_N_UNSAFE(s, i, n) { \
    int32_t __N=(n); \
    while(__N>0) { \
        U8_FWD_1_UNSAFE(s, i); \
        --__N; \
    } \
}

/**
 * Advance the string offset from one code point boundary to the n-th next one,
 * i.e., move forward by n code points.
 * (Post-incrementing iteration.)
 * "Safe" macro, checks for illegal sequences and for string boundaries.
 *
 * The length can be negative for a NUL-terminated string.
 *
 * @param s const uint8_t * string
 * @param i int32_t string offset, must be i0 && ((i)<(length) || ((length)<0 && (s)[i]!=0))) { \
        U8_FWD_1(s, i, length); \
        --__N; \
    } \
}

/**
 * Adjust a random-access offset to a code point boundary
 * at the start of a code point.
 * If the offset points to a UTF-8 trail byte,
 * then the offset is moved backward to the corresponding lead byte.
 * Otherwise, it is not modified.
 * "Unsafe" macro, assumes well-formed UTF-8.
 *
 * @param s const uint8_t * string
 * @param i string offset
 * @see U8_SET_CP_START
 * @stable ICU 2.4
 */
#define U8_SET_CP_START_UNSAFE(s, i) { \
    while(U8_IS_TRAIL((s)[i])) { --(i); } \
}

/**
 * Adjust a random-access offset to a code point boundary
 * at the start of a code point.
 * If the offset points to a UTF-8 trail byte,
 * then the offset is moved backward to the corresponding lead byte.
 * Otherwise, it is not modified.
 * "Safe" macro, checks for illegal sequences and for string boundaries.
 *
 * @param s const uint8_t * string
 * @param start int32_t starting string offset (usually 0)
 * @param i int32_t string offset, must be start<=i
 * @see U8_SET_CP_START_UNSAFE
 * @stable ICU 2.4
 */
#define U8_SET_CP_START(s, start, i) { \
    if(U8_IS_TRAIL((s)[(i)])) { \
        (i)=utf8_back1SafeBody(s, start, (i)); \
    } \
}

/* definitions with backward iteration -------------------------------------- */

/**
 * Move the string offset from one code point boundary to the previous one
 * and get the code point between them.
 * (Pre-decrementing backward iteration.)
 * "Unsafe" macro, assumes well-formed UTF-8.
 *
 * The input offset may be the same as the string length.
 * If the offset is behind a multi-byte sequence, then the macro will read
 * the whole sequence.
 * If the offset is behind a lead byte, then that itself
 * will be returned as the code point.
 * The result is undefined if the offset is behind an illegal UTF-8 sequence.
 *
 * @param s const uint8_t * string
 * @param i string offset
 * @param c output UChar32 variable
 * @see U8_PREV
 * @stable ICU 2.4
 */
#define U8_PREV_UNSAFE(s, i, c) { \
    (c)=(uint8_t)(s)[--(i)]; \
    if(U8_IS_TRAIL(c)) { \
        uint8_t __b, __count=1, __shift=6; \
\
        /* c is a trail byte */ \
        (c)&=0x3f; \
        for(;;) { \
            __b=(uint8_t)(s)[--(i)]; \
            if(__b>=0xc0) { \
                U8_MASK_LEAD_BYTE(__b, __count); \
                (c)|=(UChar32)__b<<__shift; \
                break; \
            } else { \
                (c)|=(UChar32)(__b&0x3f)<<__shift; \
                ++__count; \
                __shift+=6; \
            } \
        } \
    } \
}

/**
 * Move the string offset from one code point boundary to the previous one
 * and get the code point between them.
 * (Pre-decrementing backward iteration.)
 * "Safe" macro, checks for illegal sequences and for string boundaries.
 *
 * The input offset may be the same as the string length.
 * If the offset is behind a multi-byte sequence, then the macro will read
 * the whole sequence.
 * If the offset is behind a lead byte, then that itself
 * will be returned as the code point.
 * If the offset is behind an illegal UTF-8 sequence, then c is set to a negative value.
 *
 * @param s const uint8_t * string
 * @param start int32_t starting string offset (usually 0)
 * @param i int32_t string offset, must be start=0x80) { \
        (c)=utf8_prevCharSafeBody((const uint8_t *)s, start, &(i), c, -1); \
    } \
}

/**
 * Move the string offset from one code point boundary to the previous one
 * and get the code point between them.
 * (Pre-decrementing backward iteration.)
 * "Safe" macro, checks for illegal sequences and for string boundaries.
 *
 * The input offset may be the same as the string length.
 * If the offset is behind a multi-byte sequence, then the macro will read
 * the whole sequence.
 * If the offset is behind a lead byte, then that itself
 * will be returned as the code point.
 * If the offset is behind an illegal UTF-8 sequence, then c is set to U+FFFD.
 *
 * This macro does not distinguish between a real U+FFFD in the text
 * and U+FFFD returned for an ill-formed sequence.
 * Use U8_PREV() if that distinction is important.
 *
 * @param s const uint8_t * string
 * @param start int32_t starting string offset (usually 0)
 * @param i int32_t string offset, must be start=0x80) { \
        (c)=utf8_prevCharSafeBody((const uint8_t *)s, start, &(i), c, -3); \
    } \
}

/**
 * Move the string offset from one code point boundary to the previous one.
 * (Pre-decrementing backward iteration.)
 * The input offset may be the same as the string length.
 * "Unsafe" macro, assumes well-formed UTF-8.
 *
 * @param s const uint8_t * string
 * @param i string offset
 * @see U8_BACK_1
 * @stable ICU 2.4
 */
#define U8_BACK_1_UNSAFE(s, i) { \
    while(U8_IS_TRAIL((s)[--(i)])) {} \
}

/**
 * Move the string offset from one code point boundary to the previous one.
 * (Pre-decrementing backward iteration.)
 * The input offset may be the same as the string length.
 * "Safe" macro, checks for illegal sequences and for string boundaries.
 *
 * @param s const uint8_t * string
 * @param start int32_t starting string offset (usually 0)
 * @param i int32_t string offset, must be start0) { \
        U8_BACK_1_UNSAFE(s, i); \
        --__N; \
    } \
}

/**
 * Move the string offset from one code point boundary to the n-th one before it,
 * i.e., move backward by n code points.
 * (Pre-decrementing backward iteration.)
 * The input offset may be the same as the string length.
 * "Safe" macro, checks for illegal sequences and for string boundaries.
 *
 * @param s const uint8_t * string
 * @param start int32_t index of the start of the string
 * @param i int32_t string offset, must be start0 && (i)>(start)) { \
        U8_BACK_1(s, start, i); \
        --__N; \
    } \
}

/**
 * Adjust a random-access offset to a code point boundary after a code point.
 * If the offset is behind a partial multi-byte sequence,
 * then the offset is incremented to behind the whole sequence.
 * Otherwise, it is not modified.
 * The input offset may be the same as the string length.
 * "Unsafe" macro, assumes well-formed UTF-8.
 *
 * @param s const uint8_t * string
 * @param i string offset
 * @see U8_SET_CP_LIMIT
 * @stable ICU 2.4
 */
#define U8_SET_CP_LIMIT_UNSAFE(s, i) { \
    U8_BACK_1_UNSAFE(s, i); \
    U8_FWD_1_UNSAFE(s, i); \
}

/**
 * Adjust a random-access offset to a code point boundary after a code point.
 * If the offset is behind a partial multi-byte sequence,
 * then the offset is incremented to behind the whole sequence.
 * Otherwise, it is not modified.
 * The input offset may be the same as the string length.
 * "Safe" macro, checks for illegal sequences and for string boundaries.
 *
 * The length can be negative for a NUL-terminated string.
 *
 * @param s const uint8_t * string
 * @param start int32_t starting string offset (usually 0)
 * @param i int32_t string offset, must be start<=i<=length
 * @param length int32_t string length
 * @see U8_SET_CP_LIMIT_UNSAFE
 * @stable ICU 2.4
 */
#define U8_SET_CP_LIMIT(s, start, i, length) { \
    if((start)<(i) && ((i)<(length) || (length)<0)) { \
        U8_BACK_1(s, start, i); \
        U8_FWD_1(s, i, length); \
    } \
}

#endif

// utf16.h
/*
*******************************************************************************
*
*   Copyright (C) 1999-2012, International Business Machines
*   Corporation and others.  All Rights Reserved.
*
*******************************************************************************
*   file name:  utf16.h
*   encoding:   US-ASCII
*   tab size:   8 (not used)
*   indentation:4
*
*   created on: 1999sep09
*   created by: Markus W. Scherer
*/

/**
 * \file
 * \brief C API: 16-bit Unicode handling macros
 * 
 * This file defines macros to deal with 16-bit Unicode (UTF-16) code units and strings.
 *
 * For more information see utf.h and the ICU User Guide Strings chapter
 * (http://userguide.icu-project.org/strings).
 *
 * Usage:
 * ICU coding guidelines for if() statements should be followed when using these macros.
 * Compound statements (curly braces {}) must be used  for if-else-while... 
 * bodies and all macro statements should be terminated with semicolon.
 */

#ifndef __UTF16_H__
#define __UTF16_H__

#ifndef __UTF_H__
#   include "icu/utf.h"
#endif

/* single-code point definitions -------------------------------------------- */

/**
 * Does this code unit alone encode a code point (BMP, not a surrogate)?
 * @param c 16-bit code unit
 * @return TRUE or FALSE
 * @stable ICU 2.4
 */
#define U16_IS_SINGLE(c) !U_IS_SURROGATE(c)

/**
 * Is this code unit a lead surrogate (U+d800..U+dbff)?
 * @param c 16-bit code unit
 * @return TRUE or FALSE
 * @stable ICU 2.4
 */
#define U16_IS_LEAD(c) (((c)&0xfffffc00)==0xd800)

/**
 * Is this code unit a trail surrogate (U+dc00..U+dfff)?
 * @param c 16-bit code unit
 * @return TRUE or FALSE
 * @stable ICU 2.4
 */
#define U16_IS_TRAIL(c) (((c)&0xfffffc00)==0xdc00)

/**
 * Is this code unit a surrogate (U+d800..U+dfff)?
 * @param c 16-bit code unit
 * @return TRUE or FALSE
 * @stable ICU 2.4
 */
#define U16_IS_SURROGATE(c) U_IS_SURROGATE(c)

/**
 * Assuming c is a surrogate code point (U16_IS_SURROGATE(c)),
 * is it a lead surrogate?
 * @param c 16-bit code unit
 * @return TRUE or FALSE
 * @stable ICU 2.4
 */
#define U16_IS_SURROGATE_LEAD(c) (((c)&0x400)==0)

/**
 * Assuming c is a surrogate code point (U16_IS_SURROGATE(c)),
 * is it a trail surrogate?
 * @param c 16-bit code unit
 * @return TRUE or FALSE
 * @stable ICU 4.2
 */
#define U16_IS_SURROGATE_TRAIL(c) (((c)&0x400)!=0)

/**
 * Helper constant for U16_GET_SUPPLEMENTARY.
 * @internal
 */
#define U16_SURROGATE_OFFSET ((0xd800<<10UL)+0xdc00-0x10000)

/**
 * Get a supplementary code point value (U+10000..U+10ffff)
 * from its lead and trail surrogates.
 * The result is undefined if the input values are not
 * lead and trail surrogates.
 *
 * @param lead lead surrogate (U+d800..U+dbff)
 * @param trail trail surrogate (U+dc00..U+dfff)
 * @return supplementary code point (U+10000..U+10ffff)
 * @stable ICU 2.4
 */
#define U16_GET_SUPPLEMENTARY(lead, trail) \
    (((UChar32)(lead)<<10UL)+(UChar32)(trail)-U16_SURROGATE_OFFSET)


/**
 * Get the lead surrogate (0xd800..0xdbff) for a
 * supplementary code point (0x10000..0x10ffff).
 * @param supplementary 32-bit code point (U+10000..U+10ffff)
 * @return lead surrogate (U+d800..U+dbff) for supplementary
 * @stable ICU 2.4
 */
#define U16_LEAD(supplementary) (UChar)(((supplementary)>>10)+0xd7c0)

/**
 * Get the trail surrogate (0xdc00..0xdfff) for a
 * supplementary code point (0x10000..0x10ffff).
 * @param supplementary 32-bit code point (U+10000..U+10ffff)
 * @return trail surrogate (U+dc00..U+dfff) for supplementary
 * @stable ICU 2.4
 */
#define U16_TRAIL(supplementary) (UChar)(((supplementary)&0x3ff)|0xdc00)

/**
 * How many 16-bit code units are used to encode this Unicode code point? (1 or 2)
 * The result is not defined if c is not a Unicode code point (U+0000..U+10ffff).
 * @param c 32-bit code point
 * @return 1 or 2
 * @stable ICU 2.4
 */
#define U16_LENGTH(c) ((uint32_t)(c)<=0xffff ? 1 : 2)

/**
 * The maximum number of 16-bit code units per Unicode code point (U+0000..U+10ffff).
 * @return 2
 * @stable ICU 2.4
 */
#define U16_MAX_LENGTH 2

/**
 * Get a code point from a string at a random-access offset,
 * without changing the offset.
 * "Unsafe" macro, assumes well-formed UTF-16.
 *
 * The offset may point to either the lead or trail surrogate unit
 * for a supplementary code point, in which case the macro will read
 * the adjacent matching surrogate as well.
 * The result is undefined if the offset points to a single, unpaired surrogate.
 * Iteration through a string is more efficient with U16_NEXT_UNSAFE or U16_NEXT.
 *
 * @param s const UChar * string
 * @param i string offset
 * @param c output UChar32 variable
 * @see U16_GET
 * @stable ICU 2.4
 */
#define U16_GET_UNSAFE(s, i, c) { \
    (c)=(s)[i]; \
    if(U16_IS_SURROGATE(c)) { \
        if(U16_IS_SURROGATE_LEAD(c)) { \
            (c)=U16_GET_SUPPLEMENTARY((c), (s)[(i)+1]); \
        } else { \
            (c)=U16_GET_SUPPLEMENTARY((s)[(i)-1], (c)); \
        } \
    } \
}

/**
 * Get a code point from a string at a random-access offset,
 * without changing the offset.
 * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
 *
 * The offset may point to either the lead or trail surrogate unit
 * for a supplementary code point, in which case the macro will read
 * the adjacent matching surrogate as well.
 *
 * The length can be negative for a NUL-terminated string.
 *
 * If the offset points to a single, unpaired surrogate, then that itself
 * will be returned as the code point.
 * Iteration through a string is more efficient with U16_NEXT_UNSAFE or U16_NEXT.
 *
 * @param s const UChar * string
 * @param start starting string offset (usually 0)
 * @param i string offset, must be start<=i(start) && U16_IS_LEAD(__c2=(s)[(i)-1])) { \
                (c)=U16_GET_SUPPLEMENTARY(__c2, (c)); \
            } \
        } \
    } \
}

/* definitions with forward iteration --------------------------------------- */

/**
 * Get a code point from a string at a code point boundary offset,
 * and advance the offset to the next code point boundary.
 * (Post-incrementing forward iteration.)
 * "Unsafe" macro, assumes well-formed UTF-16.
 *
 * The offset may point to the lead surrogate unit
 * for a supplementary code point, in which case the macro will read
 * the following trail surrogate as well.
 * If the offset points to a trail surrogate, then that itself
 * will be returned as the code point.
 * The result is undefined if the offset points to a single, unpaired lead surrogate.
 *
 * @param s const UChar * string
 * @param i string offset
 * @param c output UChar32 variable
 * @see U16_NEXT
 * @stable ICU 2.4
 */
#define U16_NEXT_UNSAFE(s, i, c) { \
    (c)=(s)[(i)++]; \
    if(U16_IS_LEAD(c)) { \
        (c)=U16_GET_SUPPLEMENTARY((c), (s)[(i)++]); \
    } \
}

/**
 * Get a code point from a string at a code point boundary offset,
 * and advance the offset to the next code point boundary.
 * (Post-incrementing forward iteration.)
 * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
 *
 * The length can be negative for a NUL-terminated string.
 *
 * The offset may point to the lead surrogate unit
 * for a supplementary code point, in which case the macro will read
 * the following trail surrogate as well.
 * If the offset points to a trail surrogate or
 * to a single, unpaired lead surrogate, then that itself
 * will be returned as the code point.
 *
 * @param s const UChar * string
 * @param i string offset, must be i>10)+0xd7c0); \
        (s)[(i)++]=(uint16_t)(((c)&0x3ff)|0xdc00); \
    } \
}

/**
 * Append a code point to a string, overwriting 1 or 2 code units.
 * The offset points to the current end of the string contents
 * and is advanced (post-increment).
 * "Safe" macro, checks for a valid code point.
 * If a surrogate pair is written, checks for sufficient space in the string.
 * If the code point is not valid or a trail surrogate does not fit,
 * then isError is set to TRUE.
 *
 * @param s const UChar * string buffer
 * @param i string offset, must be i>10)+0xd7c0); \
        (s)[(i)++]=(uint16_t)(((c)&0x3ff)|0xdc00); \
    } else /* c>0x10ffff or not enough space */ { \
        (isError)=TRUE; \
    } \
}

/**
 * Advance the string offset from one code point boundary to the next.
 * (Post-incrementing iteration.)
 * "Unsafe" macro, assumes well-formed UTF-16.
 *
 * @param s const UChar * string
 * @param i string offset
 * @see U16_FWD_1
 * @stable ICU 2.4
 */
#define U16_FWD_1_UNSAFE(s, i) { \
    if(U16_IS_LEAD((s)[(i)++])) { \
        ++(i); \
    } \
}

/**
 * Advance the string offset from one code point boundary to the next.
 * (Post-incrementing iteration.)
 * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
 *
 * The length can be negative for a NUL-terminated string.
 *
 * @param s const UChar * string
 * @param i string offset, must be i0) { \
        U16_FWD_1_UNSAFE(s, i); \
        --__N; \
    } \
}

/**
 * Advance the string offset from one code point boundary to the n-th next one,
 * i.e., move forward by n code points.
 * (Post-incrementing iteration.)
 * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
 *
 * The length can be negative for a NUL-terminated string.
 *
 * @param s const UChar * string
 * @param i int32_t string offset, must be i0 && ((i)<(length) || ((length)<0 && (s)[i]!=0))) { \
        U16_FWD_1(s, i, length); \
        --__N; \
    } \
}

/**
 * Adjust a random-access offset to a code point boundary
 * at the start of a code point.
 * If the offset points to the trail surrogate of a surrogate pair,
 * then the offset is decremented.
 * Otherwise, it is not modified.
 * "Unsafe" macro, assumes well-formed UTF-16.
 *
 * @param s const UChar * string
 * @param i string offset
 * @see U16_SET_CP_START
 * @stable ICU 2.4
 */
#define U16_SET_CP_START_UNSAFE(s, i) { \
    if(U16_IS_TRAIL((s)[i])) { \
        --(i); \
    } \
}

/**
 * Adjust a random-access offset to a code point boundary
 * at the start of a code point.
 * If the offset points to the trail surrogate of a surrogate pair,
 * then the offset is decremented.
 * Otherwise, it is not modified.
 * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
 *
 * @param s const UChar * string
 * @param start starting string offset (usually 0)
 * @param i string offset, must be start<=i
 * @see U16_SET_CP_START_UNSAFE
 * @stable ICU 2.4
 */
#define U16_SET_CP_START(s, start, i) { \
    if(U16_IS_TRAIL((s)[i]) && (i)>(start) && U16_IS_LEAD((s)[(i)-1])) { \
        --(i); \
    } \
}

/* definitions with backward iteration -------------------------------------- */

/**
 * Move the string offset from one code point boundary to the previous one
 * and get the code point between them.
 * (Pre-decrementing backward iteration.)
 * "Unsafe" macro, assumes well-formed UTF-16.
 *
 * The input offset may be the same as the string length.
 * If the offset is behind a trail surrogate unit
 * for a supplementary code point, then the macro will read
 * the preceding lead surrogate as well.
 * If the offset is behind a lead surrogate, then that itself
 * will be returned as the code point.
 * The result is undefined if the offset is behind a single, unpaired trail surrogate.
 *
 * @param s const UChar * string
 * @param i string offset
 * @param c output UChar32 variable
 * @see U16_PREV
 * @stable ICU 2.4
 */
#define U16_PREV_UNSAFE(s, i, c) { \
    (c)=(s)[--(i)]; \
    if(U16_IS_TRAIL(c)) { \
        (c)=U16_GET_SUPPLEMENTARY((s)[--(i)], (c)); \
    } \
}

/**
 * Move the string offset from one code point boundary to the previous one
 * and get the code point between them.
 * (Pre-decrementing backward iteration.)
 * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
 *
 * The input offset may be the same as the string length.
 * If the offset is behind a trail surrogate unit
 * for a supplementary code point, then the macro will read
 * the preceding lead surrogate as well.
 * If the offset is behind a lead surrogate or behind a single, unpaired
 * trail surrogate, then that itself
 * will be returned as the code point.
 *
 * @param s const UChar * string
 * @param start starting string offset (usually 0)
 * @param i string offset, must be start(start) && U16_IS_LEAD(__c2=(s)[(i)-1])) { \
            --(i); \
            (c)=U16_GET_SUPPLEMENTARY(__c2, (c)); \
        } \
    } \
}

/**
 * Move the string offset from one code point boundary to the previous one.
 * (Pre-decrementing backward iteration.)
 * The input offset may be the same as the string length.
 * "Unsafe" macro, assumes well-formed UTF-16.
 *
 * @param s const UChar * string
 * @param i string offset
 * @see U16_BACK_1
 * @stable ICU 2.4
 */
#define U16_BACK_1_UNSAFE(s, i) { \
    if(U16_IS_TRAIL((s)[--(i)])) { \
        --(i); \
    } \
}

/**
 * Move the string offset from one code point boundary to the previous one.
 * (Pre-decrementing backward iteration.)
 * The input offset may be the same as the string length.
 * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
 *
 * @param s const UChar * string
 * @param start starting string offset (usually 0)
 * @param i string offset, must be start(start) && U16_IS_LEAD((s)[(i)-1])) { \
        --(i); \
    } \
}

/**
 * Move the string offset from one code point boundary to the n-th one before it,
 * i.e., move backward by n code points.
 * (Pre-decrementing backward iteration.)
 * The input offset may be the same as the string length.
 * "Unsafe" macro, assumes well-formed UTF-16.
 *
 * @param s const UChar * string
 * @param i string offset
 * @param n number of code points to skip
 * @see U16_BACK_N
 * @stable ICU 2.4
 */
#define U16_BACK_N_UNSAFE(s, i, n) { \
    int32_t __N=(n); \
    while(__N>0) { \
        U16_BACK_1_UNSAFE(s, i); \
        --__N; \
    } \
}

/**
 * Move the string offset from one code point boundary to the n-th one before it,
 * i.e., move backward by n code points.
 * (Pre-decrementing backward iteration.)
 * The input offset may be the same as the string length.
 * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
 *
 * @param s const UChar * string
 * @param start start of string
 * @param i string offset, must be start0 && (i)>(start)) { \
        U16_BACK_1(s, start, i); \
        --__N; \
    } \
}

/**
 * Adjust a random-access offset to a code point boundary after a code point.
 * If the offset is behind the lead surrogate of a surrogate pair,
 * then the offset is incremented.
 * Otherwise, it is not modified.
 * The input offset may be the same as the string length.
 * "Unsafe" macro, assumes well-formed UTF-16.
 *
 * @param s const UChar * string
 * @param i string offset
 * @see U16_SET_CP_LIMIT
 * @stable ICU 2.4
 */
#define U16_SET_CP_LIMIT_UNSAFE(s, i) { \
    if(U16_IS_LEAD((s)[(i)-1])) { \
        ++(i); \
    } \
}

/**
 * Adjust a random-access offset to a code point boundary after a code point.
 * If the offset is behind the lead surrogate of a surrogate pair,
 * then the offset is incremented.
 * Otherwise, it is not modified.
 * The input offset may be the same as the string length.
 * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
 *
 * The length can be negative for a NUL-terminated string.
 *
 * @param s const UChar * string
 * @param start int32_t starting string offset (usually 0)
 * @param i int32_t string offset, start<=i<=length
 * @param length int32_t string length
 * @see U16_SET_CP_LIMIT_UNSAFE
 * @stable ICU 2.4
 */
#define U16_SET_CP_LIMIT(s, start, i, length) { \
    if((start)<(i) && ((i)<(length) || (length)<0) && U16_IS_LEAD((s)[(i)-1]) && U16_IS_TRAIL((s)[i])) { \
        ++(i); \
    } \
}

#endif

// uversion.h
/*
*******************************************************************************
*   Copyright (C) 2000-2011, International Business Machines
*   Corporation and others.  All Rights Reserved.
*******************************************************************************
*
*   file name:  uversion.h
*   encoding:   US-ASCII
*   tab size:   8 (not used)
*   indentation:4
*
*   Created by: Vladimir Weinstein
*
*  Gets included by utypes.h and Windows .rc files
*/

/**
 * \file
 * \brief C API: API for accessing ICU version numbers. 
 */
/*===========================================================================*/
/* Main ICU version information                                              */
/*===========================================================================*/

#ifndef UVERSION_H
#define UVERSION_H


/* Actual version info lives in uvernum.h */

/** Maximum length of the copyright string.
 *  @stable ICU 2.4
 */
#define U_COPYRIGHT_STRING_LENGTH  128

/** An ICU version consists of up to 4 numbers from 0..255.
 *  @stable ICU 2.4
 */
#define U_MAX_VERSION_LENGTH 4

/** In a string, ICU version fields are delimited by dots.
 *  @stable ICU 2.4
 */
#define U_VERSION_DELIMITER '.'

/** The maximum length of an ICU version string.
 *  @stable ICU 2.4
 */
#define U_MAX_VERSION_STRING_LENGTH 20

/** The binary form of a version on ICU APIs is an array of 4 uint8_t.
 *  To compare two versions, use memcmp(v1,v2,sizeof(UVersionInfo)).
 *  @stable ICU 2.4
 */
typedef uint8_t UVersionInfo[U_MAX_VERSION_LENGTH];

/*===========================================================================*/
/* C++ namespace if supported. Versioned unless versioning is disabled.      */
/*===========================================================================*/

/**
 * This is used to end a declaration of a public ICU C++ API.
 * When not compiling for C++, it does nothing.
 * When compiling for C++, it ends the extern "C++" block begun by
#   define U_NAMESPACE_USE using namespace U_ICU_NAMESPACE;
#   define U_NAMESPACE_QUALIFIER U_ICU_NAMESPACE::

#   ifndef U_USING_ICU_NAMESPACE
#       define U_USING_ICU_NAMESPACE 1
#   endif
#   if U_USING_ICU_NAMESPACE
        U_NAMESPACE_USE
#   endif
#else
#   define U_NAMESPACE_USE
#   define U_NAMESPACE_QUALIFIER
#endif

/*===========================================================================*/
/* General version helper functions. Definitions in putil.c                  */
/*===========================================================================*/

/**
 * Parse a string with dotted-decimal version information and
 * fill in a UVersionInfo structure with the result.
 * Definition of this function lives in putil.c
 *
 * @param versionArray The destination structure for the version information.
 * @param versionString A string with dotted-decimal version information,
 *                      with up to four non-negative number fields with
 *                      values of up to 255 each.
 * @stable ICU 2.4
 */
U_STABLE void U_EXPORT2
u_versionFromString(UVersionInfo versionArray, const char *versionString);

/**
 * Parse a Unicode string with dotted-decimal version information and
 * fill in a UVersionInfo structure with the result.
 * Definition of this function lives in putil.c
 *
 * @param versionArray The destination structure for the version information.
 * @param versionString A Unicode string with dotted-decimal version
 *                      information, with up to four non-negative number
 *                      fields with values of up to 255 each.
 * @stable ICU 4.2
 */
U_STABLE void U_EXPORT2
u_versionFromUString(UVersionInfo versionArray, const UChar *versionString);


/**
 * Write a string with dotted-decimal version information according
 * to the input UVersionInfo.
 * Definition of this function lives in putil.c
 *
 * @param versionArray The version information to be written as a string.
 * @param versionString A string buffer that will be filled in with
 *                      a string corresponding to the numeric version
 *                      information in versionArray.
 *                      The buffer size must be at least U_MAX_VERSION_STRING_LENGTH.
 * @stable ICU 2.4
 */
U_STABLE void U_EXPORT2
u_versionToString(const UVersionInfo versionArray, char *versionString);

/**
 * Gets the ICU release version.  The version array stores the version information
 * for ICU.  For example, release "1.3.31.2" is then represented as 0x01031F02.
 * Definition of this function lives in putil.c
 *
 * @param versionArray the version # information, the result will be filled in
 * @stable ICU 2.0
 */
U_STABLE void U_EXPORT2
u_getVersion(UVersionInfo versionArray);
#endif

// utypes.h
/*
**********************************************************************
*   Copyright (C) 1996-2016, International Business Machines
*   Corporation and others.  All Rights Reserved.
**********************************************************************
*
*  FILE NAME : UTYPES.H (formerly ptypes.h)
*
*   Date        Name        Description
*   12/11/96    helena      Creation.
*   02/27/97    aliu        Added typedefs for UClassID, int8, int16, int32,
*                           uint8, uint16, and uint32.
*   04/01/97    aliu        Added XP_CPLUSPLUS and modified to work under C as
*                            well as C++.
*                           Modified to use memcpy() for uprv_arrayCopy() fns.
*   04/14/97    aliu        Added TPlatformUtilities.
*   05/07/97    aliu        Added import/export specifiers (replacing the old
*                           broken EXT_CLASS).  Added version number for our
*                           code.  Cleaned up header.
*    6/20/97    helena      Java class name change.
*   08/11/98    stephen     UErrorCode changed from typedef to enum
*   08/12/98    erm         Changed T_ANALYTIC_PACKAGE_VERSION to 3
*   08/14/98    stephen     Added uprv_arrayCopy() for int8_t, int16_t, int32_t
*   12/09/98    jfitz       Added BUFFER_OVERFLOW_ERROR (bug 1100066)
*   04/20/99    stephen     Cleaned up & reworked for autoconf.
*                           Renamed to utypes.h.
*   05/05/99    stephen     Changed to use 
*   12/07/99    helena      Moved copyright notice string from ucnv_bld.h here.
*******************************************************************************
*/

#ifndef UTYPES_H
#define UTYPES_H


#include 


/*!
 * \file
 * \brief Basic definitions for ICU, for both C and C++ APIs
 *
 * This file defines basic types, constants, and enumerations directly or
 * indirectly by including other header files, especially utf.h for the
 * basic character and string definitions and umachine.h for consistent
 * integer and other types.
 */


/**
 * \def U_SHOW_CPLUSPLUS_API
 * @internal
 */
#ifdef __cplusplus
#   ifndef U_SHOW_CPLUSPLUS_API
#   endif
#else
#   undef U_SHOW_CPLUSPLUS_API
#endif

/** @{ API visibility control */

/**
 * \def U_HIDE_DRAFT_API
 * Define this to 1 to request that draft API be "hidden"
 * @internal
 */
/**
 * \def U_HIDE_INTERNAL_API
 * Define this to 1 to request that internal API be "hidden"
 * @internal
 */
#if !U_DEFAULT_SHOW_DRAFT && !defined(U_SHOW_DRAFT_API)
#endif
#if !U_DEFAULT_SHOW_DRAFT && !defined(U_SHOW_INTERNAL_API)
#endif

/** @} */

/*===========================================================================*/
/* ICUDATA naming scheme                                                     */
/*===========================================================================*/

/**
 * \def U_ICUDATA_TYPE_LETTER
 *
 * This is a platform-dependent string containing one letter:
 * - b for big-endian, ASCII-family platforms
 * - l for little-endian, ASCII-family platforms
 * - e for big-endian, EBCDIC-family platforms
 * This letter is part of the common data file name.
 * @stable ICU 2.0
 */

/**
 * \def U_ICUDATA_TYPE_LITLETTER
 * The non-string form of U_ICUDATA_TYPE_LETTER
 * @stable ICU 2.0
 */
#if U_CHARSET_FAMILY
#   if U_IS_BIG_ENDIAN
   /* EBCDIC - should always be BE */
#     define U_ICUDATA_TYPE_LETTER "e"
#     define U_ICUDATA_TYPE_LITLETTER e
#   else
#     error "Don't know what to do with little endian EBCDIC!"
#     define U_ICUDATA_TYPE_LETTER "x"
#     define U_ICUDATA_TYPE_LITLETTER x
#   endif
#else
#   if U_IS_BIG_ENDIAN
      /* Big-endian ASCII */
#     define U_ICUDATA_TYPE_LETTER "b"
#     define U_ICUDATA_TYPE_LITLETTER b
#   else
      /* Little-endian ASCII */
#     define U_ICUDATA_TYPE_LETTER "l"
#     define U_ICUDATA_TYPE_LITLETTER l
#   endif
#endif

/**
 * A single string literal containing the icudata stub name. i.e. 'icudt18e' for
 * ICU 1.8.x on EBCDIC, etc..
 * @stable ICU 2.0
 */
#define U_ICUDATA_NAME    "icudt" U_ICU_VERSION_SHORT U_ICUDATA_TYPE_LETTER

/**
 *  U_ICU_ENTRY_POINT is the name of the DLL entry point to the ICU data library.
 *    Defined as a literal, not a string.
 *    Tricky Preprocessor use - ## operator replaces macro paramters with the literal string
 *                              from the corresponding macro invocation, _before_ other macro substitutions.
 *                              Need a nested \#defines to get the actual version numbers rather than
 *                              the literal text U_ICU_VERSION_MAJOR_NUM into the name.
 *                              The net result will be something of the form
 *                                  \#define U_ICU_ENTRY_POINT icudt19_dat
 * @stable ICU 2.4
 */
#define U_ICUDATA_ENTRY_POINT  U_DEF2_ICUDATA_ENTRY_POINT(U_ICU_VERSION_MAJOR_NUM,U_LIB_SUFFIX_C_NAME)


/**
 * \def NULL
 * Define NULL if necessary, to 0 for C++ and to ((void *)0) for C.
 * @stable ICU 2.0
 */
#ifndef NULL
#ifdef __cplusplus
#define NULL    0
#else
#define NULL    ((void *)0)
#endif
#endif

/*===========================================================================*/
/* Calendar/TimeZone data types                                              */
/*===========================================================================*/

/**
 * Date and Time data type.
 * This is a primitive data type that holds the date and time
 * as the number of milliseconds since 1970-jan-01, 00:00 UTC.
 * UTC leap seconds are ignored.
 * @stable ICU 2.0
 */
typedef double UDate;

/** The number of milliseconds per second @stable ICU 2.0 */
#define U_MILLIS_PER_SECOND        (1000)
/** The number of milliseconds per minute @stable ICU 2.0 */
#define U_MILLIS_PER_MINUTE       (60000)
/** The number of milliseconds per hour @stable ICU 2.0 */
#define U_MILLIS_PER_HOUR       (3600000)
/** The number of milliseconds per day @stable ICU 2.0 */
#define U_MILLIS_PER_DAY       (86400000)

/** 
 * Maximum UDate value 
 * @stable ICU 4.8 
 */ 
#define U_DATE_MAX DBL_MAX

/**
 * Minimum UDate value 
 * @stable ICU 4.8 
 */ 
#define U_DATE_MIN -U_DATE_MAX

/*===========================================================================*/
/* Shared library/DLL import-export API control                              */
/*===========================================================================*/

/*
 * Control of symbol import/export.
 * ICU is separated into three libraries.
 */

/**
 * \def U_COMBINED_IMPLEMENTATION
 * Set to export library symbols from inside the ICU library
 * when all of ICU is in a single library.
 * This can be set as a compiler option while building ICU, and it
 * needs to be the first one tested to override U_COMMON_API, U_I18N_API, etc.
 * @stable ICU 2.0
 */

/**
 * \def U_DATA_API
 * Set to export library symbols from inside the stubdata library,
 * and to import them from outside.
 * @stable ICU 3.0
 */

/**
 * \def U_COMMON_API
 * Set to export library symbols from inside the common library,
 * and to import them from outside.
 * @stable ICU 2.0
 */

/**
 * \def U_I18N_API
 * Set to export library symbols from inside the i18n library,
 * and to import them from outside.
 * @stable ICU 2.0
 */

/**
 * \def U_LAYOUT_API
 * Set to export library symbols from inside the layout engine library,
 * and to import them from outside.
 * @stable ICU 2.0
 */

/**
 * \def U_LAYOUTEX_API
 * Set to export library symbols from inside the layout extensions library,
 * and to import them from outside.
 * @stable ICU 2.6
 */

/**
 * \def U_IO_API
 * Set to export library symbols from inside the ustdio library,
 * and to import them from outside.
 * @stable ICU 2.0
 */

/**
 * \def U_TOOLUTIL_API
 * Set to export library symbols from inside the toolutil library,
 * and to import them from outside.
 * @stable ICU 3.4
 */

#if defined(U_COMBINED_IMPLEMENTATION)
#define U_DATA_API     U_EXPORT
#define U_COMMON_API   U_EXPORT
#define U_I18N_API     U_EXPORT
#define U_LAYOUT_API   U_EXPORT
#define U_LAYOUTEX_API U_EXPORT
#define U_IO_API       U_EXPORT
#define U_TOOLUTIL_API U_EXPORT
#elif defined(U_STATIC_IMPLEMENTATION)
#define U_DATA_API
#define U_COMMON_API
#define U_I18N_API
#define U_LAYOUT_API
#define U_LAYOUTEX_API
#define U_IO_API
#define U_TOOLUTIL_API
#elif defined(U_COMMON_IMPLEMENTATION)
#define U_DATA_API     U_IMPORT
#define U_COMMON_API   U_EXPORT
#define U_I18N_API     U_IMPORT
#define U_LAYOUT_API   U_IMPORT
#define U_LAYOUTEX_API U_IMPORT
#define U_IO_API       U_IMPORT
#define U_TOOLUTIL_API U_IMPORT
#elif defined(U_I18N_IMPLEMENTATION)
#define U_DATA_API     U_IMPORT
#define U_COMMON_API   U_IMPORT
#define U_I18N_API     U_EXPORT
#define U_LAYOUT_API   U_IMPORT
#define U_LAYOUTEX_API U_IMPORT
#define U_IO_API       U_IMPORT
#define U_TOOLUTIL_API U_IMPORT
#elif defined(U_LAYOUT_IMPLEMENTATION)
#define U_DATA_API     U_IMPORT
#define U_COMMON_API   U_IMPORT
#define U_I18N_API     U_IMPORT
#define U_LAYOUT_API   U_EXPORT
#define U_LAYOUTEX_API U_IMPORT
#define U_IO_API       U_IMPORT
#define U_TOOLUTIL_API U_IMPORT
#elif defined(U_LAYOUTEX_IMPLEMENTATION)
#define U_DATA_API     U_IMPORT
#define U_COMMON_API   U_IMPORT
#define U_I18N_API     U_IMPORT
#define U_LAYOUT_API   U_IMPORT
#define U_LAYOUTEX_API U_EXPORT
#define U_IO_API       U_IMPORT
#define U_TOOLUTIL_API U_IMPORT
#elif defined(U_IO_IMPLEMENTATION)
#define U_DATA_API     U_IMPORT
#define U_COMMON_API   U_IMPORT
#define U_I18N_API     U_IMPORT
#define U_LAYOUT_API   U_IMPORT
#define U_LAYOUTEX_API U_IMPORT
#define U_IO_API       U_EXPORT
#define U_TOOLUTIL_API U_IMPORT
#elif defined(U_TOOLUTIL_IMPLEMENTATION)
#define U_DATA_API     U_IMPORT
#define U_COMMON_API   U_IMPORT
#define U_I18N_API     U_IMPORT
#define U_LAYOUT_API   U_IMPORT
#define U_LAYOUTEX_API U_IMPORT
#define U_IO_API       U_IMPORT
#define U_TOOLUTIL_API U_EXPORT
#else
#define U_DATA_API     U_IMPORT
#define U_COMMON_API   U_IMPORT
#define U_I18N_API     U_IMPORT
#define U_LAYOUT_API   U_IMPORT
#define U_LAYOUTEX_API U_IMPORT
#define U_IO_API       U_IMPORT
#define U_TOOLUTIL_API U_IMPORT
#endif

/**
 * \def U_STANDARD_CPP_NAMESPACE
 * Control of C++ Namespace
 * @stable ICU 2.0
 */
#ifdef __cplusplus
#define U_STANDARD_CPP_NAMESPACE        ::
#else
#define U_STANDARD_CPP_NAMESPACE
#endif


/*===========================================================================*/
/* Global delete operator                                                    */
/*===========================================================================*/

/*
 * The ICU4C library must not use the global new and delete operators.
 * These operators here are defined to enable testing for this.
 * See Jitterbug 2581 for details of why this is necessary.
 *
 * Verification that ICU4C's memory usage is correct, i.e.,
 * that global new/delete are not used:
 *
 * a) Check for imports of global new/delete (see uobject.cpp for details)
 * b) Verify that new is never imported.
 * c) Verify that delete is only imported from object code for interface/mixin classes.
 * d) Add global delete and delete[] only for the ICU4C library itself
 *    and define them in a way that crashes or otherwise easily shows a problem.
 *
 * The following implements d).
 * The operator implementations crash; this is intentional and used for library debugging.
 *
 * Note: This is currently only done on Windows because
 * some Linux/Unix compilers have problems with defining global new/delete.
 * On Windows, it is _MSC_VER>=1200 for MSVC 6.0 and higher.
 */
#if defined(__cplusplus) && U_DEBUG && U_OVERRIDE_CXX_ALLOCATION && (_MSC_VER>=1200) && !defined(U_STATIC_IMPLEMENTATION) && (defined(U_COMMON_IMPLEMENTATION) || defined(U_I18N_IMPLEMENTATION) || defined(U_IO_IMPLEMENTATION) || defined(U_LAYOUT_IMPLEMENTATION) || defined(U_LAYOUTEX_IMPLEMENTATION))

#endif

/*===========================================================================*/
/* UErrorCode */
/*===========================================================================*/

/**
 * Error code to replace exception handling, so that the code is compatible with all C++ compilers,
 * and to use the same mechanism for C and C++.
 *
 * \par
 * ICU functions that take a reference (C++) or a pointer (C) to a UErrorCode
 * first test if(U_FAILURE(errorCode)) { return immediately; }
 * so that in a chain of such functions the first one that sets an error code
 * causes the following ones to not perform any operations.
 *
 * \par
 * Error codes should be tested using U_FAILURE() and U_SUCCESS().
 * @stable ICU 2.0
 */
typedef enum UErrorCode {
    /* The ordering of U_ERROR_INFO_START Vs U_USING_FALLBACK_WARNING looks weird
     * and is that way because VC++ debugger displays first encountered constant,
     * which is not the what the code is used for
     */

    U_USING_FALLBACK_WARNING  = -128,   /**< A resource bundle lookup returned a fallback result (not an error) */

    U_ERROR_WARNING_START     = -128,   /**< Start of information results (semantically successful) */

    U_USING_DEFAULT_WARNING   = -127,   /**< A resource bundle lookup returned a result from the root locale (not an error) */

    U_SAFECLONE_ALLOCATED_WARNING = -126, /**< A SafeClone operation required allocating memory (informational only) */

    U_STATE_OLD_WARNING       = -125,   /**< ICU has to use compatibility layer to construct the service. Expect performance/memory usage degradation. Consider upgrading */

    U_STRING_NOT_TERMINATED_WARNING = -124,/**< An output string could not be NUL-terminated because output length==destCapacity. */

    U_SORT_KEY_TOO_SHORT_WARNING = -123, /**< Number of levels requested in getBound is higher than the number of levels in the sort key */

    U_AMBIGUOUS_ALIAS_WARNING = -122,   /**< This converter alias can go to different converter implementations */

    U_DIFFERENT_UCA_VERSION = -121,     /**< ucol_open encountered a mismatch between UCA version and collator image version, so the collator was constructed from rules. No impact to further function */
    
    U_PLUGIN_CHANGED_LEVEL_WARNING = -120, /**< A plugin caused a level change. May not be an error, but later plugins may not load. */

    U_ERROR_WARNING_LIMIT,              /**< This must always be the last warning value to indicate the limit for UErrorCode warnings (last warning code +1) */


    U_ZERO_ERROR              =  0,     /**< No error, no warning. */

    U_ILLEGAL_ARGUMENT_ERROR  =  1,     /**< Start of codes indicating failure */
    U_MISSING_RESOURCE_ERROR  =  2,     /**< The requested resource cannot be found */
    U_INVALID_FORMAT_ERROR    =  3,     /**< Data format is not what is expected */
    U_FILE_ACCESS_ERROR       =  4,     /**< The requested file cannot be found */
    U_INTERNAL_PROGRAM_ERROR  =  5,     /**< Indicates a bug in the library code */
    U_MESSAGE_PARSE_ERROR     =  6,     /**< Unable to parse a message (message format) */
    U_MEMORY_ALLOCATION_ERROR =  7,     /**< Memory allocation error */
    U_INDEX_OUTOFBOUNDS_ERROR =  8,     /**< Trying to access the index that is out of bounds */
    U_PARSE_ERROR             =  9,     /**< Equivalent to Java ParseException */
    U_INVALID_CHAR_FOUND      = 10,     /**< Character conversion: Unmappable input sequence. In other APIs: Invalid character. */
    U_TRUNCATED_CHAR_FOUND    = 11,     /**< Character conversion: Incomplete input sequence. */
    U_ILLEGAL_CHAR_FOUND      = 12,     /**< Character conversion: Illegal input sequence/combination of input units. */
    U_INVALID_TABLE_FORMAT    = 13,     /**< Conversion table file found, but corrupted */
    U_INVALID_TABLE_FILE      = 14,     /**< Conversion table file not found */
    U_BUFFER_OVERFLOW_ERROR   = 15,     /**< A result would not fit in the supplied buffer */
    U_UNSUPPORTED_ERROR       = 16,     /**< Requested operation not supported in current context */
    U_RESOURCE_TYPE_MISMATCH  = 17,     /**< an operation is requested over a resource that does not support it */
    U_ILLEGAL_ESCAPE_SEQUENCE = 18,     /**< ISO-2022 illlegal escape sequence */
    U_UNSUPPORTED_ESCAPE_SEQUENCE = 19, /**< ISO-2022 unsupported escape sequence */
    U_NO_SPACE_AVAILABLE      = 20,     /**< No space available for in-buffer expansion for Arabic shaping */
    U_CE_NOT_FOUND_ERROR      = 21,     /**< Currently used only while setting variable top, but can be used generally */
    U_PRIMARY_TOO_LONG_ERROR  = 22,     /**< User tried to set variable top to a primary that is longer than two bytes */
    U_STATE_TOO_OLD_ERROR     = 23,     /**< ICU cannot construct a service from this state, as it is no longer supported */
    U_TOO_MANY_ALIASES_ERROR  = 24,     /**< There are too many aliases in the path to the requested resource.
                                             It is very possible that a circular alias definition has occured */
    U_ENUM_OUT_OF_SYNC_ERROR  = 25,     /**< UEnumeration out of sync with underlying collection */
    U_INVARIANT_CONVERSION_ERROR = 26,  /**< Unable to convert a UChar* string to char* with the invariant converter. */
    U_INVALID_STATE_ERROR     = 27,     /**< Requested operation can not be completed with ICU in its current state */
    U_COLLATOR_VERSION_MISMATCH = 28,   /**< Collator version is not compatible with the base version */
    U_USELESS_COLLATOR_ERROR  = 29,     /**< Collator is options only and no base is specified */
    U_NO_WRITE_PERMISSION     = 30,     /**< Attempt to modify read-only or constant data. */

    U_STANDARD_ERROR_LIMIT,             /**< This must always be the last value to indicate the limit for standard errors */
    /*
     * the error code range 0x10000 0x10100 are reserved for Transliterator
     */
    U_BAD_VARIABLE_DEFINITION=0x10000,/**< Missing '$' or duplicate variable name */
    U_PARSE_ERROR_START = 0x10000,    /**< Start of Transliterator errors */
    U_MALFORMED_RULE,                 /**< Elements of a rule are misplaced */
    U_MALFORMED_SET,                  /**< A UnicodeSet pattern is invalid*/
    U_MALFORMED_SYMBOL_REFERENCE,     /**< UNUSED as of ICU 2.4 */
    U_MALFORMED_UNICODE_ESCAPE,       /**< A Unicode escape pattern is invalid*/
    U_MALFORMED_VARIABLE_DEFINITION,  /**< A variable definition is invalid */
    U_MALFORMED_VARIABLE_REFERENCE,   /**< A variable reference is invalid */
    U_MISMATCHED_SEGMENT_DELIMITERS,  /**< UNUSED as of ICU 2.4 */
    U_MISPLACED_ANCHOR_START,         /**< A start anchor appears at an illegal position */
    U_MISPLACED_CURSOR_OFFSET,        /**< A cursor offset occurs at an illegal position */
    U_MISPLACED_QUANTIFIER,           /**< A quantifier appears after a segment close delimiter */
    U_MISSING_OPERATOR,               /**< A rule contains no operator */
    U_MISSING_SEGMENT_CLOSE,          /**< UNUSED as of ICU 2.4 */
    U_MULTIPLE_ANTE_CONTEXTS,         /**< More than one ante context */
    U_MULTIPLE_CURSORS,               /**< More than one cursor */
    U_MULTIPLE_POST_CONTEXTS,         /**< More than one post context */
    U_TRAILING_BACKSLASH,             /**< A dangling backslash */
    U_UNDEFINED_SEGMENT_REFERENCE,    /**< A segment reference does not correspond to a defined segment */
    U_UNDEFINED_VARIABLE,             /**< A variable reference does not correspond to a defined variable */
    U_UNQUOTED_SPECIAL,               /**< A special character was not quoted or escaped */
    U_UNTERMINATED_QUOTE,             /**< A closing single quote is missing */
    U_RULE_MASK_ERROR,                /**< A rule is hidden by an earlier more general rule */
    U_MISPLACED_COMPOUND_FILTER,      /**< A compound filter is in an invalid location */
    U_MULTIPLE_COMPOUND_FILTERS,      /**< More than one compound filter */
    U_INVALID_RBT_SYNTAX,             /**< A "::id" rule was passed to the RuleBasedTransliterator parser */
    U_INVALID_PROPERTY_PATTERN,       /**< UNUSED as of ICU 2.4 */
    U_MALFORMED_PRAGMA,               /**< A 'use' pragma is invlalid */
    U_UNCLOSED_SEGMENT,               /**< A closing ')' is missing */
    U_ILLEGAL_CHAR_IN_SEGMENT,        /**< UNUSED as of ICU 2.4 */
    U_VARIABLE_RANGE_EXHAUSTED,       /**< Too many stand-ins generated for the given variable range */
    U_VARIABLE_RANGE_OVERLAP,         /**< The variable range overlaps characters used in rules */
    U_ILLEGAL_CHARACTER,              /**< A special character is outside its allowed context */
    U_INTERNAL_TRANSLITERATOR_ERROR,  /**< Internal transliterator system error */
    U_INVALID_ID,                     /**< A "::id" rule specifies an unknown transliterator */
    U_INVALID_FUNCTION,               /**< A "&fn()" rule specifies an unknown transliterator */
    U_PARSE_ERROR_LIMIT,              /**< The limit for Transliterator errors */

    /*
     * the error code range 0x10100 0x10200 are reserved for formatting API parsing error
     */
    U_UNEXPECTED_TOKEN=0x10100,       /**< Syntax error in format pattern */
    U_FMT_PARSE_ERROR_START=0x10100,  /**< Start of format library errors */
    U_MULTIPLE_DECIMAL_SEPARATORS,    /**< More than one decimal separator in number pattern */
    U_MULTIPLE_DECIMAL_SEPERATORS = U_MULTIPLE_DECIMAL_SEPARATORS, /**< Typo: kept for backward compatibility. Use U_MULTIPLE_DECIMAL_SEPARATORS */
    U_MULTIPLE_EXPONENTIAL_SYMBOLS,   /**< More than one exponent symbol in number pattern */
    U_MALFORMED_EXPONENTIAL_PATTERN,  /**< Grouping symbol in exponent pattern */
    U_MULTIPLE_PERCENT_SYMBOLS,       /**< More than one percent symbol in number pattern */
    U_MULTIPLE_PERMILL_SYMBOLS,       /**< More than one permill symbol in number pattern */
    U_MULTIPLE_PAD_SPECIFIERS,        /**< More than one pad symbol in number pattern */
    U_PATTERN_SYNTAX_ERROR,           /**< Syntax error in format pattern */
    U_ILLEGAL_PAD_POSITION,           /**< Pad symbol misplaced in number pattern */
    U_UNMATCHED_BRACES,               /**< Braces do not match in message pattern */
    U_UNSUPPORTED_PROPERTY,           /**< UNUSED as of ICU 2.4 */
    U_UNSUPPORTED_ATTRIBUTE,          /**< UNUSED as of ICU 2.4 */
    U_ARGUMENT_TYPE_MISMATCH,         /**< Argument name and argument index mismatch in MessageFormat functions */
    U_DUPLICATE_KEYWORD,              /**< Duplicate keyword in PluralFormat */
    U_UNDEFINED_KEYWORD,              /**< Undefined Plural keyword */
    U_DEFAULT_KEYWORD_MISSING,        /**< Missing DEFAULT rule in plural rules */
    U_DECIMAL_NUMBER_SYNTAX_ERROR,    /**< Decimal number syntax error */
    U_FORMAT_INEXACT_ERROR,           /**< Cannot format a number exactly and rounding mode is ROUND_UNNECESSARY @stable ICU 4.8 */
    U_FMT_PARSE_ERROR_LIMIT,          /**< The limit for format library errors */

    /*
     * the error code range 0x10200 0x102ff are reserved for Break Iterator related error
     */
    U_BRK_INTERNAL_ERROR=0x10200,          /**< An internal error (bug) was detected.             */
    U_BRK_ERROR_START=0x10200,             /**< Start of codes indicating Break Iterator failures */
    U_BRK_HEX_DIGITS_EXPECTED,             /**< Hex digits expected as part of a escaped char in a rule. */
    U_BRK_SEMICOLON_EXPECTED,              /**< Missing ';' at the end of a RBBI rule.            */
    U_BRK_RULE_SYNTAX,                     /**< Syntax error in RBBI rule.                        */
    U_BRK_UNCLOSED_SET,                    /**< UnicodeSet witing an RBBI rule missing a closing ']'.  */
    U_BRK_ASSIGN_ERROR,                    /**< Syntax error in RBBI rule assignment statement.   */
    U_BRK_VARIABLE_REDFINITION,            /**< RBBI rule $Variable redefined.                    */
    U_BRK_MISMATCHED_PAREN,                /**< Mis-matched parentheses in an RBBI rule.          */
    U_BRK_NEW_LINE_IN_QUOTED_STRING,       /**< Missing closing quote in an RBBI rule.            */
    U_BRK_UNDEFINED_VARIABLE,              /**< Use of an undefined $Variable in an RBBI rule.    */
    U_BRK_INIT_ERROR,                      /**< Initialization failure.  Probable missing ICU Data. */
    U_BRK_RULE_EMPTY_SET,                  /**< Rule contains an empty Unicode Set.               */
    U_BRK_UNRECOGNIZED_OPTION,             /**< !!option in RBBI rules not recognized.            */
    U_BRK_MALFORMED_RULE_TAG,              /**< The {nnn} tag on a rule is mal formed             */
    U_BRK_ERROR_LIMIT,                     /**< This must always be the last value to indicate the limit for Break Iterator failures */

    /*
     * The error codes in the range 0x10300-0x103ff are reserved for regular expression related errrs
     */
    U_REGEX_INTERNAL_ERROR=0x10300,       /**< An internal error (bug) was detected.              */
    U_REGEX_ERROR_START=0x10300,          /**< Start of codes indicating Regexp failures          */
    U_REGEX_RULE_SYNTAX,                  /**< Syntax error in regexp pattern.                    */
    U_REGEX_INVALID_STATE,                /**< RegexMatcher in invalid state for requested operation */
    U_REGEX_BAD_ESCAPE_SEQUENCE,          /**< Unrecognized backslash escape sequence in pattern  */
    U_REGEX_PROPERTY_SYNTAX,              /**< Incorrect Unicode property                         */
    U_REGEX_UNIMPLEMENTED,                /**< Use of regexp feature that is not yet implemented. */
    U_REGEX_MISMATCHED_PAREN,             /**< Incorrectly nested parentheses in regexp pattern.  */
    U_REGEX_NUMBER_TOO_BIG,               /**< Decimal number is too large.                       */
    U_REGEX_BAD_INTERVAL,                 /**< Error in {min,max} interval                        */
    U_REGEX_MAX_LT_MIN,                   /**< In {min,max}, max is less than min.                */
    U_REGEX_INVALID_BACK_REF,             /**< Back-reference to a non-existent capture group.    */
    U_REGEX_INVALID_FLAG,                 /**< Invalid value for match mode flags.                */
    U_REGEX_LOOK_BEHIND_LIMIT,            /**< Look-Behind pattern matches must have a bounded maximum length.    */
    U_REGEX_SET_CONTAINS_STRING,          /**< Regexps cannot have UnicodeSets containing strings.*/
    U_REGEX_MISSING_CLOSE_BRACKET=U_REGEX_SET_CONTAINS_STRING+2, /**< Missing closing bracket on a bracket expression. */
    U_REGEX_INVALID_RANGE,                /**< In a character range [x-y], x is greater than y.   */
    U_REGEX_STACK_OVERFLOW,               /**< Regular expression backtrack stack overflow.       */
    U_REGEX_TIME_OUT,                     /**< Maximum allowed match time exceeded                */
    U_REGEX_STOPPED_BY_CALLER,            /**< Matching operation aborted by user callback fn.    */
    U_REGEX_PATTERN_TOO_BIG,              /**< Pattern exceeds limits on size or complexity. @stable ICU 55 */
    U_REGEX_INVALID_CAPTURE_GROUP_NAME,   /**< Invalid capture group name. @stable ICU 55 */
    U_REGEX_ERROR_LIMIT=U_REGEX_STOPPED_BY_CALLER+3, /**< This must always be the last value to indicate the limit for regexp errors */

    /*
     * The error code in the range 0x10400-0x104ff are reserved for IDNA related error codes
     */
    U_IDNA_PROHIBITED_ERROR=0x10400,
    U_IDNA_ERROR_START=0x10400,
    U_IDNA_UNASSIGNED_ERROR,
    U_IDNA_CHECK_BIDI_ERROR,
    U_IDNA_STD3_ASCII_RULES_ERROR,
    U_IDNA_ACE_PREFIX_ERROR,
    U_IDNA_VERIFICATION_ERROR,
    U_IDNA_LABEL_TOO_LONG_ERROR,
    U_IDNA_ZERO_LENGTH_LABEL_ERROR,
    U_IDNA_DOMAIN_NAME_TOO_LONG_ERROR,
    U_IDNA_ERROR_LIMIT,
    /*
     * Aliases for StringPrep
     */
    U_STRINGPREP_PROHIBITED_ERROR = U_IDNA_PROHIBITED_ERROR,
    U_STRINGPREP_UNASSIGNED_ERROR = U_IDNA_UNASSIGNED_ERROR,
    U_STRINGPREP_CHECK_BIDI_ERROR = U_IDNA_CHECK_BIDI_ERROR,
    
    /*
     * The error code in the range 0x10500-0x105ff are reserved for Plugin related error codes
     */
    U_PLUGIN_ERROR_START=0x10500,         /**< Start of codes indicating plugin failures */
    U_PLUGIN_TOO_HIGH=0x10500,            /**< The plugin's level is too high to be loaded right now. */
    U_PLUGIN_DIDNT_SET_LEVEL,             /**< The plugin didn't call uplug_setPlugLevel in response to a QUERY */
    U_PLUGIN_ERROR_LIMIT,                 /**< This must always be the last value to indicate the limit for plugin errors */

    U_ERROR_LIMIT=U_PLUGIN_ERROR_LIMIT      /**< This must always be the last value to indicate the limit for UErrorCode (last error code +1) */
} UErrorCode;

/* Use the following to determine if an UErrorCode represents */
/* operational success or failure. */

#ifdef __cplusplus
    /**
     * Does the error code indicate success?
     * @stable ICU 2.0
     */
    static
    inline UBool U_SUCCESS(UErrorCode code) { return (UBool)(code<=U_ZERO_ERROR); }
    /**
     * Does the error code indicate a failure?
     * @stable ICU 2.0
     */
    static
    inline UBool U_FAILURE(UErrorCode code) { return (UBool)(code>U_ZERO_ERROR); }
#else
    /**
     * Does the error code indicate success?
     * @stable ICU 2.0
     */
#   define U_SUCCESS(x) ((x)<=U_ZERO_ERROR)
    /**
     * Does the error code indicate a failure?
     * @stable ICU 2.0
     */
#   define U_FAILURE(x) ((x)>U_ZERO_ERROR)
#endif

/**
 * Return a string for a UErrorCode value.
 * The string will be the same as the name of the error code constant
 * in the UErrorCode enum above.
 * @stable ICU 2.0
 */
U_STABLE const char * U_EXPORT2
u_errorName(UErrorCode code);


#endif /* _UTYPES */

// utrace.h
/*
*******************************************************************************
*
*   Copyright (C) 2003-2013, International Business Machines
*   Corporation and others.  All Rights Reserved.
*
*******************************************************************************
*   file name:  utrace.h
*   encoding:   US-ASCII
*   tab size:   8 (not used)
*   indentation:4
*
*   created on: 2003aug06
*   created by: Markus W. Scherer
*
*   Definitions for ICU tracing/logging.
*
*/

#ifndef __UTRACE_H__
#define __UTRACE_H__

#include 

/**
 * \file
 * \brief C API:  Definitions for ICU tracing/logging. 
 *
 * This provides API for debugging the internals of ICU without the use of
 * a traditional debugger.
 *
 * By default, tracing is disabled in ICU. If you need to debug ICU with 
 * tracing, please compile ICU with the --enable-tracing configure option.
 */
 
U_CDECL_BEGIN

/**
 * Trace severity levels.  Higher levels increase the verbosity of the trace output.
 * @see utrace_setLevel
 * @stable ICU 2.8
 */
typedef enum UTraceLevel {
    /** Disable all tracing  @stable ICU 2.8*/
    UTRACE_OFF=-1,
    /** Trace error conditions only  @stable ICU 2.8*/
    UTRACE_ERROR=0,
    /** Trace errors and warnings  @stable ICU 2.8*/
    UTRACE_WARNING=3,
    /** Trace opens and closes of ICU services  @stable ICU 2.8*/
    UTRACE_OPEN_CLOSE=5,
    /** Trace an intermediate number of ICU operations  @stable ICU 2.8*/
    UTRACE_INFO=7,
    /** Trace the maximum number of ICU operations  @stable ICU 2.8*/
    UTRACE_VERBOSE=9
} UTraceLevel;

/**
 *  These are the ICU functions that will be traced when tracing is enabled.
 *  @stable ICU 2.8
 */
typedef enum UTraceFunctionNumber {
    UTRACE_FUNCTION_START=0,
    UTRACE_U_INIT=UTRACE_FUNCTION_START,
    UTRACE_U_CLEANUP,
    UTRACE_FUNCTION_LIMIT,

    UTRACE_CONVERSION_START=0x1000,
    UTRACE_UCNV_OPEN=UTRACE_CONVERSION_START,
    UTRACE_UCNV_OPEN_PACKAGE,
    UTRACE_UCNV_OPEN_ALGORITHMIC,
    UTRACE_UCNV_CLONE,
    UTRACE_UCNV_CLOSE,
    UTRACE_UCNV_FLUSH_CACHE,
    UTRACE_UCNV_LOAD,
    UTRACE_UCNV_UNLOAD,
    UTRACE_CONVERSION_LIMIT,

    UTRACE_COLLATION_START=0x2000,
    UTRACE_UCOL_OPEN=UTRACE_COLLATION_START,
    UTRACE_UCOL_CLOSE,
    UTRACE_UCOL_STRCOLL,
    UTRACE_UCOL_GET_SORTKEY,
    UTRACE_UCOL_GETLOCALE,
    UTRACE_UCOL_NEXTSORTKEYPART,
    UTRACE_UCOL_STRCOLLITER,
    UTRACE_UCOL_OPEN_FROM_SHORT_STRING,
    UTRACE_UCOL_STRCOLLUTF8, /**< @stable ICU 50 */
    UTRACE_COLLATION_LIMIT
} UTraceFunctionNumber;

/**
 * Setter for the trace level.
 * @param traceLevel A UTraceLevel value.
 * @stable ICU 2.8
 */
U_STABLE void U_EXPORT2
utrace_setLevel(int32_t traceLevel);

/**
 * Getter for the trace level.
 * @return The UTraceLevel value being used by ICU.
 * @stable ICU 2.8
 */
U_STABLE int32_t U_EXPORT2
utrace_getLevel(void);

/* Trace function pointers types  ----------------------------- */

/**
  *  Type signature for the trace function to be called when entering a function.
  *  @param context value supplied at the time the trace functions are set.
  *  @param fnNumber Enum value indicating the ICU function being entered.
  *  @stable ICU 2.8
  */
typedef void U_CALLCONV
UTraceEntry(const void *context, int32_t fnNumber);

/**
  *  Type signature for the trace function to be called when exiting from a function.
  *  @param context value supplied at the time the trace functions are set.
  *  @param fnNumber Enum value indicating the ICU function being exited.
  *  @param fmt     A formatting string that describes the number and types
  *                 of arguments included with the variable args.  The fmt
  *                 string has the same form as the utrace_vformat format
  *                 string.
  *  @param args    A variable arguments list.  Contents are described by
  *                 the fmt parameter.
  *  @see   utrace_vformat
  *  @stable ICU 2.8
  */
typedef void U_CALLCONV
UTraceExit(const void *context, int32_t fnNumber, 
           const char *fmt, va_list args);

/**
  *  Type signature for the trace function to be called from within an ICU function
  *  to display data or messages.
  *  @param context  value supplied at the time the trace functions are set.
  *  @param fnNumber Enum value indicating the ICU function being exited.
  *  @param level    The current tracing level
  *  @param fmt      A format string describing the tracing data that is supplied
  *                  as variable args
  *  @param args     The data being traced, passed as variable args.
  *  @stable ICU 2.8
  */
typedef void U_CALLCONV
UTraceData(const void *context, int32_t fnNumber, int32_t level,
           const char *fmt, va_list args);

/**
  *  Set ICU Tracing functions.  Installs application-provided tracing
  *  functions into ICU.  After doing this, subsequent ICU operations
  *  will call back to the installed functions, providing a trace
  *  of the use of ICU.  Passing a NULL pointer for a tracing function
  *  is allowed, and inhibits tracing action at points where that function
  *  would be called.
  *  

* Tracing and Threads: Tracing functions are global to a process, and * will be called in response to ICU operations performed by any * thread. If tracing of an individual thread is desired, the * tracing functions must themselves filter by checking that the * current thread is the desired thread. * * @param context an uninterpretted pointer. Whatever is passed in * here will in turn be passed to each of the tracing * functions UTraceEntry, UTraceExit and UTraceData. * ICU does not use or alter this pointer. * @param e Callback function to be called on entry to a * a traced ICU function. * @param x Callback function to be called on exit from a * traced ICU function. * @param d Callback function to be called from within a * traced ICU function, for the purpose of providing * data to the trace. * * @stable ICU 2.8 */ U_STABLE void U_EXPORT2 utrace_setFunctions(const void *context, UTraceEntry *e, UTraceExit *x, UTraceData *d); /** * Get the currently installed ICU tracing functions. Note that a null function * pointer will be returned if no trace function has been set. * * @param context The currently installed tracing context. * @param e The currently installed UTraceEntry function. * @param x The currently installed UTraceExit function. * @param d The currently installed UTraceData function. * @stable ICU 2.8 */ U_STABLE void U_EXPORT2 utrace_getFunctions(const void **context, UTraceEntry **e, UTraceExit **x, UTraceData **d); /* * * ICU trace format string syntax * * Format Strings are passed to UTraceData functions, and define the * number and types of the trace data being passed on each call. * * The UTraceData function, which is supplied by the application, * not by ICU, can either forward the trace data (passed via * varargs) and the format string back to ICU for formatting into * a displayable string, or it can interpret the format itself, * and do as it wishes with the trace data. * * * Goals for the format string * - basic data output * - easy to use for trace programmer * - sufficient provision for data types for trace output readability * - well-defined types and binary portable APIs * * Non-goals * - printf compatibility * - fancy formatting * - argument reordering and other internationalization features * * ICU trace format strings contain plain text with argument inserts, * much like standard printf format strings. * Each insert begins with a '%', then optionally contains a 'v', * then exactly one type character. * Two '%' in a row represent a '%' instead of an insert. * The trace format strings need not have \n at the end. * * * Types * ----- * * Type characters: * - c A char character in the default codepage. * - s A NUL-terminated char * string in the default codepage. * - S A UChar * string. Requires two params, (ptr, length). Length=-1 for nul term. * - b A byte (8-bit integer). * - h A 16-bit integer. Also a 16 bit Unicode code unit. * - d A 32-bit integer. Also a 20 bit Unicode code point value. * - l A 64-bit integer. * - p A data pointer. * * Vectors * ------- * * If the 'v' is not specified, then one item of the specified type * is passed in. * If the 'v' (for "vector") is specified, then a vector of items of the * specified type is passed in, via a pointer to the first item * and an int32_t value for the length of the vector. * Length==-1 means zero or NUL termination. Works for vectors of all types. * * Note: %vS is a vector of (UChar *) strings. The strings must * be nul terminated as there is no way to provide a * separate length parameter for each string. The length * parameter (required for all vectors) is the number of * strings, not the length of the strings. * * Examples * -------- * * These examples show the parameters that will be passed to an application's * UTraceData() function for various formats. * * - the precise formatting is up to the application! * - the examples use type casts for arguments only to _show_ the types of * arguments without needing variable declarations in the examples; * the type casts will not be necessary in actual code * * UTraceDataFunc(context, fnNumber, level, * "There is a character %c in the string %s.", // Format String * (char)c, (const char *)s); // varargs parameters * -> There is a character 0x42 'B' in the string "Bravo". * * UTraceDataFunc(context, fnNumber, level, * "Vector of bytes %vb vector of chars %vc", * (const uint8_t *)bytes, (int32_t)bytesLength, * (const char *)chars, (int32_t)charsLength); * -> Vector of bytes * 42 63 64 3f [4] * vector of chars * "Bcd?"[4] * * UTraceDataFunc(context, fnNumber, level, * "An int32_t %d and a whole bunch of them %vd", * (int32_t)-5, (const int32_t *)ints, (int32_t)intsLength); * -> An int32_t 0xfffffffb and a whole bunch of them * fffffffb 00000005 0000010a [3] * */ /** * Trace output Formatter. An application's UTraceData tracing functions may call * back to this function to format the trace output in a * human readable form. Note that a UTraceData function may choose * to not format the data; it could, for example, save it in * in the raw form it was received (more compact), leaving * formatting for a later trace analyis tool. * @param outBuf pointer to a buffer to receive the formatted output. Output * will be nul terminated if there is space in the buffer - * if the length of the requested output < the output buffer size. * @param capacity Length of the output buffer. * @param indent Number of spaces to indent the output. Intended to allow * data displayed from nested functions to be indented for readability. * @param fmt Format specification for the data to output * @param args Data to be formatted. * @return Length of formatted output, including the terminating NUL. * If buffer capacity is insufficient, the required capacity is returned. * @stable ICU 2.8 */ U_STABLE int32_t U_EXPORT2 utrace_vformat(char *outBuf, int32_t capacity, int32_t indent, const char *fmt, va_list args); /** * Trace output Formatter. An application's UTraceData tracing functions may call * this function to format any additional trace data, beyond that * provided by default, in human readable form with the same * formatting conventions used by utrace_vformat(). * @param outBuf pointer to a buffer to receive the formatted output. Output * will be nul terminated if there is space in the buffer - * if the length of the requested output < the output buffer size. * @param capacity Length of the output buffer. * @param indent Number of spaces to indent the output. Intended to allow * data displayed from nested functions to be indented for readability. * @param fmt Format specification for the data to output * @param ... Data to be formatted. * @return Length of formatted output, including the terminating NUL. * If buffer capacity is insufficient, the required capacity is returned. * @stable ICU 2.8 */ U_STABLE int32_t U_EXPORT2 utrace_format(char *outBuf, int32_t capacity, int32_t indent, const char *fmt, ...); /* Trace function numbers --------------------------------------------------- */ /** * Get the name of a function from its trace function number. * * @param fnNumber The trace number for an ICU function. * @return The name string for the function. * * @see UTraceFunctionNumber * @stable ICU 2.8 */ U_STABLE const char * U_EXPORT2 utrace_functionName(int32_t fnNumber); U_CDECL_END #endif // ustringtrie.h /* ******************************************************************************* * Copyright (C) 2010-2012, International Business Machines * Corporation and others. All Rights Reserved. ******************************************************************************* * file name: udicttrie.h * encoding: US-ASCII * tab size: 8 (not used) * indentation:4 * * created on: 2010dec17 * created by: Markus W. Scherer */ #ifndef __USTRINGTRIE_H__ #define __USTRINGTRIE_H__ /** * \file * \brief C API: Helper definitions for dictionary trie APIs. */ /** * Return values for BytesTrie::next(), UCharsTrie::next() and similar methods. * @see USTRINGTRIE_MATCHES * @see USTRINGTRIE_HAS_VALUE * @see USTRINGTRIE_HAS_NEXT * @stable ICU 4.8 */ enum UStringTrieResult { /** * The input unit(s) did not continue a matching string. * Once current()/next() return USTRINGTRIE_NO_MATCH, * all further calls to current()/next() will also return USTRINGTRIE_NO_MATCH, * until the trie is reset to its original state or to a saved state. * @stable ICU 4.8 */ USTRINGTRIE_NO_MATCH, /** * The input unit(s) continued a matching string * but there is no value for the string so far. * (It is a prefix of a longer string.) * @stable ICU 4.8 */ USTRINGTRIE_NO_VALUE, /** * The input unit(s) continued a matching string * and there is a value for the string so far. * This value will be returned by getValue(). * No further input byte/unit can continue a matching string. * @stable ICU 4.8 */ USTRINGTRIE_FINAL_VALUE, /** * The input unit(s) continued a matching string * and there is a value for the string so far. * This value will be returned by getValue(). * Another input byte/unit can continue a matching string. * @stable ICU 4.8 */ USTRINGTRIE_INTERMEDIATE_VALUE }; /** * Same as (result!=USTRINGTRIE_NO_MATCH). * @param result A result from BytesTrie::first(), UCharsTrie::next() etc. * @return true if the input bytes/units so far are part of a matching string/byte sequence. * @stable ICU 4.8 */ #define USTRINGTRIE_MATCHES(result) ((result)!=USTRINGTRIE_NO_MATCH) /** * Equivalent to (result==USTRINGTRIE_INTERMEDIATE_VALUE || result==USTRINGTRIE_FINAL_VALUE) but * this macro evaluates result exactly once. * @param result A result from BytesTrie::first(), UCharsTrie::next() etc. * @return true if there is a value for the input bytes/units so far. * @see BytesTrie::getValue * @see UCharsTrie::getValue * @stable ICU 4.8 */ #define USTRINGTRIE_HAS_VALUE(result) ((result)>=USTRINGTRIE_FINAL_VALUE) /** * Equivalent to (result==USTRINGTRIE_NO_VALUE || result==USTRINGTRIE_INTERMEDIATE_VALUE) but * this macro evaluates result exactly once. * @param result A result from BytesTrie::first(), UCharsTrie::next() etc. * @return true if another input byte/unit can continue a matching string. * @stable ICU 4.8 */ #define USTRINGTRIE_HAS_NEXT(result) ((result)&1) #endif /* __USTRINGTRIE_H__ */ // ushape.h /* ****************************************************************************** * * Copyright (C) 2000-2012, International Business Machines * Corporation and others. All Rights Reserved. * ****************************************************************************** * file name: ushape.h * encoding: US-ASCII * tab size: 8 (not used) * indentation:4 * * created on: 2000jun29 * created by: Markus W. Scherer */ #ifndef __USHAPE_H__ #define __USHAPE_H__ /** * \file * \brief C API: Arabic shaping * */ /** * Shape Arabic text on a character basis. * *

This function performs basic operations for "shaping" Arabic text. It is most * useful for use with legacy data formats and legacy display technology * (simple terminals). All operations are performed on Unicode characters.

* *

Text-based shaping means that some character code points in the text are * replaced by others depending on the context. It transforms one kind of text * into another. In comparison, modern displays for Arabic text select * appropriate, context-dependent font glyphs for each text element, which means * that they transform text into a glyph vector.

* *

Text transformations are necessary when modern display technology is not * available or when text needs to be transformed to or from legacy formats that * use "shaped" characters. Since the Arabic script is cursive, connecting * adjacent letters to each other, computers select images for each letter based * on the surrounding letters. This usually results in four images per Arabic * letter: initial, middle, final, and isolated forms. In Unicode, on the other * hand, letters are normally stored abstract, and a display system is expected * to select the necessary glyphs. (This makes searching and other text * processing easier because the same letter has only one code.) It is possible * to mimic this with text transformations because there are characters in * Unicode that are rendered as letters with a specific shape * (or cursive connectivity). They were included for interoperability with * legacy systems and codepages, and for unsophisticated display systems.

* *

A second kind of text transformations is supported for Arabic digits: * For compatibility with legacy codepages that only include European digits, * it is possible to replace one set of digits by another, changing the * character code points. These operations can be performed for either * Arabic-Indic Digits (U+0660...U+0669) or Eastern (Extended) Arabic-Indic * digits (U+06f0...U+06f9).

* *

Some replacements may result in more or fewer characters (code points). * By default, this means that the destination buffer may receive text with a * length different from the source length. Some legacy systems rely on the * length of the text to be constant. They expect extra spaces to be added * or consumed either next to the affected character or at the end of the * text.

* *

For details about the available operations, see the description of the * U_SHAPE_... options.

* * @param source The input text. * * @param sourceLength The number of UChars in source. * * @param dest The destination buffer that will receive the results of the * requested operations. It may be NULL only if * destSize is 0. The source and destination must not * overlap. * * @param destSize The size (capacity) of the destination buffer in UChars. * If destSize is 0, then no output is produced, * but the necessary buffer size is returned ("preflighting"). * * @param options This is a 32-bit set of flags that specify the operations * that are performed on the input text. If no error occurs, * then the result will always be written to the destination * buffer. * * @param pErrorCode must be a valid pointer to an error code value, * which must not indicate a failure before the function call. * * @return The number of UChars written to the destination buffer. * If an error occured, then no output was written, or it may be * incomplete. If U_BUFFER_OVERFLOW_ERROR is set, then * the return value indicates the necessary destination buffer size. * @stable ICU 2.0 */ U_STABLE int32_t U_EXPORT2 u_shapeArabic(const UChar *source, int32_t sourceLength, UChar *dest, int32_t destSize, uint32_t options, UErrorCode *pErrorCode); /** * Memory option: allow the result to have a different length than the source. * Affects: LamAlef options * @stable ICU 2.0 */ #define U_SHAPE_LENGTH_GROW_SHRINK 0 /** * Memory option: allow the result to have a different length than the source. * Affects: LamAlef options * This option is an alias to U_SHAPE_LENGTH_GROW_SHRINK * @stable ICU 4.2 */ #define U_SHAPE_LAMALEF_RESIZE 0 /** * Memory option: the result must have the same length as the source. * If more room is necessary, then try to consume spaces next to modified characters. * @stable ICU 2.0 */ #define U_SHAPE_LENGTH_FIXED_SPACES_NEAR 1 /** * Memory option: the result must have the same length as the source. * If more room is necessary, then try to consume spaces next to modified characters. * Affects: LamAlef options * This option is an alias to U_SHAPE_LENGTH_FIXED_SPACES_NEAR * @stable ICU 4.2 */ #define U_SHAPE_LAMALEF_NEAR 1 /** * Memory option: the result must have the same length as the source. * If more room is necessary, then try to consume spaces at the end of the text. * @stable ICU 2.0 */ #define U_SHAPE_LENGTH_FIXED_SPACES_AT_END 2 /** * Memory option: the result must have the same length as the source. * If more room is necessary, then try to consume spaces at the end of the text. * Affects: LamAlef options * This option is an alias to U_SHAPE_LENGTH_FIXED_SPACES_AT_END * @stable ICU 4.2 */ #define U_SHAPE_LAMALEF_END 2 /** * Memory option: the result must have the same length as the source. * If more room is necessary, then try to consume spaces at the beginning of the text. * @stable ICU 2.0 */ #define U_SHAPE_LENGTH_FIXED_SPACES_AT_BEGINNING 3 /** * Memory option: the result must have the same length as the source. * If more room is necessary, then try to consume spaces at the beginning of the text. * Affects: LamAlef options * This option is an alias to U_SHAPE_LENGTH_FIXED_SPACES_AT_BEGINNING * @stable ICU 4.2 */ #define U_SHAPE_LAMALEF_BEGIN 3 /** * Memory option: the result must have the same length as the source. * Shaping Mode: For each LAMALEF character found, expand LAMALEF using space at end. * If there is no space at end, use spaces at beginning of the buffer. If there * is no space at beginning of the buffer, use spaces at the near (i.e. the space * after the LAMALEF character). * If there are no spaces found, an error U_NO_SPACE_AVAILABLE (as defined in utypes.h) * will be set in pErrorCode * * Deshaping Mode: Perform the same function as the flag equals U_SHAPE_LAMALEF_END. * Affects: LamAlef options * @stable ICU 4.2 */ #define U_SHAPE_LAMALEF_AUTO 0x10000 /** Bit mask for memory options. @stable ICU 2.0 */ #define U_SHAPE_LENGTH_MASK 0x10003 /* Changed old value 3 */ /** * Bit mask for LamAlef memory options. * @stable ICU 4.2 */ #define U_SHAPE_LAMALEF_MASK 0x10003 /* updated */ /** Direction indicator: the source is in logical (keyboard) order. @stable ICU 2.0 */ #define U_SHAPE_TEXT_DIRECTION_LOGICAL 0 /** * Direction indicator: * the source is in visual RTL order, * the rightmost displayed character stored first. * This option is an alias to U_SHAPE_TEXT_DIRECTION_LOGICAL * @stable ICU 4.2 */ #define U_SHAPE_TEXT_DIRECTION_VISUAL_RTL 0 /** * Direction indicator: * the source is in visual LTR order, * the leftmost displayed character stored first. * @stable ICU 2.0 */ #define U_SHAPE_TEXT_DIRECTION_VISUAL_LTR 4 /** Bit mask for direction indicators. @stable ICU 2.0 */ #define U_SHAPE_TEXT_DIRECTION_MASK 4 /** Letter shaping option: do not perform letter shaping. @stable ICU 2.0 */ #define U_SHAPE_LETTERS_NOOP 0 /** Letter shaping option: replace abstract letter characters by "shaped" ones. @stable ICU 2.0 */ #define U_SHAPE_LETTERS_SHAPE 8 /** Letter shaping option: replace "shaped" letter characters by abstract ones. @stable ICU 2.0 */ #define U_SHAPE_LETTERS_UNSHAPE 0x10 /** * Letter shaping option: replace abstract letter characters by "shaped" ones. * The only difference with U_SHAPE_LETTERS_SHAPE is that Tashkeel letters * are always "shaped" into the isolated form instead of the medial form * (selecting code points from the Arabic Presentation Forms-B block). * @stable ICU 2.0 */ #define U_SHAPE_LETTERS_SHAPE_TASHKEEL_ISOLATED 0x18 /** Bit mask for letter shaping options. @stable ICU 2.0 */ #define U_SHAPE_LETTERS_MASK 0x18 /** Digit shaping option: do not perform digit shaping. @stable ICU 2.0 */ #define U_SHAPE_DIGITS_NOOP 0 /** * Digit shaping option: * Replace European digits (U+0030...) by Arabic-Indic digits. * @stable ICU 2.0 */ #define U_SHAPE_DIGITS_EN2AN 0x20 /** * Digit shaping option: * Replace Arabic-Indic digits by European digits (U+0030...). * @stable ICU 2.0 */ #define U_SHAPE_DIGITS_AN2EN 0x40 /** * Digit shaping option: * Replace European digits (U+0030...) by Arabic-Indic digits if the most recent * strongly directional character is an Arabic letter * (u_charDirection() result U_RIGHT_TO_LEFT_ARABIC [AL]).
* The direction of "preceding" depends on the direction indicator option. * For the first characters, the preceding strongly directional character * (initial state) is assumed to be not an Arabic letter * (it is U_LEFT_TO_RIGHT [L] or U_RIGHT_TO_LEFT [R]). * @stable ICU 2.0 */ #define U_SHAPE_DIGITS_ALEN2AN_INIT_LR 0x60 /** * Digit shaping option: * Replace European digits (U+0030...) by Arabic-Indic digits if the most recent * strongly directional character is an Arabic letter * (u_charDirection() result U_RIGHT_TO_LEFT_ARABIC [AL]).
* The direction of "preceding" depends on the direction indicator option. * For the first characters, the preceding strongly directional character * (initial state) is assumed to be an Arabic letter. * @stable ICU 2.0 */ #define U_SHAPE_DIGITS_ALEN2AN_INIT_AL 0x80 /** Not a valid option value. May be replaced by a new option. @stable ICU 2.0 */ #define U_SHAPE_DIGITS_RESERVED 0xa0 /** Bit mask for digit shaping options. @stable ICU 2.0 */ #define U_SHAPE_DIGITS_MASK 0xe0 /** Digit type option: Use Arabic-Indic digits (U+0660...U+0669). @stable ICU 2.0 */ #define U_SHAPE_DIGIT_TYPE_AN 0 /** Digit type option: Use Eastern (Extended) Arabic-Indic digits (U+06f0...U+06f9). @stable ICU 2.0 */ #define U_SHAPE_DIGIT_TYPE_AN_EXTENDED 0x100 /** Not a valid option value. May be replaced by a new option. @stable ICU 2.0 */ #define U_SHAPE_DIGIT_TYPE_RESERVED 0x200 /** Bit mask for digit type options. @stable ICU 2.0 */ #define U_SHAPE_DIGIT_TYPE_MASK 0x300 /* I need to change this from 0x3f00 to 0x300 */ /** * Tashkeel aggregation option: * Replaces any combination of U+0651 with one of * U+064C, U+064D, U+064E, U+064F, U+0650 with * U+FC5E, U+FC5F, U+FC60, U+FC61, U+FC62 consecutively. * @stable ICU 3.6 */ #define U_SHAPE_AGGREGATE_TASHKEEL 0x4000 /** Tashkeel aggregation option: do not aggregate tashkeels. @stable ICU 3.6 */ #define U_SHAPE_AGGREGATE_TASHKEEL_NOOP 0 /** Bit mask for tashkeel aggregation. @stable ICU 3.6 */ #define U_SHAPE_AGGREGATE_TASHKEEL_MASK 0x4000 /** * Presentation form option: * Don't replace Arabic Presentation Forms-A and Arabic Presentation Forms-B * characters with 0+06xx characters, before shaping. * @stable ICU 3.6 */ #define U_SHAPE_PRESERVE_PRESENTATION 0x8000 /** Presentation form option: * Replace Arabic Presentation Forms-A and Arabic Presentationo Forms-B with * their unshaped correspondants in range 0+06xx, before shaping. * @stable ICU 3.6 */ #define U_SHAPE_PRESERVE_PRESENTATION_NOOP 0 /** Bit mask for preserve presentation form. @stable ICU 3.6 */ #define U_SHAPE_PRESERVE_PRESENTATION_MASK 0x8000 /* Seen Tail option */ /** * Memory option: the result must have the same length as the source. * Shaping mode: The SEEN family character will expand into two characters using space near * the SEEN family character(i.e. the space after the character). * If there are no spaces found, an error U_NO_SPACE_AVAILABLE (as defined in utypes.h) * will be set in pErrorCode * * De-shaping mode: Any Seen character followed by Tail character will be * replaced by one cell Seen and a space will replace the Tail. * Affects: Seen options * @stable ICU 4.2 */ #define U_SHAPE_SEEN_TWOCELL_NEAR 0x200000 /** * Bit mask for Seen memory options. * @stable ICU 4.2 */ #define U_SHAPE_SEEN_MASK 0x700000 /* YehHamza option */ /** * Memory option: the result must have the same length as the source. * Shaping mode: The YEHHAMZA character will expand into two characters using space near it * (i.e. the space after the character * If there are no spaces found, an error U_NO_SPACE_AVAILABLE (as defined in utypes.h) * will be set in pErrorCode * * De-shaping mode: Any Yeh (final or isolated) character followed by Hamza character will be * replaced by one cell YehHamza and space will replace the Hamza. * Affects: YehHamza options * @stable ICU 4.2 */ #define U_SHAPE_YEHHAMZA_TWOCELL_NEAR 0x1000000 /** * Bit mask for YehHamza memory options. * @stable ICU 4.2 */ #define U_SHAPE_YEHHAMZA_MASK 0x3800000 /* New Tashkeel options */ /** * Memory option: the result must have the same length as the source. * Shaping mode: Tashkeel characters will be replaced by spaces. * Spaces will be placed at beginning of the buffer * * De-shaping mode: N/A * Affects: Tashkeel options * @stable ICU 4.2 */ #define U_SHAPE_TASHKEEL_BEGIN 0x40000 /** * Memory option: the result must have the same length as the source. * Shaping mode: Tashkeel characters will be replaced by spaces. * Spaces will be placed at end of the buffer * * De-shaping mode: N/A * Affects: Tashkeel options * @stable ICU 4.2 */ #define U_SHAPE_TASHKEEL_END 0x60000 /** * Memory option: allow the result to have a different length than the source. * Shaping mode: Tashkeel characters will be removed, buffer length will shrink. * De-shaping mode: N/A * * Affect: Tashkeel options * @stable ICU 4.2 */ #define U_SHAPE_TASHKEEL_RESIZE 0x80000 /** * Memory option: the result must have the same length as the source. * Shaping mode: Tashkeel characters will be replaced by Tatweel if it is connected to adjacent * characters (i.e. shaped on Tatweel) or replaced by space if it is not connected. * * De-shaping mode: N/A * Affects: YehHamza options * @stable ICU 4.2 */ #define U_SHAPE_TASHKEEL_REPLACE_BY_TATWEEL 0xC0000 /** * Bit mask for Tashkeel replacement with Space or Tatweel memory options. * @stable ICU 4.2 */ #define U_SHAPE_TASHKEEL_MASK 0xE0000 /* Space location Control options */ /** * This option affect the meaning of BEGIN and END options. if this option is not used the default * for BEGIN and END will be as following: * The Default (for both Visual LTR, Visual RTL and Logical Text) * 1. BEGIN always refers to the start address of physical memory. * 2. END always refers to the end address of physical memory. * * If this option is used it will swap the meaning of BEGIN and END only for Visual LTR text. * * The effect on BEGIN and END Memory Options will be as following: * A. BEGIN For Visual LTR text: This will be the beginning (right side) of the visual text( * corresponding to the physical memory address end for Visual LTR text, Same as END in * default behavior) * B. BEGIN For Logical text: Same as BEGIN in default behavior. * C. END For Visual LTR text: This will be the end (left side) of the visual text (corresponding * to the physical memory address beginning for Visual LTR text, Same as BEGIN in default behavior. * D. END For Logical text: Same as END in default behavior). * Affects: All LamAlef BEGIN, END and AUTO options. * @stable ICU 4.2 */ #define U_SHAPE_SPACES_RELATIVE_TO_TEXT_BEGIN_END 0x4000000 /** * Bit mask for swapping BEGIN and END for Visual LTR text * @stable ICU 4.2 */ #define U_SHAPE_SPACES_RELATIVE_TO_TEXT_MASK 0x4000000 /** * If this option is used, shaping will use the new Unicode code point for TAIL (i.e. 0xFE73). * If this option is not specified (Default), old unofficial Unicode TAIL code point is used (i.e. 0x200B) * De-shaping will not use this option as it will always search for both the new Unicode code point for the * TAIL (i.e. 0xFE73) or the old unofficial Unicode TAIL code point (i.e. 0x200B) and de-shape the * Seen-Family letter accordingly. * * Shaping Mode: Only shaping. * De-shaping Mode: N/A. * Affects: All Seen options * @stable ICU 4.8 */ #define U_SHAPE_TAIL_NEW_UNICODE 0x8000000 /** * Bit mask for new Unicode Tail option * @stable ICU 4.8 */ #define U_SHAPE_TAIL_TYPE_MASK 0x8000000 #endif // uscript.h /* ********************************************************************** * Copyright (C) 1997-2015, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** * * File USCRIPT.H * * Modification History: * * Date Name Description * 07/06/2001 Ram Creation. ****************************************************************************** */ #ifndef USCRIPT_H #define USCRIPT_H /** * \file * \brief C API: Unicode Script Information */ /** * Constants for ISO 15924 script codes. * * The current set of script code constants supports at least all scripts * that are encoded in the version of Unicode which ICU currently supports. * The names of the constants are usually derived from the * Unicode script property value aliases. * See UAX #24 Unicode Script Property (http://www.unicode.org/reports/tr24/) * and http://www.unicode.org/Public/UCD/latest/ucd/PropertyValueAliases.txt . * * Starting with ICU 3.6, constants for most ISO 15924 script codes * are included, for use with language tags, CLDR data, and similar. * Some of those codes are not used in the Unicode Character Database (UCD). * For example, there are no characters that have a UCD script property value of * Hans or Hant. All Han ideographs have the Hani script property value in Unicode. * * Private-use codes Qaaa..Qabx are not included. * * Starting with ICU 55, script codes are only added when their scripts * have been or will certainly be encoded in Unicode, * and have been assigned Unicode script property value aliases, * to ensure that their script names are stable and match the names of the constants. * Script codes like Latf and Aran that are not subject to separate encoding * may be added at any time. * * @stable ICU 2.2 */ typedef enum UScriptCode { /* * Note: UScriptCode constants and their ISO script code comments * are parsed by preparseucd.py. * It matches lines like * USCRIPT_ = , / * * / */ /** @stable ICU 2.2 */ USCRIPT_INVALID_CODE = -1, /** @stable ICU 2.2 */ USCRIPT_COMMON = 0, /* Zyyy */ /** @stable ICU 2.2 */ USCRIPT_INHERITED = 1, /* Zinh */ /* "Code for inherited script", for non-spacing combining marks; also Qaai */ /** @stable ICU 2.2 */ USCRIPT_ARABIC = 2, /* Arab */ /** @stable ICU 2.2 */ USCRIPT_ARMENIAN = 3, /* Armn */ /** @stable ICU 2.2 */ USCRIPT_BENGALI = 4, /* Beng */ /** @stable ICU 2.2 */ USCRIPT_BOPOMOFO = 5, /* Bopo */ /** @stable ICU 2.2 */ USCRIPT_CHEROKEE = 6, /* Cher */ /** @stable ICU 2.2 */ USCRIPT_COPTIC = 7, /* Copt */ /** @stable ICU 2.2 */ USCRIPT_CYRILLIC = 8, /* Cyrl */ /** @stable ICU 2.2 */ USCRIPT_DESERET = 9, /* Dsrt */ /** @stable ICU 2.2 */ USCRIPT_DEVANAGARI = 10, /* Deva */ /** @stable ICU 2.2 */ USCRIPT_ETHIOPIC = 11, /* Ethi */ /** @stable ICU 2.2 */ USCRIPT_GEORGIAN = 12, /* Geor */ /** @stable ICU 2.2 */ USCRIPT_GOTHIC = 13, /* Goth */ /** @stable ICU 2.2 */ USCRIPT_GREEK = 14, /* Grek */ /** @stable ICU 2.2 */ USCRIPT_GUJARATI = 15, /* Gujr */ /** @stable ICU 2.2 */ USCRIPT_GURMUKHI = 16, /* Guru */ /** @stable ICU 2.2 */ USCRIPT_HAN = 17, /* Hani */ /** @stable ICU 2.2 */ USCRIPT_HANGUL = 18, /* Hang */ /** @stable ICU 2.2 */ USCRIPT_HEBREW = 19, /* Hebr */ /** @stable ICU 2.2 */ USCRIPT_HIRAGANA = 20, /* Hira */ /** @stable ICU 2.2 */ USCRIPT_KANNADA = 21, /* Knda */ /** @stable ICU 2.2 */ USCRIPT_KATAKANA = 22, /* Kana */ /** @stable ICU 2.2 */ USCRIPT_KHMER = 23, /* Khmr */ /** @stable ICU 2.2 */ USCRIPT_LAO = 24, /* Laoo */ /** @stable ICU 2.2 */ USCRIPT_LATIN = 25, /* Latn */ /** @stable ICU 2.2 */ USCRIPT_MALAYALAM = 26, /* Mlym */ /** @stable ICU 2.2 */ USCRIPT_MONGOLIAN = 27, /* Mong */ /** @stable ICU 2.2 */ USCRIPT_MYANMAR = 28, /* Mymr */ /** @stable ICU 2.2 */ USCRIPT_OGHAM = 29, /* Ogam */ /** @stable ICU 2.2 */ USCRIPT_OLD_ITALIC = 30, /* Ital */ /** @stable ICU 2.2 */ USCRIPT_ORIYA = 31, /* Orya */ /** @stable ICU 2.2 */ USCRIPT_RUNIC = 32, /* Runr */ /** @stable ICU 2.2 */ USCRIPT_SINHALA = 33, /* Sinh */ /** @stable ICU 2.2 */ USCRIPT_SYRIAC = 34, /* Syrc */ /** @stable ICU 2.2 */ USCRIPT_TAMIL = 35, /* Taml */ /** @stable ICU 2.2 */ USCRIPT_TELUGU = 36, /* Telu */ /** @stable ICU 2.2 */ USCRIPT_THAANA = 37, /* Thaa */ /** @stable ICU 2.2 */ USCRIPT_THAI = 38, /* Thai */ /** @stable ICU 2.2 */ USCRIPT_TIBETAN = 39, /* Tibt */ /** Canadian_Aboriginal script. @stable ICU 2.6 */ USCRIPT_CANADIAN_ABORIGINAL = 40, /* Cans */ /** Canadian_Aboriginal script (alias). @stable ICU 2.2 */ USCRIPT_UCAS = USCRIPT_CANADIAN_ABORIGINAL, /** @stable ICU 2.2 */ USCRIPT_YI = 41, /* Yiii */ /* New scripts in Unicode 3.2 */ /** @stable ICU 2.2 */ USCRIPT_TAGALOG = 42, /* Tglg */ /** @stable ICU 2.2 */ USCRIPT_HANUNOO = 43, /* Hano */ /** @stable ICU 2.2 */ USCRIPT_BUHID = 44, /* Buhd */ /** @stable ICU 2.2 */ USCRIPT_TAGBANWA = 45, /* Tagb */ /* New scripts in Unicode 4 */ /** @stable ICU 2.6 */ USCRIPT_BRAILLE = 46, /* Brai */ /** @stable ICU 2.6 */ USCRIPT_CYPRIOT = 47, /* Cprt */ /** @stable ICU 2.6 */ USCRIPT_LIMBU = 48, /* Limb */ /** @stable ICU 2.6 */ USCRIPT_LINEAR_B = 49, /* Linb */ /** @stable ICU 2.6 */ USCRIPT_OSMANYA = 50, /* Osma */ /** @stable ICU 2.6 */ USCRIPT_SHAVIAN = 51, /* Shaw */ /** @stable ICU 2.6 */ USCRIPT_TAI_LE = 52, /* Tale */ /** @stable ICU 2.6 */ USCRIPT_UGARITIC = 53, /* Ugar */ /** New script code in Unicode 4.0.1 @stable ICU 3.0 */ USCRIPT_KATAKANA_OR_HIRAGANA = 54,/*Hrkt */ /* New scripts in Unicode 4.1 */ /** @stable ICU 3.4 */ USCRIPT_BUGINESE = 55, /* Bugi */ /** @stable ICU 3.4 */ USCRIPT_GLAGOLITIC = 56, /* Glag */ /** @stable ICU 3.4 */ USCRIPT_KHAROSHTHI = 57, /* Khar */ /** @stable ICU 3.4 */ USCRIPT_SYLOTI_NAGRI = 58, /* Sylo */ /** @stable ICU 3.4 */ USCRIPT_NEW_TAI_LUE = 59, /* Talu */ /** @stable ICU 3.4 */ USCRIPT_TIFINAGH = 60, /* Tfng */ /** @stable ICU 3.4 */ USCRIPT_OLD_PERSIAN = 61, /* Xpeo */ /* New script codes from Unicode and ISO 15924 */ /** @stable ICU 3.6 */ USCRIPT_BALINESE = 62, /* Bali */ /** @stable ICU 3.6 */ USCRIPT_BATAK = 63, /* Batk */ /** @stable ICU 3.6 */ USCRIPT_BLISSYMBOLS = 64, /* Blis */ /** @stable ICU 3.6 */ USCRIPT_BRAHMI = 65, /* Brah */ /** @stable ICU 3.6 */ USCRIPT_CHAM = 66, /* Cham */ /** @stable ICU 3.6 */ USCRIPT_CIRTH = 67, /* Cirt */ /** @stable ICU 3.6 */ USCRIPT_OLD_CHURCH_SLAVONIC_CYRILLIC = 68, /* Cyrs */ /** @stable ICU 3.6 */ USCRIPT_DEMOTIC_EGYPTIAN = 69, /* Egyd */ /** @stable ICU 3.6 */ USCRIPT_HIERATIC_EGYPTIAN = 70, /* Egyh */ /** @stable ICU 3.6 */ USCRIPT_EGYPTIAN_HIEROGLYPHS = 71, /* Egyp */ /** @stable ICU 3.6 */ USCRIPT_KHUTSURI = 72, /* Geok */ /** @stable ICU 3.6 */ USCRIPT_SIMPLIFIED_HAN = 73, /* Hans */ /** @stable ICU 3.6 */ USCRIPT_TRADITIONAL_HAN = 74, /* Hant */ /** @stable ICU 3.6 */ USCRIPT_PAHAWH_HMONG = 75, /* Hmng */ /** @stable ICU 3.6 */ USCRIPT_OLD_HUNGARIAN = 76, /* Hung */ /** @stable ICU 3.6 */ USCRIPT_HARAPPAN_INDUS = 77, /* Inds */ /** @stable ICU 3.6 */ USCRIPT_JAVANESE = 78, /* Java */ /** @stable ICU 3.6 */ USCRIPT_KAYAH_LI = 79, /* Kali */ /** @stable ICU 3.6 */ USCRIPT_LATIN_FRAKTUR = 80, /* Latf */ /** @stable ICU 3.6 */ USCRIPT_LATIN_GAELIC = 81, /* Latg */ /** @stable ICU 3.6 */ USCRIPT_LEPCHA = 82, /* Lepc */ /** @stable ICU 3.6 */ USCRIPT_LINEAR_A = 83, /* Lina */ /** @stable ICU 4.6 */ USCRIPT_MANDAIC = 84, /* Mand */ /** @stable ICU 3.6 */ USCRIPT_MANDAEAN = USCRIPT_MANDAIC, /** @stable ICU 3.6 */ USCRIPT_MAYAN_HIEROGLYPHS = 85, /* Maya */ /** @stable ICU 4.6 */ USCRIPT_MEROITIC_HIEROGLYPHS = 86, /* Mero */ /** @stable ICU 3.6 */ USCRIPT_MEROITIC = USCRIPT_MEROITIC_HIEROGLYPHS, /** @stable ICU 3.6 */ USCRIPT_NKO = 87, /* Nkoo */ /** @stable ICU 3.6 */ USCRIPT_ORKHON = 88, /* Orkh */ /** @stable ICU 3.6 */ USCRIPT_OLD_PERMIC = 89, /* Perm */ /** @stable ICU 3.6 */ USCRIPT_PHAGS_PA = 90, /* Phag */ /** @stable ICU 3.6 */ USCRIPT_PHOENICIAN = 91, /* Phnx */ /** @stable ICU 52 */ USCRIPT_MIAO = 92, /* Plrd */ /** @stable ICU 3.6 */ USCRIPT_PHONETIC_POLLARD = USCRIPT_MIAO, /** @stable ICU 3.6 */ USCRIPT_RONGORONGO = 93, /* Roro */ /** @stable ICU 3.6 */ USCRIPT_SARATI = 94, /* Sara */ /** @stable ICU 3.6 */ USCRIPT_ESTRANGELO_SYRIAC = 95, /* Syre */ /** @stable ICU 3.6 */ USCRIPT_WESTERN_SYRIAC = 96, /* Syrj */ /** @stable ICU 3.6 */ USCRIPT_EASTERN_SYRIAC = 97, /* Syrn */ /** @stable ICU 3.6 */ USCRIPT_TENGWAR = 98, /* Teng */ /** @stable ICU 3.6 */ USCRIPT_VAI = 99, /* Vaii */ /** @stable ICU 3.6 */ USCRIPT_VISIBLE_SPEECH = 100,/* Visp */ /** @stable ICU 3.6 */ USCRIPT_CUNEIFORM = 101,/* Xsux */ /** @stable ICU 3.6 */ USCRIPT_UNWRITTEN_LANGUAGES = 102,/* Zxxx */ /** @stable ICU 3.6 */ USCRIPT_UNKNOWN = 103,/* Zzzz */ /* Unknown="Code for uncoded script", for unassigned code points */ /** @stable ICU 3.8 */ USCRIPT_CARIAN = 104,/* Cari */ /** @stable ICU 3.8 */ USCRIPT_JAPANESE = 105,/* Jpan */ /** @stable ICU 3.8 */ USCRIPT_LANNA = 106,/* Lana */ /** @stable ICU 3.8 */ USCRIPT_LYCIAN = 107,/* Lyci */ /** @stable ICU 3.8 */ USCRIPT_LYDIAN = 108,/* Lydi */ /** @stable ICU 3.8 */ USCRIPT_OL_CHIKI = 109,/* Olck */ /** @stable ICU 3.8 */ USCRIPT_REJANG = 110,/* Rjng */ /** @stable ICU 3.8 */ USCRIPT_SAURASHTRA = 111,/* Saur */ /** Sutton SignWriting @stable ICU 3.8 */ USCRIPT_SIGN_WRITING = 112,/* Sgnw */ /** @stable ICU 3.8 */ USCRIPT_SUNDANESE = 113,/* Sund */ /** @stable ICU 3.8 */ USCRIPT_MOON = 114,/* Moon */ /** @stable ICU 3.8 */ USCRIPT_MEITEI_MAYEK = 115,/* Mtei */ /** @stable ICU 4.0 */ USCRIPT_IMPERIAL_ARAMAIC = 116,/* Armi */ /** @stable ICU 4.0 */ USCRIPT_AVESTAN = 117,/* Avst */ /** @stable ICU 4.0 */ USCRIPT_CHAKMA = 118,/* Cakm */ /** @stable ICU 4.0 */ USCRIPT_KOREAN = 119,/* Kore */ /** @stable ICU 4.0 */ USCRIPT_KAITHI = 120,/* Kthi */ /** @stable ICU 4.0 */ USCRIPT_MANICHAEAN = 121,/* Mani */ /** @stable ICU 4.0 */ USCRIPT_INSCRIPTIONAL_PAHLAVI = 122,/* Phli */ /** @stable ICU 4.0 */ USCRIPT_PSALTER_PAHLAVI = 123,/* Phlp */ /** @stable ICU 4.0 */ USCRIPT_BOOK_PAHLAVI = 124,/* Phlv */ /** @stable ICU 4.0 */ USCRIPT_INSCRIPTIONAL_PARTHIAN = 125,/* Prti */ /** @stable ICU 4.0 */ USCRIPT_SAMARITAN = 126,/* Samr */ /** @stable ICU 4.0 */ USCRIPT_TAI_VIET = 127,/* Tavt */ /** @stable ICU 4.0 */ USCRIPT_MATHEMATICAL_NOTATION = 128,/* Zmth */ /** @stable ICU 4.0 */ USCRIPT_SYMBOLS = 129,/* Zsym */ /** @stable ICU 4.4 */ USCRIPT_BAMUM = 130,/* Bamu */ /** @stable ICU 4.4 */ USCRIPT_LISU = 131,/* Lisu */ /** @stable ICU 4.4 */ USCRIPT_NAKHI_GEBA = 132,/* Nkgb */ /** @stable ICU 4.4 */ USCRIPT_OLD_SOUTH_ARABIAN = 133,/* Sarb */ /** @stable ICU 4.6 */ USCRIPT_BASSA_VAH = 134,/* Bass */ /** @stable ICU 54 */ USCRIPT_DUPLOYAN = 135,/* Dupl */ /** @stable ICU 4.6 */ USCRIPT_ELBASAN = 136,/* Elba */ /** @stable ICU 4.6 */ USCRIPT_GRANTHA = 137,/* Gran */ /** @stable ICU 4.6 */ USCRIPT_KPELLE = 138,/* Kpel */ /** @stable ICU 4.6 */ USCRIPT_LOMA = 139,/* Loma */ /** Mende Kikakui @stable ICU 4.6 */ USCRIPT_MENDE = 140,/* Mend */ /** @stable ICU 4.6 */ USCRIPT_MEROITIC_CURSIVE = 141,/* Merc */ /** @stable ICU 4.6 */ USCRIPT_OLD_NORTH_ARABIAN = 142,/* Narb */ /** @stable ICU 4.6 */ USCRIPT_NABATAEAN = 143,/* Nbat */ /** @stable ICU 4.6 */ USCRIPT_PALMYRENE = 144,/* Palm */ /** @stable ICU 54 */ USCRIPT_KHUDAWADI = 145,/* Sind */ /** @stable ICU 4.6 */ USCRIPT_SINDHI = USCRIPT_KHUDAWADI, /** @stable ICU 4.6 */ USCRIPT_WARANG_CITI = 146,/* Wara */ /** @stable ICU 4.8 */ USCRIPT_AFAKA = 147,/* Afak */ /** @stable ICU 4.8 */ USCRIPT_JURCHEN = 148,/* Jurc */ /** @stable ICU 4.8 */ USCRIPT_MRO = 149,/* Mroo */ /** @stable ICU 4.8 */ USCRIPT_NUSHU = 150,/* Nshu */ /** @stable ICU 4.8 */ USCRIPT_SHARADA = 151,/* Shrd */ /** @stable ICU 4.8 */ USCRIPT_SORA_SOMPENG = 152,/* Sora */ /** @stable ICU 4.8 */ USCRIPT_TAKRI = 153,/* Takr */ /** @stable ICU 4.8 */ USCRIPT_TANGUT = 154,/* Tang */ /** @stable ICU 4.8 */ USCRIPT_WOLEAI = 155,/* Wole */ /** @stable ICU 49 */ USCRIPT_ANATOLIAN_HIEROGLYPHS = 156,/* Hluw */ /** @stable ICU 49 */ USCRIPT_KHOJKI = 157,/* Khoj */ /** @stable ICU 49 */ USCRIPT_TIRHUTA = 158,/* Tirh */ /** @stable ICU 52 */ USCRIPT_CAUCASIAN_ALBANIAN = 159,/* Aghb */ /** @stable ICU 52 */ USCRIPT_MAHAJANI = 160,/* Mahj */ /** @stable ICU 54 */ USCRIPT_AHOM = 161,/* Ahom */ /** @stable ICU 54 */ USCRIPT_HATRAN = 162,/* Hatr */ /** @stable ICU 54 */ USCRIPT_MODI = 163,/* Modi */ /** @stable ICU 54 */ USCRIPT_MULTANI = 164,/* Mult */ /** @stable ICU 54 */ USCRIPT_PAU_CIN_HAU = 165,/* Pauc */ /** @stable ICU 54 */ USCRIPT_SIDDHAM = 166,/* Sidd */ /** * One higher than the last script code constant. * This value increases as constants for script codes are added. * * There are constants for Unicode 7 script property values. * There are constants for ISO 15924 script codes assigned on or before 2013-10-12. * There are no constants for private use codes from Qaaa - Qabx * except as used in the UCD. * * @stable ICU 2.2 */ USCRIPT_CODE_LIMIT = 167 } UScriptCode; /** * Gets the script codes associated with the given locale or ISO 15924 abbreviation or name. * Fills in USCRIPT_MALAYALAM given "Malayam" OR "Mlym". * Fills in USCRIPT_LATIN given "en" OR "en_US" * If the required capacity is greater than the capacity of the destination buffer, * then the error code is set to U_BUFFER_OVERFLOW_ERROR and the required capacity is returned. * *

Note: To search by short or long script alias only, use * u_getPropertyValueEnum(UCHAR_SCRIPT, alias) instead. That does * a fast lookup with no access of the locale data. * * @param nameOrAbbrOrLocale name of the script, as given in * PropertyValueAliases.txt, or ISO 15924 code or locale * @param fillIn the UScriptCode buffer to fill in the script code * @param capacity the capacity (size) fo UScriptCode buffer passed in. * @param err the error status code. * @return The number of script codes filled in the buffer passed in * @stable ICU 2.4 */ U_STABLE int32_t U_EXPORT2 uscript_getCode(const char* nameOrAbbrOrLocale,UScriptCode* fillIn,int32_t capacity,UErrorCode *err); /** * Returns the long Unicode script name, if there is one. * Otherwise returns the 4-letter ISO 15924 script code. * Returns "Malayam" given USCRIPT_MALAYALAM. * * @param scriptCode UScriptCode enum * @return long script name as given in PropertyValueAliases.txt, or the 4-letter code, * or NULL if scriptCode is invalid * @stable ICU 2.4 */ U_STABLE const char* U_EXPORT2 uscript_getName(UScriptCode scriptCode); /** * Returns the 4-letter ISO 15924 script code, * which is the same as the short Unicode script name if Unicode has names for the script. * Returns "Mlym" given USCRIPT_MALAYALAM. * * @param scriptCode UScriptCode enum * @return short script name (4-letter code), or NULL if scriptCode is invalid * @stable ICU 2.4 */ U_STABLE const char* U_EXPORT2 uscript_getShortName(UScriptCode scriptCode); /** * Gets the script code associated with the given codepoint. * Returns USCRIPT_MALAYALAM given 0x0D02 * @param codepoint UChar32 codepoint * @param err the error status code. * @return The UScriptCode, or 0 if codepoint is invalid * @stable ICU 2.4 */ U_STABLE UScriptCode U_EXPORT2 uscript_getScript(UChar32 codepoint, UErrorCode *err); /** * Do the Script_Extensions of code point c contain script sc? * If c does not have explicit Script_Extensions, then this tests whether * c has the Script property value sc. * * Some characters are commonly used in multiple scripts. * For more information, see UAX #24: http://www.unicode.org/reports/tr24/. * * The Script_Extensions property is provisional. It may be modified or removed * in future versions of the Unicode Standard, and thus in ICU. * @param c code point * @param sc script code * @return TRUE if sc is in Script_Extensions(c) * @stable ICU 49 */ U_STABLE UBool U_EXPORT2 uscript_hasScript(UChar32 c, UScriptCode sc); /** * Writes code point c's Script_Extensions as a list of UScriptCode values * to the output scripts array and returns the number of script codes. * - If c does have Script_Extensions, then the Script property value * (normally Common or Inherited) is not included. * - If c does not have Script_Extensions, then the one Script code is written to the output array. * - If c is not a valid code point, then the one USCRIPT_UNKNOWN code is written. * In other words, if the return value is 1, * then the output array contains exactly c's single Script code. * If the return value is n>=2, then the output array contains c's n Script_Extensions script codes. * * Some characters are commonly used in multiple scripts. * For more information, see UAX #24: http://www.unicode.org/reports/tr24/. * * If there are more than capacity script codes to be written, then * U_BUFFER_OVERFLOW_ERROR is set and the number of Script_Extensions is returned. * (Usual ICU buffer handling behavior.) * * The Script_Extensions property is provisional. It may be modified or removed * in future versions of the Unicode Standard, and thus in ICU. * @param c code point * @param scripts output script code array * @param capacity capacity of the scripts array * @param errorCode Standard ICU error code. Its input value must * pass the U_SUCCESS() test, or else the function returns * immediately. Check for U_FAILURE() on output or use with * function chaining. (See User Guide for details.) * @return number of script codes in c's Script_Extensions, or 1 for the single Script value, * written to scripts unless U_BUFFER_OVERFLOW_ERROR indicates insufficient capacity * @stable ICU 49 */ U_STABLE int32_t U_EXPORT2 uscript_getScriptExtensions(UChar32 c, UScriptCode *scripts, int32_t capacity, UErrorCode *errorCode); /** * Script usage constants. * See UAX #31 Unicode Identifier and Pattern Syntax. * http://www.unicode.org/reports/tr31/#Table_Candidate_Characters_for_Exclusion_from_Identifiers * * @stable ICU 51 */ typedef enum UScriptUsage { /** Not encoded in Unicode. @stable ICU 51 */ USCRIPT_USAGE_NOT_ENCODED, /** Unknown script usage. @stable ICU 51 */ USCRIPT_USAGE_UNKNOWN, /** Candidate for Exclusion from Identifiers. @stable ICU 51 */ USCRIPT_USAGE_EXCLUDED, /** Limited Use script. @stable ICU 51 */ USCRIPT_USAGE_LIMITED_USE, /** Aspirational Use script. @stable ICU 51 */ USCRIPT_USAGE_ASPIRATIONAL, /** Recommended script. @stable ICU 51 */ USCRIPT_USAGE_RECOMMENDED } UScriptUsage; /** * Writes the script sample character string. * This string normally consists of one code point but might be longer. * The string is empty if the script is not encoded. * * @param script script code * @param dest output string array * @param capacity number of UChars in the dest array * @param pErrorCode standard ICU in/out error code, must pass U_SUCCESS() on input * @return the string length, even if U_BUFFER_OVERFLOW_ERROR * @stable ICU 51 */ U_STABLE int32_t U_EXPORT2 uscript_getSampleString(UScriptCode script, UChar *dest, int32_t capacity, UErrorCode *pErrorCode); /** * Returns the script usage according to UAX #31 Unicode Identifier and Pattern Syntax. * Returns USCRIPT_USAGE_NOT_ENCODED if the script is not encoded in Unicode. * * @param script script code * @return script usage * @see UScriptUsage * @stable ICU 51 */ U_STABLE UScriptUsage U_EXPORT2 uscript_getUsage(UScriptCode script); /** * Returns TRUE if the script is written right-to-left. * For example, Arab and Hebr. * * @param script script code * @return TRUE if the script is right-to-left * @stable ICU 51 */ U_STABLE UBool U_EXPORT2 uscript_isRightToLeft(UScriptCode script); /** * Returns TRUE if the script allows line breaks between letters (excluding hyphenation). * Such a script typically requires dictionary-based line breaking. * For example, Hani and Thai. * * @param script script code * @return TRUE if the script allows line breaks between letters * @stable ICU 51 */ U_STABLE UBool U_EXPORT2 uscript_breaksBetweenLetters(UScriptCode script); /** * Returns TRUE if in modern (or most recent) usage of the script case distinctions are customary. * For example, Latn and Cyrl. * * @param script script code * @return TRUE if the script is cased * @stable ICU 51 */ U_STABLE UBool U_EXPORT2 uscript_isCased(UScriptCode script); #endif // urep.h /* ****************************************************************************** * Copyright (C) 1997-2010, International Business Machines * Corporation and others. All Rights Reserved. ****************************************************************************** * Date Name Description * 06/23/00 aliu Creation. ****************************************************************************** */ #ifndef __UREP_H #define __UREP_H U_CDECL_BEGIN /******************************************************************** * General Notes ******************************************************************** * TODO * Add usage scenario * Add test code * Talk about pinning * Talk about "can truncate result if out of memory" */ /******************************************************************** * Data Structures ********************************************************************/ /** * \file * \brief C API: Callbacks for UReplaceable */ /** * An opaque replaceable text object. This will be manipulated only * through the caller-supplied UReplaceableFunctor struct. Related * to the C++ class Replaceable. * This is currently only used in the Transliterator C API, see utrans.h . * @stable ICU 2.0 */ typedef void* UReplaceable; /** * A set of function pointers that transliterators use to manipulate a * UReplaceable. The caller should supply the required functions to * manipulate their text appropriately. Related to the C++ class * Replaceable. * @stable ICU 2.0 */ typedef struct UReplaceableCallbacks { /** * Function pointer that returns the number of UChar code units in * this text. * * @param rep A pointer to "this" UReplaceable object. * @return The length of the text. * @stable ICU 2.0 */ int32_t (*length)(const UReplaceable* rep); /** * Function pointer that returns a UChar code units at the given * offset into this text; 0 <= offset < n, where n is the value * returned by (*length)(rep). See unistr.h for a description of * charAt() vs. char32At(). * * @param rep A pointer to "this" UReplaceable object. * @param offset The index at which to fetch the UChar (code unit). * @return The UChar (code unit) at offset, or U+FFFF if the offset is out of bounds. * @stable ICU 2.0 */ UChar (*charAt)(const UReplaceable* rep, int32_t offset); /** * Function pointer that returns a UChar32 code point at the given * offset into this text. See unistr.h for a description of * charAt() vs. char32At(). * * @param rep A pointer to "this" UReplaceable object. * @param offset The index at which to fetch the UChar32 (code point). * @return The UChar32 (code point) at offset, or U+FFFF if the offset is out of bounds. * @stable ICU 2.0 */ UChar32 (*char32At)(const UReplaceable* rep, int32_t offset); /** * Function pointer that replaces text between start and limit in * this text with the given text. Attributes (out of band info) * should be retained. * * @param rep A pointer to "this" UReplaceable object. * @param start the starting index of the text to be replaced, * inclusive. * @param limit the ending index of the text to be replaced, * exclusive. * @param text the new text to replace the UChars from * start..limit-1. * @param textLength the number of UChars at text, or -1 if text * is null-terminated. * @stable ICU 2.0 */ void (*replace)(UReplaceable* rep, int32_t start, int32_t limit, const UChar* text, int32_t textLength); /** * Function pointer that copies the characters in the range * [start, limit) into the array dst. * * @param rep A pointer to "this" UReplaceable object. * @param start offset of first character which will be copied * into the array * @param limit offset immediately following the last character to * be copied * @param dst array in which to copy characters. The length of * dst must be at least (limit - start). * @stable ICU 2.1 */ void (*extract)(UReplaceable* rep, int32_t start, int32_t limit, UChar* dst); /** * Function pointer that copies text between start and limit in * this text to another index in the text. Attributes (out of * band info) should be retained. After this call, there will be * (at least) two copies of the characters originally located at * start..limit-1. * * @param rep A pointer to "this" UReplaceable object. * @param start the starting index of the text to be copied, * inclusive. * @param limit the ending index of the text to be copied, * exclusive. * @param dest the index at which the copy of the UChars should be * inserted. * @stable ICU 2.0 */ void (*copy)(UReplaceable* rep, int32_t start, int32_t limit, int32_t dest); } UReplaceableCallbacks; U_CDECL_END #endif // uobject.h /* ****************************************************************************** * * Copyright (C) 2002-2012, International Business Machines * Corporation and others. All Rights Reserved. * ****************************************************************************** * file name: uobject.h * encoding: US-ASCII * tab size: 8 (not used) * indentation:4 * * created on: 2002jun26 * created by: Markus W. Scherer */ #ifndef __UOBJECT_H__ #define __UOBJECT_H__ /** * \file * \brief C++ API: Common ICU base class UObject. */ /** * @{ * \def U_NO_THROW * Define this to define the throw() specification so * certain functions do not throw any exceptions * * UMemory operator new methods should have the throw() specification * appended to them, so that the compiler adds the additional NULL check * before calling constructors. Without, if operator new returns NULL the * constructor is still called, and if the constructor references member * data, (which it typically does), the result is a segmentation violation. * * @stable ICU 4.2 */ #ifndef U_NO_THROW #define U_NO_THROW throw() #endif /** @} */ /*===========================================================================*/ /* UClassID-based RTTI */ /*===========================================================================*/ /** * UClassID is used to identify classes without using the compiler's RTTI. * This was used before C++ compilers consistently supported RTTI. * ICU 4.6 requires compiler RTTI to be turned on. * * Each class hierarchy which needs * to implement polymorphic clone() or operator==() defines two methods, * described in detail below. UClassID values can be compared using * operator==(). Nothing else should be done with them. * * \par * In class hierarchies that implement "poor man's RTTI", * each concrete subclass implements getDynamicClassID() in the same way: * * \code * class Derived { * public: * virtual UClassID getDynamicClassID() const * { return Derived::getStaticClassID(); } * } * \endcode * * Each concrete class implements getStaticClassID() as well, which allows * clients to test for a specific type. * * \code * class Derived { * public: * static UClassID U_EXPORT2 getStaticClassID(); * private: * static char fgClassID; * } * * // In Derived.cpp: * UClassID Derived::getStaticClassID() * { return (UClassID)&Derived::fgClassID; } * char Derived::fgClassID = 0; // Value is irrelevant * \endcode * @stable ICU 2.0 */ typedef void* UClassID; #endif // umisc.h /* ********************************************************************** * Copyright (C) 1999-2006, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** * file name: umisc.h * encoding: US-ASCII * tab size: 8 (not used) * indentation:4 * * created on: 1999oct15 * created by: Markus W. Scherer */ #ifndef UMISC_H #define UMISC_H /** * \file * \brief C API:misc definitions * * This file contains miscellaneous definitions for the C APIs. */ U_CDECL_BEGIN /** A struct representing a range of text containing a specific field * @stable ICU 2.0 */ typedef struct UFieldPosition { /** * The field * @stable ICU 2.0 */ int32_t field; /** * The start of the text range containing field * @stable ICU 2.0 */ int32_t beginIndex; /** * The limit of the text range containing field * @stable ICU 2.0 */ int32_t endIndex; } UFieldPosition; #if !UCONFIG_NO_SERVICE /** * Opaque type returned by registerInstance, registerFactory and unregister for service registration. * @stable ICU 2.6 */ typedef const void* URegistryKey; #endif U_CDECL_END #endif // ulistformatter.h /* ***************************************************************************************** * Copyright (C) 2015-2016, International Business Machines * Corporation and others. All Rights Reserved. ***************************************************************************************** */ #ifndef ULISTFORMATTER_H #define ULISTFORMATTER_H #if !UCONFIG_NO_FORMATTING /** * \file * \brief C API: Format a list in a locale-appropriate way. * * A UListFormatter is used to format a list of items in a locale-appropriate way, * using data from CLDR. * Example: Input data ["Alice", "Bob", "Charlie", "Delta"] will be formatted * as "Alice, Bob, Charlie, and Delta" in English. */ /** * Opaque UListFormatter object for use in C * @stable ICU 55 */ struct UListFormatter; typedef struct UListFormatter UListFormatter; /**< C typedef for struct UListFormatter. @stable ICU 55 */ /** * Open a new UListFormatter object using the rules for a given locale. * @param locale * The locale whose rules should be used; may be NULL for * default locale. * @param status * A pointer to a standard ICU UErrorCode (input/output parameter). * Its input value must pass the U_SUCCESS() test, or else the * function returns immediately. The caller should check its output * value with U_FAILURE(), or use with function chaining (see User * Guide for details). * @return * A pointer to a UListFormatter object for the specified locale, * or NULL if an error occurred. * @stable ICU 55 */ U_STABLE UListFormatter* U_EXPORT2 ulistfmt_open(const char* locale, UErrorCode* status); /** * Close a UListFormatter object. Once closed it may no longer be used. * @param listfmt * The UListFormatter object to close. * @stable ICU 55 */ U_STABLE void U_EXPORT2 ulistfmt_close(UListFormatter *listfmt); /** * Formats a list of strings using the conventions established for the * UListFormatter object. * @param listfmt * The UListFormatter object specifying the list conventions. * @param strings * An array of pointers to UChar strings; the array length is * specified by stringCount. Must be non-NULL if stringCount > 0. * @param stringLengths * An array of string lengths corresponding to the strings[] * parameter; any individual length value may be negative to indicate * that the corresponding strings[] entry is 0-terminated, or * stringLengths itself may be NULL if all of the strings are * 0-terminated. If non-NULL, the stringLengths array must have * stringCount entries. * @param stringCount * the number of entries in strings[], and the number of entries * in the stringLengths array if it is not NULL. Must be >= 0. * @param result * A pointer to a buffer to receive the formatted list. * @param resultCapacity * The maximum size of result. * @param status * A pointer to a standard ICU UErrorCode (input/output parameter). * Its input value must pass the U_SUCCESS() test, or else the * function returns immediately. The caller should check its output * value with U_FAILURE(), or use with function chaining (see User * Guide for details). * @return * The total buffer size needed; if greater than resultLength, the * output was truncated. May be <=0 if unable to determine the * total buffer size needed (e.g. for illegal arguments). * @stable ICU 55 */ U_DRAFT int32_t U_EXPORT2 ulistfmt_format(const UListFormatter* listfmt, const UChar* const strings[], const int32_t * stringLengths, int32_t stringCount, UChar* result, int32_t resultCapacity, UErrorCode* status); #endif /* #if !UCONFIG_NO_FORMATTING */ #endif // uiter.h /* ******************************************************************************* * * Copyright (C) 2002-2011 International Business Machines * Corporation and others. All Rights Reserved. * ******************************************************************************* * file name: uiter.h * encoding: US-ASCII * tab size: 8 (not used) * indentation:4 * * created on: 2002jan18 * created by: Markus W. Scherer */ #ifndef __UITER_H__ #define __UITER_H__ /** * \file * \brief C API: Unicode Character Iteration * * @see UCharIterator */ U_CDECL_BEGIN struct UCharIterator; typedef struct UCharIterator UCharIterator; /**< C typedef for struct UCharIterator. @stable ICU 2.1 */ /** * Origin constants for UCharIterator.getIndex() and UCharIterator.move(). * @see UCharIteratorMove * @see UCharIterator * @stable ICU 2.1 */ typedef enum UCharIteratorOrigin { UITER_START, UITER_CURRENT, UITER_LIMIT, UITER_ZERO, UITER_LENGTH } UCharIteratorOrigin; /** Constants for UCharIterator. @stable ICU 2.6 */ enum { /** * Constant value that may be returned by UCharIteratorMove * indicating that the final UTF-16 index is not known, but that the move succeeded. * This can occur when moving relative to limit or length, or * when moving relative to the current index after a setState() * when the current UTF-16 index is not known. * * It would be very inefficient to have to count from the beginning of the text * just to get the current/limit/length index after moving relative to it. * The actual index can be determined with getIndex(UITER_CURRENT) * which will count the UChars if necessary. * * @stable ICU 2.6 */ UITER_UNKNOWN_INDEX=-2 }; /** * Constant for UCharIterator getState() indicating an error or * an unknown state. * Returned by uiter_getState()/UCharIteratorGetState * when an error occurs. * Also, some UCharIterator implementations may not be able to return * a valid state for each position. This will be clearly documented * for each such iterator (none of the public ones here). * * @stable ICU 2.6 */ #define UITER_NO_STATE ((uint32_t)0xffffffff) /** * Function type declaration for UCharIterator.getIndex(). * * Gets the current position, or the start or limit of the * iteration range. * * This function may perform slowly for UITER_CURRENT after setState() was called, * or for UITER_LENGTH, because an iterator implementation may have to count * UChars if the underlying storage is not UTF-16. * * @param iter the UCharIterator structure ("this pointer") * @param origin get the 0, start, limit, length, or current index * @return the requested index, or U_SENTINEL in an error condition * * @see UCharIteratorOrigin * @see UCharIterator * @stable ICU 2.1 */ typedef int32_t U_CALLCONV UCharIteratorGetIndex(UCharIterator *iter, UCharIteratorOrigin origin); /** * Function type declaration for UCharIterator.move(). * * Use iter->move(iter, index, UITER_ZERO) like CharacterIterator::setIndex(index). * * Moves the current position relative to the start or limit of the * iteration range, or relative to the current position itself. * The movement is expressed in numbers of code units forward * or backward by specifying a positive or negative delta. * Out of bounds movement will be pinned to the start or limit. * * This function may perform slowly for moving relative to UITER_LENGTH * because an iterator implementation may have to count the rest of the * UChars if the native storage is not UTF-16. * * When moving relative to the limit or length, or * relative to the current position after setState() was called, * move() may return UITER_UNKNOWN_INDEX (-2) to avoid an inefficient * determination of the actual UTF-16 index. * The actual index can be determined with getIndex(UITER_CURRENT) * which will count the UChars if necessary. * See UITER_UNKNOWN_INDEX for details. * * @param iter the UCharIterator structure ("this pointer") * @param delta can be positive, zero, or negative * @param origin move relative to the 0, start, limit, length, or current index * @return the new index, or U_SENTINEL on an error condition, * or UITER_UNKNOWN_INDEX when the index is not known. * * @see UCharIteratorOrigin * @see UCharIterator * @see UITER_UNKNOWN_INDEX * @stable ICU 2.1 */ typedef int32_t U_CALLCONV UCharIteratorMove(UCharIterator *iter, int32_t delta, UCharIteratorOrigin origin); /** * Function type declaration for UCharIterator.hasNext(). * * Check if current() and next() can still * return another code unit. * * @param iter the UCharIterator structure ("this pointer") * @return boolean value for whether current() and next() can still return another code unit * * @see UCharIterator * @stable ICU 2.1 */ typedef UBool U_CALLCONV UCharIteratorHasNext(UCharIterator *iter); /** * Function type declaration for UCharIterator.hasPrevious(). * * Check if previous() can still return another code unit. * * @param iter the UCharIterator structure ("this pointer") * @return boolean value for whether previous() can still return another code unit * * @see UCharIterator * @stable ICU 2.1 */ typedef UBool U_CALLCONV UCharIteratorHasPrevious(UCharIterator *iter); /** * Function type declaration for UCharIterator.current(). * * Return the code unit at the current position, * or U_SENTINEL if there is none (index is at the limit). * * @param iter the UCharIterator structure ("this pointer") * @return the current code unit * * @see UCharIterator * @stable ICU 2.1 */ typedef UChar32 U_CALLCONV UCharIteratorCurrent(UCharIterator *iter); /** * Function type declaration for UCharIterator.next(). * * Return the code unit at the current index and increment * the index (post-increment, like s[i++]), * or return U_SENTINEL if there is none (index is at the limit). * * @param iter the UCharIterator structure ("this pointer") * @return the current code unit (and post-increment the current index) * * @see UCharIterator * @stable ICU 2.1 */ typedef UChar32 U_CALLCONV UCharIteratorNext(UCharIterator *iter); /** * Function type declaration for UCharIterator.previous(). * * Decrement the index and return the code unit from there * (pre-decrement, like s[--i]), * or return U_SENTINEL if there is none (index is at the start). * * @param iter the UCharIterator structure ("this pointer") * @return the previous code unit (after pre-decrementing the current index) * * @see UCharIterator * @stable ICU 2.1 */ typedef UChar32 U_CALLCONV UCharIteratorPrevious(UCharIterator *iter); /** * Function type declaration for UCharIterator.reservedFn(). * Reserved for future use. * * @param iter the UCharIterator structure ("this pointer") * @param something some integer argument * @return some integer * * @see UCharIterator * @stable ICU 2.1 */ typedef int32_t U_CALLCONV UCharIteratorReserved(UCharIterator *iter, int32_t something); /** * Function type declaration for UCharIterator.getState(). * * Get the "state" of the iterator in the form of a single 32-bit word. * It is recommended that the state value be calculated to be as small as * is feasible. For strings with limited lengths, fewer than 32 bits may * be sufficient. * * This is used together with setState()/UCharIteratorSetState * to save and restore the iterator position more efficiently than with * getIndex()/move(). * * The iterator state is defined as a uint32_t value because it is designed * for use in ucol_nextSortKeyPart() which provides 32 bits to store the state * of the character iterator. * * With some UCharIterator implementations (e.g., UTF-8), * getting and setting the UTF-16 index with existing functions * (getIndex(UITER_CURRENT) followed by move(pos, UITER_ZERO)) is possible but * relatively slow because the iterator has to "walk" from a known index * to the requested one. * This takes more time the farther it needs to go. * * An opaque state value allows an iterator implementation to provide * an internal index (UTF-8: the source byte array index) for * fast, constant-time restoration. * * After calling setState(), a getIndex(UITER_CURRENT) may be slow because * the UTF-16 index may not be restored as well, but the iterator can deliver * the correct text contents and move relative to the current position * without performance degradation. * * Some UCharIterator implementations may not be able to return * a valid state for each position, in which case they return UITER_NO_STATE instead. * This will be clearly documented for each such iterator (none of the public ones here). * * @param iter the UCharIterator structure ("this pointer") * @return the state word * * @see UCharIterator * @see UCharIteratorSetState * @see UITER_NO_STATE * @stable ICU 2.6 */ typedef uint32_t U_CALLCONV UCharIteratorGetState(const UCharIterator *iter); /** * Function type declaration for UCharIterator.setState(). * * Restore the "state" of the iterator using a state word from a getState() call. * The iterator object need not be the same one as for which getState() was called, * but it must be of the same type (set up using the same uiter_setXYZ function) * and it must iterate over the same string * (binary identical regardless of memory address). * For more about the state word see UCharIteratorGetState. * * After calling setState(), a getIndex(UITER_CURRENT) may be slow because * the UTF-16 index may not be restored as well, but the iterator can deliver * the correct text contents and move relative to the current position * without performance degradation. * * @param iter the UCharIterator structure ("this pointer") * @param state the state word from a getState() call * on a same-type, same-string iterator * @param pErrorCode Must be a valid pointer to an error code value, * which must not indicate a failure before the function call. * * @see UCharIterator * @see UCharIteratorGetState * @stable ICU 2.6 */ typedef void U_CALLCONV UCharIteratorSetState(UCharIterator *iter, uint32_t state, UErrorCode *pErrorCode); /** * C API for code unit iteration. * This can be used as a C wrapper around * CharacterIterator, Replaceable, or implemented using simple strings, etc. * * There are two roles for using UCharIterator: * * A "provider" sets the necessary function pointers and controls the "protected" * fields of the UCharIterator structure. A "provider" passes a UCharIterator * into C APIs that need a UCharIterator as an abstract, flexible string interface. * * Implementations of such C APIs are "callers" of UCharIterator functions; * they only use the "public" function pointers and never access the "protected" * fields directly. * * The current() and next() functions only check the current index against the * limit, and previous() only checks the current index against the start, * to see if the iterator already reached the end of the iteration range. * * The assumption - in all iterators - is that the index is moved via the API, * which means it won't go out of bounds, or the index is modified by * user code that knows enough about the iterator implementation to set valid * index values. * * UCharIterator functions return code unit values 0..0xffff, * or U_SENTINEL if the iteration bounds are reached. * * @stable ICU 2.1 */ struct UCharIterator { /** * (protected) Pointer to string or wrapped object or similar. * Not used by caller. * @stable ICU 2.1 */ const void *context; /** * (protected) Length of string or similar. * Not used by caller. * @stable ICU 2.1 */ int32_t length; /** * (protected) Start index or similar. * Not used by caller. * @stable ICU 2.1 */ int32_t start; /** * (protected) Current index or similar. * Not used by caller. * @stable ICU 2.1 */ int32_t index; /** * (protected) Limit index or similar. * Not used by caller. * @stable ICU 2.1 */ int32_t limit; /** * (protected) Used by UTF-8 iterators and possibly others. * @stable ICU 2.1 */ int32_t reservedField; /** * (public) Returns the current position or the * start or limit index of the iteration range. * * @see UCharIteratorGetIndex * @stable ICU 2.1 */ UCharIteratorGetIndex *getIndex; /** * (public) Moves the current position relative to the start or limit of the * iteration range, or relative to the current position itself. * The movement is expressed in numbers of code units forward * or backward by specifying a positive or negative delta. * * @see UCharIteratorMove * @stable ICU 2.1 */ UCharIteratorMove *move; /** * (public) Check if current() and next() can still * return another code unit. * * @see UCharIteratorHasNext * @stable ICU 2.1 */ UCharIteratorHasNext *hasNext; /** * (public) Check if previous() can still return another code unit. * * @see UCharIteratorHasPrevious * @stable ICU 2.1 */ UCharIteratorHasPrevious *hasPrevious; /** * (public) Return the code unit at the current position, * or U_SENTINEL if there is none (index is at the limit). * * @see UCharIteratorCurrent * @stable ICU 2.1 */ UCharIteratorCurrent *current; /** * (public) Return the code unit at the current index and increment * the index (post-increment, like s[i++]), * or return U_SENTINEL if there is none (index is at the limit). * * @see UCharIteratorNext * @stable ICU 2.1 */ UCharIteratorNext *next; /** * (public) Decrement the index and return the code unit from there * (pre-decrement, like s[--i]), * or return U_SENTINEL if there is none (index is at the start). * * @see UCharIteratorPrevious * @stable ICU 2.1 */ UCharIteratorPrevious *previous; /** * (public) Reserved for future use. Currently NULL. * * @see UCharIteratorReserved * @stable ICU 2.1 */ UCharIteratorReserved *reservedFn; /** * (public) Return the state of the iterator, to be restored later with setState(). * This function pointer is NULL if the iterator does not implement it. * * @see UCharIteratorGet * @stable ICU 2.6 */ UCharIteratorGetState *getState; /** * (public) Restore the iterator state from the state word from a call * to getState(). * This function pointer is NULL if the iterator does not implement it. * * @see UCharIteratorSet * @stable ICU 2.6 */ UCharIteratorSetState *setState; }; /** * Helper function for UCharIterator to get the code point * at the current index. * * Return the code point that includes the code unit at the current position, * or U_SENTINEL if there is none (index is at the limit). * If the current code unit is a lead or trail surrogate, * then the following or preceding surrogate is used to form * the code point value. * * @param iter the UCharIterator structure ("this pointer") * @return the current code point * * @see UCharIterator * @see U16_GET * @see UnicodeString::char32At() * @stable ICU 2.1 */ U_STABLE UChar32 U_EXPORT2 uiter_current32(UCharIterator *iter); /** * Helper function for UCharIterator to get the next code point. * * Return the code point at the current index and increment * the index (post-increment, like s[i++]), * or return U_SENTINEL if there is none (index is at the limit). * * @param iter the UCharIterator structure ("this pointer") * @return the current code point (and post-increment the current index) * * @see UCharIterator * @see U16_NEXT * @stable ICU 2.1 */ U_STABLE UChar32 U_EXPORT2 uiter_next32(UCharIterator *iter); /** * Helper function for UCharIterator to get the previous code point. * * Decrement the index and return the code point from there * (pre-decrement, like s[--i]), * or return U_SENTINEL if there is none (index is at the start). * * @param iter the UCharIterator structure ("this pointer") * @return the previous code point (after pre-decrementing the current index) * * @see UCharIterator * @see U16_PREV * @stable ICU 2.1 */ U_STABLE UChar32 U_EXPORT2 uiter_previous32(UCharIterator *iter); /** * Get the "state" of the iterator in the form of a single 32-bit word. * This is a convenience function that calls iter->getState(iter) * if iter->getState is not NULL; * if it is NULL or any other error occurs, then UITER_NO_STATE is returned. * * Some UCharIterator implementations may not be able to return * a valid state for each position, in which case they return UITER_NO_STATE instead. * This will be clearly documented for each such iterator (none of the public ones here). * * @param iter the UCharIterator structure ("this pointer") * @return the state word * * @see UCharIterator * @see UCharIteratorGetState * @see UITER_NO_STATE * @stable ICU 2.6 */ U_STABLE uint32_t U_EXPORT2 uiter_getState(const UCharIterator *iter); /** * Restore the "state" of the iterator using a state word from a getState() call. * This is a convenience function that calls iter->setState(iter, state, pErrorCode) * if iter->setState is not NULL; if it is NULL, then U_UNSUPPORTED_ERROR is set. * * @param iter the UCharIterator structure ("this pointer") * @param state the state word from a getState() call * on a same-type, same-string iterator * @param pErrorCode Must be a valid pointer to an error code value, * which must not indicate a failure before the function call. * * @see UCharIterator * @see UCharIteratorSetState * @stable ICU 2.6 */ U_STABLE void U_EXPORT2 uiter_setState(UCharIterator *iter, uint32_t state, UErrorCode *pErrorCode); /** * Set up a UCharIterator to iterate over a string. * * Sets the UCharIterator function pointers for iteration over the string s * with iteration boundaries start=index=0 and length=limit=string length. * The "provider" may set the start, index, and limit values at any time * within the range 0..length. * The length field will be ignored. * * The string pointer s is set into UCharIterator.context without copying * or reallocating the string contents. * * getState() simply returns the current index. * move() will always return the final index. * * @param iter UCharIterator structure to be set for iteration * @param s String to iterate over * @param length Length of s, or -1 if NUL-terminated * * @see UCharIterator * @stable ICU 2.1 */ U_STABLE void U_EXPORT2 uiter_setString(UCharIterator *iter, const UChar *s, int32_t length); /** * Set up a UCharIterator to iterate over a UTF-16BE string * (byte vector with a big-endian pair of bytes per UChar). * * Everything works just like with a normal UChar iterator (uiter_setString), * except that UChars are assembled from byte pairs, * and that the length argument here indicates an even number of bytes. * * getState() simply returns the current index. * move() will always return the final index. * * @param iter UCharIterator structure to be set for iteration * @param s UTF-16BE string to iterate over * @param length Length of s as an even number of bytes, or -1 if NUL-terminated * (NUL means pair of 0 bytes at even index from s) * * @see UCharIterator * @see uiter_setString * @stable ICU 2.6 */ U_STABLE void U_EXPORT2 uiter_setUTF16BE(UCharIterator *iter, const char *s, int32_t length); /** * Set up a UCharIterator to iterate over a UTF-8 string. * * Sets the UCharIterator function pointers for iteration over the UTF-8 string s * with UTF-8 iteration boundaries 0 and length. * The implementation counts the UTF-16 index on the fly and * lazily evaluates the UTF-16 length of the text. * * The start field is used as the UTF-8 offset, the limit field as the UTF-8 length. * When the reservedField is not 0, then it contains a supplementary code point * and the UTF-16 index is between the two corresponding surrogates. * At that point, the UTF-8 index is behind that code point. * * The UTF-8 string pointer s is set into UCharIterator.context without copying * or reallocating the string contents. * * getState() returns a state value consisting of * - the current UTF-8 source byte index (bits 31..1) * - a flag (bit 0) that indicates whether the UChar position is in the middle * of a surrogate pair * (from a 4-byte UTF-8 sequence for the corresponding supplementary code point) * * getState() cannot also encode the UTF-16 index in the state value. * move(relative to limit or length), or * move(relative to current) after setState(), may return UITER_UNKNOWN_INDEX. * * @param iter UCharIterator structure to be set for iteration * @param s UTF-8 string to iterate over * @param length Length of s in bytes, or -1 if NUL-terminated * * @see UCharIterator * @stable ICU 2.6 */ U_STABLE void U_EXPORT2 uiter_setUTF8(UCharIterator *iter, const char *s, int32_t length); U_CDECL_END #endif // uenum.h /* ******************************************************************************* * * Copyright (C) 2002-2013, International Business Machines * Corporation and others. All Rights Reserved. * ******************************************************************************* * file name: uenum.h * encoding: US-ASCII * tab size: 8 (not used) * indentation:2 * * created on: 2002jul08 * created by: Vladimir Weinstein */ #ifndef __UENUM_H #define __UENUM_H /** * \file * \brief C API: String Enumeration */ /** * An enumeration object. * For usage in C programs. * @stable ICU 2.2 */ struct UEnumeration; /** structure representing an enumeration object instance @stable ICU 2.2 */ typedef struct UEnumeration UEnumeration; /** * Disposes of resources in use by the iterator. If en is NULL, * does nothing. After this call, any char* or UChar* pointer * returned by uenum_unext() or uenum_next() is invalid. * @param en UEnumeration structure pointer * @stable ICU 2.2 */ U_STABLE void U_EXPORT2 uenum_close(UEnumeration* en); /** * Returns the number of elements that the iterator traverses. If * the iterator is out-of-sync with its service, status is set to * U_ENUM_OUT_OF_SYNC_ERROR. * This is a convenience function. It can end up being very * expensive as all the items might have to be pre-fetched (depending * on the type of data being traversed). Use with caution and only * when necessary. * @param en UEnumeration structure pointer * @param status error code, can be U_ENUM_OUT_OF_SYNC_ERROR if the * iterator is out of sync. * @return number of elements in the iterator * @stable ICU 2.2 */ U_STABLE int32_t U_EXPORT2 uenum_count(UEnumeration* en, UErrorCode* status); /** * Returns the next element in the iterator's list. If there are * no more elements, returns NULL. If the iterator is out-of-sync * with its service, status is set to U_ENUM_OUT_OF_SYNC_ERROR and * NULL is returned. If the native service string is a char* string, * it is converted to UChar* with the invariant converter. * The result is terminated by (UChar)0. * @param en the iterator object * @param resultLength pointer to receive the length of the result * (not including the terminating \\0). * If the pointer is NULL it is ignored. * @param status the error code, set to U_ENUM_OUT_OF_SYNC_ERROR if * the iterator is out of sync with its service. * @return a pointer to the string. The string will be * zero-terminated. The return pointer is owned by this iterator * and must not be deleted by the caller. The pointer is valid * until the next call to any uenum_... method, including * uenum_next() or uenum_unext(). When all strings have been * traversed, returns NULL. * @stable ICU 2.2 */ U_STABLE const UChar* U_EXPORT2 uenum_unext(UEnumeration* en, int32_t* resultLength, UErrorCode* status); /** * Returns the next element in the iterator's list. If there are * no more elements, returns NULL. If the iterator is out-of-sync * with its service, status is set to U_ENUM_OUT_OF_SYNC_ERROR and * NULL is returned. If the native service string is a UChar* * string, it is converted to char* with the invariant converter. * The result is terminated by (char)0. If the conversion fails * (because a character cannot be converted) then status is set to * U_INVARIANT_CONVERSION_ERROR and the return value is undefined * (but non-NULL). * @param en the iterator object * @param resultLength pointer to receive the length of the result * (not including the terminating \\0). * If the pointer is NULL it is ignored. * @param status the error code, set to U_ENUM_OUT_OF_SYNC_ERROR if * the iterator is out of sync with its service. Set to * U_INVARIANT_CONVERSION_ERROR if the underlying native string is * UChar* and conversion to char* with the invariant converter * fails. This error pertains only to current string, so iteration * might be able to continue successfully. * @return a pointer to the string. The string will be * zero-terminated. The return pointer is owned by this iterator * and must not be deleted by the caller. The pointer is valid * until the next call to any uenum_... method, including * uenum_next() or uenum_unext(). When all strings have been * traversed, returns NULL. * @stable ICU 2.2 */ U_STABLE const char* U_EXPORT2 uenum_next(UEnumeration* en, int32_t* resultLength, UErrorCode* status); /** * Resets the iterator to the current list of service IDs. This * re-establishes sync with the service and rewinds the iterator * to start at the first element. * @param en the iterator object * @param status the error code, set to U_ENUM_OUT_OF_SYNC_ERROR if * the iterator is out of sync with its service. * @stable ICU 2.2 */ U_STABLE void U_EXPORT2 uenum_reset(UEnumeration* en, UErrorCode* status); /** * Given an array of const UChar* strings, return a UEnumeration. String pointers from 0..count-1 must not be null. * Do not free or modify either the string array or the characters it points to until this object has been destroyed with uenum_close. * \snippet test/cintltst/uenumtst.c uenum_openUCharStringsEnumeration * @param strings array of const UChar* strings (each null terminated). All storage is owned by the caller. * @param count length of the array * @param ec error code * @return the new UEnumeration object. Caller is responsible for calling uenum_close to free memory. * @see uenum_close * @stable ICU 50 */ U_STABLE UEnumeration* U_EXPORT2 uenum_openUCharStringsEnumeration(const UChar* const strings[], int32_t count, UErrorCode* ec); /* Note: next function is not hidden as draft, as it is used internally (it was formerly an internal function). */ /** * Given an array of const char* strings (invariant chars only), return a UEnumeration. String pointers from 0..count-1 must not be null. * Do not free or modify either the string array or the characters it points to until this object has been destroyed with uenum_close. * \snippet test/cintltst/uenumtst.c uenum_openCharStringsEnumeration * @param strings array of char* strings (each null terminated). All storage is owned by the caller. * @param count length of the array * @param ec error code * @return the new UEnumeration object. Caller is responsible for calling uenum_close to free memory * @see uenum_close * @stable ICU 50 */ U_STABLE UEnumeration* U_EXPORT2 uenum_openCharStringsEnumeration(const char* const strings[], int32_t count, UErrorCode* ec); #endif // uloc.h /* ********************************************************************** * Copyright (C) 1997-2016, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** * * File ULOC.H * * Modification History: * * Date Name Description * 04/01/97 aliu Creation. * 08/22/98 stephen JDK 1.2 sync. * 12/08/98 rtg New C API for Locale * 03/30/99 damiba overhaul * 03/31/99 helena Javadoc for uloc functions. * 04/15/99 Madhu Updated Javadoc ******************************************************************************** */ #ifndef ULOC_H #define ULOC_H /** * \file * \brief C API: Locale * *

ULoc C API for Locale

* A Locale represents a specific geographical, political, * or cultural region. An operation that requires a Locale to perform * its task is called locale-sensitive and uses the Locale * to tailor information for the user. For example, displaying a number * is a locale-sensitive operation--the number should be formatted * according to the customs/conventions of the user's native country, * region, or culture. In the C APIs, a locales is simply a const char string. * *

* You create a Locale with one of the three options listed below. * Each of the component is separated by '_' in the locale string. * \htmlonly

\endhtmlonly *
 * \code
 *       newLanguage
 * 
 *       newLanguage + newCountry
 * 
 *       newLanguage + newCountry + newVariant
 * \endcode
 * 
* \htmlonly
\endhtmlonly * The first option is a valid ISO * Language Code. These codes are the lower-case two-letter * codes as defined by ISO-639. * You can find a full list of these codes at a number of sites, such as: *
* http://www.ics.uci.edu/pub/ietf/http/related/iso639.txt * *

* The second option includes an additonal ISO Country * Code. These codes are the upper-case two-letter codes * as defined by ISO-3166. * You can find a full list of these codes at a number of sites, such as: *
* http://www.chemie.fu-berlin.de/diverse/doc/ISO_3166.html * *

* The third option requires another additonal information--the * Variant. * The Variant codes are vendor and browser-specific. * For example, use WIN for Windows, MAC for Macintosh, and POSIX for POSIX. * Where there are two variants, separate them with an underscore, and * put the most important one first. For * example, a Traditional Spanish collation might be referenced, with * "ES", "ES", "Traditional_WIN". * *

* Because a Locale is just an identifier for a region, * no validity check is performed when you specify a Locale. * If you want to see whether particular resources are available for the * Locale you asked for, you must query those resources. For * example, ask the UNumberFormat for the locales it supports * using its getAvailable method. *
Note: When you ask for a resource for a particular * locale, you get back the best available match, not necessarily * precisely what you asked for. For more information, look at * UResourceBundle. * *

* The Locale provides a number of convenient constants * that you can use to specify the commonly used * locales. For example, the following refers to a locale * for the United States: * \htmlonly

\endhtmlonly *
 * \code
 *       ULOC_US
 * \endcode
 * 
* \htmlonly
\endhtmlonly * *

* Once you've specified a locale you can query it for information about * itself. Use uloc_getCountry to get the ISO Country Code and * uloc_getLanguage to get the ISO Language Code. You can * use uloc_getDisplayCountry to get the * name of the country suitable for displaying to the user. Similarly, * you can use uloc_getDisplayLanguage to get the name of * the language suitable for displaying to the user. Interestingly, * the uloc_getDisplayXXX methods are themselves locale-sensitive * and have two versions: one that uses the default locale and one * that takes a locale as an argument and displays the name or country in * a language appropriate to that locale. * *

* The ICU provides a number of services that perform locale-sensitive * operations. For example, the unum_xxx functions format * numbers, currency, or percentages in a locale-sensitive manner. *

* \htmlonly
\endhtmlonly *
 * \code
 *     UErrorCode success = U_ZERO_ERROR;
 *     UNumberFormat *nf;
 *     const char* myLocale = "fr_FR";
 * 
 *     nf = unum_open( UNUM_DEFAULT, NULL, success );          
 *     unum_close(nf);
 *     nf = unum_open( UNUM_CURRENCY, NULL, success );
 *     unum_close(nf);
 *     nf = unum_open( UNUM_PERCENT, NULL, success );   
 *     unum_close(nf);
 * \endcode
 * 
* \htmlonly
\endhtmlonly * Each of these methods has two variants; one with an explicit locale * and one without; the latter using the default locale. * \htmlonly
\endhtmlonly *
 * \code 
 * 
 *     nf = unum_open( UNUM_DEFAULT, myLocale, success );          
 *     unum_close(nf);
 *     nf = unum_open( UNUM_CURRENCY, myLocale, success );
 *     unum_close(nf);
 *     nf = unum_open( UNUM_PERCENT, myLocale, success );   
 *     unum_close(nf);
 * \endcode
 * 
* \htmlonly
\endhtmlonly * A Locale is the mechanism for identifying the kind of services * (UNumberFormat) that you would like to get. The locale is * just a mechanism for identifying these services. * *

* Each international serivce that performs locale-sensitive operations * allows you * to get all the available objects of that type. You can sift * through these objects by language, country, or variant, * and use the display names to present a menu to the user. * For example, you can create a menu of all the collation objects * suitable for a given language. Such classes implement these * three class methods: * \htmlonly

\endhtmlonly *
 * \code
 *       const char* uloc_getAvailable(int32_t index);
 *       int32_t uloc_countAvailable();
 *       int32_t
 *       uloc_getDisplayName(const char* localeID,
 *                 const char* inLocaleID, 
 *                 UChar* result,
 *                 int32_t maxResultSize,
 *                  UErrorCode* err);
 * 
 * \endcode
 * 
* \htmlonly
\endhtmlonly *

* Concerning POSIX/RFC1766 Locale IDs, * the getLanguage/getCountry/getVariant/getName functions do understand * the POSIX type form of language_COUNTRY.ENCODING\@VARIANT * and if there is not an ICU-stype variant, uloc_getVariant() for example * will return the one listed after the \@at sign. As well, the hyphen * "-" is recognized as a country/variant separator similarly to RFC1766. * So for example, "en-us" will be interpreted as en_US. * As a result, uloc_getName() is far from a no-op, and will have the * effect of converting POSIX/RFC1766 IDs into ICU form, although it does * NOT map any of the actual codes (i.e. russian->ru) in any way. * Applications should call uloc_getName() at the point where a locale ID * is coming from an external source (user entry, OS, web browser) * and pass the resulting string to other ICU functions. For example, * don't use de-de\@EURO as an argument to resourcebundle. * * @see UResourceBundle */ /** Useful constant for this language. @stable ICU 2.0 */ #define ULOC_CHINESE "zh" /** Useful constant for this language. @stable ICU 2.0 */ #define ULOC_ENGLISH "en" /** Useful constant for this language. @stable ICU 2.0 */ #define ULOC_FRENCH "fr" /** Useful constant for this language. @stable ICU 2.0 */ #define ULOC_GERMAN "de" /** Useful constant for this language. @stable ICU 2.0 */ #define ULOC_ITALIAN "it" /** Useful constant for this language. @stable ICU 2.0 */ #define ULOC_JAPANESE "ja" /** Useful constant for this language. @stable ICU 2.0 */ #define ULOC_KOREAN "ko" /** Useful constant for this language. @stable ICU 2.0 */ #define ULOC_SIMPLIFIED_CHINESE "zh_CN" /** Useful constant for this language. @stable ICU 2.0 */ #define ULOC_TRADITIONAL_CHINESE "zh_TW" /** Useful constant for this country/region. @stable ICU 2.0 */ #define ULOC_CANADA "en_CA" /** Useful constant for this country/region. @stable ICU 2.0 */ #define ULOC_CANADA_FRENCH "fr_CA" /** Useful constant for this country/region. @stable ICU 2.0 */ #define ULOC_CHINA "zh_CN" /** Useful constant for this country/region. @stable ICU 2.0 */ #define ULOC_PRC "zh_CN" /** Useful constant for this country/region. @stable ICU 2.0 */ #define ULOC_FRANCE "fr_FR" /** Useful constant for this country/region. @stable ICU 2.0 */ #define ULOC_GERMANY "de_DE" /** Useful constant for this country/region. @stable ICU 2.0 */ #define ULOC_ITALY "it_IT" /** Useful constant for this country/region. @stable ICU 2.0 */ #define ULOC_JAPAN "ja_JP" /** Useful constant for this country/region. @stable ICU 2.0 */ #define ULOC_KOREA "ko_KR" /** Useful constant for this country/region. @stable ICU 2.0 */ #define ULOC_TAIWAN "zh_TW" /** Useful constant for this country/region. @stable ICU 2.0 */ #define ULOC_UK "en_GB" /** Useful constant for this country/region. @stable ICU 2.0 */ #define ULOC_US "en_US" /** * Useful constant for the maximum size of the language part of a locale ID. * (including the terminating NULL). * @stable ICU 2.0 */ #define ULOC_LANG_CAPACITY 12 /** * Useful constant for the maximum size of the country part of a locale ID * (including the terminating NULL). * @stable ICU 2.0 */ #define ULOC_COUNTRY_CAPACITY 4 /** * Useful constant for the maximum size of the whole locale ID * (including the terminating NULL and all keywords). * @stable ICU 2.0 */ #define ULOC_FULLNAME_CAPACITY 157 /** * Useful constant for the maximum size of the script part of a locale ID * (including the terminating NULL). * @stable ICU 2.8 */ #define ULOC_SCRIPT_CAPACITY 6 /** * Useful constant for the maximum size of keywords in a locale * @stable ICU 2.8 */ #define ULOC_KEYWORDS_CAPACITY 96 /** * Useful constant for the maximum total size of keywords and their values in a locale * @stable ICU 2.8 */ #define ULOC_KEYWORD_AND_VALUES_CAPACITY 100 /** * Invariant character separating keywords from the locale string * @stable ICU 2.8 */ #define ULOC_KEYWORD_SEPARATOR '@' /** * Unicode code point for '@' separating keywords from the locale string. * @see ULOC_KEYWORD_SEPARATOR * @stable ICU 4.6 */ #define ULOC_KEYWORD_SEPARATOR_UNICODE 0x40 /** * Invariant character for assigning value to a keyword * @stable ICU 2.8 */ #define ULOC_KEYWORD_ASSIGN '=' /** * Unicode code point for '=' for assigning value to a keyword. * @see ULOC_KEYWORD_ASSIGN * @stable ICU 4.6 */ #define ULOC_KEYWORD_ASSIGN_UNICODE 0x3D /** * Invariant character separating keywords * @stable ICU 2.8 */ #define ULOC_KEYWORD_ITEM_SEPARATOR ';' /** * Unicode code point for ';' separating keywords * @see ULOC_KEYWORD_ITEM_SEPARATOR * @stable ICU 4.6 */ #define ULOC_KEYWORD_ITEM_SEPARATOR_UNICODE 0x3B /** * Constants for *_getLocale() * Allow user to select whether she wants information on * requested, valid or actual locale. * For example, a collator for "en_US_CALIFORNIA" was * requested. In the current state of ICU (2.0), * the requested locale is "en_US_CALIFORNIA", * the valid locale is "en_US" (most specific locale supported by ICU) * and the actual locale is "root" (the collation data comes unmodified * from the UCA) * The locale is considered supported by ICU if there is a core ICU bundle * for that locale (although it may be empty). * @stable ICU 2.1 */ typedef enum { /** This is locale the data actually comes from * @stable ICU 2.1 */ ULOC_ACTUAL_LOCALE = 0, /** This is the most specific locale supported by ICU * @stable ICU 2.1 */ ULOC_VALID_LOCALE = 1, ULOC_DATA_LOCALE_TYPE_LIMIT = 3 } ULocDataLocaleType ; #ifndef U_HIDE_SYSTEM_API /** * Gets ICU's default locale. * The returned string is a snapshot in time, and will remain valid * and unchanged even when uloc_setDefault() is called. * The returned storage is owned by ICU, and must not be altered or deleted * by the caller. * * @return the ICU default locale * @system * @stable ICU 2.0 */ U_STABLE const char* U_EXPORT2 uloc_getDefault(void); /** * Sets ICU's default locale. * By default (without calling this function), ICU's default locale will be based * on information obtained from the underlying system environment. *

* Changes to ICU's default locale do not propagate back to the * system environment. *

* Changes to ICU's default locale to not affect any ICU services that * may already be open based on the previous default locale value. * * @param localeID the new ICU default locale. A value of NULL will try to get * the system's default locale. * @param status the error information if the setting of default locale fails * @system * @stable ICU 2.0 */ U_STABLE void U_EXPORT2 uloc_setDefault(const char* localeID, UErrorCode* status); #endif /* U_HIDE_SYSTEM_API */ /** * Gets the language code for the specified locale. * * @param localeID the locale to get the ISO language code with * @param language the language code for localeID * @param languageCapacity the size of the language buffer to store the * language code with * @param err error information if retrieving the language code failed * @return the actual buffer size needed for the language code. If it's greater * than languageCapacity, the returned language code will be truncated. * @stable ICU 2.0 */ U_STABLE int32_t U_EXPORT2 uloc_getLanguage(const char* localeID, char* language, int32_t languageCapacity, UErrorCode* err); /** * Gets the script code for the specified locale. * * @param localeID the locale to get the ISO language code with * @param script the language code for localeID * @param scriptCapacity the size of the language buffer to store the * language code with * @param err error information if retrieving the language code failed * @return the actual buffer size needed for the language code. If it's greater * than scriptCapacity, the returned language code will be truncated. * @stable ICU 2.8 */ U_STABLE int32_t U_EXPORT2 uloc_getScript(const char* localeID, char* script, int32_t scriptCapacity, UErrorCode* err); /** * Gets the country code for the specified locale. * * @param localeID the locale to get the country code with * @param country the country code for localeID * @param countryCapacity the size of the country buffer to store the * country code with * @param err error information if retrieving the country code failed * @return the actual buffer size needed for the country code. If it's greater * than countryCapacity, the returned country code will be truncated. * @stable ICU 2.0 */ U_STABLE int32_t U_EXPORT2 uloc_getCountry(const char* localeID, char* country, int32_t countryCapacity, UErrorCode* err); /** * Gets the variant code for the specified locale. * * @param localeID the locale to get the variant code with * @param variant the variant code for localeID * @param variantCapacity the size of the variant buffer to store the * variant code with * @param err error information if retrieving the variant code failed * @return the actual buffer size needed for the variant code. If it's greater * than variantCapacity, the returned variant code will be truncated. * @stable ICU 2.0 */ U_STABLE int32_t U_EXPORT2 uloc_getVariant(const char* localeID, char* variant, int32_t variantCapacity, UErrorCode* err); /** * Gets the full name for the specified locale. * Note: This has the effect of 'canonicalizing' the ICU locale ID to * a certain extent. Upper and lower case are set as needed. * It does NOT map aliased names in any way. * See the top of this header file. * This API supports preflighting. * * @param localeID the locale to get the full name with * @param name fill in buffer for the name without keywords. * @param nameCapacity capacity of the fill in buffer. * @param err error information if retrieving the full name failed * @return the actual buffer size needed for the full name. If it's greater * than nameCapacity, the returned full name will be truncated. * @stable ICU 2.0 */ U_STABLE int32_t U_EXPORT2 uloc_getName(const char* localeID, char* name, int32_t nameCapacity, UErrorCode* err); /** * Gets the full name for the specified locale. * Note: This has the effect of 'canonicalizing' the string to * a certain extent. Upper and lower case are set as needed, * and if the components were in 'POSIX' format they are changed to * ICU format. It does NOT map aliased names in any way. * See the top of this header file. * * @param localeID the locale to get the full name with * @param name the full name for localeID * @param nameCapacity the size of the name buffer to store the * full name with * @param err error information if retrieving the full name failed * @return the actual buffer size needed for the full name. If it's greater * than nameCapacity, the returned full name will be truncated. * @stable ICU 2.8 */ U_STABLE int32_t U_EXPORT2 uloc_canonicalize(const char* localeID, char* name, int32_t nameCapacity, UErrorCode* err); /** * Gets the ISO language code for the specified locale. * * @param localeID the locale to get the ISO language code with * @return language the ISO language code for localeID * @stable ICU 2.0 */ U_STABLE const char* U_EXPORT2 uloc_getISO3Language(const char* localeID); /** * Gets the ISO country code for the specified locale. * * @param localeID the locale to get the ISO country code with * @return country the ISO country code for localeID * @stable ICU 2.0 */ U_STABLE const char* U_EXPORT2 uloc_getISO3Country(const char* localeID); /** * Gets the Win32 LCID value for the specified locale. * If the ICU locale is not recognized by Windows, 0 will be returned. * * LCIDs were deprecated with Windows Vista and Microsoft recommends * developers to use BCP47 style tags instead (uloc_toLanguageTag.) * * @param localeID the locale to get the Win32 LCID value with * @return country the Win32 LCID for localeID * @stable ICU 2.0 */ U_STABLE uint32_t U_EXPORT2 uloc_getLCID(const char* localeID); /** * Gets the language name suitable for display for the specified locale. * * @param locale the locale to get the ISO language code with * @param displayLocale Specifies the locale to be used to display the name. In other words, * if the locale's language code is "en", passing Locale::getFrench() for * inLocale would result in "Anglais", while passing Locale::getGerman() * for inLocale would result in "Englisch". * @param language the displayable language code for localeID * @param languageCapacity the size of the language buffer to store the * displayable language code with * @param status error information if retrieving the displayable language code failed * @return the actual buffer size needed for the displayable language code. If it's greater * than languageCapacity, the returned language code will be truncated. * @stable ICU 2.0 */ U_STABLE int32_t U_EXPORT2 uloc_getDisplayLanguage(const char* locale, const char* displayLocale, UChar* language, int32_t languageCapacity, UErrorCode* status); /** * Gets the script name suitable for display for the specified locale. * * @param locale the locale to get the displayable script code with. NULL may be used to specify the default. * @param displayLocale Specifies the locale to be used to display the name. In other words, * if the locale's language code is "en", passing Locale::getFrench() for * inLocale would result in "", while passing Locale::getGerman() * for inLocale would result in "". NULL may be used to specify the default. * @param script the displayable country code for localeID * @param scriptCapacity the size of the script buffer to store the * displayable script code with * @param status error information if retrieving the displayable script code failed * @return the actual buffer size needed for the displayable script code. If it's greater * than scriptCapacity, the returned displayable script code will be truncated. * @stable ICU 2.8 */ U_STABLE int32_t U_EXPORT2 uloc_getDisplayScript(const char* locale, const char* displayLocale, UChar* script, int32_t scriptCapacity, UErrorCode* status); /** * Gets the country name suitable for display for the specified locale. * Warning: this is for the region part of a valid locale ID; it cannot just be the region code (like "FR"). * To get the display name for a region alone, or for other options, use ULocaleDisplayNames instead. * * @param locale the locale to get the displayable country code with. NULL may be used to specify the default. * @param displayLocale Specifies the locale to be used to display the name. In other words, * if the locale's language code is "en", passing Locale::getFrench() for * inLocale would result in "Anglais", while passing Locale::getGerman() * for inLocale would result in "Englisch". NULL may be used to specify the default. * @param country the displayable country code for localeID * @param countryCapacity the size of the country buffer to store the * displayable country code with * @param status error information if retrieving the displayable country code failed * @return the actual buffer size needed for the displayable country code. If it's greater * than countryCapacity, the returned displayable country code will be truncated. * @stable ICU 2.0 */ U_STABLE int32_t U_EXPORT2 uloc_getDisplayCountry(const char* locale, const char* displayLocale, UChar* country, int32_t countryCapacity, UErrorCode* status); /** * Gets the variant name suitable for display for the specified locale. * * @param locale the locale to get the displayable variant code with. NULL may be used to specify the default. * @param displayLocale Specifies the locale to be used to display the name. In other words, * if the locale's language code is "en", passing Locale::getFrench() for * inLocale would result in "Anglais", while passing Locale::getGerman() * for inLocale would result in "Englisch". NULL may be used to specify the default. * @param variant the displayable variant code for localeID * @param variantCapacity the size of the variant buffer to store the * displayable variant code with * @param status error information if retrieving the displayable variant code failed * @return the actual buffer size needed for the displayable variant code. If it's greater * than variantCapacity, the returned displayable variant code will be truncated. * @stable ICU 2.0 */ U_STABLE int32_t U_EXPORT2 uloc_getDisplayVariant(const char* locale, const char* displayLocale, UChar* variant, int32_t variantCapacity, UErrorCode* status); /** * Gets the keyword name suitable for display for the specified locale. * E.g: for the locale string de_DE\@collation=PHONEBOOK, this API gets the display * string for the keyword collation. * Usage: * * UErrorCode status = U_ZERO_ERROR; * const char* keyword =NULL; * int32_t keywordLen = 0; * int32_t keywordCount = 0; * UChar displayKeyword[256]; * int32_t displayKeywordLen = 0; * UEnumeration* keywordEnum = uloc_openKeywords("de_DE@collation=PHONEBOOK;calendar=TRADITIONAL", &status); * for(keywordCount = uenum_count(keywordEnum, &status); keywordCount > 0 ; keywordCount--){ * if(U_FAILURE(status)){ * ...something went wrong so handle the error... * break; * } * // the uenum_next returns NUL terminated string * keyword = uenum_next(keywordEnum, &keywordLen, &status); * displayKeywordLen = uloc_getDisplayKeyword(keyword, "en_US", displayKeyword, 256); * ... do something interesting ..... * } * uenum_close(keywordEnum); * * @param keyword The keyword whose display string needs to be returned. * @param displayLocale Specifies the locale to be used to display the name. In other words, * if the locale's language code is "en", passing Locale::getFrench() for * inLocale would result in "Anglais", while passing Locale::getGerman() * for inLocale would result in "Englisch". NULL may be used to specify the default. * @param dest the buffer to which the displayable keyword should be written. * @param destCapacity The size of the buffer (number of UChars). If it is 0, then * dest may be NULL and the function will only return the length of the * result without writing any of the result string (pre-flighting). * @param status error information if retrieving the displayable string failed. * Should not be NULL and should not indicate failure on entry. * @return the actual buffer size needed for the displayable variant code. * @see #uloc_openKeywords * @stable ICU 2.8 */ U_STABLE int32_t U_EXPORT2 uloc_getDisplayKeyword(const char* keyword, const char* displayLocale, UChar* dest, int32_t destCapacity, UErrorCode* status); /** * Gets the value of the keyword suitable for display for the specified locale. * E.g: for the locale string de_DE\@collation=PHONEBOOK, this API gets the display * string for PHONEBOOK, in the display locale, when "collation" is specified as the keyword. * * @param locale The locale to get the displayable variant code with. NULL may be used to specify the default. * @param keyword The keyword for whose value should be used. * @param displayLocale Specifies the locale to be used to display the name. In other words, * if the locale's language code is "en", passing Locale::getFrench() for * inLocale would result in "Anglais", while passing Locale::getGerman() * for inLocale would result in "Englisch". NULL may be used to specify the default. * @param dest the buffer to which the displayable keyword should be written. * @param destCapacity The size of the buffer (number of UChars). If it is 0, then * dest may be NULL and the function will only return the length of the * result without writing any of the result string (pre-flighting). * @param status error information if retrieving the displayable string failed. * Should not be NULL and must not indicate failure on entry. * @return the actual buffer size needed for the displayable variant code. * @stable ICU 2.8 */ U_STABLE int32_t U_EXPORT2 uloc_getDisplayKeywordValue( const char* locale, const char* keyword, const char* displayLocale, UChar* dest, int32_t destCapacity, UErrorCode* status); /** * Gets the full name suitable for display for the specified locale. * * @param localeID the locale to get the displayable name with. NULL may be used to specify the default. * @param inLocaleID Specifies the locale to be used to display the name. In other words, * if the locale's language code is "en", passing Locale::getFrench() for * inLocale would result in "Anglais", while passing Locale::getGerman() * for inLocale would result in "Englisch". NULL may be used to specify the default. * @param result the displayable name for localeID * @param maxResultSize the size of the name buffer to store the * displayable full name with * @param err error information if retrieving the displayable name failed * @return the actual buffer size needed for the displayable name. If it's greater * than maxResultSize, the returned displayable name will be truncated. * @stable ICU 2.0 */ U_STABLE int32_t U_EXPORT2 uloc_getDisplayName(const char* localeID, const char* inLocaleID, UChar* result, int32_t maxResultSize, UErrorCode* err); /** * Gets the specified locale from a list of all available locales. * The return value is a pointer to an item of * a locale name array. Both this array and the pointers * it contains are owned by ICU and should not be deleted or written through * by the caller. The locale name is terminated by a null pointer. * @param n the specific locale name index of the available locale list * @return a specified locale name of all available locales * @stable ICU 2.0 */ U_STABLE const char* U_EXPORT2 uloc_getAvailable(int32_t n); /** * Gets the size of the all available locale list. * * @return the size of the locale list * @stable ICU 2.0 */ U_STABLE int32_t U_EXPORT2 uloc_countAvailable(void); /** * * Gets a list of all available 2-letter language codes defined in ISO 639, * plus additional 3-letter codes determined to be useful for locale generation as * defined by Unicode CLDR. This is a pointer * to an array of pointers to arrays of char. All of these pointers are owned * by ICU-- do not delete them, and do not write through them. The array is * terminated with a null pointer. * @return a list of all available language codes * @stable ICU 2.0 */ U_STABLE const char* const* U_EXPORT2 uloc_getISOLanguages(void); /** * * Gets a list of all available 2-letter country codes defined in ISO 639. This is a * pointer to an array of pointers to arrays of char. All of these pointers are * owned by ICU-- do not delete them, and do not write through them. The array is * terminated with a null pointer. * @return a list of all available country codes * @stable ICU 2.0 */ U_STABLE const char* const* U_EXPORT2 uloc_getISOCountries(void); /** * Truncate the locale ID string to get the parent locale ID. * Copies the part of the string before the last underscore. * The parent locale ID will be an empty string if there is no * underscore, or if there is only one underscore at localeID[0]. * * @param localeID Input locale ID string. * @param parent Output string buffer for the parent locale ID. * @param parentCapacity Size of the output buffer. * @param err A UErrorCode value. * @return The length of the parent locale ID. * @stable ICU 2.0 */ U_STABLE int32_t U_EXPORT2 uloc_getParent(const char* localeID, char* parent, int32_t parentCapacity, UErrorCode* err); /** * Gets the full name for the specified locale, like uloc_getName(), * but without keywords. * * Note: This has the effect of 'canonicalizing' the string to * a certain extent. Upper and lower case are set as needed, * and if the components were in 'POSIX' format they are changed to * ICU format. It does NOT map aliased names in any way. * See the top of this header file. * * This API strips off the keyword part, so "de_DE\@collation=phonebook" * will become "de_DE". * This API supports preflighting. * * @param localeID the locale to get the full name with * @param name fill in buffer for the name without keywords. * @param nameCapacity capacity of the fill in buffer. * @param err error information if retrieving the full name failed * @return the actual buffer size needed for the full name. If it's greater * than nameCapacity, the returned full name will be truncated. * @stable ICU 2.8 */ U_STABLE int32_t U_EXPORT2 uloc_getBaseName(const char* localeID, char* name, int32_t nameCapacity, UErrorCode* err); /** * Gets an enumeration of keywords for the specified locale. Enumeration * must get disposed of by the client using uenum_close function. * * @param localeID the locale to get the variant code with * @param status error information if retrieving the keywords failed * @return enumeration of keywords or NULL if there are no keywords. * @stable ICU 2.8 */ U_STABLE UEnumeration* U_EXPORT2 uloc_openKeywords(const char* localeID, UErrorCode* status); /** * Get the value for a keyword. Locale name does not need to be normalized. * * @param localeID locale name containing the keyword ("de_DE@currency=EURO;collation=PHONEBOOK") * @param keywordName name of the keyword for which we want the value. Case insensitive. * @param buffer receiving buffer * @param bufferCapacity capacity of receiving buffer * @param status containing error code - buffer not big enough. * @return the length of keyword value * @stable ICU 2.8 */ U_STABLE int32_t U_EXPORT2 uloc_getKeywordValue(const char* localeID, const char* keywordName, char* buffer, int32_t bufferCapacity, UErrorCode* status); /** * Sets or removes the value of the specified keyword. * * For removing all keywords, use uloc_getBaseName(). * * NOTE: Unlike almost every other ICU function which takes a * buffer, this function will NOT truncate the output text. If a * BUFFER_OVERFLOW_ERROR is received, it means that the original * buffer is untouched. This is done to prevent incorrect or possibly * even malformed locales from being generated and used. * * @param keywordName name of the keyword to be set. Case insensitive. * @param keywordValue value of the keyword to be set. If 0-length or * NULL, will result in the keyword being removed. No error is given if * that keyword does not exist. * @param buffer input buffer containing locale to be modified. * @param bufferCapacity capacity of receiving buffer * @param status containing error code - buffer not big enough. * @return the length needed for the buffer * @see uloc_getKeywordValue * @stable ICU 3.2 */ U_STABLE int32_t U_EXPORT2 uloc_setKeywordValue(const char* keywordName, const char* keywordValue, char* buffer, int32_t bufferCapacity, UErrorCode* status); /** * Returns whether the locale's script is written right-to-left. * If there is no script subtag, then the likely script is used, see uloc_addLikelySubtags(). * If no likely script is known, then FALSE is returned. * * A script is right-to-left according to the CLDR script metadata * which corresponds to whether the script's letters have Bidi_Class=R or AL. * * Returns TRUE for "ar" and "en-Hebr", FALSE for "zh" and "fa-Cyrl". * * @param locale input locale ID * @return TRUE if the locale's script is written right-to-left * @stable ICU 54 */ U_STABLE UBool U_EXPORT2 uloc_isRightToLeft(const char *locale); /** * enums for the return value for the character and line orientation * functions. * @stable ICU 4.0 */ typedef enum { ULOC_LAYOUT_LTR = 0, /* left-to-right. */ ULOC_LAYOUT_RTL = 1, /* right-to-left. */ ULOC_LAYOUT_TTB = 2, /* top-to-bottom. */ ULOC_LAYOUT_BTT = 3, /* bottom-to-top. */ ULOC_LAYOUT_UNKNOWN } ULayoutType; /** * Get the layout character orientation for the specified locale. * * @param localeId locale name * @param status Error status * @return an enum indicating the layout orientation for characters. * @stable ICU 4.0 */ U_STABLE ULayoutType U_EXPORT2 uloc_getCharacterOrientation(const char* localeId, UErrorCode *status); /** * Get the layout line orientation for the specified locale. * * @param localeId locale name * @param status Error status * @return an enum indicating the layout orientation for lines. * @stable ICU 4.0 */ U_STABLE ULayoutType U_EXPORT2 uloc_getLineOrientation(const char* localeId, UErrorCode *status); /** * enums for the 'outResult' parameter return value * @see uloc_acceptLanguageFromHTTP * @see uloc_acceptLanguage * @stable ICU 3.2 */ typedef enum { ULOC_ACCEPT_FAILED = 0, /* No exact match was found. */ ULOC_ACCEPT_VALID = 1, /* An exact match was found. */ ULOC_ACCEPT_FALLBACK = 2 /* A fallback was found, for example, Accept list contained 'ja_JP' which matched available locale 'ja'. */ } UAcceptResult; /** * Based on a HTTP header from a web browser and a list of available locales, * determine an acceptable locale for the user. * @param result - buffer to accept the result locale * @param resultAvailable the size of the result buffer. * @param outResult - An out parameter that contains the fallback status * @param httpAcceptLanguage - "Accept-Language:" header as per HTTP. * @param availableLocales - list of available locales to match * @param status Error status, may be BUFFER_OVERFLOW_ERROR * @return length needed for the locale. * @stable ICU 3.2 */ U_STABLE int32_t U_EXPORT2 uloc_acceptLanguageFromHTTP(char *result, int32_t resultAvailable, UAcceptResult *outResult, const char *httpAcceptLanguage, UEnumeration* availableLocales, UErrorCode *status); /** * Based on a list of available locales, * determine an acceptable locale for the user. * @param result - buffer to accept the result locale * @param resultAvailable the size of the result buffer. * @param outResult - An out parameter that contains the fallback status * @param acceptList - list of acceptable languages * @param acceptListCount - count of acceptList items * @param availableLocales - list of available locales to match * @param status Error status, may be BUFFER_OVERFLOW_ERROR * @return length needed for the locale. * @stable ICU 3.2 */ U_STABLE int32_t U_EXPORT2 uloc_acceptLanguage(char *result, int32_t resultAvailable, UAcceptResult *outResult, const char **acceptList, int32_t acceptListCount, UEnumeration* availableLocales, UErrorCode *status); /** * Gets the ICU locale ID for the specified Win32 LCID value. * * @param hostID the Win32 LCID to translate * @param locale the output buffer for the ICU locale ID, which will be NUL-terminated * if there is room. * @param localeCapacity the size of the output buffer * @param status an error is returned if the LCID is unrecognized or the output buffer * is too small * @return actual the actual size of the locale ID, not including NUL-termination * @stable ICU 3.8 */ U_STABLE int32_t U_EXPORT2 uloc_getLocaleForLCID(uint32_t hostID, char *locale, int32_t localeCapacity, UErrorCode *status); /** * Add the likely subtags for a provided locale ID, per the algorithm described * in the following CLDR technical report: * * http://www.unicode.org/reports/tr35/#Likely_Subtags * * If localeID is already in the maximal form, or there is no data available * for maximization, it will be copied to the output buffer. For example, * "und-Zzzz" cannot be maximized, since there is no reasonable maximization. * * Examples: * * "en" maximizes to "en_Latn_US" * * "de" maximizes to "de_Latn_US" * * "sr" maximizes to "sr_Cyrl_RS" * * "sh" maximizes to "sr_Latn_RS" (Note this will not reverse.) * * "zh_Hani" maximizes to "zh_Hans_CN" (Note this will not reverse.) * * @param localeID The locale to maximize * @param maximizedLocaleID The maximized locale * @param maximizedLocaleIDCapacity The capacity of the maximizedLocaleID buffer * @param err Error information if maximizing the locale failed. If the length * of the localeID and the null-terminator is greater than the maximum allowed size, * or the localeId is not well-formed, the error code is U_ILLEGAL_ARGUMENT_ERROR. * @return The actual buffer size needed for the maximized locale. If it's * greater than maximizedLocaleIDCapacity, the returned ID will be truncated. * On error, the return value is -1. * @stable ICU 4.0 */ U_STABLE int32_t U_EXPORT2 uloc_addLikelySubtags(const char* localeID, char* maximizedLocaleID, int32_t maximizedLocaleIDCapacity, UErrorCode* err); /** * Minimize the subtags for a provided locale ID, per the algorithm described * in the following CLDR technical report: * * http://www.unicode.org/reports/tr35/#Likely_Subtags * * If localeID is already in the minimal form, or there is no data available * for minimization, it will be copied to the output buffer. Since the * minimization algorithm relies on proper maximization, see the comments * for uloc_addLikelySubtags for reasons why there might not be any data. * * Examples: * * "en_Latn_US" minimizes to "en" * * "de_Latn_US" minimizes to "de" * * "sr_Cyrl_RS" minimizes to "sr" * * "zh_Hant_TW" minimizes to "zh_TW" (The region is preferred to the * script, and minimizing to "zh" would imply "zh_Hans_CN".) * * @param localeID The locale to minimize * @param minimizedLocaleID The minimized locale * @param minimizedLocaleIDCapacity The capacity of the minimizedLocaleID buffer * @param err Error information if minimizing the locale failed. If the length * of the localeID and the null-terminator is greater than the maximum allowed size, * or the localeId is not well-formed, the error code is U_ILLEGAL_ARGUMENT_ERROR. * @return The actual buffer size needed for the minimized locale. If it's * greater than minimizedLocaleIDCapacity, the returned ID will be truncated. * On error, the return value is -1. * @stable ICU 4.0 */ U_STABLE int32_t U_EXPORT2 uloc_minimizeSubtags(const char* localeID, char* minimizedLocaleID, int32_t minimizedLocaleIDCapacity, UErrorCode* err); /** * Returns a locale ID for the specified BCP47 language tag string. * If the specified language tag contains any ill-formed subtags, * the first such subtag and all following subtags are ignored. *

* This implements the 'Language-Tag' production of BCP47, and so * supports grandfathered (regular and irregular) as well as private * use language tags. Private use tags are represented as 'x-whatever', * and grandfathered tags are converted to their canonical replacements * where they exist. Note that a few grandfathered tags have no modern * replacement, these will be converted using the fallback described in * the first paragraph, so some information might be lost. * @param langtag the input BCP47 language tag. * @param localeID the output buffer receiving a locale ID for the * specified BCP47 language tag. * @param localeIDCapacity the size of the locale ID output buffer. * @param parsedLength if not NULL, successfully parsed length * for the input language tag is set. * @param err error information if receiving the locald ID * failed. * @return the length of the locale ID. * @stable ICU 4.2 */ U_STABLE int32_t U_EXPORT2 uloc_forLanguageTag(const char* langtag, char* localeID, int32_t localeIDCapacity, int32_t* parsedLength, UErrorCode* err); /** * Returns a well-formed language tag for this locale ID. *

* Note: When strict is FALSE, any locale * fields which do not satisfy the BCP47 syntax requirement will * be omitted from the result. When strict is * TRUE, this function sets U_ILLEGAL_ARGUMENT_ERROR to the * err if any locale fields do not satisfy the * BCP47 syntax requirement. * @param localeID the input locale ID * @param langtag the output buffer receiving BCP47 language * tag for the locale ID. * @param langtagCapacity the size of the BCP47 language tag * output buffer. * @param strict boolean value indicating if the function returns * an error for an ill-formed input locale ID. * @param err error information if receiving the language * tag failed. * @return The length of the BCP47 language tag. * @stable ICU 4.2 */ U_STABLE int32_t U_EXPORT2 uloc_toLanguageTag(const char* localeID, char* langtag, int32_t langtagCapacity, UBool strict, UErrorCode* err); /** * Converts the specified keyword (legacy key, or BCP 47 Unicode locale * extension key) to the equivalent BCP 47 Unicode locale extension key. * For example, BCP 47 Unicode locale extension key "co" is returned for * the input keyword "collation". *

* When the specified keyword is unknown, but satisfies the BCP syntax, * then the pointer to the input keyword itself will be returned. * For example, * uloc_toUnicodeLocaleKey("ZZ") returns "ZZ". * * @param keyword the input locale keyword (either legacy key * such as "collation" or BCP 47 Unicode locale extension * key such as "co"). * @return the well-formed BCP 47 Unicode locale extension key, * or NULL if the specified locale keyword cannot be * mapped to a well-formed BCP 47 Unicode locale extension * key. * @see uloc_toLegacyKey * @stable ICU 54 */ U_STABLE const char* U_EXPORT2 uloc_toUnicodeLocaleKey(const char* keyword); /** * Converts the specified keyword value (legacy type, or BCP 47 * Unicode locale extension type) to the well-formed BCP 47 Unicode locale * extension type for the specified keyword (category). For example, BCP 47 * Unicode locale extension type "phonebk" is returned for the input * keyword value "phonebook", with the keyword "collation" (or "co"). *

* When the specified keyword is not recognized, but the specified value * satisfies the syntax of the BCP 47 Unicode locale extension type, * or when the specified keyword allows 'variable' type and the specified * value satisfies the syntax, then the pointer to the input type value itself * will be returned. * For example, * uloc_toUnicodeLocaleType("Foo", "Bar") returns "Bar", * uloc_toUnicodeLocaleType("variableTop", "00A4") returns "00A4". * * @param keyword the locale keyword (either legacy key such as * "collation" or BCP 47 Unicode locale extension * key such as "co"). * @param value the locale keyword value (either legacy type * such as "phonebook" or BCP 47 Unicode locale extension * type such as "phonebk"). * @return the well-formed BCP47 Unicode locale extension type, * or NULL if the locale keyword value cannot be mapped to * a well-formed BCP 47 Unicode locale extension type. * @see uloc_toLegacyType * @stable ICU 54 */ U_STABLE const char* U_EXPORT2 uloc_toUnicodeLocaleType(const char* keyword, const char* value); /** * Converts the specified keyword (BCP 47 Unicode locale extension key, or * legacy key) to the legacy key. For example, legacy key "collation" is * returned for the input BCP 47 Unicode locale extension key "co". * * @param keyword the input locale keyword (either BCP 47 Unicode locale * extension key or legacy key). * @return the well-formed legacy key, or NULL if the specified * keyword cannot be mapped to a well-formed legacy key. * @see toUnicodeLocaleKey * @stable ICU 54 */ U_STABLE const char* U_EXPORT2 uloc_toLegacyKey(const char* keyword); /** * Converts the specified keyword value (BCP 47 Unicode locale extension type, * or legacy type or type alias) to the canonical legacy type. For example, * the legacy type "phonebook" is returned for the input BCP 47 Unicode * locale extension type "phonebk" with the keyword "collation" (or "co"). *

* When the specified keyword is not recognized, but the specified value * satisfies the syntax of legacy key, or when the specified keyword * allows 'variable' type and the specified value satisfies the syntax, * then the pointer to the input type value itself will be returned. * For example, * uloc_toLegacyType("Foo", "Bar") returns "Bar", * uloc_toLegacyType("vt", "00A4") returns "00A4". * * @param keyword the locale keyword (either legacy keyword such as * "collation" or BCP 47 Unicode locale extension * key such as "co"). * @param value the locale keyword value (either BCP 47 Unicode locale * extension type such as "phonebk" or legacy keyword value * such as "phonebook"). * @return the well-formed legacy type, or NULL if the specified * keyword value cannot be mapped to a well-formed legacy * type. * @see toUnicodeLocaleType * @stable ICU 54 */ U_STABLE const char* U_EXPORT2 uloc_toLegacyType(const char* keyword, const char* value); #endif /*_ULOC*/ // ures.h /* ********************************************************************** * Copyright (C) 1997-2016, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** * * File URES.H (formerly CRESBUND.H) * * Modification History: * * Date Name Description * 04/01/97 aliu Creation. * 02/22/99 damiba overhaul. * 04/04/99 helena Fixed internal header inclusion. * 04/15/99 Madhu Updated Javadoc * 06/14/99 stephen Removed functions taking a filename suffix. * 07/20/99 stephen Language-independent ypedef to void* * 11/09/99 weiv Added ures_getLocale() * 06/24/02 weiv Added support for resource sharing ****************************************************************************** */ #ifndef URES_H #define URES_H /** * \file * \brief C API: Resource Bundle * *

C API: Resource Bundle

* * C API representing a collection of resource information pertaining to a given * locale. A resource bundle provides a way of accessing locale- specific information in * a data file. You create a resource bundle that manages the resources for a given * locale and then ask it for individual resources. *

* Resource bundles in ICU4C are currently defined using text files which conform to the following * BNF definition. * More on resource bundle concepts and syntax can be found in the * Users Guide. *

*/ /** * UResourceBundle is an opaque type for handles for resource bundles in C APIs. * @stable ICU 2.0 */ struct UResourceBundle; /** * @stable ICU 2.0 */ typedef struct UResourceBundle UResourceBundle; /** * Numeric constants for types of resource items. * @see ures_getType * @stable ICU 2.0 */ typedef enum { /** Resource type constant for "no resource". @stable ICU 2.6 */ URES_NONE=-1, /** Resource type constant for 16-bit Unicode strings. @stable ICU 2.6 */ URES_STRING=0, /** Resource type constant for binary data. @stable ICU 2.6 */ URES_BINARY=1, /** Resource type constant for tables of key-value pairs. @stable ICU 2.6 */ URES_TABLE=2, /** * Resource type constant for aliases; * internally stores a string which identifies the actual resource * storing the data (can be in a different resource bundle). * Resolved internally before delivering the actual resource through the API. * @stable ICU 2.6 */ URES_ALIAS=3, /** * Resource type constant for a single 28-bit integer, interpreted as * signed or unsigned by the ures_getInt() or ures_getUInt() function. * @see ures_getInt * @see ures_getUInt * @stable ICU 2.6 */ URES_INT=7, /** Resource type constant for arrays of resources. @stable ICU 2.6 */ URES_ARRAY=8, /** * Resource type constant for vectors of 32-bit integers. * @see ures_getIntVector * @stable ICU 2.6 */ URES_INT_VECTOR = 14, URES_LIMIT = 16 } UResType; /* * Functions to create and destroy resource bundles. */ /** * Opens a UResourceBundle, from which users can extract strings by using * their corresponding keys. * Note that the caller is responsible of calling ures_close on each succesfully * opened resource bundle. * @param packageName The packageName and locale together point to an ICU udata object, * as defined by udata_open( packageName, "res", locale, err) * or equivalent. Typically, packageName will refer to a (.dat) file, or to * a package registered with udata_setAppData(). Using a full file or directory * pathname for packageName is deprecated. If NULL, ICU data will be used. * @param locale specifies the locale for which we want to open the resource * if NULL, the default locale will be used. If strlen(locale) == 0 * root locale will be used. * * @param status fills in the outgoing error code. * The UErrorCode err parameter is used to return status information to the user. To * check whether the construction succeeded or not, you should check the value of * U_SUCCESS(err). If you wish more detailed information, you can check for * informational status results which still indicate success. U_USING_FALLBACK_WARNING * indicates that a fall back locale was used. For example, 'de_CH' was requested, * but nothing was found there, so 'de' was used. U_USING_DEFAULT_WARNING indicates that * the default locale data or root locale data was used; neither the requested locale * nor any of its fall back locales could be found. Please see the users guide for more * information on this topic. * @return a newly allocated resource bundle. * @see ures_close * @stable ICU 2.0 */ U_STABLE UResourceBundle* U_EXPORT2 ures_open(const char* packageName, const char* locale, UErrorCode* status); /** This function does not care what kind of localeID is passed in. It simply opens a bundle with * that name. Fallback mechanism is disabled for the new bundle. If the requested bundle contains * an %%ALIAS directive, the results are undefined. * @param packageName The packageName and locale together point to an ICU udata object, * as defined by udata_open( packageName, "res", locale, err) * or equivalent. Typically, packageName will refer to a (.dat) file, or to * a package registered with udata_setAppData(). Using a full file or directory * pathname for packageName is deprecated. If NULL, ICU data will be used. * @param locale specifies the locale for which we want to open the resource * if NULL, the default locale will be used. If strlen(locale) == 0 * root locale will be used. * * @param status fills in the outgoing error code. Either U_ZERO_ERROR or U_MISSING_RESOURCE_ERROR * @return a newly allocated resource bundle or NULL if it doesn't exist. * @see ures_close * @stable ICU 2.0 */ U_STABLE UResourceBundle* U_EXPORT2 ures_openDirect(const char* packageName, const char* locale, UErrorCode* status); /** * Same as ures_open() but takes a const UChar *path. * This path will be converted to char * using the default converter, * then ures_open() is called. * * @param packageName The packageName and locale together point to an ICU udata object, * as defined by udata_open( packageName, "res", locale, err) * or equivalent. Typically, packageName will refer to a (.dat) file, or to * a package registered with udata_setAppData(). Using a full file or directory * pathname for packageName is deprecated. If NULL, ICU data will be used. * @param locale specifies the locale for which we want to open the resource * if NULL, the default locale will be used. If strlen(locale) == 0 * root locale will be used. * @param status fills in the outgoing error code. * @return a newly allocated resource bundle. * @see ures_open * @stable ICU 2.0 */ U_STABLE UResourceBundle* U_EXPORT2 ures_openU(const UChar* packageName, const char* locale, UErrorCode* status); /** * Close a resource bundle, all pointers returned from the various ures_getXXX calls * on this particular bundle should be considered invalid henceforth. * * @param resourceBundle a pointer to a resourceBundle struct. Can be NULL. * @see ures_open * @stable ICU 2.0 */ U_STABLE void U_EXPORT2 ures_close(UResourceBundle* resourceBundle); /** * Return the version number associated with this ResourceBundle as an * UVersionInfo array. * * @param resB The resource bundle for which the version is checked. * @param versionInfo A UVersionInfo array that is filled with the version number * as specified in the resource bundle or its parent. * @stable ICU 2.0 */ U_STABLE void U_EXPORT2 ures_getVersion(const UResourceBundle* resB, UVersionInfo versionInfo); /** * Return the name of the Locale associated with this ResourceBundle. * You can choose between requested, valid and real locale. * * @param resourceBundle resource bundle in question * @param type You can choose between requested, valid and actual * locale. For description see the definition of * ULocDataLocaleType in uloc.h * @param status just for catching illegal arguments * @return A Locale name * @stable ICU 2.8 */ U_STABLE const char* U_EXPORT2 ures_getLocaleByType(const UResourceBundle* resourceBundle, ULocDataLocaleType type, UErrorCode* status); /** * Returns a string from a string resource type * * @param resourceBundle a string resource * @param len fills in the length of resulting string * @param status fills in the outgoing error code * could be U_MISSING_RESOURCE_ERROR if the key is not found * Always check the value of status. Don't count on returning NULL. * could be a non-failing error * e.g.: U_USING_FALLBACK_WARNING,U_USING_DEFAULT_WARNING * @return a pointer to a zero-terminated UChar array which lives in a memory mapped/DLL file. * @see ures_getBinary * @see ures_getIntVector * @see ures_getInt * @see ures_getUInt * @stable ICU 2.0 */ U_STABLE const UChar* U_EXPORT2 ures_getString(const UResourceBundle* resourceBundle, int32_t* len, UErrorCode* status); /** * Returns a UTF-8 string from a string resource. * The UTF-8 string may be returnable directly as a pointer, or * it may need to be copied, or transformed from UTF-16 using u_strToUTF8() * or equivalent. * * If forceCopy==TRUE, then the string is always written to the dest buffer * and dest is returned. * * If forceCopy==FALSE, then the string is returned as a pointer if possible, * without needing a dest buffer (it can be NULL). If the string needs to be * copied or transformed, then it may be placed into dest at an arbitrary offset. * * If the string is to be written to dest, then U_BUFFER_OVERFLOW_ERROR and * U_STRING_NOT_TERMINATED_WARNING are set if appropriate, as usual. * * If the string is transformed from UTF-16, then a conversion error may occur * if an unpaired surrogate is encountered. If the function is successful, then * the output UTF-8 string is always well-formed. * * @param resB Resource bundle. * @param dest Destination buffer. Can be NULL only if capacity=*length==0. * @param length Input: Capacity of destination buffer. * Output: Actual length of the UTF-8 string, not counting the * terminating NUL, even in case of U_BUFFER_OVERFLOW_ERROR. * Can be NULL, meaning capacity=0 and the string length is not * returned to the caller. * @param forceCopy If TRUE, then the output string will always be written to * dest, with U_BUFFER_OVERFLOW_ERROR and * U_STRING_NOT_TERMINATED_WARNING set if appropriate. * If FALSE, then the dest buffer may or may not contain a * copy of the string. dest may or may not be modified. * If a copy needs to be written, then the UErrorCode parameter * indicates overflow etc. as usual. * @param status Pointer to a standard ICU error code. Its input value must * pass the U_SUCCESS() test, or else the function returns * immediately. Check for U_FAILURE() on output or use with * function chaining. (See User Guide for details.) * @return The pointer to the UTF-8 string. It may be dest, or at some offset * from dest (only if !forceCopy), or in unrelated memory. * Always NUL-terminated unless the string was written to dest and * length==capacity (in which case U_STRING_NOT_TERMINATED_WARNING is set). * * @see ures_getString * @see u_strToUTF8 * @stable ICU 3.6 */ U_STABLE const char * U_EXPORT2 ures_getUTF8String(const UResourceBundle *resB, char *dest, int32_t *length, UBool forceCopy, UErrorCode *status); /** * Returns a binary data from a binary resource. * * @param resourceBundle a string resource * @param len fills in the length of resulting byte chunk * @param status fills in the outgoing error code * could be U_MISSING_RESOURCE_ERROR if the key is not found * Always check the value of status. Don't count on returning NULL. * could be a non-failing error * e.g.: U_USING_FALLBACK_WARNING,U_USING_DEFAULT_WARNING * @return a pointer to a chunk of unsigned bytes which live in a memory mapped/DLL file. * @see ures_getString * @see ures_getIntVector * @see ures_getInt * @see ures_getUInt * @stable ICU 2.0 */ U_STABLE const uint8_t* U_EXPORT2 ures_getBinary(const UResourceBundle* resourceBundle, int32_t* len, UErrorCode* status); /** * Returns a 32 bit integer array from a resource. * * @param resourceBundle an int vector resource * @param len fills in the length of resulting byte chunk * @param status fills in the outgoing error code * could be U_MISSING_RESOURCE_ERROR if the key is not found * Always check the value of status. Don't count on returning NULL. * could be a non-failing error * e.g.: U_USING_FALLBACK_WARNING,U_USING_DEFAULT_WARNING * @return a pointer to a chunk of integers which live in a memory mapped/DLL file. * @see ures_getBinary * @see ures_getString * @see ures_getInt * @see ures_getUInt * @stable ICU 2.0 */ U_STABLE const int32_t* U_EXPORT2 ures_getIntVector(const UResourceBundle* resourceBundle, int32_t* len, UErrorCode* status); /** * Returns an unsigned integer from a resource. * This integer is originally 28 bits. * * @param resourceBundle a string resource * @param status fills in the outgoing error code * could be U_MISSING_RESOURCE_ERROR if the key is not found * could be a non-failing error * e.g.: U_USING_FALLBACK_WARNING,U_USING_DEFAULT_WARNING * @return an integer value * @see ures_getInt * @see ures_getIntVector * @see ures_getBinary * @see ures_getString * @stable ICU 2.0 */ U_STABLE uint32_t U_EXPORT2 ures_getUInt(const UResourceBundle* resourceBundle, UErrorCode *status); /** * Returns a signed integer from a resource. * This integer is originally 28 bit and the sign gets propagated. * * @param resourceBundle a string resource * @param status fills in the outgoing error code * could be U_MISSING_RESOURCE_ERROR if the key is not found * could be a non-failing error * e.g.: U_USING_FALLBACK_WARNING,U_USING_DEFAULT_WARNING * @return an integer value * @see ures_getUInt * @see ures_getIntVector * @see ures_getBinary * @see ures_getString * @stable ICU 2.0 */ U_STABLE int32_t U_EXPORT2 ures_getInt(const UResourceBundle* resourceBundle, UErrorCode *status); /** * Returns the size of a resource. Size for scalar types is always 1, * and for vector/table types is the number of child resources. * @warning Integer array is treated as a scalar type. There are no * APIs to access individual members of an integer array. It * is always returned as a whole. * @param resourceBundle a resource * @return number of resources in a given resource. * @stable ICU 2.0 */ U_STABLE int32_t U_EXPORT2 ures_getSize(const UResourceBundle *resourceBundle); /** * Returns the type of a resource. Available types are defined in enum UResType * * @param resourceBundle a resource * @return type of the given resource. * @see UResType * @stable ICU 2.0 */ U_STABLE UResType U_EXPORT2 ures_getType(const UResourceBundle *resourceBundle); /** * Returns the key associated with a given resource. Not all the resources have a key - only * those that are members of a table. * * @param resourceBundle a resource * @return a key associated to this resource, or NULL if it doesn't have a key * @stable ICU 2.0 */ U_STABLE const char * U_EXPORT2 ures_getKey(const UResourceBundle *resourceBundle); /* ITERATION API This API provides means for iterating through a resource */ /** * Resets the internal context of a resource so that iteration starts from the first element. * * @param resourceBundle a resource * @stable ICU 2.0 */ U_STABLE void U_EXPORT2 ures_resetIterator(UResourceBundle *resourceBundle); /** * Checks whether the given resource has another element to iterate over. * * @param resourceBundle a resource * @return TRUE if there are more elements, FALSE if there is no more elements * @stable ICU 2.0 */ U_STABLE UBool U_EXPORT2 ures_hasNext(const UResourceBundle *resourceBundle); /** * Returns the next resource in a given resource or NULL if there are no more resources * to iterate over. Features a fill-in parameter. * * @param resourceBundle a resource * @param fillIn if NULL a new UResourceBundle struct is allocated and must be closed by the caller. * Alternatively, you can supply a struct to be filled by this function. * @param status fills in the outgoing error code. You may still get a non NULL result even if an * error occured. Check status instead. * @return a pointer to a UResourceBundle struct. If fill in param was NULL, caller must close it * @stable ICU 2.0 */ U_STABLE UResourceBundle* U_EXPORT2 ures_getNextResource(UResourceBundle *resourceBundle, UResourceBundle *fillIn, UErrorCode *status); /** * Returns the next string in a given resource or NULL if there are no more resources * to iterate over. * * @param resourceBundle a resource * @param len fill in length of the string * @param key fill in for key associated with this string. NULL if no key * @param status fills in the outgoing error code. If an error occured, we may return NULL, but don't * count on it. Check status instead! * @return a pointer to a zero-terminated UChar array which lives in a memory mapped/DLL file. * @stable ICU 2.0 */ U_STABLE const UChar* U_EXPORT2 ures_getNextString(UResourceBundle *resourceBundle, int32_t* len, const char ** key, UErrorCode *status); /** * Returns the resource in a given resource at the specified index. Features a fill-in parameter. * * @param resourceBundle the resource bundle from which to get a sub-resource * @param indexR an index to the wanted resource. * @param fillIn if NULL a new UResourceBundle struct is allocated and must be closed by the caller. * Alternatively, you can supply a struct to be filled by this function. * @param status fills in the outgoing error code. Don't count on NULL being returned if an error has * occured. Check status instead. * @return a pointer to a UResourceBundle struct. If fill in param was NULL, caller must close it * @stable ICU 2.0 */ U_STABLE UResourceBundle* U_EXPORT2 ures_getByIndex(const UResourceBundle *resourceBundle, int32_t indexR, UResourceBundle *fillIn, UErrorCode *status); /** * Returns the string in a given resource at the specified index. * * @param resourceBundle a resource * @param indexS an index to the wanted string. * @param len fill in length of the string * @param status fills in the outgoing error code. If an error occured, we may return NULL, but don't * count on it. Check status instead! * @return a pointer to a zero-terminated UChar array which lives in a memory mapped/DLL file. * @stable ICU 2.0 */ U_STABLE const UChar* U_EXPORT2 ures_getStringByIndex(const UResourceBundle *resourceBundle, int32_t indexS, int32_t* len, UErrorCode *status); /** * Returns a UTF-8 string from a resource at the specified index. * The UTF-8 string may be returnable directly as a pointer, or * it may need to be copied, or transformed from UTF-16 using u_strToUTF8() * or equivalent. * * If forceCopy==TRUE, then the string is always written to the dest buffer * and dest is returned. * * If forceCopy==FALSE, then the string is returned as a pointer if possible, * without needing a dest buffer (it can be NULL). If the string needs to be * copied or transformed, then it may be placed into dest at an arbitrary offset. * * If the string is to be written to dest, then U_BUFFER_OVERFLOW_ERROR and * U_STRING_NOT_TERMINATED_WARNING are set if appropriate, as usual. * * If the string is transformed from UTF-16, then a conversion error may occur * if an unpaired surrogate is encountered. If the function is successful, then * the output UTF-8 string is always well-formed. * * @param resB Resource bundle. * @param stringIndex An index to the wanted string. * @param dest Destination buffer. Can be NULL only if capacity=*length==0. * @param pLength Input: Capacity of destination buffer. * Output: Actual length of the UTF-8 string, not counting the * terminating NUL, even in case of U_BUFFER_OVERFLOW_ERROR. * Can be NULL, meaning capacity=0 and the string length is not * returned to the caller. * @param forceCopy If TRUE, then the output string will always be written to * dest, with U_BUFFER_OVERFLOW_ERROR and * U_STRING_NOT_TERMINATED_WARNING set if appropriate. * If FALSE, then the dest buffer may or may not contain a * copy of the string. dest may or may not be modified. * If a copy needs to be written, then the UErrorCode parameter * indicates overflow etc. as usual. * @param status Pointer to a standard ICU error code. Its input value must * pass the U_SUCCESS() test, or else the function returns * immediately. Check for U_FAILURE() on output or use with * function chaining. (See User Guide for details.) * @return The pointer to the UTF-8 string. It may be dest, or at some offset * from dest (only if !forceCopy), or in unrelated memory. * Always NUL-terminated unless the string was written to dest and * length==capacity (in which case U_STRING_NOT_TERMINATED_WARNING is set). * * @see ures_getStringByIndex * @see u_strToUTF8 * @stable ICU 3.6 */ U_STABLE const char * U_EXPORT2 ures_getUTF8StringByIndex(const UResourceBundle *resB, int32_t stringIndex, char *dest, int32_t *pLength, UBool forceCopy, UErrorCode *status); /** * Returns a resource in a given resource that has a given key. This procedure works only with table * resources. Features a fill-in parameter. * * @param resourceBundle a resource * @param key a key associated with the wanted resource * @param fillIn if NULL a new UResourceBundle struct is allocated and must be closed by the caller. * Alternatively, you can supply a struct to be filled by this function. * @param status fills in the outgoing error code. * @return a pointer to a UResourceBundle struct. If fill in param was NULL, caller must close it * @stable ICU 2.0 */ U_STABLE UResourceBundle* U_EXPORT2 ures_getByKey(const UResourceBundle *resourceBundle, const char* key, UResourceBundle *fillIn, UErrorCode *status); /** * Returns a string in a given resource that has a given key. This procedure works only with table * resources. * * @param resB a resource * @param key a key associated with the wanted string * @param len fill in length of the string * @param status fills in the outgoing error code. If an error occured, we may return NULL, but don't * count on it. Check status instead! * @return a pointer to a zero-terminated UChar array which lives in a memory mapped/DLL file. * @stable ICU 2.0 */ U_STABLE const UChar* U_EXPORT2 ures_getStringByKey(const UResourceBundle *resB, const char* key, int32_t* len, UErrorCode *status); /** * Returns a UTF-8 string from a resource and a key. * This function works only with table resources. * * The UTF-8 string may be returnable directly as a pointer, or * it may need to be copied, or transformed from UTF-16 using u_strToUTF8() * or equivalent. * * If forceCopy==TRUE, then the string is always written to the dest buffer * and dest is returned. * * If forceCopy==FALSE, then the string is returned as a pointer if possible, * without needing a dest buffer (it can be NULL). If the string needs to be * copied or transformed, then it may be placed into dest at an arbitrary offset. * * If the string is to be written to dest, then U_BUFFER_OVERFLOW_ERROR and * U_STRING_NOT_TERMINATED_WARNING are set if appropriate, as usual. * * If the string is transformed from UTF-16, then a conversion error may occur * if an unpaired surrogate is encountered. If the function is successful, then * the output UTF-8 string is always well-formed. * * @param resB Resource bundle. * @param key A key associated with the wanted resource * @param dest Destination buffer. Can be NULL only if capacity=*length==0. * @param pLength Input: Capacity of destination buffer. * Output: Actual length of the UTF-8 string, not counting the * terminating NUL, even in case of U_BUFFER_OVERFLOW_ERROR. * Can be NULL, meaning capacity=0 and the string length is not * returned to the caller. * @param forceCopy If TRUE, then the output string will always be written to * dest, with U_BUFFER_OVERFLOW_ERROR and * U_STRING_NOT_TERMINATED_WARNING set if appropriate. * If FALSE, then the dest buffer may or may not contain a * copy of the string. dest may or may not be modified. * If a copy needs to be written, then the UErrorCode parameter * indicates overflow etc. as usual. * @param status Pointer to a standard ICU error code. Its input value must * pass the U_SUCCESS() test, or else the function returns * immediately. Check for U_FAILURE() on output or use with * function chaining. (See User Guide for details.) * @return The pointer to the UTF-8 string. It may be dest, or at some offset * from dest (only if !forceCopy), or in unrelated memory. * Always NUL-terminated unless the string was written to dest and * length==capacity (in which case U_STRING_NOT_TERMINATED_WARNING is set). * * @see ures_getStringByKey * @see u_strToUTF8 * @stable ICU 3.6 */ U_STABLE const char * U_EXPORT2 ures_getUTF8StringByKey(const UResourceBundle *resB, const char *key, char *dest, int32_t *pLength, UBool forceCopy, UErrorCode *status); /** * Create a string enumerator, owned by the caller, of all locales located within * the specified resource tree. * @param packageName name of the tree, such as (NULL) or U_ICUDATA_ALIAS or or "ICUDATA-coll" * This call is similar to uloc_getAvailable(). * @param status error code * @stable ICU 3.2 */ U_STABLE UEnumeration* U_EXPORT2 ures_openAvailableLocales(const char *packageName, UErrorCode *status); #endif /*_URES*/ /*eof*/ // udisplaycontext.h /* ***************************************************************************************** * Copyright (C) 2014-2016, International Business Machines * Corporation and others. All Rights Reserved. ***************************************************************************************** */ #ifndef UDISPLAYCONTEXT_H #define UDISPLAYCONTEXT_H #if !UCONFIG_NO_FORMATTING /** * \file * \brief C API: Display context types (enum values) */ /** * Display context types, for getting values of a particular setting. * Note, the specific numeric values are internal and may change. * @stable ICU 51 */ enum UDisplayContextType { /** * Type to retrieve the dialect handling setting, e.g. * UDISPCTX_STANDARD_NAMES or UDISPCTX_DIALECT_NAMES. * @stable ICU 51 */ UDISPCTX_TYPE_DIALECT_HANDLING = 0, /** * Type to retrieve the capitalization context setting, e.g. * UDISPCTX_CAPITALIZATION_NONE, UDISPCTX_CAPITALIZATION_FOR_MIDDLE_OF_SENTENCE, * UDISPCTX_CAPITALIZATION_FOR_BEGINNING_OF_SENTENCE, etc. * @stable ICU 51 */ UDISPCTX_TYPE_CAPITALIZATION = 1, /** * Type to retrieve the display length setting, e.g. * UDISPCTX_LENGTH_FULL, UDISPCTX_LENGTH_SHORT. * @stable ICU 54 */ UDISPCTX_TYPE_DISPLAY_LENGTH = 2 }; /** * @stable ICU 51 */ typedef enum UDisplayContextType UDisplayContextType; /** * Display context settings. * Note, the specific numeric values are internal and may change. * @stable ICU 51 */ enum UDisplayContext { /** * ================================ * DIALECT_HANDLING can be set to one of UDISPCTX_STANDARD_NAMES or * UDISPCTX_DIALECT_NAMES. Use UDisplayContextType UDISPCTX_TYPE_DIALECT_HANDLING * to get the value. */ /** * A possible setting for DIALECT_HANDLING: * use standard names when generating a locale name, * e.g. en_GB displays as 'English (United Kingdom)'. * @stable ICU 51 */ UDISPCTX_STANDARD_NAMES = (UDISPCTX_TYPE_DIALECT_HANDLING<<8) + 0, /** * A possible setting for DIALECT_HANDLING: * use dialect names, when generating a locale name, * e.g. en_GB displays as 'British English'. * @stable ICU 51 */ UDISPCTX_DIALECT_NAMES = (UDISPCTX_TYPE_DIALECT_HANDLING<<8) + 1, /** * ================================ * CAPITALIZATION can be set to one of UDISPCTX_CAPITALIZATION_NONE, * UDISPCTX_CAPITALIZATION_FOR_MIDDLE_OF_SENTENCE, * UDISPCTX_CAPITALIZATION_FOR_BEGINNING_OF_SENTENCE, * UDISPCTX_CAPITALIZATION_FOR_UI_LIST_OR_MENU, or * UDISPCTX_CAPITALIZATION_FOR_STANDALONE. * Use UDisplayContextType UDISPCTX_TYPE_CAPITALIZATION to get the value. */ /** * The capitalization context to be used is unknown (this is the default value). * @stable ICU 51 */ UDISPCTX_CAPITALIZATION_NONE = (UDISPCTX_TYPE_CAPITALIZATION<<8) + 0, /** * The capitalization context if a date, date symbol or display name is to be * formatted with capitalization appropriate for the middle of a sentence. * @stable ICU 51 */ UDISPCTX_CAPITALIZATION_FOR_MIDDLE_OF_SENTENCE = (UDISPCTX_TYPE_CAPITALIZATION<<8) + 1, /** * The capitalization context if a date, date symbol or display name is to be * formatted with capitalization appropriate for the beginning of a sentence. * @stable ICU 51 */ UDISPCTX_CAPITALIZATION_FOR_BEGINNING_OF_SENTENCE = (UDISPCTX_TYPE_CAPITALIZATION<<8) + 2, /** * The capitalization context if a date, date symbol or display name is to be * formatted with capitalization appropriate for a user-interface list or menu item. * @stable ICU 51 */ UDISPCTX_CAPITALIZATION_FOR_UI_LIST_OR_MENU = (UDISPCTX_TYPE_CAPITALIZATION<<8) + 3, /** * The capitalization context if a date, date symbol or display name is to be * formatted with capitalization appropriate for stand-alone usage such as an * isolated name on a calendar page. * @stable ICU 51 */ UDISPCTX_CAPITALIZATION_FOR_STANDALONE = (UDISPCTX_TYPE_CAPITALIZATION<<8) + 4, /** * ================================ * DISPLAY_LENGTH can be set to one of UDISPCTX_LENGTH_FULL or * UDISPCTX_LENGTH_SHORT. Use UDisplayContextType UDISPCTX_TYPE_DISPLAY_LENGTH * to get the value. */ /** * A possible setting for DISPLAY_LENGTH: * use full names when generating a locale name, * e.g. "United States" for US. * @stable ICU 54 */ UDISPCTX_LENGTH_FULL = (UDISPCTX_TYPE_DISPLAY_LENGTH<<8) + 0, /** * A possible setting for DISPLAY_LENGTH: * use short names when generating a locale name, * e.g. "U.S." for US. * @stable ICU 54 */ UDISPCTX_LENGTH_SHORT = (UDISPCTX_TYPE_DISPLAY_LENGTH<<8) + 1 }; /** * @stable ICU 51 */ typedef enum UDisplayContext UDisplayContext; #endif /* #if !UCONFIG_NO_FORMATTING */ #endif // uldnames.h /* ******************************************************************************* * Copyright (C) 2010-2016, International Business Machines Corporation and * others. All Rights Reserved. ******************************************************************************* */ #ifndef __ULDNAMES_H__ #define __ULDNAMES_H__ /** * \file * \brief C API: Provides display names of Locale ids and their components. */ /** * Enum used in LocaleDisplayNames::createInstance. * @stable ICU 4.4 */ typedef enum { /** * Use standard names when generating a locale name, * e.g. en_GB displays as 'English (United Kingdom)'. * @stable ICU 4.4 */ ULDN_STANDARD_NAMES = 0, /** * Use dialect names, when generating a locale name, * e.g. en_GB displays as 'British English'. * @stable ICU 4.4 */ ULDN_DIALECT_NAMES } UDialectHandling; /** * Opaque C service object type for the locale display names API * @stable ICU 4.4 */ struct ULocaleDisplayNames; /** * C typedef for struct ULocaleDisplayNames. * @stable ICU 4.4 */ typedef struct ULocaleDisplayNames ULocaleDisplayNames; #if !UCONFIG_NO_FORMATTING /** * Returns an instance of LocaleDisplayNames that returns names * formatted for the provided locale, using the provided * dialectHandling. The usual value for dialectHandling is * ULOC_STANDARD_NAMES. * * @param locale the display locale * @param dialectHandling how to select names for locales * @return a ULocaleDisplayNames instance * @param pErrorCode the status code * @stable ICU 4.4 */ U_STABLE ULocaleDisplayNames * U_EXPORT2 uldn_open(const char * locale, UDialectHandling dialectHandling, UErrorCode *pErrorCode); /** * Closes a ULocaleDisplayNames instance obtained from uldn_open(). * @param ldn the ULocaleDisplayNames instance to be closed * @stable ICU 4.4 */ U_STABLE void U_EXPORT2 uldn_close(ULocaleDisplayNames *ldn); /* getters for state */ /** * Returns the locale used to determine the display names. This is * not necessarily the same locale passed to {@link #uldn_open}. * @param ldn the LocaleDisplayNames instance * @return the display locale * @stable ICU 4.4 */ U_STABLE const char * U_EXPORT2 uldn_getLocale(const ULocaleDisplayNames *ldn); /** * Returns the dialect handling used in the display names. * @param ldn the LocaleDisplayNames instance * @return the dialect handling enum * @stable ICU 4.4 */ U_STABLE UDialectHandling U_EXPORT2 uldn_getDialectHandling(const ULocaleDisplayNames *ldn); /* names for entire locales */ /** * Returns the display name of the provided locale. * @param ldn the LocaleDisplayNames instance * @param locale the locale whose display name to return * @param result receives the display name * @param maxResultSize the size of the result buffer * @param pErrorCode the status code * @return the actual buffer size needed for the display name. If it's * greater than maxResultSize, the returned name will be truncated. * @stable ICU 4.4 */ U_STABLE int32_t U_EXPORT2 uldn_localeDisplayName(const ULocaleDisplayNames *ldn, const char *locale, UChar *result, int32_t maxResultSize, UErrorCode *pErrorCode); /* names for components of a locale */ /** * Returns the display name of the provided language code. * @param ldn the LocaleDisplayNames instance * @param lang the language code whose display name to return * @param result receives the display name * @param maxResultSize the size of the result buffer * @param pErrorCode the status code * @return the actual buffer size needed for the display name. If it's * greater than maxResultSize, the returned name will be truncated. * @stable ICU 4.4 */ U_STABLE int32_t U_EXPORT2 uldn_languageDisplayName(const ULocaleDisplayNames *ldn, const char *lang, UChar *result, int32_t maxResultSize, UErrorCode *pErrorCode); /** * Returns the display name of the provided script. * @param ldn the LocaleDisplayNames instance * @param script the script whose display name to return * @param result receives the display name * @param maxResultSize the size of the result buffer * @param pErrorCode the status code * @return the actual buffer size needed for the display name. If it's * greater than maxResultSize, the returned name will be truncated. * @stable ICU 4.4 */ U_STABLE int32_t U_EXPORT2 uldn_scriptDisplayName(const ULocaleDisplayNames *ldn, const char *script, UChar *result, int32_t maxResultSize, UErrorCode *pErrorCode); /** * Returns the display name of the provided script code. * @param ldn the LocaleDisplayNames instance * @param scriptCode the script code whose display name to return * @param result receives the display name * @param maxResultSize the size of the result buffer * @param pErrorCode the status code * @return the actual buffer size needed for the display name. If it's * greater than maxResultSize, the returned name will be truncated. * @stable ICU 4.4 */ U_STABLE int32_t U_EXPORT2 uldn_scriptCodeDisplayName(const ULocaleDisplayNames *ldn, UScriptCode scriptCode, UChar *result, int32_t maxResultSize, UErrorCode *pErrorCode); /** * Returns the display name of the provided region code. * @param ldn the LocaleDisplayNames instance * @param region the region code whose display name to return * @param result receives the display name * @param maxResultSize the size of the result buffer * @param pErrorCode the status code * @return the actual buffer size needed for the display name. If it's * greater than maxResultSize, the returned name will be truncated. * @stable ICU 4.4 */ U_STABLE int32_t U_EXPORT2 uldn_regionDisplayName(const ULocaleDisplayNames *ldn, const char *region, UChar *result, int32_t maxResultSize, UErrorCode *pErrorCode); /** * Returns the display name of the provided variant * @param ldn the LocaleDisplayNames instance * @param variant the variant whose display name to return * @param result receives the display name * @param maxResultSize the size of the result buffer * @param pErrorCode the status code * @return the actual buffer size needed for the display name. If it's * greater than maxResultSize, the returned name will be truncated. * @stable ICU 4.4 */ U_STABLE int32_t U_EXPORT2 uldn_variantDisplayName(const ULocaleDisplayNames *ldn, const char *variant, UChar *result, int32_t maxResultSize, UErrorCode *pErrorCode); /** * Returns the display name of the provided locale key * @param ldn the LocaleDisplayNames instance * @param key the locale key whose display name to return * @param result receives the display name * @param maxResultSize the size of the result buffer * @param pErrorCode the status code * @return the actual buffer size needed for the display name. If it's * greater than maxResultSize, the returned name will be truncated. * @stable ICU 4.4 */ U_STABLE int32_t U_EXPORT2 uldn_keyDisplayName(const ULocaleDisplayNames *ldn, const char *key, UChar *result, int32_t maxResultSize, UErrorCode *pErrorCode); /** * Returns the display name of the provided value (used with the provided key). * @param ldn the LocaleDisplayNames instance * @param key the locale key * @param value the locale key's value * @param result receives the display name * @param maxResultSize the size of the result buffer * @param pErrorCode the status code * @return the actual buffer size needed for the display name. If it's * greater than maxResultSize, the returned name will be truncated. * @stable ICU 4.4 */ U_STABLE int32_t U_EXPORT2 uldn_keyValueDisplayName(const ULocaleDisplayNames *ldn, const char *key, const char *value, UChar *result, int32_t maxResultSize, UErrorCode *pErrorCode); /** * Returns an instance of LocaleDisplayNames that returns names formatted * for the provided locale, using the provided UDisplayContext settings. * * @param locale The display locale * @param contexts List of one or more context settings (e.g. for dialect * handling, capitalization, etc. * @param length Number of items in the contexts list * @param pErrorCode Pointer to UErrorCode input/output status. If at entry this indicates * a failure status, the function will do nothing; otherwise this will be * updated with any new status from the function. * @return a ULocaleDisplayNames instance * @stable ICU 51 */ U_STABLE ULocaleDisplayNames * U_EXPORT2 uldn_openForContext(const char * locale, UDisplayContext *contexts, int32_t length, UErrorCode *pErrorCode); /** * Returns the UDisplayContext value for the specified UDisplayContextType. * @param ldn the ULocaleDisplayNames instance * @param type the UDisplayContextType whose value to return * @param pErrorCode Pointer to UErrorCode input/output status. If at entry this indicates * a failure status, the function will do nothing; otherwise this will be * updated with any new status from the function. * @return the UDisplayContextValue for the specified type. * @stable ICU 51 */ U_STABLE UDisplayContext U_EXPORT2 uldn_getContext(const ULocaleDisplayNames *ldn, UDisplayContextType type, UErrorCode *pErrorCode); #endif /* !UCONFIG_NO_FORMATTING */ #endif /* __ULDNAMES_H__ */ // ucurr.h /* ********************************************************************** * Copyright (c) 2002-2016, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** */ #ifndef _UCURR_H_ #define _UCURR_H_ /** * \file * \brief C API: Encapsulates information about a currency. * * The ucurr API encapsulates information about a currency, as defined by * ISO 4217. A currency is represented by a 3-character string * containing its ISO 4217 code. This API can return various data * necessary the proper display of a currency: * *

  • A display symbol, for a specific locale *
  • The number of fraction digits to display *
  • A rounding increment *
* * The DecimalFormat class uses these data to display * currencies. * @author Alan Liu * @since ICU 2.2 */ #if !UCONFIG_NO_FORMATTING /** * Currency Usage used for Decimal Format * @stable ICU 54 */ enum UCurrencyUsage { /** * a setting to specify currency usage which determines currency digit * and rounding for standard usage, for example: "50.00 NT$" * used as DEFAULT value * @stable ICU 54 */ UCURR_USAGE_STANDARD=0, /** * a setting to specify currency usage which determines currency digit * and rounding for cash usage, for example: "50 NT$" * @stable ICU 54 */ UCURR_USAGE_CASH=1, /** * One higher than the last enum UCurrencyUsage constant. * @stable ICU 54 */ UCURR_USAGE_COUNT=2 }; typedef enum UCurrencyUsage UCurrencyUsage; /** * Finds a currency code for the given locale. * @param locale the locale for which to retrieve a currency code. * Currency can be specified by the "currency" keyword * in which case it overrides the default currency code * @param buff fill in buffer. Can be NULL for preflighting. * @param buffCapacity capacity of the fill in buffer. Can be 0 for * preflighting. If it is non-zero, the buff parameter * must not be NULL. * @param ec error code * @return length of the currency string. It should always be 3. If 0, * currency couldn't be found or the input values are * invalid. * @stable ICU 2.8 */ U_STABLE int32_t U_EXPORT2 ucurr_forLocale(const char* locale, UChar* buff, int32_t buffCapacity, UErrorCode* ec); /** * Selector constants for ucurr_getName(). * * @see ucurr_getName * @stable ICU 2.6 */ typedef enum UCurrNameStyle { /** * Selector for ucurr_getName indicating a symbolic name for a * currency, such as "$" for USD. * @stable ICU 2.6 */ UCURR_SYMBOL_NAME, /** * Selector for ucurr_getName indicating the long name for a * currency, such as "US Dollar" for USD. * @stable ICU 2.6 */ UCURR_LONG_NAME } UCurrNameStyle; #if !UCONFIG_NO_SERVICE /** * @stable ICU 2.6 */ typedef const void* UCurrRegistryKey; /** * Register an (existing) ISO 4217 currency code for the given locale. * Only the country code and the two variants EURO and PRE_EURO are * recognized. * @param isoCode the three-letter ISO 4217 currency code * @param locale the locale for which to register this currency code * @param status the in/out status code * @return a registry key that can be used to unregister this currency code, or NULL * if there was an error. * @stable ICU 2.6 */ U_STABLE UCurrRegistryKey U_EXPORT2 ucurr_register(const UChar* isoCode, const char* locale, UErrorCode* status); /** * Unregister the previously-registered currency definitions using the * URegistryKey returned from ucurr_register. Key becomes invalid after * a successful call and should not be used again. Any currency * that might have been hidden by the original ucurr_register call is * restored. * @param key the registry key returned by a previous call to ucurr_register * @param status the in/out status code, no special meanings are assigned * @return TRUE if the currency for this key was successfully unregistered * @stable ICU 2.6 */ U_STABLE UBool U_EXPORT2 ucurr_unregister(UCurrRegistryKey key, UErrorCode* status); #endif /* UCONFIG_NO_SERVICE */ /** * Returns the display name for the given currency in the * given locale. For example, the display name for the USD * currency object in the en_US locale is "$". * @param currency null-terminated 3-letter ISO 4217 code * @param locale locale in which to display currency * @param nameStyle selector for which kind of name to return * @param isChoiceFormat fill-in set to TRUE if the returned value * is a ChoiceFormat pattern; otherwise it is a static string * @param len fill-in parameter to receive length of result * @param ec error code * @return pointer to display string of 'len' UChars. If the resource * data contains no entry for 'currency', then 'currency' itself is * returned. If *isChoiceFormat is TRUE, then the result is a * ChoiceFormat pattern. Otherwise it is a static string. * @stable ICU 2.6 */ U_STABLE const UChar* U_EXPORT2 ucurr_getName(const UChar* currency, const char* locale, UCurrNameStyle nameStyle, UBool* isChoiceFormat, int32_t* len, UErrorCode* ec); /** * Returns the plural name for the given currency in the * given locale. For example, the plural name for the USD * currency object in the en_US locale is "US dollar" or "US dollars". * @param currency null-terminated 3-letter ISO 4217 code * @param locale locale in which to display currency * @param isChoiceFormat fill-in set to TRUE if the returned value * is a ChoiceFormat pattern; otherwise it is a static string * @param pluralCount plural count * @param len fill-in parameter to receive length of result * @param ec error code * @return pointer to display string of 'len' UChars. If the resource * data contains no entry for 'currency', then 'currency' itself is * returned. * @stable ICU 4.2 */ U_STABLE const UChar* U_EXPORT2 ucurr_getPluralName(const UChar* currency, const char* locale, UBool* isChoiceFormat, const char* pluralCount, int32_t* len, UErrorCode* ec); /** * Returns the number of the number of fraction digits that should * be displayed for the given currency. * This is equivalent to ucurr_getDefaultFractionDigitsForUsage(currency,UCURR_USAGE_STANDARD,ec); * @param currency null-terminated 3-letter ISO 4217 code * @param ec input-output error code * @return a non-negative number of fraction digits to be * displayed, or 0 if there is an error * @stable ICU 3.0 */ U_STABLE int32_t U_EXPORT2 ucurr_getDefaultFractionDigits(const UChar* currency, UErrorCode* ec); /** * Returns the number of the number of fraction digits that should * be displayed for the given currency with usage. * @param currency null-terminated 3-letter ISO 4217 code * @param usage enum usage for the currency * @param ec input-output error code * @return a non-negative number of fraction digits to be * displayed, or 0 if there is an error * @stable ICU 54 */ U_STABLE int32_t U_EXPORT2 ucurr_getDefaultFractionDigitsForUsage(const UChar* currency, const UCurrencyUsage usage, UErrorCode* ec); /** * Returns the rounding increment for the given currency, or 0.0 if no * rounding is done by the currency. * This is equivalent to ucurr_getRoundingIncrementForUsage(currency,UCURR_USAGE_STANDARD,ec); * @param currency null-terminated 3-letter ISO 4217 code * @param ec input-output error code * @return the non-negative rounding increment, or 0.0 if none, * or 0.0 if there is an error * @stable ICU 3.0 */ U_STABLE double U_EXPORT2 ucurr_getRoundingIncrement(const UChar* currency, UErrorCode* ec); /** * Returns the rounding increment for the given currency, or 0.0 if no * rounding is done by the currency given usage. * @param currency null-terminated 3-letter ISO 4217 code * @param usage enum usage for the currency * @param ec input-output error code * @return the non-negative rounding increment, or 0.0 if none, * or 0.0 if there is an error * @stable ICU 54 */ U_STABLE double U_EXPORT2 ucurr_getRoundingIncrementForUsage(const UChar* currency, const UCurrencyUsage usage, UErrorCode* ec); /** * Selector constants for ucurr_openCurrencies(). * * @see ucurr_openCurrencies * @stable ICU 3.2 */ typedef enum UCurrCurrencyType { /** * Select all ISO-4217 currency codes. * @stable ICU 3.2 */ UCURR_ALL = INT32_MAX, /** * Select only ISO-4217 commonly used currency codes. * These currencies can be found in common use, and they usually have * bank notes or coins associated with the currency code. * This does not include fund codes, precious metals and other * various ISO-4217 codes limited to special financial products. * @stable ICU 3.2 */ UCURR_COMMON = 1, /** * Select ISO-4217 uncommon currency codes. * These codes respresent fund codes, precious metals and other * various ISO-4217 codes limited to special financial products. * A fund code is a monetary resource associated with a currency. * @stable ICU 3.2 */ UCURR_UNCOMMON = 2, /** * Select only deprecated ISO-4217 codes. * These codes are no longer in general public use. * @stable ICU 3.2 */ UCURR_DEPRECATED = 4, /** * Select only non-deprecated ISO-4217 codes. * These codes are in general public use. * @stable ICU 3.2 */ UCURR_NON_DEPRECATED = 8 } UCurrCurrencyType; /** * Provides a UEnumeration object for listing ISO-4217 codes. * @param currType You can use one of several UCurrCurrencyType values for this * variable. You can also | (or) them together to get a specific list of * currencies. Most people will want to use the (UCURR_CURRENCY|UCURR_NON_DEPRECATED) value to * get a list of current currencies. * @param pErrorCode Error code * @stable ICU 3.2 */ U_STABLE UEnumeration * U_EXPORT2 ucurr_openISOCurrencies(uint32_t currType, UErrorCode *pErrorCode); /** * Queries if the given ISO 4217 3-letter code is available on the specified date range. * * Note: For checking availability of a currency on a specific date, specify the date on both 'from' and 'to' * * When 'from' is U_DATE_MIN and 'to' is U_DATE_MAX, this method checks if the specified currency is available any time. * If 'from' and 'to' are same UDate value, this method checks if the specified currency is available on that date. * * @param isoCode * The ISO 4217 3-letter code. * * @param from * The lower bound of the date range, inclusive. When 'from' is U_DATE_MIN, check the availability * of the currency any date before 'to' * * @param to * The upper bound of the date range, inclusive. When 'to' is U_DATE_MAX, check the availability of * the currency any date after 'from' * * @param errorCode * ICU error code * * @return TRUE if the given ISO 4217 3-letter code is supported on the specified date range. * * @stable ICU 4.8 */ U_STABLE UBool U_EXPORT2 ucurr_isAvailable(const UChar* isoCode, UDate from, UDate to, UErrorCode* errorCode); /** * Finds the number of valid currency codes for the * given locale and date. * @param locale the locale for which to retrieve the * currency count. * @param date the date for which to retrieve the * currency count for the given locale. * @param ec error code * @return the number of currency codes for the * given locale and date. If 0, currency * codes couldn't be found for the input * values are invalid. * @stable ICU 4.0 */ U_STABLE int32_t U_EXPORT2 ucurr_countCurrencies(const char* locale, UDate date, UErrorCode* ec); /** * Finds a currency code for the given locale and date * @param locale the locale for which to retrieve a currency code. * Currency can be specified by the "currency" keyword * in which case it overrides the default currency code * @param date the date for which to retrieve a currency code for * the given locale. * @param index the index within the available list of currency codes * for the given locale on the given date. * @param buff fill in buffer. Can be NULL for preflighting. * @param buffCapacity capacity of the fill in buffer. Can be 0 for * preflighting. If it is non-zero, the buff parameter * must not be NULL. * @param ec error code * @return length of the currency string. It should always be 3. * If 0, currency couldn't be found or the input values are * invalid. * @stable ICU 4.0 */ U_STABLE int32_t U_EXPORT2 ucurr_forLocaleAndDate(const char* locale, UDate date, int32_t index, UChar* buff, int32_t buffCapacity, UErrorCode* ec); /** * Given a key and a locale, returns an array of string values in a preferred * order that would make a difference. These are all and only those values where * the open (creation) of the service with the locale formed from the input locale * plus input keyword and that value has different behavior than creation with the * input locale alone. * @param key one of the keys supported by this service. For now, only * "currency" is supported. * @param locale the locale * @param commonlyUsed if set to true it will return only commonly used values * with the given locale in preferred order. Otherwise, * it will return all the available values for the locale. * @param status error status * @return a string enumeration over keyword values for the given key and the locale. * @stable ICU 4.2 */ U_STABLE UEnumeration* U_EXPORT2 ucurr_getKeywordValuesForLocale(const char* key, const char* locale, UBool commonlyUsed, UErrorCode* status); /** * Returns the ISO 4217 numeric code for the currency. *

Note: If the ISO 4217 numeric code is not assigned for the currency or * the currency is unknown, this function returns 0. * * @param currency null-terminated 3-letter ISO 4217 code * @return The ISO 4217 numeric code of the currency * @stable ICU 49 */ U_STABLE int32_t U_EXPORT2 ucurr_getNumericCode(const UChar* currency); #endif /* #if !UCONFIG_NO_FORMATTING */ #endif // ucnv_err.h /* ********************************************************************** * Copyright (C) 1999-2009, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** * * * ucnv_err.h: */ /** * \file * \brief C UConverter predefined error callbacks * *

Error Behaviour Functions

* Defines some error behaviour functions called by ucnv_{from,to}Unicode * These are provided as part of ICU and many are stable, but they * can also be considered only as an example of what can be done with * callbacks. You may of course write your own. * * If you want to write your own, you may also find the functions from * ucnv_cb.h useful when writing your own callbacks. * * These functions, although public, should NEVER be called directly. * They should be used as parameters to the ucnv_setFromUCallback * and ucnv_setToUCallback functions, to set the behaviour of a converter * when it encounters ILLEGAL/UNMAPPED/INVALID sequences. * * usage example: 'STOP' doesn't need any context, but newContext * could be set to something other than 'NULL' if needed. The available * contexts in this header can modify the default behavior of the callback. * * \code * UErrorCode err = U_ZERO_ERROR; * UConverter *myConverter = ucnv_open("ibm-949", &err); * const void *oldContext; * UConverterFromUCallback oldAction; * * * if (U_SUCCESS(err)) * { * ucnv_setFromUCallBack(myConverter, * UCNV_FROM_U_CALLBACK_STOP, * NULL, * &oldAction, * &oldContext, * &status); * } * \endcode * * The code above tells "myConverter" to stop when it encounters an * ILLEGAL/TRUNCATED/INVALID sequences when it is used to convert from * Unicode -> Codepage. The behavior from Codepage to Unicode is not changed, * and ucnv_setToUCallBack would need to be called in order to change * that behavior too. * * Here is an example with a context: * * \code * UErrorCode err = U_ZERO_ERROR; * UConverter *myConverter = ucnv_open("ibm-949", &err); * const void *oldContext; * UConverterFromUCallback oldAction; * * * if (U_SUCCESS(err)) * { * ucnv_setToUCallBack(myConverter, * UCNV_TO_U_CALLBACK_SUBSTITUTE, * UCNV_SUB_STOP_ON_ILLEGAL, * &oldAction, * &oldContext, * &status); * } * \endcode * * The code above tells "myConverter" to stop when it encounters an * ILLEGAL/TRUNCATED/INVALID sequences when it is used to convert from * Codepage -> Unicode. Any unmapped and legal characters will be * substituted to be the default substitution character. */ #ifndef UCNV_ERR_H #define UCNV_ERR_H #if !UCONFIG_NO_CONVERSION /** Forward declaring the UConverter structure. @stable ICU 2.0 */ struct UConverter; /** @stable ICU 2.0 */ typedef struct UConverter UConverter; /** * FROM_U, TO_U context options for sub callback * @stable ICU 2.0 */ #define UCNV_SUB_STOP_ON_ILLEGAL "i" /** * FROM_U, TO_U context options for skip callback * @stable ICU 2.0 */ #define UCNV_SKIP_STOP_ON_ILLEGAL "i" /** * FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to ICU (%UXXXX) * @stable ICU 2.0 */ #define UCNV_ESCAPE_ICU NULL /** * FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to JAVA (\\uXXXX) * @stable ICU 2.0 */ #define UCNV_ESCAPE_JAVA "J" /** * FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to C (\\uXXXX \\UXXXXXXXX) * TO_U_CALLBACK_ESCAPE option to escape the character value accoding to C (\\xXXXX) * @stable ICU 2.0 */ #define UCNV_ESCAPE_C "C" /** * FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to XML Decimal escape \htmlonly(&#DDDD;)\endhtmlonly * TO_U_CALLBACK_ESCAPE context option to escape the character value accoding to XML Decimal escape \htmlonly(&#DDDD;)\endhtmlonly * @stable ICU 2.0 */ #define UCNV_ESCAPE_XML_DEC "D" /** * FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to XML Hex escape \htmlonly(&#xXXXX;)\endhtmlonly * TO_U_CALLBACK_ESCAPE context option to escape the character value accoding to XML Hex escape \htmlonly(&#xXXXX;)\endhtmlonly * @stable ICU 2.0 */ #define UCNV_ESCAPE_XML_HEX "X" /** * FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to Unicode (U+XXXXX) * @stable ICU 2.0 */ #define UCNV_ESCAPE_UNICODE "U" /** * FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to CSS2 conventions (\\HH..H, that is, * a backslash, 1..6 hex digits, and a space) * @stable ICU 4.0 */ #define UCNV_ESCAPE_CSS2 "S" /** * The process condition code to be used with the callbacks. * Codes which are greater than UCNV_IRREGULAR should be * passed on to any chained callbacks. * @stable ICU 2.0 */ typedef enum { UCNV_UNASSIGNED = 0, /**< The code point is unassigned. The error code U_INVALID_CHAR_FOUND will be set. */ UCNV_ILLEGAL = 1, /**< The code point is illegal. For example, \\x81\\x2E is illegal in SJIS because \\x2E is not a valid trail byte for the \\x81 lead byte. Also, starting with Unicode 3.0.1, non-shortest byte sequences in UTF-8 (like \\xC1\\xA1 instead of \\x61 for U+0061) are also illegal, not just irregular. The error code U_ILLEGAL_CHAR_FOUND will be set. */ UCNV_IRREGULAR = 2, /**< The codepoint is not a regular sequence in the encoding. For example, \\xED\\xA0\\x80..\\xED\\xBF\\xBF are irregular UTF-8 byte sequences for single surrogate code points. The error code U_INVALID_CHAR_FOUND will be set. */ UCNV_RESET = 3, /**< The callback is called with this reason when a 'reset' has occured. Callback should reset all state. */ UCNV_CLOSE = 4, /**< Called when the converter is closed. The callback should release any allocated memory.*/ UCNV_CLONE = 5 /**< Called when ucnv_safeClone() is called on the converter. the pointer available as the 'context' is an alias to the original converters' context pointer. If the context must be owned by the new converter, the callback must clone the data and call ucnv_setFromUCallback (or setToUCallback) with the correct pointer. @stable ICU 2.2 */ } UConverterCallbackReason; /** * The structure for the fromUnicode callback function parameter. * @stable ICU 2.0 */ typedef struct { uint16_t size; /**< The size of this struct. @stable ICU 2.0 */ UBool flush; /**< The internal state of converter will be reset and data flushed if set to TRUE. @stable ICU 2.0 */ UConverter *converter; /**< Pointer to the converter that is opened and to which this struct is passed as an argument. @stable ICU 2.0 */ const UChar *source; /**< Pointer to the source source buffer. @stable ICU 2.0 */ const UChar *sourceLimit; /**< Pointer to the limit (end + 1) of source buffer. @stable ICU 2.0 */ char *target; /**< Pointer to the target buffer. @stable ICU 2.0 */ const char *targetLimit; /**< Pointer to the limit (end + 1) of target buffer. @stable ICU 2.0 */ int32_t *offsets; /**< Pointer to the buffer that recieves the offsets. *offset = blah ; offset++;. @stable ICU 2.0 */ } UConverterFromUnicodeArgs; /** * The structure for the toUnicode callback function parameter. * @stable ICU 2.0 */ typedef struct { uint16_t size; /**< The size of this struct @stable ICU 2.0 */ UBool flush; /**< The internal state of converter will be reset and data flushed if set to TRUE. @stable ICU 2.0 */ UConverter *converter; /**< Pointer to the converter that is opened and to which this struct is passed as an argument. @stable ICU 2.0 */ const char *source; /**< Pointer to the source source buffer. @stable ICU 2.0 */ const char *sourceLimit; /**< Pointer to the limit (end + 1) of source buffer. @stable ICU 2.0 */ UChar *target; /**< Pointer to the target buffer. @stable ICU 2.0 */ const UChar *targetLimit; /**< Pointer to the limit (end + 1) of target buffer. @stable ICU 2.0 */ int32_t *offsets; /**< Pointer to the buffer that recieves the offsets. *offset = blah ; offset++;. @stable ICU 2.0 */ } UConverterToUnicodeArgs; /** * DO NOT CALL THIS FUNCTION DIRECTLY! * This From Unicode callback STOPS at the ILLEGAL_SEQUENCE, * returning the error code back to the caller immediately. * * @param context Pointer to the callback's private data * @param fromUArgs Information about the conversion in progress * @param codeUnits Points to 'length' UChars of the concerned Unicode sequence * @param length Size (in bytes) of the concerned codepage sequence * @param codePoint Single UChar32 (UTF-32) containing the concerend Unicode codepoint. * @param reason Defines the reason the callback was invoked * @param err This should always be set to a failure status prior to calling. * @stable ICU 2.0 */ U_STABLE void U_EXPORT2 UCNV_FROM_U_CALLBACK_STOP ( const void *context, UConverterFromUnicodeArgs *fromUArgs, const UChar* codeUnits, int32_t length, UChar32 codePoint, UConverterCallbackReason reason, UErrorCode * err); /** * DO NOT CALL THIS FUNCTION DIRECTLY! * This To Unicode callback STOPS at the ILLEGAL_SEQUENCE, * returning the error code back to the caller immediately. * * @param context Pointer to the callback's private data * @param toUArgs Information about the conversion in progress * @param codeUnits Points to 'length' bytes of the concerned codepage sequence * @param length Size (in bytes) of the concerned codepage sequence * @param reason Defines the reason the callback was invoked * @param err This should always be set to a failure status prior to calling. * @stable ICU 2.0 */ U_STABLE void U_EXPORT2 UCNV_TO_U_CALLBACK_STOP ( const void *context, UConverterToUnicodeArgs *toUArgs, const char* codeUnits, int32_t length, UConverterCallbackReason reason, UErrorCode * err); /** * DO NOT CALL THIS FUNCTION DIRECTLY! * This From Unicode callback skips any ILLEGAL_SEQUENCE, or * skips only UNASSINGED_SEQUENCE depending on the context parameter * simply ignoring those characters. * * @param context The function currently recognizes the callback options: * UCNV_SKIP_STOP_ON_ILLEGAL: STOPS at the ILLEGAL_SEQUENCE, * returning the error code back to the caller immediately. * NULL: Skips any ILLEGAL_SEQUENCE * @param fromUArgs Information about the conversion in progress * @param codeUnits Points to 'length' UChars of the concerned Unicode sequence * @param length Size (in bytes) of the concerned codepage sequence * @param codePoint Single UChar32 (UTF-32) containing the concerend Unicode codepoint. * @param reason Defines the reason the callback was invoked * @param err Return value will be set to success if the callback was handled, * otherwise this value will be set to a failure status. * @stable ICU 2.0 */ U_STABLE void U_EXPORT2 UCNV_FROM_U_CALLBACK_SKIP ( const void *context, UConverterFromUnicodeArgs *fromUArgs, const UChar* codeUnits, int32_t length, UChar32 codePoint, UConverterCallbackReason reason, UErrorCode * err); /** * DO NOT CALL THIS FUNCTION DIRECTLY! * This From Unicode callback will Substitute the ILLEGAL SEQUENCE, or * UNASSIGNED_SEQUENCE depending on context parameter, with the * current substitution string for the converter. This is the default * callback. * * @param context The function currently recognizes the callback options: * UCNV_SUB_STOP_ON_ILLEGAL: STOPS at the ILLEGAL_SEQUENCE, * returning the error code back to the caller immediately. * NULL: Substitutes any ILLEGAL_SEQUENCE * @param fromUArgs Information about the conversion in progress * @param codeUnits Points to 'length' UChars of the concerned Unicode sequence * @param length Size (in bytes) of the concerned codepage sequence * @param codePoint Single UChar32 (UTF-32) containing the concerend Unicode codepoint. * @param reason Defines the reason the callback was invoked * @param err Return value will be set to success if the callback was handled, * otherwise this value will be set to a failure status. * @see ucnv_setSubstChars * @stable ICU 2.0 */ U_STABLE void U_EXPORT2 UCNV_FROM_U_CALLBACK_SUBSTITUTE ( const void *context, UConverterFromUnicodeArgs *fromUArgs, const UChar* codeUnits, int32_t length, UChar32 codePoint, UConverterCallbackReason reason, UErrorCode * err); /** * DO NOT CALL THIS FUNCTION DIRECTLY! * This From Unicode callback will Substitute the ILLEGAL SEQUENCE with the * hexadecimal representation of the illegal codepoints * * @param context The function currently recognizes the callback options: *
    *
  • UCNV_ESCAPE_ICU: Substitues the ILLEGAL SEQUENCE with the hexadecimal * representation in the format %UXXXX, e.g. "%uFFFE%u00AC%uC8FE"). * In the Event the converter doesn't support the characters {%,U}[A-F][0-9], * it will substitute the illegal sequence with the substitution characters. * Note that codeUnit(32bit int eg: unit of a surrogate pair) is represented as * %UD84D%UDC56
  • *
  • UCNV_ESCAPE_JAVA: Substitues the ILLEGAL SEQUENCE with the hexadecimal * representation in the format \\uXXXX, e.g. "\\uFFFE\\u00AC\\uC8FE"). * In the Event the converter doesn't support the characters {\,u}[A-F][0-9], * it will substitute the illegal sequence with the substitution characters. * Note that codeUnit(32bit int eg: unit of a surrogate pair) is represented as * \\uD84D\\uDC56
  • *
  • UCNV_ESCAPE_C: Substitues the ILLEGAL SEQUENCE with the hexadecimal * representation in the format \\uXXXX, e.g. "\\uFFFE\\u00AC\\uC8FE"). * In the Event the converter doesn't support the characters {\,u,U}[A-F][0-9], * it will substitute the illegal sequence with the substitution characters. * Note that codeUnit(32bit int eg: unit of a surrogate pair) is represented as * \\U00023456
  • *
  • UCNV_ESCAPE_XML_DEC: Substitues the ILLEGAL SEQUENCE with the decimal * representation in the format \htmlonly&#DDDDDDDD;, e.g. "&#65534;&#172;&#51454;")\endhtmlonly. * In the Event the converter doesn't support the characters {&,#}[0-9], * it will substitute the illegal sequence with the substitution characters. * Note that codeUnit(32bit int eg: unit of a surrogate pair) is represented as * &#144470; and Zero padding is ignored.
  • *
  • UCNV_ESCAPE_XML_HEX:Substitues the ILLEGAL SEQUENCE with the decimal * representation in the format \htmlonly&#xXXXX; e.g. "&#xFFFE;&#x00AC;&#xC8FE;")\endhtmlonly. * In the Event the converter doesn't support the characters {&,#,x}[0-9], * it will substitute the illegal sequence with the substitution characters. * Note that codeUnit(32bit int eg: unit of a surrogate pair) is represented as * \htmlonly&#x23456;\endhtmlonly
  • *
* @param fromUArgs Information about the conversion in progress * @param codeUnits Points to 'length' UChars of the concerned Unicode sequence * @param length Size (in bytes) of the concerned codepage sequence * @param codePoint Single UChar32 (UTF-32) containing the concerend Unicode codepoint. * @param reason Defines the reason the callback was invoked * @param err Return value will be set to success if the callback was handled, * otherwise this value will be set to a failure status. * @stable ICU 2.0 */ U_STABLE void U_EXPORT2 UCNV_FROM_U_CALLBACK_ESCAPE ( const void *context, UConverterFromUnicodeArgs *fromUArgs, const UChar* codeUnits, int32_t length, UChar32 codePoint, UConverterCallbackReason reason, UErrorCode * err); /** * DO NOT CALL THIS FUNCTION DIRECTLY! * This To Unicode callback skips any ILLEGAL_SEQUENCE, or * skips only UNASSINGED_SEQUENCE depending on the context parameter * simply ignoring those characters. * * @param context The function currently recognizes the callback options: * UCNV_SKIP_STOP_ON_ILLEGAL: STOPS at the ILLEGAL_SEQUENCE, * returning the error code back to the caller immediately. * NULL: Skips any ILLEGAL_SEQUENCE * @param toUArgs Information about the conversion in progress * @param codeUnits Points to 'length' bytes of the concerned codepage sequence * @param length Size (in bytes) of the concerned codepage sequence * @param reason Defines the reason the callback was invoked * @param err Return value will be set to success if the callback was handled, * otherwise this value will be set to a failure status. * @stable ICU 2.0 */ U_STABLE void U_EXPORT2 UCNV_TO_U_CALLBACK_SKIP ( const void *context, UConverterToUnicodeArgs *toUArgs, const char* codeUnits, int32_t length, UConverterCallbackReason reason, UErrorCode * err); /** * DO NOT CALL THIS FUNCTION DIRECTLY! * This To Unicode callback will Substitute the ILLEGAL SEQUENCE,or * UNASSIGNED_SEQUENCE depending on context parameter, with the * Unicode substitution character, U+FFFD. * * @param context The function currently recognizes the callback options: * UCNV_SUB_STOP_ON_ILLEGAL: STOPS at the ILLEGAL_SEQUENCE, * returning the error code back to the caller immediately. * NULL: Substitutes any ILLEGAL_SEQUENCE * @param toUArgs Information about the conversion in progress * @param codeUnits Points to 'length' bytes of the concerned codepage sequence * @param length Size (in bytes) of the concerned codepage sequence * @param reason Defines the reason the callback was invoked * @param err Return value will be set to success if the callback was handled, * otherwise this value will be set to a failure status. * @stable ICU 2.0 */ U_STABLE void U_EXPORT2 UCNV_TO_U_CALLBACK_SUBSTITUTE ( const void *context, UConverterToUnicodeArgs *toUArgs, const char* codeUnits, int32_t length, UConverterCallbackReason reason, UErrorCode * err); /** * DO NOT CALL THIS FUNCTION DIRECTLY! * This To Unicode callback will Substitute the ILLEGAL SEQUENCE with the * hexadecimal representation of the illegal bytes * (in the format %XNN, e.g. "%XFF%X0A%XC8%X03"). * * @param context This function currently recognizes the callback options: * UCNV_ESCAPE_ICU, UCNV_ESCAPE_JAVA, UCNV_ESCAPE_C, UCNV_ESCAPE_XML_DEC, * UCNV_ESCAPE_XML_HEX and UCNV_ESCAPE_UNICODE. * @param toUArgs Information about the conversion in progress * @param codeUnits Points to 'length' bytes of the concerned codepage sequence * @param length Size (in bytes) of the concerned codepage sequence * @param reason Defines the reason the callback was invoked * @param err Return value will be set to success if the callback was handled, * otherwise this value will be set to a failure status. * @stable ICU 2.0 */ U_STABLE void U_EXPORT2 UCNV_TO_U_CALLBACK_ESCAPE ( const void *context, UConverterToUnicodeArgs *toUArgs, const char* codeUnits, int32_t length, UConverterCallbackReason reason, UErrorCode * err); #endif #endif /*UCNV_ERR_H*/ // ucnv.h /* ********************************************************************** * Copyright (C) 1999-2014, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** * ucnv.h: * External APIs for the ICU's codeset conversion library * Bertrand A. Damiba * * Modification History: * * Date Name Description * 04/04/99 helena Fixed internal header inclusion. * 05/11/00 helena Added setFallback and usesFallback APIs. * 06/29/2000 helena Major rewrite of the callback APIs. * 12/07/2000 srl Update of documentation */ /** * \file * \brief C API: Character conversion * *

Character Conversion C API

* *

This API is used to convert codepage or character encoded data to and * from UTF-16. You can open a converter with {@link ucnv_open() }. With that * converter, you can get its properties, set options, convert your data and * close the converter.

* *

Since many software programs recogize different converter names for * different types of converters, there are other functions in this API to * iterate over the converter aliases. The functions {@link ucnv_getAvailableName() }, * {@link ucnv_getAlias() } and {@link ucnv_getStandardName() } are some of the * more frequently used alias functions to get this information.

* *

When a converter encounters an illegal, irregular, invalid or unmappable character * its default behavior is to use a substitution character to replace the * bad byte sequence. This behavior can be changed by using {@link ucnv_setFromUCallBack() } * or {@link ucnv_setToUCallBack() } on the converter. The header ucnv_err.h defines * many other callback actions that can be used instead of a character substitution.

* *

More information about this API can be found in our * User's * Guide.

*/ #ifndef UCNV_H #define UCNV_H #ifndef __USET_H__ /** * USet is the C API type for Unicode sets. * It is forward-declared here to avoid including the header file if related * conversion APIs are not used. * See unicode/uset.h * * @see ucnv_getUnicodeSet * @stable ICU 2.6 */ struct USet; /** @stable ICU 2.6 */ typedef struct USet USet; #endif #if !UCONFIG_NO_CONVERSION U_CDECL_BEGIN /** Maximum length of a converter name including the terminating NULL @stable ICU 2.0 */ #define UCNV_MAX_CONVERTER_NAME_LENGTH 60 /** Maximum length of a converter name including path and terminating NULL @stable ICU 2.0 */ #define UCNV_MAX_FULL_FILE_NAME_LENGTH (600+UCNV_MAX_CONVERTER_NAME_LENGTH) /** Shift in for EBDCDIC_STATEFUL and iso2022 states @stable ICU 2.0 */ #define UCNV_SI 0x0F /** Shift out for EBDCDIC_STATEFUL and iso2022 states @stable ICU 2.0 */ #define UCNV_SO 0x0E /** * Enum for specifying basic types of converters * @see ucnv_getType * @stable ICU 2.0 */ typedef enum { /** @stable ICU 2.0 */ UCNV_UNSUPPORTED_CONVERTER = -1, /** @stable ICU 2.0 */ UCNV_SBCS = 0, /** @stable ICU 2.0 */ UCNV_DBCS = 1, /** @stable ICU 2.0 */ UCNV_MBCS = 2, /** @stable ICU 2.0 */ UCNV_LATIN_1 = 3, /** @stable ICU 2.0 */ UCNV_UTF8 = 4, /** @stable ICU 2.0 */ UCNV_UTF16_BigEndian = 5, /** @stable ICU 2.0 */ UCNV_UTF16_LittleEndian = 6, /** @stable ICU 2.0 */ UCNV_UTF32_BigEndian = 7, /** @stable ICU 2.0 */ UCNV_UTF32_LittleEndian = 8, /** @stable ICU 2.0 */ UCNV_EBCDIC_STATEFUL = 9, /** @stable ICU 2.0 */ UCNV_ISO_2022 = 10, /** @stable ICU 2.0 */ UCNV_LMBCS_1 = 11, /** @stable ICU 2.0 */ UCNV_LMBCS_2, /** @stable ICU 2.0 */ UCNV_LMBCS_3, /** @stable ICU 2.0 */ UCNV_LMBCS_4, /** @stable ICU 2.0 */ UCNV_LMBCS_5, /** @stable ICU 2.0 */ UCNV_LMBCS_6, /** @stable ICU 2.0 */ UCNV_LMBCS_8, /** @stable ICU 2.0 */ UCNV_LMBCS_11, /** @stable ICU 2.0 */ UCNV_LMBCS_16, /** @stable ICU 2.0 */ UCNV_LMBCS_17, /** @stable ICU 2.0 */ UCNV_LMBCS_18, /** @stable ICU 2.0 */ UCNV_LMBCS_19, /** @stable ICU 2.0 */ UCNV_LMBCS_LAST = UCNV_LMBCS_19, /** @stable ICU 2.0 */ UCNV_HZ, /** @stable ICU 2.0 */ UCNV_SCSU, /** @stable ICU 2.0 */ UCNV_ISCII, /** @stable ICU 2.0 */ UCNV_US_ASCII, /** @stable ICU 2.0 */ UCNV_UTF7, /** @stable ICU 2.2 */ UCNV_BOCU1, /** @stable ICU 2.2 */ UCNV_UTF16, /** @stable ICU 2.2 */ UCNV_UTF32, /** @stable ICU 2.2 */ UCNV_CESU8, /** @stable ICU 2.4 */ UCNV_IMAP_MAILBOX, /** @stable ICU 4.8 */ UCNV_COMPOUND_TEXT, /* Number of converter types for which we have conversion routines. */ UCNV_NUMBER_OF_SUPPORTED_CONVERTER_TYPES } UConverterType; /** * Enum for specifying which platform a converter ID refers to. * The use of platform/CCSID is not recommended. See ucnv_openCCSID(). * * @see ucnv_getPlatform * @see ucnv_openCCSID * @see ucnv_getCCSID * @stable ICU 2.0 */ typedef enum { UCNV_UNKNOWN = -1, UCNV_IBM = 0 } UConverterPlatform; /** * Function pointer for error callback in the codepage to unicode direction. * Called when an error has occured in conversion to unicode, or on open/close of the callback (see reason). * @param context Pointer to the callback's private data * @param args Information about the conversion in progress * @param codeUnits Points to 'length' bytes of the concerned codepage sequence * @param length Size (in bytes) of the concerned codepage sequence * @param reason Defines the reason the callback was invoked * @param pErrorCode ICU error code in/out parameter. * For converter callback functions, set to a conversion error * before the call, and the callback may reset it to U_ZERO_ERROR. * @see ucnv_setToUCallBack * @see UConverterToUnicodeArgs * @stable ICU 2.0 */ typedef void (U_EXPORT2 *UConverterToUCallback) ( const void* context, UConverterToUnicodeArgs *args, const char *codeUnits, int32_t length, UConverterCallbackReason reason, UErrorCode *pErrorCode); /** * Function pointer for error callback in the unicode to codepage direction. * Called when an error has occured in conversion from unicode, or on open/close of the callback (see reason). * @param context Pointer to the callback's private data * @param args Information about the conversion in progress * @param codeUnits Points to 'length' UChars of the concerned Unicode sequence * @param length Size (in bytes) of the concerned codepage sequence * @param codePoint Single UChar32 (UTF-32) containing the concerend Unicode codepoint. * @param reason Defines the reason the callback was invoked * @param pErrorCode ICU error code in/out parameter. * For converter callback functions, set to a conversion error * before the call, and the callback may reset it to U_ZERO_ERROR. * @see ucnv_setFromUCallBack * @stable ICU 2.0 */ typedef void (U_EXPORT2 *UConverterFromUCallback) ( const void* context, UConverterFromUnicodeArgs *args, const UChar* codeUnits, int32_t length, UChar32 codePoint, UConverterCallbackReason reason, UErrorCode *pErrorCode); U_CDECL_END /** * Character that separates converter names from options and options from each other. * @see ucnv_open * @stable ICU 2.0 */ #define UCNV_OPTION_SEP_CHAR ',' /** * String version of UCNV_OPTION_SEP_CHAR. * @see ucnv_open * @stable ICU 2.0 */ #define UCNV_OPTION_SEP_STRING "," /** * Character that separates a converter option from its value. * @see ucnv_open * @stable ICU 2.0 */ #define UCNV_VALUE_SEP_CHAR '=' /** * String version of UCNV_VALUE_SEP_CHAR. * @see ucnv_open * @stable ICU 2.0 */ #define UCNV_VALUE_SEP_STRING "=" /** * Converter option for specifying a locale. * For example, ucnv_open("SCSU,locale=ja", &errorCode); * See convrtrs.txt. * * @see ucnv_open * @stable ICU 2.0 */ #define UCNV_LOCALE_OPTION_STRING ",locale=" /** * Converter option for specifying a version selector (0..9) for some converters. * For example, * \code * ucnv_open("UTF-7,version=1", &errorCode); * \endcode * See convrtrs.txt. * * @see ucnv_open * @stable ICU 2.4 */ #define UCNV_VERSION_OPTION_STRING ",version=" /** * Converter option for EBCDIC SBCS or mixed-SBCS/DBCS (stateful) codepages. * Swaps Unicode mappings for EBCDIC LF and NL codes, as used on * S/390 (z/OS) Unix System Services (Open Edition). * For example, ucnv_open("ibm-1047,swaplfnl", &errorCode); * See convrtrs.txt. * * @see ucnv_open * @stable ICU 2.4 */ #define UCNV_SWAP_LFNL_OPTION_STRING ",swaplfnl" /** * Do a fuzzy compare of two converter/alias names. * The comparison is case-insensitive, ignores leading zeroes if they are not * followed by further digits, and ignores all but letters and digits. * Thus the strings "UTF-8", "utf_8", "u*T@f08" and "Utf 8" are exactly equivalent. * See section 1.4, Charset Alias Matching in Unicode Technical Standard #22 * at http://www.unicode.org/reports/tr22/ * * @param name1 a converter name or alias, zero-terminated * @param name2 a converter name or alias, zero-terminated * @return 0 if the names match, or a negative value if the name1 * lexically precedes name2, or a positive value if the name1 * lexically follows name2. * @stable ICU 2.0 */ U_STABLE int U_EXPORT2 ucnv_compareNames(const char *name1, const char *name2); /** * Creates a UConverter object with the name of a coded character set specified as a C string. * The actual name will be resolved with the alias file * using a case-insensitive string comparison that ignores * leading zeroes and all non-alphanumeric characters. * E.g., the names "UTF8", "utf-8", "u*T@f08" and "Utf 8" are all equivalent. * (See also ucnv_compareNames().) * If NULL is passed for the converter name, it will create one with the * getDefaultName return value. * *

A converter name for ICU 1.5 and above may contain options * like a locale specification to control the specific behavior of * the newly instantiated converter. * The meaning of the options depends on the particular converter. * If an option is not defined for or recognized by a given converter, then it is ignored.

* *

Options are appended to the converter name string, with a * UCNV_OPTION_SEP_CHAR between the name and the first option and * also between adjacent options.

* *

If the alias is ambiguous, then the preferred converter is used * and the status is set to U_AMBIGUOUS_ALIAS_WARNING.

* *

The conversion behavior and names can vary between platforms. ICU may * convert some characters differently from other platforms. Details on this topic * are in the User's * Guide. Aliases starting with a "cp" prefix have no specific meaning * other than its an alias starting with the letters "cp". Please do not * associate any meaning to these aliases.

* * \snippet samples/ucnv/convsamp.cpp ucnv_open * * @param converterName Name of the coded character set table. * This may have options appended to the string. * IANA alias character set names, IBM CCSIDs starting with "ibm-", * Windows codepage numbers starting with "windows-" are frequently * used for this parameter. See ucnv_getAvailableName and * ucnv_getAlias for a complete list that is available. * If this parameter is NULL, the default converter will be used. * @param err outgoing error status U_MEMORY_ALLOCATION_ERROR, U_FILE_ACCESS_ERROR * @return the created Unicode converter object, or NULL if an error occured * @see ucnv_openU * @see ucnv_openCCSID * @see ucnv_getAvailableName * @see ucnv_getAlias * @see ucnv_getDefaultName * @see ucnv_close * @see ucnv_compareNames * @stable ICU 2.0 */ U_STABLE UConverter* U_EXPORT2 ucnv_open(const char *converterName, UErrorCode *err); /** * Creates a Unicode converter with the names specified as unicode string. * The name should be limited to the ASCII-7 alphanumerics range. * The actual name will be resolved with the alias file * using a case-insensitive string comparison that ignores * leading zeroes and all non-alphanumeric characters. * E.g., the names "UTF8", "utf-8", "u*T@f08" and "Utf 8" are all equivalent. * (See also ucnv_compareNames().) * If NULL is passed for the converter name, it will create * one with the ucnv_getDefaultName() return value. * If the alias is ambiguous, then the preferred converter is used * and the status is set to U_AMBIGUOUS_ALIAS_WARNING. * *

See ucnv_open for the complete details

* @param name Name of the UConverter table in a zero terminated * Unicode string * @param err outgoing error status U_MEMORY_ALLOCATION_ERROR, * U_FILE_ACCESS_ERROR * @return the created Unicode converter object, or NULL if an * error occured * @see ucnv_open * @see ucnv_openCCSID * @see ucnv_close * @see ucnv_compareNames * @stable ICU 2.0 */ U_STABLE UConverter* U_EXPORT2 ucnv_openU(const UChar *name, UErrorCode *err); /** * Creates a UConverter object from a CCSID number and platform pair. * Note that the usefulness of this function is limited to platforms with numeric * encoding IDs. Only IBM and Microsoft platforms use numeric (16-bit) identifiers for * encodings. * * In addition, IBM CCSIDs and Unicode conversion tables are not 1:1 related. * For many IBM CCSIDs there are multiple (up to six) Unicode conversion tables, and * for some Unicode conversion tables there are multiple CCSIDs. * Some "alternate" Unicode conversion tables are provided by the * IBM CDRA conversion table registry. * The most prominent example of a systematic modification of conversion tables that is * not provided in the form of conversion table files in the repository is * that S/390 Unix System Services swaps the codes for Line Feed and New Line in all * EBCDIC codepages, which requires such a swap in the Unicode conversion tables as well. * * Only IBM default conversion tables are accessible with ucnv_openCCSID(). * ucnv_getCCSID() will return the same CCSID for all conversion tables that are associated * with that CCSID. * * Currently, the only "platform" supported in the ICU converter API is UCNV_IBM. * * In summary, the use of CCSIDs and the associated API functions is not recommended. * * In order to open a converter with the default IBM CDRA Unicode conversion table, * you can use this function or use the prefix "ibm-": * \code * char name[20]; * sprintf(name, "ibm-%hu", ccsid); * cnv=ucnv_open(name, &errorCode); * \endcode * * In order to open a converter with the IBM S/390 Unix System Services variant * of a Unicode/EBCDIC conversion table, * you can use the prefix "ibm-" together with the option string UCNV_SWAP_LFNL_OPTION_STRING: * \code * char name[20]; * sprintf(name, "ibm-%hu" UCNV_SWAP_LFNL_OPTION_STRING, ccsid); * cnv=ucnv_open(name, &errorCode); * \endcode * * In order to open a converter from a Microsoft codepage number, use the prefix "cp": * \code * char name[20]; * sprintf(name, "cp%hu", codepageID); * cnv=ucnv_open(name, &errorCode); * \endcode * * If the alias is ambiguous, then the preferred converter is used * and the status is set to U_AMBIGUOUS_ALIAS_WARNING. * * @param codepage codepage number to create * @param platform the platform in which the codepage number exists * @param err error status U_MEMORY_ALLOCATION_ERROR, U_FILE_ACCESS_ERROR * @return the created Unicode converter object, or NULL if an error * occured. * @see ucnv_open * @see ucnv_openU * @see ucnv_close * @see ucnv_getCCSID * @see ucnv_getPlatform * @see UConverterPlatform * @stable ICU 2.0 */ U_STABLE UConverter* U_EXPORT2 ucnv_openCCSID(int32_t codepage, UConverterPlatform platform, UErrorCode * err); /** *

Creates a UConverter object specified from a packageName and a converterName.

* *

The packageName and converterName must point to an ICU udata object, as defined by * udata_open( packageName, "cnv", converterName, err) or equivalent. * Typically, packageName will refer to a (.dat) file, or to a package registered with * udata_setAppData(). Using a full file or directory pathname for packageName is deprecated.

* *

The name will NOT be looked up in the alias mechanism, nor will the converter be * stored in the converter cache or the alias table. The only way to open further converters * is call this function multiple times, or use the ucnv_safeClone() function to clone a * 'master' converter.

* *

A future version of ICU may add alias table lookups and/or caching * to this function.

* *

Example Use: * cnv = ucnv_openPackage("myapp", "myconverter", &err); *

* * @param packageName name of the package (equivalent to 'path' in udata_open() call) * @param converterName name of the data item to be used, without suffix. * @param err outgoing error status U_MEMORY_ALLOCATION_ERROR, U_FILE_ACCESS_ERROR * @return the created Unicode converter object, or NULL if an error occured * @see udata_open * @see ucnv_open * @see ucnv_safeClone * @see ucnv_close * @stable ICU 2.2 */ U_STABLE UConverter* U_EXPORT2 ucnv_openPackage(const char *packageName, const char *converterName, UErrorCode *err); /** * Thread safe converter cloning operation. * For most efficient operation, pass in a stackBuffer (and a *pBufferSize) * with at least U_CNV_SAFECLONE_BUFFERSIZE bytes of space. * If the buffer size is sufficient, then the clone will use the stack buffer; * otherwise, it will be allocated, and *pBufferSize will indicate * the actual size. (This should not occur with U_CNV_SAFECLONE_BUFFERSIZE.) * * You must ucnv_close() the clone in any case. * * If *pBufferSize==0, (regardless of whether stackBuffer==NULL or not) * then *pBufferSize will be changed to a sufficient size * for cloning this converter, * without actually cloning the converter ("pure pre-flighting"). * * If *pBufferSize is greater than zero but not large enough for a stack-based * clone, then the converter is cloned using newly allocated memory * and *pBufferSize is changed to the necessary size. * * If the converter clone fits into the stack buffer but the stack buffer is not * sufficiently aligned for the clone, then the clone will use an * adjusted pointer and use an accordingly smaller buffer size. * * @param cnv converter to be cloned * @param stackBuffer Deprecated functionality as of ICU 52, use NULL.
* user allocated space for the new clone. If NULL new memory will be allocated. * If buffer is not large enough, new memory will be allocated. * Clients can use the U_CNV_SAFECLONE_BUFFERSIZE. This will probably be enough to avoid memory allocations. * @param pBufferSize Deprecated functionality as of ICU 52, use NULL or 1.
* pointer to size of allocated space. * @param status to indicate whether the operation went on smoothly or there were errors * An informational status value, U_SAFECLONE_ALLOCATED_WARNING, * is used if any allocations were necessary. * However, it is better to check if *pBufferSize grew for checking for * allocations because warning codes can be overridden by subsequent * function calls. * @return pointer to the new clone * @stable ICU 2.0 */ U_STABLE UConverter * U_EXPORT2 ucnv_safeClone(const UConverter *cnv, void *stackBuffer, int32_t *pBufferSize, UErrorCode *status); /** * Deletes the unicode converter and releases resources associated * with just this instance. * Does not free up shared converter tables. * * @param converter the converter object to be deleted * @see ucnv_open * @see ucnv_openU * @see ucnv_openCCSID * @stable ICU 2.0 */ U_STABLE void U_EXPORT2 ucnv_close(UConverter * converter); /** * Fills in the output parameter, subChars, with the substitution characters * as multiple bytes. * If ucnv_setSubstString() set a Unicode string because the converter is * stateful, then subChars will be an empty string. * * @param converter the Unicode converter * @param subChars the subsitution characters * @param len on input the capacity of subChars, on output the number * of bytes copied to it * @param err the outgoing error status code. * If the substitution character array is too small, an * U_INDEX_OUTOFBOUNDS_ERROR will be returned. * @see ucnv_setSubstString * @see ucnv_setSubstChars * @stable ICU 2.0 */ U_STABLE void U_EXPORT2 ucnv_getSubstChars(const UConverter *converter, char *subChars, int8_t *len, UErrorCode *err); /** * Sets the substitution chars when converting from unicode to a codepage. The * substitution is specified as a string of 1-4 bytes, and may contain * NULL bytes. * The subChars must represent a single character. The caller needs to know the * byte sequence of a valid character in the converter's charset. * For some converters, for example some ISO 2022 variants, only single-byte * substitution characters may be supported. * The newer ucnv_setSubstString() function relaxes these limitations. * * @param converter the Unicode converter * @param subChars the substitution character byte sequence we want set * @param len the number of bytes in subChars * @param err the error status code. U_INDEX_OUTOFBOUNDS_ERROR if * len is bigger than the maximum number of bytes allowed in subchars * @see ucnv_setSubstString * @see ucnv_getSubstChars * @stable ICU 2.0 */ U_STABLE void U_EXPORT2 ucnv_setSubstChars(UConverter *converter, const char *subChars, int8_t len, UErrorCode *err); /** * Set a substitution string for converting from Unicode to a charset. * The caller need not know the charset byte sequence for each charset. * * Unlike ucnv_setSubstChars() which is designed to set a charset byte sequence * for a single character, this function takes a Unicode string with * zero, one or more characters, and immediately verifies that the string can be * converted to the charset. * If not, or if the result is too long (more than 32 bytes as of ICU 3.6), * then the function returns with an error accordingly. * * Also unlike ucnv_setSubstChars(), this function works for stateful charsets * by converting on the fly at the point of substitution rather than setting * a fixed byte sequence. * * @param cnv The UConverter object. * @param s The Unicode string. * @param length The number of UChars in s, or -1 for a NUL-terminated string. * @param err Pointer to a standard ICU error code. Its input value must * pass the U_SUCCESS() test, or else the function returns * immediately. Check for U_FAILURE() on output or use with * function chaining. (See User Guide for details.) * * @see ucnv_setSubstChars * @see ucnv_getSubstChars * @stable ICU 3.6 */ U_STABLE void U_EXPORT2 ucnv_setSubstString(UConverter *cnv, const UChar *s, int32_t length, UErrorCode *err); /** * Fills in the output parameter, errBytes, with the error characters from the * last failing conversion. * * @param converter the Unicode converter * @param errBytes the codepage bytes which were in error * @param len on input the capacity of errBytes, on output the number of * bytes which were copied to it * @param err the error status code. * If the substitution character array is too small, an * U_INDEX_OUTOFBOUNDS_ERROR will be returned. * @stable ICU 2.0 */ U_STABLE void U_EXPORT2 ucnv_getInvalidChars(const UConverter *converter, char *errBytes, int8_t *len, UErrorCode *err); /** * Fills in the output parameter, errChars, with the error characters from the * last failing conversion. * * @param converter the Unicode converter * @param errUChars the UChars which were in error * @param len on input the capacity of errUChars, on output the number of * UChars which were copied to it * @param err the error status code. * If the substitution character array is too small, an * U_INDEX_OUTOFBOUNDS_ERROR will be returned. * @stable ICU 2.0 */ U_STABLE void U_EXPORT2 ucnv_getInvalidUChars(const UConverter *converter, UChar *errUChars, int8_t *len, UErrorCode *err); /** * Resets the state of a converter to the default state. This is used * in the case of an error, to restart a conversion from a known default state. * It will also empty the internal output buffers. * @param converter the Unicode converter * @stable ICU 2.0 */ U_STABLE void U_EXPORT2 ucnv_reset(UConverter *converter); /** * Resets the to-Unicode part of a converter state to the default state. * This is used in the case of an error to restart a conversion to * Unicode to a known default state. It will also empty the internal * output buffers used for the conversion to Unicode codepoints. * @param converter the Unicode converter * @stable ICU 2.0 */ U_STABLE void U_EXPORT2 ucnv_resetToUnicode(UConverter *converter); /** * Resets the from-Unicode part of a converter state to the default state. * This is used in the case of an error to restart a conversion from * Unicode to a known default state. It will also empty the internal output * buffers used for the conversion from Unicode codepoints. * @param converter the Unicode converter * @stable ICU 2.0 */ U_STABLE void U_EXPORT2 ucnv_resetFromUnicode(UConverter *converter); /** * Returns the maximum number of bytes that are output per UChar in conversion * from Unicode using this converter. * The returned number can be used with UCNV_GET_MAX_BYTES_FOR_STRING * to calculate the size of a target buffer for conversion from Unicode. * * Note: Before ICU 2.8, this function did not return reliable numbers for * some stateful converters (EBCDIC_STATEFUL, ISO-2022) and LMBCS. * * This number may not be the same as the maximum number of bytes per * "conversion unit". In other words, it may not be the intuitively expected * number of bytes per character that would be published for a charset, * and may not fulfill any other purpose than the allocation of an output * buffer of guaranteed sufficient size for a given input length and converter. * * Examples for special cases that are taken into account: * - Supplementary code points may convert to more bytes than BMP code points. * This function returns bytes per UChar (UTF-16 code unit), not per * Unicode code point, for efficient buffer allocation. * - State-shifting output (SI/SO, escapes, etc.) from stateful converters. * - When m input UChars are converted to n output bytes, then the maximum m/n * is taken into account. * * The number returned here does not take into account * (see UCNV_GET_MAX_BYTES_FOR_STRING): * - callbacks which output more than one charset character sequence per call, * like escape callbacks * - initial and final non-character bytes that are output by some converters * (automatic BOMs, initial escape sequence, final SI, etc.) * * Examples for returned values: * - SBCS charsets: 1 * - Shift-JIS: 2 * - UTF-16: 2 (2 per BMP, 4 per surrogate _pair_, BOM not counted) * - UTF-8: 3 (3 per BMP, 4 per surrogate _pair_) * - EBCDIC_STATEFUL (EBCDIC mixed SBCS/DBCS): 3 (SO + DBCS) * - ISO-2022: 3 (always outputs UTF-8) * - ISO-2022-JP: 6 (4-byte escape sequences + DBCS) * - ISO-2022-CN: 8 (4-byte designator sequences + 2-byte SS2/SS3 + DBCS) * * @param converter The Unicode converter. * @return The maximum number of bytes per UChar (16 bit code unit) * that are output by ucnv_fromUnicode(), * to be used together with UCNV_GET_MAX_BYTES_FOR_STRING * for buffer allocation. * * @see UCNV_GET_MAX_BYTES_FOR_STRING * @see ucnv_getMinCharSize * @stable ICU 2.0 */ U_STABLE int8_t U_EXPORT2 ucnv_getMaxCharSize(const UConverter *converter); /** * Calculates the size of a buffer for conversion from Unicode to a charset. * The calculated size is guaranteed to be sufficient for this conversion. * * It takes into account initial and final non-character bytes that are output * by some converters. * It does not take into account callbacks which output more than one charset * character sequence per call, like escape callbacks. * The default (substitution) callback only outputs one charset character sequence. * * @param length Number of UChars to be converted. * @param maxCharSize Return value from ucnv_getMaxCharSize() for the converter * that will be used. * @return Size of a buffer that will be large enough to hold the output bytes of * converting length UChars with the converter that returned the maxCharSize. * * @see ucnv_getMaxCharSize * @stable ICU 2.8 */ #define UCNV_GET_MAX_BYTES_FOR_STRING(length, maxCharSize) \ (((int32_t)(length)+10)*(int32_t)(maxCharSize)) /** * Returns the minimum byte length (per codepoint) for characters in this codepage. * This is usually either 1 or 2. * @param converter the Unicode converter * @return the minimum number of bytes per codepoint allowed by this particular converter * @see ucnv_getMaxCharSize * @stable ICU 2.0 */ U_STABLE int8_t U_EXPORT2 ucnv_getMinCharSize(const UConverter *converter); /** * Returns the display name of the converter passed in based on the Locale * passed in. If the locale contains no display name, the internal ASCII * name will be filled in. * * @param converter the Unicode converter. * @param displayLocale is the specific Locale we want to localised for * @param displayName user provided buffer to be filled in * @param displayNameCapacity size of displayName Buffer * @param err error status code * @return displayNameLength number of UChar needed in displayName * @see ucnv_getName * @stable ICU 2.0 */ U_STABLE int32_t U_EXPORT2 ucnv_getDisplayName(const UConverter *converter, const char *displayLocale, UChar *displayName, int32_t displayNameCapacity, UErrorCode *err); /** * Gets the internal, canonical name of the converter (zero-terminated). * The lifetime of the returned string will be that of the converter * passed to this function. * @param converter the Unicode converter * @param err UErrorCode status * @return the internal name of the converter * @see ucnv_getDisplayName * @stable ICU 2.0 */ U_STABLE const char * U_EXPORT2 ucnv_getName(const UConverter *converter, UErrorCode *err); /** * Gets a codepage number associated with the converter. This is not guaranteed * to be the one used to create the converter. Some converters do not represent * platform registered codepages and return zero for the codepage number. * The error code fill-in parameter indicates if the codepage number * is available. * Does not check if the converter is NULL or if converter's data * table is NULL. * * Important: The use of CCSIDs is not recommended because it is limited * to only two platforms in principle and only one (UCNV_IBM) in the current * ICU converter API. * Also, CCSIDs are insufficient to identify IBM Unicode conversion tables precisely. * For more details see ucnv_openCCSID(). * * @param converter the Unicode converter * @param err the error status code. * @return If any error occurrs, -1 will be returned otherwise, the codepage number * will be returned * @see ucnv_openCCSID * @see ucnv_getPlatform * @stable ICU 2.0 */ U_STABLE int32_t U_EXPORT2 ucnv_getCCSID(const UConverter *converter, UErrorCode *err); /** * Gets a codepage platform associated with the converter. Currently, * only UCNV_IBM will be returned. * Does not test if the converter is NULL or if converter's data * table is NULL. * @param converter the Unicode converter * @param err the error status code. * @return The codepage platform * @stable ICU 2.0 */ U_STABLE UConverterPlatform U_EXPORT2 ucnv_getPlatform(const UConverter *converter, UErrorCode *err); /** * Gets the type of the converter * e.g. SBCS, MBCS, DBCS, UTF8, UTF16_BE, UTF16_LE, ISO_2022, * EBCDIC_STATEFUL, LATIN_1 * @param converter a valid, opened converter * @return the type of the converter * @stable ICU 2.0 */ U_STABLE UConverterType U_EXPORT2 ucnv_getType(const UConverter * converter); /** * Gets the "starter" (lead) bytes for converters of type MBCS. * Will fill in an U_ILLEGAL_ARGUMENT_ERROR if converter passed in * is not MBCS. Fills in an array of type UBool, with the value of the byte * as offset to the array. For example, if (starters[0x20] == TRUE) at return, * it means that the byte 0x20 is a starter byte in this converter. * Context pointers are always owned by the caller. * * @param converter a valid, opened converter of type MBCS * @param starters an array of size 256 to be filled in * @param err error status, U_ILLEGAL_ARGUMENT_ERROR if the * converter is not a type which can return starters. * @see ucnv_getType * @stable ICU 2.0 */ U_STABLE void U_EXPORT2 ucnv_getStarters(const UConverter* converter, UBool starters[256], UErrorCode* err); /** * Selectors for Unicode sets that can be returned by ucnv_getUnicodeSet(). * @see ucnv_getUnicodeSet * @stable ICU 2.6 */ typedef enum UConverterUnicodeSet { /** Select the set of roundtrippable Unicode code points. @stable ICU 2.6 */ UCNV_ROUNDTRIP_SET, /** Select the set of Unicode code points with roundtrip or fallback mappings. @stable ICU 4.0 */ UCNV_ROUNDTRIP_AND_FALLBACK_SET, /** Number of UConverterUnicodeSet selectors. @stable ICU 2.6 */ UCNV_SET_COUNT } UConverterUnicodeSet; /** * Returns the set of Unicode code points that can be converted by an ICU converter. * * Returns one of several kinds of set: * * 1. UCNV_ROUNDTRIP_SET * * The set of all Unicode code points that can be roundtrip-converted * (converted without any data loss) with the converter (ucnv_fromUnicode()). * This set will not include code points that have fallback mappings * or are only the result of reverse fallback mappings. * This set will also not include PUA code points with fallbacks, although * ucnv_fromUnicode() will always uses those mappings despite ucnv_setFallback(). * See UTR #22 "Character Mapping Markup Language" * at http://www.unicode.org/reports/tr22/ * * This is useful for example for * - checking that a string or document can be roundtrip-converted with a converter, * without/before actually performing the conversion * - testing if a converter can be used for text for typical text for a certain locale, * by comparing its roundtrip set with the set of ExemplarCharacters from * ICU's locale data or other sources * * 2. UCNV_ROUNDTRIP_AND_FALLBACK_SET * * The set of all Unicode code points that can be converted with the converter (ucnv_fromUnicode()) * when fallbacks are turned on (see ucnv_setFallback()). * This set includes all code points with roundtrips and fallbacks (but not reverse fallbacks). * * In the future, there may be more UConverterUnicodeSet choices to select * sets with different properties. * * @param cnv The converter for which a set is requested. * @param setFillIn A valid USet *. It will be cleared by this function before * the converter's specific set is filled into the USet. * @param whichSet A UConverterUnicodeSet selector; * currently UCNV_ROUNDTRIP_SET is the only supported value. * @param pErrorCode ICU error code in/out parameter. * Must fulfill U_SUCCESS before the function call. * * @see UConverterUnicodeSet * @see uset_open * @see uset_close * @stable ICU 2.6 */ U_STABLE void U_EXPORT2 ucnv_getUnicodeSet(const UConverter *cnv, USet *setFillIn, UConverterUnicodeSet whichSet, UErrorCode *pErrorCode); /** * Gets the current calback function used by the converter when an illegal * or invalid codepage sequence is found. * Context pointers are always owned by the caller. * * @param converter the unicode converter * @param action fillin: returns the callback function pointer * @param context fillin: returns the callback's private void* context * @see ucnv_setToUCallBack * @stable ICU 2.0 */ U_STABLE void U_EXPORT2 ucnv_getToUCallBack (const UConverter * converter, UConverterToUCallback *action, const void **context); /** * Gets the current callback function used by the converter when illegal * or invalid Unicode sequence is found. * Context pointers are always owned by the caller. * * @param converter the unicode converter * @param action fillin: returns the callback function pointer * @param context fillin: returns the callback's private void* context * @see ucnv_setFromUCallBack * @stable ICU 2.0 */ U_STABLE void U_EXPORT2 ucnv_getFromUCallBack (const UConverter * converter, UConverterFromUCallback *action, const void **context); /** * Changes the callback function used by the converter when * an illegal or invalid sequence is found. * Context pointers are always owned by the caller. * Predefined actions and contexts can be found in the ucnv_err.h header. * * @param converter the unicode converter * @param newAction the new callback function * @param newContext the new toUnicode callback context pointer. This can be NULL. * @param oldAction fillin: returns the old callback function pointer. This can be NULL. * @param oldContext fillin: returns the old callback's private void* context. This can be NULL. * @param err The error code status * @see ucnv_getToUCallBack * @stable ICU 2.0 */ U_STABLE void U_EXPORT2 ucnv_setToUCallBack (UConverter * converter, UConverterToUCallback newAction, const void* newContext, UConverterToUCallback *oldAction, const void** oldContext, UErrorCode * err); /** * Changes the current callback function used by the converter when * an illegal or invalid sequence is found. * Context pointers are always owned by the caller. * Predefined actions and contexts can be found in the ucnv_err.h header. * * @param converter the unicode converter * @param newAction the new callback function * @param newContext the new fromUnicode callback context pointer. This can be NULL. * @param oldAction fillin: returns the old callback function pointer. This can be NULL. * @param oldContext fillin: returns the old callback's private void* context. This can be NULL. * @param err The error code status * @see ucnv_getFromUCallBack * @stable ICU 2.0 */ U_STABLE void U_EXPORT2 ucnv_setFromUCallBack (UConverter * converter, UConverterFromUCallback newAction, const void *newContext, UConverterFromUCallback *oldAction, const void **oldContext, UErrorCode * err); /** * Converts an array of unicode characters to an array of codepage * characters. This function is optimized for converting a continuous * stream of data in buffer-sized chunks, where the entire source and * target does not fit in available buffers. * * The source pointer is an in/out parameter. It starts out pointing where the * conversion is to begin, and ends up pointing after the last UChar consumed. * * Target similarly starts out pointer at the first available byte in the output * buffer, and ends up pointing after the last byte written to the output. * * The converter always attempts to consume the entire source buffer, unless * (1.) the target buffer is full, or (2.) a failing error is returned from the * current callback function. When a successful error status has been * returned, it means that all of the source buffer has been * consumed. At that point, the caller should reset the source and * sourceLimit pointers to point to the next chunk. * * At the end of the stream (flush==TRUE), the input is completely consumed * when *source==sourceLimit and no error code is set. * The converter object is then automatically reset by this function. * (This means that a converter need not be reset explicitly between data * streams if it finishes the previous stream without errors.) * * This is a stateful conversion. Additionally, even when all source data has * been consumed, some data may be in the converters' internal state. * Call this function repeatedly, updating the target pointers with * the next empty chunk of target in case of a * U_BUFFER_OVERFLOW_ERROR, and updating the source pointers * with the next chunk of source when a successful error status is * returned, until there are no more chunks of source data. * @param converter the Unicode converter * @param target I/O parameter. Input : Points to the beginning of the buffer to copy * codepage characters to. Output : points to after the last codepage character copied * to target. * @param targetLimit the pointer just after last of the target buffer * @param source I/O parameter, pointer to pointer to the source Unicode character buffer. * @param sourceLimit the pointer just after the last of the source buffer * @param offsets if NULL is passed, nothing will happen to it, otherwise it needs to have the same number * of allocated cells as target. Will fill in offsets from target to source pointer * e.g: offsets[3] is equal to 6, it means that the target[3] was a result of transcoding source[6] * For output data carried across calls, and other data without a specific source character * (such as from escape sequences or callbacks) -1 will be placed for offsets. * @param flush set to TRUE if the current source buffer is the last available * chunk of the source, FALSE otherwise. Note that if a failing status is returned, * this function may have to be called multiple times with flush set to TRUE until * the source buffer is consumed. * @param err the error status. U_ILLEGAL_ARGUMENT_ERROR will be set if the * converter is NULL. * U_BUFFER_OVERFLOW_ERROR will be set if the target is full and there is * still data to be written to the target. * @see ucnv_fromUChars * @see ucnv_convert * @see ucnv_getMinCharSize * @see ucnv_setToUCallBack * @stable ICU 2.0 */ U_STABLE void U_EXPORT2 ucnv_fromUnicode (UConverter * converter, char **target, const char *targetLimit, const UChar ** source, const UChar * sourceLimit, int32_t* offsets, UBool flush, UErrorCode * err); /** * Converts a buffer of codepage bytes into an array of unicode UChars * characters. This function is optimized for converting a continuous * stream of data in buffer-sized chunks, where the entire source and * target does not fit in available buffers. * * The source pointer is an in/out parameter. It starts out pointing where the * conversion is to begin, and ends up pointing after the last byte of source consumed. * * Target similarly starts out pointer at the first available UChar in the output * buffer, and ends up pointing after the last UChar written to the output. * It does NOT necessarily keep UChar sequences together. * * The converter always attempts to consume the entire source buffer, unless * (1.) the target buffer is full, or (2.) a failing error is returned from the * current callback function. When a successful error status has been * returned, it means that all of the source buffer has been * consumed. At that point, the caller should reset the source and * sourceLimit pointers to point to the next chunk. * * At the end of the stream (flush==TRUE), the input is completely consumed * when *source==sourceLimit and no error code is set * The converter object is then automatically reset by this function. * (This means that a converter need not be reset explicitly between data * streams if it finishes the previous stream without errors.) * * This is a stateful conversion. Additionally, even when all source data has * been consumed, some data may be in the converters' internal state. * Call this function repeatedly, updating the target pointers with * the next empty chunk of target in case of a * U_BUFFER_OVERFLOW_ERROR, and updating the source pointers * with the next chunk of source when a successful error status is * returned, until there are no more chunks of source data. * @param converter the Unicode converter * @param target I/O parameter. Input : Points to the beginning of the buffer to copy * UChars into. Output : points to after the last UChar copied. * @param targetLimit the pointer just after the end of the target buffer * @param source I/O parameter, pointer to pointer to the source codepage buffer. * @param sourceLimit the pointer to the byte after the end of the source buffer * @param offsets if NULL is passed, nothing will happen to it, otherwise it needs to have the same number * of allocated cells as target. Will fill in offsets from target to source pointer * e.g: offsets[3] is equal to 6, it means that the target[3] was a result of transcoding source[6] * For output data carried across calls, and other data without a specific source character * (such as from escape sequences or callbacks) -1 will be placed for offsets. * @param flush set to TRUE if the current source buffer is the last available * chunk of the source, FALSE otherwise. Note that if a failing status is returned, * this function may have to be called multiple times with flush set to TRUE until * the source buffer is consumed. * @param err the error status. U_ILLEGAL_ARGUMENT_ERROR will be set if the * converter is NULL. * U_BUFFER_OVERFLOW_ERROR will be set if the target is full and there is * still data to be written to the target. * @see ucnv_fromUChars * @see ucnv_convert * @see ucnv_getMinCharSize * @see ucnv_setFromUCallBack * @see ucnv_getNextUChar * @stable ICU 2.0 */ U_STABLE void U_EXPORT2 ucnv_toUnicode(UConverter *converter, UChar **target, const UChar *targetLimit, const char **source, const char *sourceLimit, int32_t *offsets, UBool flush, UErrorCode *err); /** * Convert the Unicode string into a codepage string using an existing UConverter. * The output string is NUL-terminated if possible. * * This function is a more convenient but less powerful version of ucnv_fromUnicode(). * It is only useful for whole strings, not for streaming conversion. * * The maximum output buffer capacity required (barring output from callbacks) will be * UCNV_GET_MAX_BYTES_FOR_STRING(srcLength, ucnv_getMaxCharSize(cnv)). * * @param cnv the converter object to be used (ucnv_resetFromUnicode() will be called) * @param src the input Unicode string * @param srcLength the input string length, or -1 if NUL-terminated * @param dest destination string buffer, can be NULL if destCapacity==0 * @param destCapacity the number of chars available at dest * @param pErrorCode normal ICU error code; * common error codes that may be set by this function include * U_BUFFER_OVERFLOW_ERROR, U_STRING_NOT_TERMINATED_WARNING, * U_ILLEGAL_ARGUMENT_ERROR, and conversion errors * @return the length of the output string, not counting the terminating NUL; * if the length is greater than destCapacity, then the string will not fit * and a buffer of the indicated length would need to be passed in * @see ucnv_fromUnicode * @see ucnv_convert * @see UCNV_GET_MAX_BYTES_FOR_STRING * @stable ICU 2.0 */ U_STABLE int32_t U_EXPORT2 ucnv_fromUChars(UConverter *cnv, char *dest, int32_t destCapacity, const UChar *src, int32_t srcLength, UErrorCode *pErrorCode); /** * Convert the codepage string into a Unicode string using an existing UConverter. * The output string is NUL-terminated if possible. * * This function is a more convenient but less powerful version of ucnv_toUnicode(). * It is only useful for whole strings, not for streaming conversion. * * The maximum output buffer capacity required (barring output from callbacks) will be * 2*srcLength (each char may be converted into a surrogate pair). * * @param cnv the converter object to be used (ucnv_resetToUnicode() will be called) * @param src the input codepage string * @param srcLength the input string length, or -1 if NUL-terminated * @param dest destination string buffer, can be NULL if destCapacity==0 * @param destCapacity the number of UChars available at dest * @param pErrorCode normal ICU error code; * common error codes that may be set by this function include * U_BUFFER_OVERFLOW_ERROR, U_STRING_NOT_TERMINATED_WARNING, * U_ILLEGAL_ARGUMENT_ERROR, and conversion errors * @return the length of the output string, not counting the terminating NUL; * if the length is greater than destCapacity, then the string will not fit * and a buffer of the indicated length would need to be passed in * @see ucnv_toUnicode * @see ucnv_convert * @stable ICU 2.0 */ U_STABLE int32_t U_EXPORT2 ucnv_toUChars(UConverter *cnv, UChar *dest, int32_t destCapacity, const char *src, int32_t srcLength, UErrorCode *pErrorCode); /** * Convert a codepage buffer into Unicode one character at a time. * The input is completely consumed when the U_INDEX_OUTOFBOUNDS_ERROR is set. * * Advantage compared to ucnv_toUnicode() or ucnv_toUChars(): * - Faster for small amounts of data, for most converters, e.g., * US-ASCII, ISO-8859-1, UTF-8/16/32, and most "normal" charsets. * (For complex converters, e.g., SCSU, UTF-7 and ISO 2022 variants, * it uses ucnv_toUnicode() internally.) * - Convenient. * * Limitations compared to ucnv_toUnicode(): * - Always assumes flush=TRUE. * This makes ucnv_getNextUChar() unsuitable for "streaming" conversion, * that is, for where the input is supplied in multiple buffers, * because ucnv_getNextUChar() will assume the end of the input at the end * of the first buffer. * - Does not provide offset output. * * It is possible to "mix" ucnv_getNextUChar() and ucnv_toUnicode() because * ucnv_getNextUChar() uses the current state of the converter * (unlike ucnv_toUChars() which always resets first). * However, if ucnv_getNextUChar() is called after ucnv_toUnicode() * stopped in the middle of a character sequence (with flush=FALSE), * then ucnv_getNextUChar() will always use the slower ucnv_toUnicode() * internally until the next character boundary. * (This is new in ICU 2.6. In earlier releases, ucnv_getNextUChar() had to * start at a character boundary.) * * Instead of using ucnv_getNextUChar(), it is recommended * to convert using ucnv_toUnicode() or ucnv_toUChars() * and then iterate over the text using U16_NEXT() or a UCharIterator (uiter.h) * or a C++ CharacterIterator or similar. * This allows streaming conversion and offset output, for example. * *

Handling of surrogate pairs and supplementary-plane code points:
* There are two different kinds of codepages that provide mappings for surrogate characters: *

    *
  • Codepages like UTF-8, UTF-32, and GB 18030 provide direct representations for Unicode * code points U+10000-U+10ffff as well as for single surrogates U+d800-U+dfff. * Each valid sequence will result in exactly one returned code point. * If a sequence results in a single surrogate, then that will be returned * by itself, even if a neighboring sequence encodes the matching surrogate.
  • *
  • Codepages like SCSU and LMBCS (and UTF-16) provide direct representations only for BMP code points * including surrogates. Code points in supplementary planes are represented with * two sequences, each encoding a surrogate. * For these codepages, matching pairs of surrogates will be combined into single * code points for returning from this function. * (Note that SCSU is actually a mix of these codepage types.)
  • *

* * @param converter an open UConverter * @param source the address of a pointer to the codepage buffer, will be * updated to point after the bytes consumed in the conversion call. * @param sourceLimit points to the end of the input buffer * @param err fills in error status (see ucnv_toUnicode) * U_INDEX_OUTOFBOUNDS_ERROR will be set if the input * is empty or does not convert to any output (e.g.: pure state-change * codes SI/SO, escape sequences for ISO 2022, * or if the callback did not output anything, ...). * This function will not set a U_BUFFER_OVERFLOW_ERROR because * the "buffer" is the return code. However, there might be subsequent output * stored in the converter object * that will be returned in following calls to this function. * @return a UChar32 resulting from the partial conversion of source * @see ucnv_toUnicode * @see ucnv_toUChars * @see ucnv_convert * @stable ICU 2.0 */ U_STABLE UChar32 U_EXPORT2 ucnv_getNextUChar(UConverter * converter, const char **source, const char * sourceLimit, UErrorCode * err); /** * Convert from one external charset to another using two existing UConverters. * Internally, two conversions - ucnv_toUnicode() and ucnv_fromUnicode() - * are used, "pivoting" through 16-bit Unicode. * * Important: For streaming conversion (multiple function calls for successive * parts of a text stream), the caller must provide a pivot buffer explicitly, * and must preserve the pivot buffer and associated pointers from one * call to another. (The buffer may be moved if its contents and the relative * pointer positions are preserved.) * * There is a similar function, ucnv_convert(), * which has the following limitations: * - it takes charset names, not converter objects, so that * - two converters are opened for each call * - only single-string conversion is possible, not streaming operation * - it does not provide enough information to find out, * in case of failure, whether the toUnicode or * the fromUnicode conversion failed * * By contrast, ucnv_convertEx() * - takes UConverter parameters instead of charset names * - fully exposes the pivot buffer for streaming conversion and complete error handling * * ucnv_convertEx() also provides further convenience: * - an option to reset the converters at the beginning * (if reset==TRUE, see parameters; * also sets *pivotTarget=*pivotSource=pivotStart) * - allow NUL-terminated input * (only a single NUL byte, will not work for charsets with multi-byte NULs) * (if sourceLimit==NULL, see parameters) * - terminate with a NUL on output * (only a single NUL byte, not useful for charsets with multi-byte NULs), * or set U_STRING_NOT_TERMINATED_WARNING if the output exactly fills * the target buffer * - the pivot buffer can be provided internally; * possible only for whole-string conversion, not streaming conversion; * in this case, the caller will not be able to get details about where an * error occurred * (if pivotStart==NULL, see below) * * The function returns when one of the following is true: * - the entire source text has been converted successfully to the target buffer * - a target buffer overflow occurred (U_BUFFER_OVERFLOW_ERROR) * - a conversion error occurred * (other U_FAILURE(), see description of pErrorCode) * * Limitation compared to the direct use of * ucnv_fromUnicode() and ucnv_toUnicode(): * ucnv_convertEx() does not provide offset information. * * Limitation compared to ucnv_fromUChars() and ucnv_toUChars(): * ucnv_convertEx() does not support preflighting directly. * * Sample code for converting a single string from * one external charset to UTF-8, ignoring the location of errors: * * \code * int32_t * myToUTF8(UConverter *cnv, * const char *s, int32_t length, * char *u8, int32_t capacity, * UErrorCode *pErrorCode) { * UConverter *utf8Cnv; * char *target; * * if(U_FAILURE(*pErrorCode)) { * return 0; * } * * utf8Cnv=myGetCachedUTF8Converter(pErrorCode); * if(U_FAILURE(*pErrorCode)) { * return 0; * } * * if(length<0) { * length=strlen(s); * } * target=u8; * ucnv_convertEx(utf8Cnv, cnv, * &target, u8+capacity, * &s, s+length, * NULL, NULL, NULL, NULL, * TRUE, TRUE, * pErrorCode); * * myReleaseCachedUTF8Converter(utf8Cnv); * * // return the output string length, but without preflighting * return (int32_t)(target-u8); * } * \endcode * * @param targetCnv Output converter, used to convert from the UTF-16 pivot * to the target using ucnv_fromUnicode(). * @param sourceCnv Input converter, used to convert from the source to * the UTF-16 pivot using ucnv_toUnicode(). * @param target I/O parameter, same as for ucnv_fromUChars(). * Input: *target points to the beginning of the target buffer. * Output: *target points to the first unit after the last char written. * @param targetLimit Pointer to the first unit after the target buffer. * @param source I/O parameter, same as for ucnv_toUChars(). * Input: *source points to the beginning of the source buffer. * Output: *source points to the first unit after the last char read. * @param sourceLimit Pointer to the first unit after the source buffer. * @param pivotStart Pointer to the UTF-16 pivot buffer. If pivotStart==NULL, * then an internal buffer is used and the other pivot * arguments are ignored and can be NULL as well. * @param pivotSource I/O parameter, same as source in ucnv_fromUChars() for * conversion from the pivot buffer to the target buffer. * @param pivotTarget I/O parameter, same as target in ucnv_toUChars() for * conversion from the source buffer to the pivot buffer. * It must be pivotStart<=*pivotSource<=*pivotTarget<=pivotLimit * and pivotStart[0..ucnv_countAvaiable()]) * @return a pointer a string (library owned), or NULL if the index is out of bounds. * @see ucnv_countAvailable * @stable ICU 2.0 */ U_STABLE const char* U_EXPORT2 ucnv_getAvailableName(int32_t n); /** * Returns a UEnumeration to enumerate all of the canonical converter * names, as per the alias file, regardless of the ability to open each * converter. * * @return A UEnumeration object for getting all the recognized canonical * converter names. * @see ucnv_getAvailableName * @see uenum_close * @see uenum_next * @stable ICU 2.4 */ U_STABLE UEnumeration * U_EXPORT2 ucnv_openAllNames(UErrorCode *pErrorCode); /** * Gives the number of aliases for a given converter or alias name. * If the alias is ambiguous, then the preferred converter is used * and the status is set to U_AMBIGUOUS_ALIAS_WARNING. * This method only enumerates the listed entries in the alias file. * @param alias alias name * @param pErrorCode error status * @return number of names on alias list for given alias * @stable ICU 2.0 */ U_STABLE uint16_t U_EXPORT2 ucnv_countAliases(const char *alias, UErrorCode *pErrorCode); /** * Gives the name of the alias at given index of alias list. * This method only enumerates the listed entries in the alias file. * If the alias is ambiguous, then the preferred converter is used * and the status is set to U_AMBIGUOUS_ALIAS_WARNING. * @param alias alias name * @param n index in alias list * @param pErrorCode result of operation * @return returns the name of the alias at given index * @see ucnv_countAliases * @stable ICU 2.0 */ U_STABLE const char * U_EXPORT2 ucnv_getAlias(const char *alias, uint16_t n, UErrorCode *pErrorCode); /** * Fill-up the list of alias names for the given alias. * This method only enumerates the listed entries in the alias file. * If the alias is ambiguous, then the preferred converter is used * and the status is set to U_AMBIGUOUS_ALIAS_WARNING. * @param alias alias name * @param aliases fill-in list, aliases is a pointer to an array of * ucnv_countAliases() string-pointers * (const char *) that will be filled in. * The strings themselves are owned by the library. * @param pErrorCode result of operation * @stable ICU 2.0 */ U_STABLE void U_EXPORT2 ucnv_getAliases(const char *alias, const char **aliases, UErrorCode *pErrorCode); /** * Return a new UEnumeration object for enumerating all the * alias names for a given converter that are recognized by a standard. * This method only enumerates the listed entries in the alias file. * The convrtrs.txt file can be modified to change the results of * this function. * The first result in this list is the same result given by * ucnv_getStandardName, which is the default alias for * the specified standard name. The returned object must be closed with * uenum_close when you are done with the object. * * @param convName original converter name * @param standard name of the standard governing the names; MIME and IANA * are such standards * @param pErrorCode The error code * @return A UEnumeration object for getting all aliases that are recognized * by a standard. If any of the parameters are invalid, NULL * is returned. * @see ucnv_getStandardName * @see uenum_close * @see uenum_next * @stable ICU 2.2 */ U_STABLE UEnumeration * U_EXPORT2 ucnv_openStandardNames(const char *convName, const char *standard, UErrorCode *pErrorCode); /** * Gives the number of standards associated to converter names. * @return number of standards * @stable ICU 2.0 */ U_STABLE uint16_t U_EXPORT2 ucnv_countStandards(void); /** * Gives the name of the standard at given index of standard list. * @param n index in standard list * @param pErrorCode result of operation * @return returns the name of the standard at given index. Owned by the library. * @stable ICU 2.0 */ U_STABLE const char * U_EXPORT2 ucnv_getStandard(uint16_t n, UErrorCode *pErrorCode); /** * Returns a standard name for a given converter name. *

* Example alias table:
* conv alias1 { STANDARD1 } alias2 { STANDARD1* } *

* Result of ucnv_getStandardName("conv", "STANDARD1") from example * alias table:
* "alias2" * * @param name original converter name * @param standard name of the standard governing the names; MIME and IANA * are such standards * @param pErrorCode result of operation * @return returns the standard converter name; * if a standard converter name cannot be determined, * then NULL is returned. Owned by the library. * @stable ICU 2.0 */ U_STABLE const char * U_EXPORT2 ucnv_getStandardName(const char *name, const char *standard, UErrorCode *pErrorCode); /** * This function will return the internal canonical converter name of the * tagged alias. This is the opposite of ucnv_openStandardNames, which * returns the tagged alias given the canonical name. *

* Example alias table:
* conv alias1 { STANDARD1 } alias2 { STANDARD1* } *

* Result of ucnv_getStandardName("alias1", "STANDARD1") from example * alias table:
* "conv" * * @return returns the canonical converter name; * if a standard or alias name cannot be determined, * then NULL is returned. The returned string is * owned by the library. * @see ucnv_getStandardName * @stable ICU 2.4 */ U_STABLE const char * U_EXPORT2 ucnv_getCanonicalName(const char *alias, const char *standard, UErrorCode *pErrorCode); /** * Returns the current default converter name. If you want to open * a default converter, you do not need to use this function. * It is faster if you pass a NULL argument to ucnv_open the * default converter. * * If U_CHARSET_IS_UTF8 is defined to 1 in utypes.h then this function * always returns "UTF-8". * * @return returns the current default converter name. * Storage owned by the library * @see ucnv_setDefaultName * @stable ICU 2.0 */ U_STABLE const char * U_EXPORT2 ucnv_getDefaultName(void); #ifndef U_HIDE_SYSTEM_API /** * This function is not thread safe. DO NOT call this function when ANY ICU * function is being used from more than one thread! This function sets the * current default converter name. If this function needs to be called, it * should be called during application initialization. Most of the time, the * results from ucnv_getDefaultName() or ucnv_open with a NULL string argument * is sufficient for your application. * * If U_CHARSET_IS_UTF8 is defined to 1 in utypes.h then this function * does nothing. * * @param name the converter name to be the default (must be known by ICU). * @see ucnv_getDefaultName * @system * @stable ICU 2.0 */ U_STABLE void U_EXPORT2 ucnv_setDefaultName(const char *name); #endif /* U_HIDE_SYSTEM_API */ /** * Fixes the backslash character mismapping. For example, in SJIS, the backslash * character in the ASCII portion is also used to represent the yen currency sign. * When mapping from Unicode character 0x005C, it's unclear whether to map the * character back to yen or backslash in SJIS. This function will take the input * buffer and replace all the yen sign characters with backslash. This is necessary * when the user tries to open a file with the input buffer on Windows. * This function will test the converter to see whether such mapping is * required. You can sometimes avoid using this function by using the correct version * of Shift-JIS. * * @param cnv The converter representing the target codepage. * @param source the input buffer to be fixed * @param sourceLen the length of the input buffer * @see ucnv_isAmbiguous * @stable ICU 2.0 */ U_STABLE void U_EXPORT2 ucnv_fixFileSeparator(const UConverter *cnv, UChar *source, int32_t sourceLen); /** * Determines if the converter contains ambiguous mappings of the same * character or not. * @param cnv the converter to be tested * @return TRUE if the converter contains ambiguous mapping of the same * character, FALSE otherwise. * @stable ICU 2.0 */ U_STABLE UBool U_EXPORT2 ucnv_isAmbiguous(const UConverter *cnv); /** * Sets the converter to use fallback mappings or not. * Regardless of this flag, the converter will always use * fallbacks from Unicode Private Use code points, as well as * reverse fallbacks (to Unicode). * For details see ".ucm File Format" * in the Conversion Data chapter of the ICU User Guide: * http://www.icu-project.org/userguide/conversion-data.html#ucmformat * * @param cnv The converter to set the fallback mapping usage on. * @param usesFallback TRUE if the user wants the converter to take advantage of the fallback * mapping, FALSE otherwise. * @stable ICU 2.0 * @see ucnv_usesFallback */ U_STABLE void U_EXPORT2 ucnv_setFallback(UConverter *cnv, UBool usesFallback); /** * Determines if the converter uses fallback mappings or not. * This flag has restrictions, see ucnv_setFallback(). * * @param cnv The converter to be tested * @return TRUE if the converter uses fallback, FALSE otherwise. * @stable ICU 2.0 * @see ucnv_setFallback */ U_STABLE UBool U_EXPORT2 ucnv_usesFallback(const UConverter *cnv); /** * Detects Unicode signature byte sequences at the start of the byte stream * and returns the charset name of the indicated Unicode charset. * NULL is returned when no Unicode signature is recognized. * The number of bytes in the signature is output as well. * * The caller can ucnv_open() a converter using the charset name. * The first code unit (UChar) from the start of the stream will be U+FEFF * (the Unicode BOM/signature character) and can usually be ignored. * * For most Unicode charsets it is also possible to ignore the indicated * number of initial stream bytes and start converting after them. * However, there are stateful Unicode charsets (UTF-7 and BOCU-1) for which * this will not work. Therefore, it is best to ignore the first output UChar * instead of the input signature bytes. *

* Usage: * \snippet samples/ucnv/convsamp.cpp ucnv_detectUnicodeSignature * * @param source The source string in which the signature should be detected. * @param sourceLength Length of the input string, or -1 if terminated with a NUL byte. * @param signatureLength A pointer to int32_t to receive the number of bytes that make up the signature * of the detected UTF. 0 if not detected. * Can be a NULL pointer. * @param pErrorCode ICU error code in/out parameter. * Must fulfill U_SUCCESS before the function call. * @return The name of the encoding detected. NULL if encoding is not detected. * @stable ICU 2.4 */ U_STABLE const char* U_EXPORT2 ucnv_detectUnicodeSignature(const char* source, int32_t sourceLength, int32_t *signatureLength, UErrorCode *pErrorCode); /** * Returns the number of UChars held in the converter's internal state * because more input is needed for completing the conversion. This function is * useful for mapping semantics of ICU's converter interface to those of iconv, * and this information is not needed for normal conversion. * @param cnv The converter in which the input is held * @param status ICU error code in/out parameter. * Must fulfill U_SUCCESS before the function call. * @return The number of UChars in the state. -1 if an error is encountered. * @stable ICU 3.4 */ U_STABLE int32_t U_EXPORT2 ucnv_fromUCountPending(const UConverter* cnv, UErrorCode* status); /** * Returns the number of chars held in the converter's internal state * because more input is needed for completing the conversion. This function is * useful for mapping semantics of ICU's converter interface to those of iconv, * and this information is not needed for normal conversion. * @param cnv The converter in which the input is held as internal state * @param status ICU error code in/out parameter. * Must fulfill U_SUCCESS before the function call. * @return The number of chars in the state. -1 if an error is encountered. * @stable ICU 3.4 */ U_STABLE int32_t U_EXPORT2 ucnv_toUCountPending(const UConverter* cnv, UErrorCode* status); /** * Returns whether or not the charset of the converter has a fixed number of bytes * per charset character. * An example of this are converters that are of the type UCNV_SBCS or UCNV_DBCS. * Another example is UTF-32 which is always 4 bytes per character. * A Unicode code point may be represented by more than one UTF-8 or UTF-16 code unit * but a UTF-32 converter encodes each code point with 4 bytes. * Note: This method is not intended to be used to determine whether the charset has a * fixed ratio of bytes to Unicode codes units for any particular Unicode encoding form. * FALSE is returned with the UErrorCode if error occurs or cnv is NULL. * @param cnv The converter to be tested * @param status ICU error code in/out paramter * @return TRUE if the converter is fixed-width * @stable ICU 4.8 */ U_STABLE UBool U_EXPORT2 ucnv_isFixedWidth(UConverter *cnv, UErrorCode *status); #endif #endif /*_UCNV*/ // ucnv_cb.h /* ********************************************************************** * Copyright (C) 2000-2004, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** * ucnv_cb.h: * External APIs for the ICU's codeset conversion library * Helena Shih * * Modification History: * * Date Name Description */ /** * \file * \brief C UConverter functions to aid the writers of callbacks * *

Callback API for UConverter

* * These functions are provided here for the convenience of the callback * writer. If you are just looking for callback functions to use, please * see ucnv_err.h. DO NOT call these functions directly when you are * working with converters, unless your code has been called as a callback * via ucnv_setFromUCallback or ucnv_setToUCallback !! * * A note about error codes and overflow. Unlike other ICU functions, * these functions do not expect the error status to be U_ZERO_ERROR. * Callbacks must be much more careful about their error codes. * The error codes used here are in/out parameters, which should be passed * back in the callback's error parameter. * * For example, if you call ucnv_cbfromUWriteBytes to write data out * to the output codepage, it may return U_BUFFER_OVERFLOW_ERROR if * the data did not fit in the target. But this isn't a failing error, * in fact, ucnv_cbfromUWriteBytes may be called AGAIN with the error * status still U_BUFFER_OVERFLOW_ERROR to attempt to write further bytes, * which will also go into the internal overflow buffers. * * Concerning offsets, the 'offset' parameters here are relative to the start * of SOURCE. For example, Suppose the string "ABCD" was being converted * from Unicode into a codepage which doesn't have a mapping for 'B'. * 'A' will be written out correctly, but * The FromU Callback will be called on an unassigned character for 'B'. * At this point, this is the state of the world: * Target: A [..] [points after A] * Source: A B [C] D [points to C - B has been consumed] * 0 1 2 3 * codePoint = "B" [the unassigned codepoint] * * Now, suppose a callback wants to write the substitution character '?' to * the target. It calls ucnv_cbFromUWriteBytes() to write the ?. * It should pass ZERO as the offset, because the offset as far as the * callback is concerned is relative to the SOURCE pointer [which points * before 'C'.] If the callback goes into the args and consumes 'C' also, * it would call FromUWriteBytes with an offset of 1 (and advance the source * pointer). * */ #ifndef UCNV_CB_H #define UCNV_CB_H #if !UCONFIG_NO_CONVERSION /** * ONLY used by FromU callback functions. * Writes out the specified byte output bytes to the target byte buffer or to converter internal buffers. * * @param args callback fromUnicode arguments * @param source source bytes to write * @param length length of bytes to write * @param offsetIndex the relative offset index from callback. * @param err error status. If U_BUFFER_OVERFLOW is returned, then U_BUFFER_OVERFLOW must * be returned to the user, because it means that not all data could be written into the target buffer, and some is * in the converter error buffer. * @see ucnv_cbFromUWriteSub * @stable ICU 2.0 */ U_STABLE void U_EXPORT2 ucnv_cbFromUWriteBytes (UConverterFromUnicodeArgs *args, const char* source, int32_t length, int32_t offsetIndex, UErrorCode * err); /** * ONLY used by FromU callback functions. * This function will write out the correct substitution character sequence * to the target. * * @param args callback fromUnicode arguments * @param offsetIndex the relative offset index from the current source pointer to be used * @param err error status. If U_BUFFER_OVERFLOW is returned, then U_BUFFER_OVERFLOW must * be returned to the user, because it means that not all data could be written into the target buffer, and some is * in the converter error buffer. * @see ucnv_cbFromUWriteBytes * @stable ICU 2.0 */ U_STABLE void U_EXPORT2 ucnv_cbFromUWriteSub (UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode * err); /** * ONLY used by fromU callback functions. * This function will write out the error character(s) to the target UChar buffer. * * @param args callback fromUnicode arguments * @param source pointer to pointer to first UChar to write [on exit: 1 after last UChar processed] * @param sourceLimit pointer after last UChar to write * @param offsetIndex the relative offset index from callback which will be set * @param err error status U_BUFFER_OVERFLOW * @see ucnv_cbToUWriteSub * @stable ICU 2.0 */ U_STABLE void U_EXPORT2 ucnv_cbFromUWriteUChars(UConverterFromUnicodeArgs *args, const UChar** source, const UChar* sourceLimit, int32_t offsetIndex, UErrorCode * err); /** * ONLY used by ToU callback functions. * This function will write out the specified characters to the target * UChar buffer. * * @param args callback toUnicode arguments * @param source source string to write * @param length the length of source string * @param offsetIndex the relative offset index which will be written. * @param err error status U_BUFFER_OVERFLOW * @see ucnv_cbToUWriteSub * @stable ICU 2.0 */ U_STABLE void U_EXPORT2 ucnv_cbToUWriteUChars (UConverterToUnicodeArgs *args, const UChar* source, int32_t length, int32_t offsetIndex, UErrorCode * err); /** * ONLY used by ToU callback functions. * This function will write out the Unicode substitution character (U+FFFD). * * @param args callback fromUnicode arguments * @param offsetIndex the relative offset index from callback. * @param err error status U_BUFFER_OVERFLOW * @see ucnv_cbToUWriteUChars * @stable ICU 2.0 */ U_STABLE void U_EXPORT2 ucnv_cbToUWriteSub (UConverterToUnicodeArgs *args, int32_t offsetIndex, UErrorCode * err); #endif #endif // uclean.h /* ****************************************************************************** * Copyright (C) 2001-2014, International Business Machines * Corporation and others. All Rights Reserved. ****************************************************************************** * file name: uclean.h * encoding: US-ASCII * tab size: 8 (not used) * indentation:4 * * created on: 2001July05 * created by: George Rhoten */ #ifndef __UCLEAN_H__ #define __UCLEAN_H__ /** * \file * \brief C API: Initialize and clean up ICU */ /** * Initialize ICU. * * Use of this function is optional. It is OK to simply use ICU * services and functions without first having initialized * ICU by calling u_init(). * * u_init() will attempt to load some part of ICU's data, and is * useful as a test for configuration or installation problems that * leave the ICU data inaccessible. A successful invocation of u_init() * does not, however, guarantee that all ICU data is accessible. * * Multiple calls to u_init() cause no harm, aside from the small amount * of time required. * * In old versions of ICU, u_init() was required in multi-threaded applications * to ensure the thread safety of ICU. u_init() is no longer needed for this purpose. * * @param status An ICU UErrorCode parameter. It must not be NULL. * An Error will be returned if some required part of ICU data can not * be loaded or initialized. * The function returns immediately if the input error code indicates a * failure, as usual. * * @stable ICU 2.6 */ U_STABLE void U_EXPORT2 u_init(UErrorCode *status); #ifndef U_HIDE_SYSTEM_API /** * Clean up the system resources, such as allocated memory or open files, * used in all ICU libraries. This will free/delete all memory owned by the * ICU libraries, and return them to their original load state. All open ICU * items (collators, resource bundles, converters, etc.) must be closed before * calling this function, otherwise ICU may not free its allocated memory * (e.g. close your converters and resource bundles before calling this * function). Generally, this function should be called once just before * an application exits. For applications that dynamically load and unload * the ICU libraries (relatively uncommon), u_cleanup() should be called * just before the library unload. *

* u_cleanup() also clears any ICU heap functions, mutex functions or * trace functions that may have been set for the process. * This has the effect of restoring ICU to its initial condition, before * any of these override functions were installed. Refer to * u_setMemoryFunctions(), u_setMutexFunctions and * utrace_setFunctions(). If ICU is to be reinitialized after after * calling u_cleanup(), these runtime override functions will need to * be set up again if they are still required. *

* u_cleanup() is not thread safe. All other threads should stop using ICU * before calling this function. *

* Any open ICU items will be left in an undefined state by u_cleanup(), * and any subsequent attempt to use such an item will give unpredictable * results. *

* After calling u_cleanup(), an application may continue to use ICU by * calling u_init(). An application must invoke u_init() first from one single * thread before allowing other threads call u_init(). All threads existing * at the time of the first thread's call to u_init() must also call * u_init() themselves before continuing with other ICU operations. *

* The use of u_cleanup() just before an application terminates is optional, * but it should be called only once for performance reasons. The primary * benefit is to eliminate reports of memory or resource leaks originating * in ICU code from the results generated by heap analysis tools. *

* Use this function with great care! *

* * @stable ICU 2.0 * @system */ U_STABLE void U_EXPORT2 u_cleanup(void); /** * Pointer type for a user supplied memory allocation function. * @param context user supplied value, obtained from from u_setMemoryFunctions(). * @param size The number of bytes to be allocated * @return Pointer to the newly allocated memory, or NULL if the allocation failed. * @stable ICU 2.8 * @system */ typedef void *U_CALLCONV UMemAllocFn(const void *context, size_t size); /** * Pointer type for a user supplied memory re-allocation function. * @param context user supplied value, obtained from from u_setMemoryFunctions(). * @param size The number of bytes to be allocated * @return Pointer to the newly allocated memory, or NULL if the allocation failed. * @stable ICU 2.8 * @system */ typedef void *U_CALLCONV UMemReallocFn(const void *context, void *mem, size_t size); /** * Pointer type for a user supplied memory free function. Behavior should be * similar the standard C library free(). * @param context user supplied value, obtained from from u_setMemoryFunctions(). * @param mem Pointer to the memory block to be resized * @param size The new size for the block * @return Pointer to the resized memory block, or NULL if the resizing failed. * @stable ICU 2.8 * @system */ typedef void U_CALLCONV UMemFreeFn (const void *context, void *mem); /** * Set the functions that ICU will use for memory allocation. * Use of this function is optional; by default (without this function), ICU will * use the standard C library malloc() and free() functions. * This function can only be used when ICU is in an initial, unused state, before * u_init() has been called. * @param context This pointer value will be saved, and then (later) passed as * a parameter to the memory functions each time they * are called. * @param a Pointer to a user-supplied malloc function. * @param r Pointer to a user-supplied realloc function. * @param f Pointer to a user-supplied free function. * @param status Receives error values. * @stable ICU 2.8 * @system */ U_STABLE void U_EXPORT2 u_setMemoryFunctions(const void *context, UMemAllocFn *a, UMemReallocFn *r, UMemFreeFn *f, UErrorCode *status); #endif /* U_HIDE_SYSTEM_API */ #endif // uchar.h /* ********************************************************************** * Copyright (C) 1997-2016, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** * * File UCHAR.H * * Modification History: * * Date Name Description * 04/02/97 aliu Creation. * 03/29/99 helena Updated for C APIs. * 4/15/99 Madhu Updated for C Implementation and Javadoc * 5/20/99 Madhu Added the function u_getVersion() * 8/19/1999 srl Upgraded scripts to Unicode 3.0 * 8/27/1999 schererm UCharDirection constants: U_... * 11/11/1999 weiv added u_isalnum(), cleaned comments * 01/11/2000 helena Renamed u_getVersion to u_getUnicodeVersion(). ****************************************************************************** */ #ifndef UCHAR_H #define UCHAR_H U_CDECL_BEGIN /*==========================================================================*/ /* Unicode version number */ /*==========================================================================*/ /** * Unicode version number, default for the current ICU version. * The actual Unicode Character Database (UCD) data is stored in uprops.dat * and may be generated from UCD files from a different Unicode version. * Call u_getUnicodeVersion to get the actual Unicode version of the data. * * @see u_getUnicodeVersion * @stable ICU 2.0 */ #define U_UNICODE_VERSION "8.0" /** * \file * \brief C API: Unicode Properties * * This C API provides low-level access to the Unicode Character Database. * In addition to raw property values, some convenience functions calculate * derived properties, for example for Java-style programming. * * Unicode assigns each code point (not just assigned character) values for * many properties. * Most of them are simple boolean flags, or constants from a small enumerated list. * For some properties, values are strings or other relatively more complex types. * * For more information see * "About the Unicode Character Database" (http://www.unicode.org/ucd/) * and the ICU User Guide chapter on Properties (http://icu-project.org/userguide/properties.html). * * Many functions are designed to match java.lang.Character functions. * See the individual function documentation, * and see the JDK 1.4 java.lang.Character documentation * at http://java.sun.com/j2se/1.4/docs/api/java/lang/Character.html * * There are also functions that provide easy migration from C/POSIX functions * like isblank(). Their use is generally discouraged because the C/POSIX * standards do not define their semantics beyond the ASCII range, which means * that different implementations exhibit very different behavior. * Instead, Unicode properties should be used directly. * * There are also only a few, broad C/POSIX character classes, and they tend * to be used for conflicting purposes. For example, the "isalpha()" class * is sometimes used to determine word boundaries, while a more sophisticated * approach would at least distinguish initial letters from continuation * characters (the latter including combining marks). * (In ICU, BreakIterator is the most sophisticated API for word boundaries.) * Another example: There is no "istitle()" class for titlecase characters. * * ICU 3.4 and later provides API access for all twelve C/POSIX character classes. * ICU implements them according to the Standard Recommendations in * Annex C: Compatibility Properties of UTS #18 Unicode Regular Expressions * (http://www.unicode.org/reports/tr18/#Compatibility_Properties). * * API access for C/POSIX character classes is as follows: * - alpha: u_isUAlphabetic(c) or u_hasBinaryProperty(c, UCHAR_ALPHABETIC) * - lower: u_isULowercase(c) or u_hasBinaryProperty(c, UCHAR_LOWERCASE) * - upper: u_isUUppercase(c) or u_hasBinaryProperty(c, UCHAR_UPPERCASE) * - punct: u_ispunct(c) * - digit: u_isdigit(c) or u_charType(c)==U_DECIMAL_DIGIT_NUMBER * - xdigit: u_isxdigit(c) or u_hasBinaryProperty(c, UCHAR_POSIX_XDIGIT) * - alnum: u_hasBinaryProperty(c, UCHAR_POSIX_ALNUM) * - space: u_isUWhiteSpace(c) or u_hasBinaryProperty(c, UCHAR_WHITE_SPACE) * - blank: u_isblank(c) or u_hasBinaryProperty(c, UCHAR_POSIX_BLANK) * - cntrl: u_charType(c)==U_CONTROL_CHAR * - graph: u_hasBinaryProperty(c, UCHAR_POSIX_GRAPH) * - print: u_hasBinaryProperty(c, UCHAR_POSIX_PRINT) * * Note: Some of the u_isxyz() functions in uchar.h predate, and do not match, * the Standard Recommendations in UTS #18. Instead, they match Java * functions according to their API documentation. * * \htmlonly * The C/POSIX character classes are also available in UnicodeSet patterns, * using patterns like [:graph:] or \p{graph}. * \endhtmlonly * * Note: There are several ICU whitespace functions. * Comparison: * - u_isUWhiteSpace=UCHAR_WHITE_SPACE: Unicode White_Space property; * most of general categories "Z" (separators) + most whitespace ISO controls * (including no-break spaces, but excluding IS1..IS4 and ZWSP) * - u_isWhitespace: Java isWhitespace; Z + whitespace ISO controls but excluding no-break spaces * - u_isJavaSpaceChar: Java isSpaceChar; just Z (including no-break spaces) * - u_isspace: Z + whitespace ISO controls (including no-break spaces) * - u_isblank: "horizontal spaces" = TAB + Zs - ZWSP */ /** * Constants. */ /** The lowest Unicode code point value. Code points are non-negative. @stable ICU 2.0 */ #define UCHAR_MIN_VALUE 0 /** * The highest Unicode code point value (scalar value) according to * The Unicode Standard. This is a 21-bit value (20.1 bits, rounded up). * For a single character, UChar32 is a simple type that can hold any code point value. * * @see UChar32 * @stable ICU 2.0 */ #define UCHAR_MAX_VALUE 0x10ffff /** * Get a single-bit bit set (a flag) from a bit number 0..31. * @stable ICU 2.1 */ #define U_MASK(x) ((uint32_t)1<<(x)) /** * Selection constants for Unicode properties. * These constants are used in functions like u_hasBinaryProperty to select * one of the Unicode properties. * * The properties APIs are intended to reflect Unicode properties as defined * in the Unicode Character Database (UCD) and Unicode Technical Reports (UTR). * For details about the properties see http://www.unicode.org/ucd/ . * For names of Unicode properties see the UCD file PropertyAliases.txt. * * Important: If ICU is built with UCD files from Unicode versions below, e.g., 3.2, * then properties marked with "new in Unicode 3.2" are not or not fully available. * Check u_getUnicodeVersion to be sure. * * @see u_hasBinaryProperty * @see u_getIntPropertyValue * @see u_getUnicodeVersion * @stable ICU 2.1 */ typedef enum UProperty { /* * Note: UProperty constants are parsed by preparseucd.py. * It matches lines like * UCHAR_=, */ /* Note: Place UCHAR_ALPHABETIC before UCHAR_BINARY_START so that debuggers display UCHAR_ALPHABETIC as the symbolic name for 0, rather than UCHAR_BINARY_START. Likewise for other *_START identifiers. */ /** Binary property Alphabetic. Same as u_isUAlphabetic, different from u_isalpha. Lu+Ll+Lt+Lm+Lo+Nl+Other_Alphabetic @stable ICU 2.1 */ UCHAR_ALPHABETIC=0, /** First constant for binary Unicode properties. @stable ICU 2.1 */ UCHAR_BINARY_START=UCHAR_ALPHABETIC, /** Binary property ASCII_Hex_Digit. 0-9 A-F a-f @stable ICU 2.1 */ UCHAR_ASCII_HEX_DIGIT=1, /** Binary property Bidi_Control. Format controls which have specific functions in the Bidi Algorithm. @stable ICU 2.1 */ UCHAR_BIDI_CONTROL=2, /** Binary property Bidi_Mirrored. Characters that may change display in RTL text. Same as u_isMirrored. See Bidi Algorithm, UTR 9. @stable ICU 2.1 */ UCHAR_BIDI_MIRRORED=3, /** Binary property Dash. Variations of dashes. @stable ICU 2.1 */ UCHAR_DASH=4, /** Binary property Default_Ignorable_Code_Point (new in Unicode 3.2). Ignorable in most processing. <2060..206F, FFF0..FFFB, E0000..E0FFF>+Other_Default_Ignorable_Code_Point+(Cf+Cc+Cs-White_Space) @stable ICU 2.1 */ UCHAR_DEFAULT_IGNORABLE_CODE_POINT=5, /** Binary property Deprecated (new in Unicode 3.2). The usage of deprecated characters is strongly discouraged. @stable ICU 2.1 */ UCHAR_DEPRECATED=6, /** Binary property Diacritic. Characters that linguistically modify the meaning of another character to which they apply. @stable ICU 2.1 */ UCHAR_DIACRITIC=7, /** Binary property Extender. Extend the value or shape of a preceding alphabetic character, e.g., length and iteration marks. @stable ICU 2.1 */ UCHAR_EXTENDER=8, /** Binary property Full_Composition_Exclusion. CompositionExclusions.txt+Singleton Decompositions+ Non-Starter Decompositions. @stable ICU 2.1 */ UCHAR_FULL_COMPOSITION_EXCLUSION=9, /** Binary property Grapheme_Base (new in Unicode 3.2). For programmatic determination of grapheme cluster boundaries. [0..10FFFF]-Cc-Cf-Cs-Co-Cn-Zl-Zp-Grapheme_Link-Grapheme_Extend-CGJ @stable ICU 2.1 */ UCHAR_GRAPHEME_BASE=10, /** Binary property Grapheme_Extend (new in Unicode 3.2). For programmatic determination of grapheme cluster boundaries. Me+Mn+Mc+Other_Grapheme_Extend-Grapheme_Link-CGJ @stable ICU 2.1 */ UCHAR_GRAPHEME_EXTEND=11, /** Binary property Grapheme_Link (new in Unicode 3.2). For programmatic determination of grapheme cluster boundaries. @stable ICU 2.1 */ UCHAR_GRAPHEME_LINK=12, /** Binary property Hex_Digit. Characters commonly used for hexadecimal numbers. @stable ICU 2.1 */ UCHAR_HEX_DIGIT=13, /** Binary property Hyphen. Dashes used to mark connections between pieces of words, plus the Katakana middle dot. @stable ICU 2.1 */ UCHAR_HYPHEN=14, /** Binary property ID_Continue. Characters that can continue an identifier. DerivedCoreProperties.txt also says "NOTE: Cf characters should be filtered out." ID_Start+Mn+Mc+Nd+Pc @stable ICU 2.1 */ UCHAR_ID_CONTINUE=15, /** Binary property ID_Start. Characters that can start an identifier. Lu+Ll+Lt+Lm+Lo+Nl @stable ICU 2.1 */ UCHAR_ID_START=16, /** Binary property Ideographic. CJKV ideographs. @stable ICU 2.1 */ UCHAR_IDEOGRAPHIC=17, /** Binary property IDS_Binary_Operator (new in Unicode 3.2). For programmatic determination of Ideographic Description Sequences. @stable ICU 2.1 */ UCHAR_IDS_BINARY_OPERATOR=18, /** Binary property IDS_Trinary_Operator (new in Unicode 3.2). For programmatic determination of Ideographic Description Sequences. @stable ICU 2.1 */ UCHAR_IDS_TRINARY_OPERATOR=19, /** Binary property Join_Control. Format controls for cursive joining and ligation. @stable ICU 2.1 */ UCHAR_JOIN_CONTROL=20, /** Binary property Logical_Order_Exception (new in Unicode 3.2). Characters that do not use logical order and require special handling in most processing. @stable ICU 2.1 */ UCHAR_LOGICAL_ORDER_EXCEPTION=21, /** Binary property Lowercase. Same as u_isULowercase, different from u_islower. Ll+Other_Lowercase @stable ICU 2.1 */ UCHAR_LOWERCASE=22, /** Binary property Math. Sm+Other_Math @stable ICU 2.1 */ UCHAR_MATH=23, /** Binary property Noncharacter_Code_Point. Code points that are explicitly defined as illegal for the encoding of characters. @stable ICU 2.1 */ UCHAR_NONCHARACTER_CODE_POINT=24, /** Binary property Quotation_Mark. @stable ICU 2.1 */ UCHAR_QUOTATION_MARK=25, /** Binary property Radical (new in Unicode 3.2). For programmatic determination of Ideographic Description Sequences. @stable ICU 2.1 */ UCHAR_RADICAL=26, /** Binary property Soft_Dotted (new in Unicode 3.2). Characters with a "soft dot", like i or j. An accent placed on these characters causes the dot to disappear. @stable ICU 2.1 */ UCHAR_SOFT_DOTTED=27, /** Binary property Terminal_Punctuation. Punctuation characters that generally mark the end of textual units. @stable ICU 2.1 */ UCHAR_TERMINAL_PUNCTUATION=28, /** Binary property Unified_Ideograph (new in Unicode 3.2). For programmatic determination of Ideographic Description Sequences. @stable ICU 2.1 */ UCHAR_UNIFIED_IDEOGRAPH=29, /** Binary property Uppercase. Same as u_isUUppercase, different from u_isupper. Lu+Other_Uppercase @stable ICU 2.1 */ UCHAR_UPPERCASE=30, /** Binary property White_Space. Same as u_isUWhiteSpace, different from u_isspace and u_isWhitespace. Space characters+TAB+CR+LF-ZWSP-ZWNBSP @stable ICU 2.1 */ UCHAR_WHITE_SPACE=31, /** Binary property XID_Continue. ID_Continue modified to allow closure under normalization forms NFKC and NFKD. @stable ICU 2.1 */ UCHAR_XID_CONTINUE=32, /** Binary property XID_Start. ID_Start modified to allow closure under normalization forms NFKC and NFKD. @stable ICU 2.1 */ UCHAR_XID_START=33, /** Binary property Case_Sensitive. Either the source of a case mapping or _in_ the target of a case mapping. Not the same as the general category Cased_Letter. @stable ICU 2.6 */ UCHAR_CASE_SENSITIVE=34, /** Binary property STerm (new in Unicode 4.0.1). Sentence Terminal. Used in UAX #29: Text Boundaries (http://www.unicode.org/reports/tr29/) @stable ICU 3.0 */ UCHAR_S_TERM=35, /** Binary property Variation_Selector (new in Unicode 4.0.1). Indicates all those characters that qualify as Variation Selectors. For details on the behavior of these characters, see StandardizedVariants.html and 15.6 Variation Selectors. @stable ICU 3.0 */ UCHAR_VARIATION_SELECTOR=36, /** Binary property NFD_Inert. ICU-specific property for characters that are inert under NFD, i.e., they do not interact with adjacent characters. See the documentation for the Normalizer2 class and the Normalizer2::isInert() method. @stable ICU 3.0 */ UCHAR_NFD_INERT=37, /** Binary property NFKD_Inert. ICU-specific property for characters that are inert under NFKD, i.e., they do not interact with adjacent characters. See the documentation for the Normalizer2 class and the Normalizer2::isInert() method. @stable ICU 3.0 */ UCHAR_NFKD_INERT=38, /** Binary property NFC_Inert. ICU-specific property for characters that are inert under NFC, i.e., they do not interact with adjacent characters. See the documentation for the Normalizer2 class and the Normalizer2::isInert() method. @stable ICU 3.0 */ UCHAR_NFC_INERT=39, /** Binary property NFKC_Inert. ICU-specific property for characters that are inert under NFKC, i.e., they do not interact with adjacent characters. See the documentation for the Normalizer2 class and the Normalizer2::isInert() method. @stable ICU 3.0 */ UCHAR_NFKC_INERT=40, /** Binary Property Segment_Starter. ICU-specific property for characters that are starters in terms of Unicode normalization and combining character sequences. They have ccc=0 and do not occur in non-initial position of the canonical decomposition of any character (like a-umlaut in NFD and a Jamo T in an NFD(Hangul LVT)). ICU uses this property for segmenting a string for generating a set of canonically equivalent strings, e.g. for canonical closure while processing collation tailoring rules. @stable ICU 3.0 */ UCHAR_SEGMENT_STARTER=41, /** Binary property Pattern_Syntax (new in Unicode 4.1). See UAX #31 Identifier and Pattern Syntax (http://www.unicode.org/reports/tr31/) @stable ICU 3.4 */ UCHAR_PATTERN_SYNTAX=42, /** Binary property Pattern_White_Space (new in Unicode 4.1). See UAX #31 Identifier and Pattern Syntax (http://www.unicode.org/reports/tr31/) @stable ICU 3.4 */ UCHAR_PATTERN_WHITE_SPACE=43, /** Binary property alnum (a C/POSIX character class). Implemented according to the UTS #18 Annex C Standard Recommendation. See the uchar.h file documentation. @stable ICU 3.4 */ UCHAR_POSIX_ALNUM=44, /** Binary property blank (a C/POSIX character class). Implemented according to the UTS #18 Annex C Standard Recommendation. See the uchar.h file documentation. @stable ICU 3.4 */ UCHAR_POSIX_BLANK=45, /** Binary property graph (a C/POSIX character class). Implemented according to the UTS #18 Annex C Standard Recommendation. See the uchar.h file documentation. @stable ICU 3.4 */ UCHAR_POSIX_GRAPH=46, /** Binary property print (a C/POSIX character class). Implemented according to the UTS #18 Annex C Standard Recommendation. See the uchar.h file documentation. @stable ICU 3.4 */ UCHAR_POSIX_PRINT=47, /** Binary property xdigit (a C/POSIX character class). Implemented according to the UTS #18 Annex C Standard Recommendation. See the uchar.h file documentation. @stable ICU 3.4 */ UCHAR_POSIX_XDIGIT=48, /** Binary property Cased. For Lowercase, Uppercase and Titlecase characters. @stable ICU 4.4 */ UCHAR_CASED=49, /** Binary property Case_Ignorable. Used in context-sensitive case mappings. @stable ICU 4.4 */ UCHAR_CASE_IGNORABLE=50, /** Binary property Changes_When_Lowercased. @stable ICU 4.4 */ UCHAR_CHANGES_WHEN_LOWERCASED=51, /** Binary property Changes_When_Uppercased. @stable ICU 4.4 */ UCHAR_CHANGES_WHEN_UPPERCASED=52, /** Binary property Changes_When_Titlecased. @stable ICU 4.4 */ UCHAR_CHANGES_WHEN_TITLECASED=53, /** Binary property Changes_When_Casefolded. @stable ICU 4.4 */ UCHAR_CHANGES_WHEN_CASEFOLDED=54, /** Binary property Changes_When_Casemapped. @stable ICU 4.4 */ UCHAR_CHANGES_WHEN_CASEMAPPED=55, /** Binary property Changes_When_NFKC_Casefolded. @stable ICU 4.4 */ UCHAR_CHANGES_WHEN_NFKC_CASEFOLDED=56, /** One more than the last constant for binary Unicode properties. @stable ICU 2.1 */ UCHAR_BINARY_LIMIT=61, /** Enumerated property Bidi_Class. Same as u_charDirection, returns UCharDirection values. @stable ICU 2.2 */ UCHAR_BIDI_CLASS=0x1000, /** First constant for enumerated/integer Unicode properties. @stable ICU 2.2 */ UCHAR_INT_START=UCHAR_BIDI_CLASS, /** Enumerated property Block. Same as ublock_getCode, returns UBlockCode values. @stable ICU 2.2 */ UCHAR_BLOCK=0x1001, /** Enumerated property Canonical_Combining_Class. Same as u_getCombiningClass, returns 8-bit numeric values. @stable ICU 2.2 */ UCHAR_CANONICAL_COMBINING_CLASS=0x1002, /** Enumerated property Decomposition_Type. Returns UDecompositionType values. @stable ICU 2.2 */ UCHAR_DECOMPOSITION_TYPE=0x1003, /** Enumerated property East_Asian_Width. See http://www.unicode.org/reports/tr11/ Returns UEastAsianWidth values. @stable ICU 2.2 */ UCHAR_EAST_ASIAN_WIDTH=0x1004, /** Enumerated property General_Category. Same as u_charType, returns UCharCategory values. @stable ICU 2.2 */ UCHAR_GENERAL_CATEGORY=0x1005, /** Enumerated property Joining_Group. Returns UJoiningGroup values. @stable ICU 2.2 */ UCHAR_JOINING_GROUP=0x1006, /** Enumerated property Joining_Type. Returns UJoiningType values. @stable ICU 2.2 */ UCHAR_JOINING_TYPE=0x1007, /** Enumerated property Line_Break. Returns ULineBreak values. @stable ICU 2.2 */ UCHAR_LINE_BREAK=0x1008, /** Enumerated property Numeric_Type. Returns UNumericType values. @stable ICU 2.2 */ UCHAR_NUMERIC_TYPE=0x1009, /** Enumerated property Script. Same as uscript_getScript, returns UScriptCode values. @stable ICU 2.2 */ UCHAR_SCRIPT=0x100A, /** Enumerated property Hangul_Syllable_Type, new in Unicode 4. Returns UHangulSyllableType values. @stable ICU 2.6 */ UCHAR_HANGUL_SYLLABLE_TYPE=0x100B, /** Enumerated property NFD_Quick_Check. Returns UNormalizationCheckResult values. @stable ICU 3.0 */ UCHAR_NFD_QUICK_CHECK=0x100C, /** Enumerated property NFKD_Quick_Check. Returns UNormalizationCheckResult values. @stable ICU 3.0 */ UCHAR_NFKD_QUICK_CHECK=0x100D, /** Enumerated property NFC_Quick_Check. Returns UNormalizationCheckResult values. @stable ICU 3.0 */ UCHAR_NFC_QUICK_CHECK=0x100E, /** Enumerated property NFKC_Quick_Check. Returns UNormalizationCheckResult values. @stable ICU 3.0 */ UCHAR_NFKC_QUICK_CHECK=0x100F, /** Enumerated property Lead_Canonical_Combining_Class. ICU-specific property for the ccc of the first code point of the decomposition, or lccc(c)=ccc(NFD(c)[0]). Useful for checking for canonically ordered text; see UNORM_FCD and http://www.unicode.org/notes/tn5/#FCD . Returns 8-bit numeric values like UCHAR_CANONICAL_COMBINING_CLASS. @stable ICU 3.0 */ UCHAR_LEAD_CANONICAL_COMBINING_CLASS=0x1010, /** Enumerated property Trail_Canonical_Combining_Class. ICU-specific property for the ccc of the last code point of the decomposition, or tccc(c)=ccc(NFD(c)[last]). Useful for checking for canonically ordered text; see UNORM_FCD and http://www.unicode.org/notes/tn5/#FCD . Returns 8-bit numeric values like UCHAR_CANONICAL_COMBINING_CLASS. @stable ICU 3.0 */ UCHAR_TRAIL_CANONICAL_COMBINING_CLASS=0x1011, /** Enumerated property Grapheme_Cluster_Break (new in Unicode 4.1). Used in UAX #29: Text Boundaries (http://www.unicode.org/reports/tr29/) Returns UGraphemeClusterBreak values. @stable ICU 3.4 */ UCHAR_GRAPHEME_CLUSTER_BREAK=0x1012, /** Enumerated property Sentence_Break (new in Unicode 4.1). Used in UAX #29: Text Boundaries (http://www.unicode.org/reports/tr29/) Returns USentenceBreak values. @stable ICU 3.4 */ UCHAR_SENTENCE_BREAK=0x1013, /** Enumerated property Word_Break (new in Unicode 4.1). Used in UAX #29: Text Boundaries (http://www.unicode.org/reports/tr29/) Returns UWordBreakValues values. @stable ICU 3.4 */ UCHAR_WORD_BREAK=0x1014, /** Enumerated property Bidi_Paired_Bracket_Type (new in Unicode 6.3). Used in UAX #9: Unicode Bidirectional Algorithm (http://www.unicode.org/reports/tr9/) Returns UBidiPairedBracketType values. @stable ICU 52 */ UCHAR_BIDI_PAIRED_BRACKET_TYPE=0x1015, /** One more than the last constant for enumerated/integer Unicode properties. @stable ICU 2.2 */ UCHAR_INT_LIMIT=0x1016, /** Bitmask property General_Category_Mask. This is the General_Category property returned as a bit mask. When used in u_getIntPropertyValue(c), same as U_MASK(u_charType(c)), returns bit masks for UCharCategory values where exactly one bit is set. When used with u_getPropertyValueName() and u_getPropertyValueEnum(), a multi-bit mask is used for sets of categories like "Letters". Mask values should be cast to uint32_t. @stable ICU 2.4 */ UCHAR_GENERAL_CATEGORY_MASK=0x2000, /** First constant for bit-mask Unicode properties. @stable ICU 2.4 */ UCHAR_MASK_START=UCHAR_GENERAL_CATEGORY_MASK, /** One more than the last constant for bit-mask Unicode properties. @stable ICU 2.4 */ UCHAR_MASK_LIMIT=0x2001, /** Double property Numeric_Value. Corresponds to u_getNumericValue. @stable ICU 2.4 */ UCHAR_NUMERIC_VALUE=0x3000, /** First constant for double Unicode properties. @stable ICU 2.4 */ UCHAR_DOUBLE_START=UCHAR_NUMERIC_VALUE, /** One more than the last constant for double Unicode properties. @stable ICU 2.4 */ UCHAR_DOUBLE_LIMIT=0x3001, /** String property Age. Corresponds to u_charAge. @stable ICU 2.4 */ UCHAR_AGE=0x4000, /** First constant for string Unicode properties. @stable ICU 2.4 */ UCHAR_STRING_START=UCHAR_AGE, /** String property Bidi_Mirroring_Glyph. Corresponds to u_charMirror. @stable ICU 2.4 */ UCHAR_BIDI_MIRRORING_GLYPH=0x4001, /** String property Case_Folding. Corresponds to u_strFoldCase in ustring.h. @stable ICU 2.4 */ UCHAR_CASE_FOLDING=0x4002, /** String property Lowercase_Mapping. Corresponds to u_strToLower in ustring.h. @stable ICU 2.4 */ UCHAR_LOWERCASE_MAPPING=0x4004, /** String property Name. Corresponds to u_charName. @stable ICU 2.4 */ UCHAR_NAME=0x4005, /** String property Simple_Case_Folding. Corresponds to u_foldCase. @stable ICU 2.4 */ UCHAR_SIMPLE_CASE_FOLDING=0x4006, /** String property Simple_Lowercase_Mapping. Corresponds to u_tolower. @stable ICU 2.4 */ UCHAR_SIMPLE_LOWERCASE_MAPPING=0x4007, /** String property Simple_Titlecase_Mapping. Corresponds to u_totitle. @stable ICU 2.4 */ UCHAR_SIMPLE_TITLECASE_MAPPING=0x4008, /** String property Simple_Uppercase_Mapping. Corresponds to u_toupper. @stable ICU 2.4 */ UCHAR_SIMPLE_UPPERCASE_MAPPING=0x4009, /** String property Titlecase_Mapping. Corresponds to u_strToTitle in ustring.h. @stable ICU 2.4 */ UCHAR_TITLECASE_MAPPING=0x400A, /** String property Uppercase_Mapping. Corresponds to u_strToUpper in ustring.h. @stable ICU 2.4 */ UCHAR_UPPERCASE_MAPPING=0x400C, /** String property Bidi_Paired_Bracket (new in Unicode 6.3). Corresponds to u_getBidiPairedBracket. @stable ICU 52 */ UCHAR_BIDI_PAIRED_BRACKET=0x400D, /** One more than the last constant for string Unicode properties. @stable ICU 2.4 */ UCHAR_STRING_LIMIT=0x400E, /** Miscellaneous property Script_Extensions (new in Unicode 6.0). Some characters are commonly used in multiple scripts. For more information, see UAX #24: http://www.unicode.org/reports/tr24/. Corresponds to uscript_hasScript and uscript_getScriptExtensions in uscript.h. @stable ICU 4.6 */ UCHAR_SCRIPT_EXTENSIONS=0x7000, /** First constant for Unicode properties with unusual value types. @stable ICU 4.6 */ UCHAR_OTHER_PROPERTY_START=UCHAR_SCRIPT_EXTENSIONS, /** One more than the last constant for Unicode properties with unusual value types. * @stable ICU 4.6 */ UCHAR_OTHER_PROPERTY_LIMIT=0x7001, /** Represents a nonexistent or invalid property or property value. @stable ICU 2.4 */ UCHAR_INVALID_CODE = -1 } UProperty; /** * Data for enumerated Unicode general category types. * See http://www.unicode.org/Public/UNIDATA/UnicodeData.html . * @stable ICU 2.0 */ typedef enum UCharCategory { /* * Note: UCharCategory constants and their API comments are parsed by preparseucd.py. * It matches pairs of lines like * / ** comment... * / * U_<[A-Z_]+> = , */ /** Non-category for unassigned and non-character code points. @stable ICU 2.0 */ U_UNASSIGNED = 0, /** Cn "Other, Not Assigned (no characters in [UnicodeData.txt] have this property)" (same as U_UNASSIGNED!) @stable ICU 2.0 */ U_GENERAL_OTHER_TYPES = 0, /** Lu @stable ICU 2.0 */ U_UPPERCASE_LETTER = 1, /** Ll @stable ICU 2.0 */ U_LOWERCASE_LETTER = 2, /** Lt @stable ICU 2.0 */ U_TITLECASE_LETTER = 3, /** Lm @stable ICU 2.0 */ U_MODIFIER_LETTER = 4, /** Lo @stable ICU 2.0 */ U_OTHER_LETTER = 5, /** Mn @stable ICU 2.0 */ U_NON_SPACING_MARK = 6, /** Me @stable ICU 2.0 */ U_ENCLOSING_MARK = 7, /** Mc @stable ICU 2.0 */ U_COMBINING_SPACING_MARK = 8, /** Nd @stable ICU 2.0 */ U_DECIMAL_DIGIT_NUMBER = 9, /** Nl @stable ICU 2.0 */ U_LETTER_NUMBER = 10, /** No @stable ICU 2.0 */ U_OTHER_NUMBER = 11, /** Zs @stable ICU 2.0 */ U_SPACE_SEPARATOR = 12, /** Zl @stable ICU 2.0 */ U_LINE_SEPARATOR = 13, /** Zp @stable ICU 2.0 */ U_PARAGRAPH_SEPARATOR = 14, /** Cc @stable ICU 2.0 */ U_CONTROL_CHAR = 15, /** Cf @stable ICU 2.0 */ U_FORMAT_CHAR = 16, /** Co @stable ICU 2.0 */ U_PRIVATE_USE_CHAR = 17, /** Cs @stable ICU 2.0 */ U_SURROGATE = 18, /** Pd @stable ICU 2.0 */ U_DASH_PUNCTUATION = 19, /** Ps @stable ICU 2.0 */ U_START_PUNCTUATION = 20, /** Pe @stable ICU 2.0 */ U_END_PUNCTUATION = 21, /** Pc @stable ICU 2.0 */ U_CONNECTOR_PUNCTUATION = 22, /** Po @stable ICU 2.0 */ U_OTHER_PUNCTUATION = 23, /** Sm @stable ICU 2.0 */ U_MATH_SYMBOL = 24, /** Sc @stable ICU 2.0 */ U_CURRENCY_SYMBOL = 25, /** Sk @stable ICU 2.0 */ U_MODIFIER_SYMBOL = 26, /** So @stable ICU 2.0 */ U_OTHER_SYMBOL = 27, /** Pi @stable ICU 2.0 */ U_INITIAL_PUNCTUATION = 28, /** Pf @stable ICU 2.0 */ U_FINAL_PUNCTUATION = 29, /** One higher than the last enum UCharCategory constant. @stable ICU 2.0 */ U_CHAR_CATEGORY_COUNT } UCharCategory; /** * U_GC_XX_MASK constants are bit flags corresponding to Unicode * general category values. * For each category, the nth bit is set if the numeric value of the * corresponding UCharCategory constant is n. * * There are also some U_GC_Y_MASK constants for groups of general categories * like L for all letter categories. * * @see u_charType * @see U_GET_GC_MASK * @see UCharCategory * @stable ICU 2.1 */ #define U_GC_CN_MASK U_MASK(U_GENERAL_OTHER_TYPES) /** Mask constant for a UCharCategory. @stable ICU 2.1 */ #define U_GC_LU_MASK U_MASK(U_UPPERCASE_LETTER) /** Mask constant for a UCharCategory. @stable ICU 2.1 */ #define U_GC_LL_MASK U_MASK(U_LOWERCASE_LETTER) /** Mask constant for a UCharCategory. @stable ICU 2.1 */ #define U_GC_LT_MASK U_MASK(U_TITLECASE_LETTER) /** Mask constant for a UCharCategory. @stable ICU 2.1 */ #define U_GC_LM_MASK U_MASK(U_MODIFIER_LETTER) /** Mask constant for a UCharCategory. @stable ICU 2.1 */ #define U_GC_LO_MASK U_MASK(U_OTHER_LETTER) /** Mask constant for a UCharCategory. @stable ICU 2.1 */ #define U_GC_MN_MASK U_MASK(U_NON_SPACING_MARK) /** Mask constant for a UCharCategory. @stable ICU 2.1 */ #define U_GC_ME_MASK U_MASK(U_ENCLOSING_MARK) /** Mask constant for a UCharCategory. @stable ICU 2.1 */ #define U_GC_MC_MASK U_MASK(U_COMBINING_SPACING_MARK) /** Mask constant for a UCharCategory. @stable ICU 2.1 */ #define U_GC_ND_MASK U_MASK(U_DECIMAL_DIGIT_NUMBER) /** Mask constant for a UCharCategory. @stable ICU 2.1 */ #define U_GC_NL_MASK U_MASK(U_LETTER_NUMBER) /** Mask constant for a UCharCategory. @stable ICU 2.1 */ #define U_GC_NO_MASK U_MASK(U_OTHER_NUMBER) /** Mask constant for a UCharCategory. @stable ICU 2.1 */ #define U_GC_ZS_MASK U_MASK(U_SPACE_SEPARATOR) /** Mask constant for a UCharCategory. @stable ICU 2.1 */ #define U_GC_ZL_MASK U_MASK(U_LINE_SEPARATOR) /** Mask constant for a UCharCategory. @stable ICU 2.1 */ #define U_GC_ZP_MASK U_MASK(U_PARAGRAPH_SEPARATOR) /** Mask constant for a UCharCategory. @stable ICU 2.1 */ #define U_GC_CC_MASK U_MASK(U_CONTROL_CHAR) /** Mask constant for a UCharCategory. @stable ICU 2.1 */ #define U_GC_CF_MASK U_MASK(U_FORMAT_CHAR) /** Mask constant for a UCharCategory. @stable ICU 2.1 */ #define U_GC_CO_MASK U_MASK(U_PRIVATE_USE_CHAR) /** Mask constant for a UCharCategory. @stable ICU 2.1 */ #define U_GC_CS_MASK U_MASK(U_SURROGATE) /** Mask constant for a UCharCategory. @stable ICU 2.1 */ #define U_GC_PD_MASK U_MASK(U_DASH_PUNCTUATION) /** Mask constant for a UCharCategory. @stable ICU 2.1 */ #define U_GC_PS_MASK U_MASK(U_START_PUNCTUATION) /** Mask constant for a UCharCategory. @stable ICU 2.1 */ #define U_GC_PE_MASK U_MASK(U_END_PUNCTUATION) /** Mask constant for a UCharCategory. @stable ICU 2.1 */ #define U_GC_PC_MASK U_MASK(U_CONNECTOR_PUNCTUATION) /** Mask constant for a UCharCategory. @stable ICU 2.1 */ #define U_GC_PO_MASK U_MASK(U_OTHER_PUNCTUATION) /** Mask constant for a UCharCategory. @stable ICU 2.1 */ #define U_GC_SM_MASK U_MASK(U_MATH_SYMBOL) /** Mask constant for a UCharCategory. @stable ICU 2.1 */ #define U_GC_SC_MASK U_MASK(U_CURRENCY_SYMBOL) /** Mask constant for a UCharCategory. @stable ICU 2.1 */ #define U_GC_SK_MASK U_MASK(U_MODIFIER_SYMBOL) /** Mask constant for a UCharCategory. @stable ICU 2.1 */ #define U_GC_SO_MASK U_MASK(U_OTHER_SYMBOL) /** Mask constant for a UCharCategory. @stable ICU 2.1 */ #define U_GC_PI_MASK U_MASK(U_INITIAL_PUNCTUATION) /** Mask constant for a UCharCategory. @stable ICU 2.1 */ #define U_GC_PF_MASK U_MASK(U_FINAL_PUNCTUATION) /** Mask constant for multiple UCharCategory bits (L Letters). @stable ICU 2.1 */ #define U_GC_L_MASK \ (U_GC_LU_MASK|U_GC_LL_MASK|U_GC_LT_MASK|U_GC_LM_MASK|U_GC_LO_MASK) /** Mask constant for multiple UCharCategory bits (LC Cased Letters). @stable ICU 2.1 */ #define U_GC_LC_MASK \ (U_GC_LU_MASK|U_GC_LL_MASK|U_GC_LT_MASK) /** Mask constant for multiple UCharCategory bits (M Marks). @stable ICU 2.1 */ #define U_GC_M_MASK (U_GC_MN_MASK|U_GC_ME_MASK|U_GC_MC_MASK) /** Mask constant for multiple UCharCategory bits (N Numbers). @stable ICU 2.1 */ #define U_GC_N_MASK (U_GC_ND_MASK|U_GC_NL_MASK|U_GC_NO_MASK) /** Mask constant for multiple UCharCategory bits (Z Separators). @stable ICU 2.1 */ #define U_GC_Z_MASK (U_GC_ZS_MASK|U_GC_ZL_MASK|U_GC_ZP_MASK) /** Mask constant for multiple UCharCategory bits (C Others). @stable ICU 2.1 */ #define U_GC_C_MASK \ (U_GC_CN_MASK|U_GC_CC_MASK|U_GC_CF_MASK|U_GC_CO_MASK|U_GC_CS_MASK) /** Mask constant for multiple UCharCategory bits (P Punctuation). @stable ICU 2.1 */ #define U_GC_P_MASK \ (U_GC_PD_MASK|U_GC_PS_MASK|U_GC_PE_MASK|U_GC_PC_MASK|U_GC_PO_MASK| \ U_GC_PI_MASK|U_GC_PF_MASK) /** Mask constant for multiple UCharCategory bits (S Symbols). @stable ICU 2.1 */ #define U_GC_S_MASK (U_GC_SM_MASK|U_GC_SC_MASK|U_GC_SK_MASK|U_GC_SO_MASK) /** * This specifies the language directional property of a character set. * @stable ICU 2.0 */ typedef enum UCharDirection { /* * Note: UCharDirection constants and their API comments are parsed by preparseucd.py. * It matches pairs of lines like * / ** comment... * / * U_<[A-Z_]+> = , */ /** L @stable ICU 2.0 */ U_LEFT_TO_RIGHT = 0, /** R @stable ICU 2.0 */ U_RIGHT_TO_LEFT = 1, /** EN @stable ICU 2.0 */ U_EUROPEAN_NUMBER = 2, /** ES @stable ICU 2.0 */ U_EUROPEAN_NUMBER_SEPARATOR = 3, /** ET @stable ICU 2.0 */ U_EUROPEAN_NUMBER_TERMINATOR = 4, /** AN @stable ICU 2.0 */ U_ARABIC_NUMBER = 5, /** CS @stable ICU 2.0 */ U_COMMON_NUMBER_SEPARATOR = 6, /** B @stable ICU 2.0 */ U_BLOCK_SEPARATOR = 7, /** S @stable ICU 2.0 */ U_SEGMENT_SEPARATOR = 8, /** WS @stable ICU 2.0 */ U_WHITE_SPACE_NEUTRAL = 9, /** ON @stable ICU 2.0 */ U_OTHER_NEUTRAL = 10, /** LRE @stable ICU 2.0 */ U_LEFT_TO_RIGHT_EMBEDDING = 11, /** LRO @stable ICU 2.0 */ U_LEFT_TO_RIGHT_OVERRIDE = 12, /** AL @stable ICU 2.0 */ U_RIGHT_TO_LEFT_ARABIC = 13, /** RLE @stable ICU 2.0 */ U_RIGHT_TO_LEFT_EMBEDDING = 14, /** RLO @stable ICU 2.0 */ U_RIGHT_TO_LEFT_OVERRIDE = 15, /** PDF @stable ICU 2.0 */ U_POP_DIRECTIONAL_FORMAT = 16, /** NSM @stable ICU 2.0 */ U_DIR_NON_SPACING_MARK = 17, /** BN @stable ICU 2.0 */ U_BOUNDARY_NEUTRAL = 18, /** FSI @stable ICU 52 */ U_FIRST_STRONG_ISOLATE = 19, /** LRI @stable ICU 52 */ U_LEFT_TO_RIGHT_ISOLATE = 20, /** RLI @stable ICU 52 */ U_RIGHT_TO_LEFT_ISOLATE = 21, /** PDI @stable ICU 52 */ U_POP_DIRECTIONAL_ISOLATE = 22, /** @stable ICU 2.0 */ U_CHAR_DIRECTION_COUNT } UCharDirection; /** * Bidi Paired Bracket Type constants. * * @see UCHAR_BIDI_PAIRED_BRACKET_TYPE * @stable ICU 52 */ typedef enum UBidiPairedBracketType { /* * Note: UBidiPairedBracketType constants are parsed by preparseucd.py. * It matches lines like * U_BPT_ */ /** Not a paired bracket. @stable ICU 52 */ U_BPT_NONE, /** Open paired bracket. @stable ICU 52 */ U_BPT_OPEN, /** Close paired bracket. @stable ICU 52 */ U_BPT_CLOSE, /** @stable ICU 52 */ U_BPT_COUNT /* 3 */ } UBidiPairedBracketType; /** * Constants for Unicode blocks, see the Unicode Data file Blocks.txt * @stable ICU 2.0 */ enum UBlockCode { /* * Note: UBlockCode constants are parsed by preparseucd.py. * It matches lines like * UBLOCK_ = , */ /** New No_Block value in Unicode 4. @stable ICU 2.6 */ UBLOCK_NO_BLOCK = 0, /*[none]*/ /* Special range indicating No_Block */ /** @stable ICU 2.0 */ UBLOCK_BASIC_LATIN = 1, /*[0000]*/ /** @stable ICU 2.0 */ UBLOCK_LATIN_1_SUPPLEMENT=2, /*[0080]*/ /** @stable ICU 2.0 */ UBLOCK_LATIN_EXTENDED_A =3, /*[0100]*/ /** @stable ICU 2.0 */ UBLOCK_LATIN_EXTENDED_B =4, /*[0180]*/ /** @stable ICU 2.0 */ UBLOCK_IPA_EXTENSIONS =5, /*[0250]*/ /** @stable ICU 2.0 */ UBLOCK_SPACING_MODIFIER_LETTERS =6, /*[02B0]*/ /** @stable ICU 2.0 */ UBLOCK_COMBINING_DIACRITICAL_MARKS =7, /*[0300]*/ /** * Unicode 3.2 renames this block to "Greek and Coptic". * @stable ICU 2.0 */ UBLOCK_GREEK =8, /*[0370]*/ /** @stable ICU 2.0 */ UBLOCK_CYRILLIC =9, /*[0400]*/ /** @stable ICU 2.0 */ UBLOCK_ARMENIAN =10, /*[0530]*/ /** @stable ICU 2.0 */ UBLOCK_HEBREW =11, /*[0590]*/ /** @stable ICU 2.0 */ UBLOCK_ARABIC =12, /*[0600]*/ /** @stable ICU 2.0 */ UBLOCK_SYRIAC =13, /*[0700]*/ /** @stable ICU 2.0 */ UBLOCK_THAANA =14, /*[0780]*/ /** @stable ICU 2.0 */ UBLOCK_DEVANAGARI =15, /*[0900]*/ /** @stable ICU 2.0 */ UBLOCK_BENGALI =16, /*[0980]*/ /** @stable ICU 2.0 */ UBLOCK_GURMUKHI =17, /*[0A00]*/ /** @stable ICU 2.0 */ UBLOCK_GUJARATI =18, /*[0A80]*/ /** @stable ICU 2.0 */ UBLOCK_ORIYA =19, /*[0B00]*/ /** @stable ICU 2.0 */ UBLOCK_TAMIL =20, /*[0B80]*/ /** @stable ICU 2.0 */ UBLOCK_TELUGU =21, /*[0C00]*/ /** @stable ICU 2.0 */ UBLOCK_KANNADA =22, /*[0C80]*/ /** @stable ICU 2.0 */ UBLOCK_MALAYALAM =23, /*[0D00]*/ /** @stable ICU 2.0 */ UBLOCK_SINHALA =24, /*[0D80]*/ /** @stable ICU 2.0 */ UBLOCK_THAI =25, /*[0E00]*/ /** @stable ICU 2.0 */ UBLOCK_LAO =26, /*[0E80]*/ /** @stable ICU 2.0 */ UBLOCK_TIBETAN =27, /*[0F00]*/ /** @stable ICU 2.0 */ UBLOCK_MYANMAR =28, /*[1000]*/ /** @stable ICU 2.0 */ UBLOCK_GEORGIAN =29, /*[10A0]*/ /** @stable ICU 2.0 */ UBLOCK_HANGUL_JAMO =30, /*[1100]*/ /** @stable ICU 2.0 */ UBLOCK_ETHIOPIC =31, /*[1200]*/ /** @stable ICU 2.0 */ UBLOCK_CHEROKEE =32, /*[13A0]*/ /** @stable ICU 2.0 */ UBLOCK_UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS =33, /*[1400]*/ /** @stable ICU 2.0 */ UBLOCK_OGHAM =34, /*[1680]*/ /** @stable ICU 2.0 */ UBLOCK_RUNIC =35, /*[16A0]*/ /** @stable ICU 2.0 */ UBLOCK_KHMER =36, /*[1780]*/ /** @stable ICU 2.0 */ UBLOCK_MONGOLIAN =37, /*[1800]*/ /** @stable ICU 2.0 */ UBLOCK_LATIN_EXTENDED_ADDITIONAL =38, /*[1E00]*/ /** @stable ICU 2.0 */ UBLOCK_GREEK_EXTENDED =39, /*[1F00]*/ /** @stable ICU 2.0 */ UBLOCK_GENERAL_PUNCTUATION =40, /*[2000]*/ /** @stable ICU 2.0 */ UBLOCK_SUPERSCRIPTS_AND_SUBSCRIPTS =41, /*[2070]*/ /** @stable ICU 2.0 */ UBLOCK_CURRENCY_SYMBOLS =42, /*[20A0]*/ /** * Unicode 3.2 renames this block to "Combining Diacritical Marks for Symbols". * @stable ICU 2.0 */ UBLOCK_COMBINING_MARKS_FOR_SYMBOLS =43, /*[20D0]*/ /** @stable ICU 2.0 */ UBLOCK_LETTERLIKE_SYMBOLS =44, /*[2100]*/ /** @stable ICU 2.0 */ UBLOCK_NUMBER_FORMS =45, /*[2150]*/ /** @stable ICU 2.0 */ UBLOCK_ARROWS =46, /*[2190]*/ /** @stable ICU 2.0 */ UBLOCK_MATHEMATICAL_OPERATORS =47, /*[2200]*/ /** @stable ICU 2.0 */ UBLOCK_MISCELLANEOUS_TECHNICAL =48, /*[2300]*/ /** @stable ICU 2.0 */ UBLOCK_CONTROL_PICTURES =49, /*[2400]*/ /** @stable ICU 2.0 */ UBLOCK_OPTICAL_CHARACTER_RECOGNITION =50, /*[2440]*/ /** @stable ICU 2.0 */ UBLOCK_ENCLOSED_ALPHANUMERICS =51, /*[2460]*/ /** @stable ICU 2.0 */ UBLOCK_BOX_DRAWING =52, /*[2500]*/ /** @stable ICU 2.0 */ UBLOCK_BLOCK_ELEMENTS =53, /*[2580]*/ /** @stable ICU 2.0 */ UBLOCK_GEOMETRIC_SHAPES =54, /*[25A0]*/ /** @stable ICU 2.0 */ UBLOCK_MISCELLANEOUS_SYMBOLS =55, /*[2600]*/ /** @stable ICU 2.0 */ UBLOCK_DINGBATS =56, /*[2700]*/ /** @stable ICU 2.0 */ UBLOCK_BRAILLE_PATTERNS =57, /*[2800]*/ /** @stable ICU 2.0 */ UBLOCK_CJK_RADICALS_SUPPLEMENT =58, /*[2E80]*/ /** @stable ICU 2.0 */ UBLOCK_KANGXI_RADICALS =59, /*[2F00]*/ /** @stable ICU 2.0 */ UBLOCK_IDEOGRAPHIC_DESCRIPTION_CHARACTERS =60, /*[2FF0]*/ /** @stable ICU 2.0 */ UBLOCK_CJK_SYMBOLS_AND_PUNCTUATION =61, /*[3000]*/ /** @stable ICU 2.0 */ UBLOCK_HIRAGANA =62, /*[3040]*/ /** @stable ICU 2.0 */ UBLOCK_KATAKANA =63, /*[30A0]*/ /** @stable ICU 2.0 */ UBLOCK_BOPOMOFO =64, /*[3100]*/ /** @stable ICU 2.0 */ UBLOCK_HANGUL_COMPATIBILITY_JAMO =65, /*[3130]*/ /** @stable ICU 2.0 */ UBLOCK_KANBUN =66, /*[3190]*/ /** @stable ICU 2.0 */ UBLOCK_BOPOMOFO_EXTENDED =67, /*[31A0]*/ /** @stable ICU 2.0 */ UBLOCK_ENCLOSED_CJK_LETTERS_AND_MONTHS =68, /*[3200]*/ /** @stable ICU 2.0 */ UBLOCK_CJK_COMPATIBILITY =69, /*[3300]*/ /** @stable ICU 2.0 */ UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A =70, /*[3400]*/ /** @stable ICU 2.0 */ UBLOCK_CJK_UNIFIED_IDEOGRAPHS =71, /*[4E00]*/ /** @stable ICU 2.0 */ UBLOCK_YI_SYLLABLES =72, /*[A000]*/ /** @stable ICU 2.0 */ UBLOCK_YI_RADICALS =73, /*[A490]*/ /** @stable ICU 2.0 */ UBLOCK_HANGUL_SYLLABLES =74, /*[AC00]*/ /** @stable ICU 2.0 */ UBLOCK_HIGH_SURROGATES =75, /*[D800]*/ /** @stable ICU 2.0 */ UBLOCK_HIGH_PRIVATE_USE_SURROGATES =76, /*[DB80]*/ /** @stable ICU 2.0 */ UBLOCK_LOW_SURROGATES =77, /*[DC00]*/ /** * Same as UBLOCK_PRIVATE_USE. * Until Unicode 3.1.1, the corresponding block name was "Private Use", * and multiple code point ranges had this block. * Unicode 3.2 renames the block for the BMP PUA to "Private Use Area" and * adds separate blocks for the supplementary PUAs. * * @stable ICU 2.0 */ UBLOCK_PRIVATE_USE_AREA =78, /*[E000]*/ /** * Same as UBLOCK_PRIVATE_USE_AREA. * Until Unicode 3.1.1, the corresponding block name was "Private Use", * and multiple code point ranges had this block. * Unicode 3.2 renames the block for the BMP PUA to "Private Use Area" and * adds separate blocks for the supplementary PUAs. * * @stable ICU 2.0 */ UBLOCK_PRIVATE_USE = UBLOCK_PRIVATE_USE_AREA, /** @stable ICU 2.0 */ UBLOCK_CJK_COMPATIBILITY_IDEOGRAPHS =79, /*[F900]*/ /** @stable ICU 2.0 */ UBLOCK_ALPHABETIC_PRESENTATION_FORMS =80, /*[FB00]*/ /** @stable ICU 2.0 */ UBLOCK_ARABIC_PRESENTATION_FORMS_A =81, /*[FB50]*/ /** @stable ICU 2.0 */ UBLOCK_COMBINING_HALF_MARKS =82, /*[FE20]*/ /** @stable ICU 2.0 */ UBLOCK_CJK_COMPATIBILITY_FORMS =83, /*[FE30]*/ /** @stable ICU 2.0 */ UBLOCK_SMALL_FORM_VARIANTS =84, /*[FE50]*/ /** @stable ICU 2.0 */ UBLOCK_ARABIC_PRESENTATION_FORMS_B =85, /*[FE70]*/ /** @stable ICU 2.0 */ UBLOCK_SPECIALS =86, /*[FFF0]*/ /** @stable ICU 2.0 */ UBLOCK_HALFWIDTH_AND_FULLWIDTH_FORMS =87, /*[FF00]*/ /* New blocks in Unicode 3.1 */ /** @stable ICU 2.0 */ UBLOCK_OLD_ITALIC = 88, /*[10300]*/ /** @stable ICU 2.0 */ UBLOCK_GOTHIC = 89, /*[10330]*/ /** @stable ICU 2.0 */ UBLOCK_DESERET = 90, /*[10400]*/ /** @stable ICU 2.0 */ UBLOCK_BYZANTINE_MUSICAL_SYMBOLS = 91, /*[1D000]*/ /** @stable ICU 2.0 */ UBLOCK_MUSICAL_SYMBOLS = 92, /*[1D100]*/ /** @stable ICU 2.0 */ UBLOCK_MATHEMATICAL_ALPHANUMERIC_SYMBOLS = 93, /*[1D400]*/ /** @stable ICU 2.0 */ UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B = 94, /*[20000]*/ /** @stable ICU 2.0 */ UBLOCK_CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT = 95, /*[2F800]*/ /** @stable ICU 2.0 */ UBLOCK_TAGS = 96, /*[E0000]*/ /* New blocks in Unicode 3.2 */ /** @stable ICU 3.0 */ UBLOCK_CYRILLIC_SUPPLEMENT = 97, /*[0500]*/ /** * Unicode 4.0.1 renames the "Cyrillic Supplementary" block to "Cyrillic Supplement". * @stable ICU 2.2 */ UBLOCK_CYRILLIC_SUPPLEMENTARY = UBLOCK_CYRILLIC_SUPPLEMENT, /** @stable ICU 2.2 */ UBLOCK_TAGALOG = 98, /*[1700]*/ /** @stable ICU 2.2 */ UBLOCK_HANUNOO = 99, /*[1720]*/ /** @stable ICU 2.2 */ UBLOCK_BUHID = 100, /*[1740]*/ /** @stable ICU 2.2 */ UBLOCK_TAGBANWA = 101, /*[1760]*/ /** @stable ICU 2.2 */ UBLOCK_MISCELLANEOUS_MATHEMATICAL_SYMBOLS_A = 102, /*[27C0]*/ /** @stable ICU 2.2 */ UBLOCK_SUPPLEMENTAL_ARROWS_A = 103, /*[27F0]*/ /** @stable ICU 2.2 */ UBLOCK_SUPPLEMENTAL_ARROWS_B = 104, /*[2900]*/ /** @stable ICU 2.2 */ UBLOCK_MISCELLANEOUS_MATHEMATICAL_SYMBOLS_B = 105, /*[2980]*/ /** @stable ICU 2.2 */ UBLOCK_SUPPLEMENTAL_MATHEMATICAL_OPERATORS = 106, /*[2A00]*/ /** @stable ICU 2.2 */ UBLOCK_KATAKANA_PHONETIC_EXTENSIONS = 107, /*[31F0]*/ /** @stable ICU 2.2 */ UBLOCK_VARIATION_SELECTORS = 108, /*[FE00]*/ /** @stable ICU 2.2 */ UBLOCK_SUPPLEMENTARY_PRIVATE_USE_AREA_A = 109, /*[F0000]*/ /** @stable ICU 2.2 */ UBLOCK_SUPPLEMENTARY_PRIVATE_USE_AREA_B = 110, /*[100000]*/ /* New blocks in Unicode 4 */ /** @stable ICU 2.6 */ UBLOCK_LIMBU = 111, /*[1900]*/ /** @stable ICU 2.6 */ UBLOCK_TAI_LE = 112, /*[1950]*/ /** @stable ICU 2.6 */ UBLOCK_KHMER_SYMBOLS = 113, /*[19E0]*/ /** @stable ICU 2.6 */ UBLOCK_PHONETIC_EXTENSIONS = 114, /*[1D00]*/ /** @stable ICU 2.6 */ UBLOCK_MISCELLANEOUS_SYMBOLS_AND_ARROWS = 115, /*[2B00]*/ /** @stable ICU 2.6 */ UBLOCK_YIJING_HEXAGRAM_SYMBOLS = 116, /*[4DC0]*/ /** @stable ICU 2.6 */ UBLOCK_LINEAR_B_SYLLABARY = 117, /*[10000]*/ /** @stable ICU 2.6 */ UBLOCK_LINEAR_B_IDEOGRAMS = 118, /*[10080]*/ /** @stable ICU 2.6 */ UBLOCK_AEGEAN_NUMBERS = 119, /*[10100]*/ /** @stable ICU 2.6 */ UBLOCK_UGARITIC = 120, /*[10380]*/ /** @stable ICU 2.6 */ UBLOCK_SHAVIAN = 121, /*[10450]*/ /** @stable ICU 2.6 */ UBLOCK_OSMANYA = 122, /*[10480]*/ /** @stable ICU 2.6 */ UBLOCK_CYPRIOT_SYLLABARY = 123, /*[10800]*/ /** @stable ICU 2.6 */ UBLOCK_TAI_XUAN_JING_SYMBOLS = 124, /*[1D300]*/ /** @stable ICU 2.6 */ UBLOCK_VARIATION_SELECTORS_SUPPLEMENT = 125, /*[E0100]*/ /* New blocks in Unicode 4.1 */ /** @stable ICU 3.4 */ UBLOCK_ANCIENT_GREEK_MUSICAL_NOTATION = 126, /*[1D200]*/ /** @stable ICU 3.4 */ UBLOCK_ANCIENT_GREEK_NUMBERS = 127, /*[10140]*/ /** @stable ICU 3.4 */ UBLOCK_ARABIC_SUPPLEMENT = 128, /*[0750]*/ /** @stable ICU 3.4 */ UBLOCK_BUGINESE = 129, /*[1A00]*/ /** @stable ICU 3.4 */ UBLOCK_CJK_STROKES = 130, /*[31C0]*/ /** @stable ICU 3.4 */ UBLOCK_COMBINING_DIACRITICAL_MARKS_SUPPLEMENT = 131, /*[1DC0]*/ /** @stable ICU 3.4 */ UBLOCK_COPTIC = 132, /*[2C80]*/ /** @stable ICU 3.4 */ UBLOCK_ETHIOPIC_EXTENDED = 133, /*[2D80]*/ /** @stable ICU 3.4 */ UBLOCK_ETHIOPIC_SUPPLEMENT = 134, /*[1380]*/ /** @stable ICU 3.4 */ UBLOCK_GEORGIAN_SUPPLEMENT = 135, /*[2D00]*/ /** @stable ICU 3.4 */ UBLOCK_GLAGOLITIC = 136, /*[2C00]*/ /** @stable ICU 3.4 */ UBLOCK_KHAROSHTHI = 137, /*[10A00]*/ /** @stable ICU 3.4 */ UBLOCK_MODIFIER_TONE_LETTERS = 138, /*[A700]*/ /** @stable ICU 3.4 */ UBLOCK_NEW_TAI_LUE = 139, /*[1980]*/ /** @stable ICU 3.4 */ UBLOCK_OLD_PERSIAN = 140, /*[103A0]*/ /** @stable ICU 3.4 */ UBLOCK_PHONETIC_EXTENSIONS_SUPPLEMENT = 141, /*[1D80]*/ /** @stable ICU 3.4 */ UBLOCK_SUPPLEMENTAL_PUNCTUATION = 142, /*[2E00]*/ /** @stable ICU 3.4 */ UBLOCK_SYLOTI_NAGRI = 143, /*[A800]*/ /** @stable ICU 3.4 */ UBLOCK_TIFINAGH = 144, /*[2D30]*/ /** @stable ICU 3.4 */ UBLOCK_VERTICAL_FORMS = 145, /*[FE10]*/ /* New blocks in Unicode 5.0 */ /** @stable ICU 3.6 */ UBLOCK_NKO = 146, /*[07C0]*/ /** @stable ICU 3.6 */ UBLOCK_BALINESE = 147, /*[1B00]*/ /** @stable ICU 3.6 */ UBLOCK_LATIN_EXTENDED_C = 148, /*[2C60]*/ /** @stable ICU 3.6 */ UBLOCK_LATIN_EXTENDED_D = 149, /*[A720]*/ /** @stable ICU 3.6 */ UBLOCK_PHAGS_PA = 150, /*[A840]*/ /** @stable ICU 3.6 */ UBLOCK_PHOENICIAN = 151, /*[10900]*/ /** @stable ICU 3.6 */ UBLOCK_CUNEIFORM = 152, /*[12000]*/ /** @stable ICU 3.6 */ UBLOCK_CUNEIFORM_NUMBERS_AND_PUNCTUATION = 153, /*[12400]*/ /** @stable ICU 3.6 */ UBLOCK_COUNTING_ROD_NUMERALS = 154, /*[1D360]*/ /* New blocks in Unicode 5.1 */ /** @stable ICU 4.0 */ UBLOCK_SUNDANESE = 155, /*[1B80]*/ /** @stable ICU 4.0 */ UBLOCK_LEPCHA = 156, /*[1C00]*/ /** @stable ICU 4.0 */ UBLOCK_OL_CHIKI = 157, /*[1C50]*/ /** @stable ICU 4.0 */ UBLOCK_CYRILLIC_EXTENDED_A = 158, /*[2DE0]*/ /** @stable ICU 4.0 */ UBLOCK_VAI = 159, /*[A500]*/ /** @stable ICU 4.0 */ UBLOCK_CYRILLIC_EXTENDED_B = 160, /*[A640]*/ /** @stable ICU 4.0 */ UBLOCK_SAURASHTRA = 161, /*[A880]*/ /** @stable ICU 4.0 */ UBLOCK_KAYAH_LI = 162, /*[A900]*/ /** @stable ICU 4.0 */ UBLOCK_REJANG = 163, /*[A930]*/ /** @stable ICU 4.0 */ UBLOCK_CHAM = 164, /*[AA00]*/ /** @stable ICU 4.0 */ UBLOCK_ANCIENT_SYMBOLS = 165, /*[10190]*/ /** @stable ICU 4.0 */ UBLOCK_PHAISTOS_DISC = 166, /*[101D0]*/ /** @stable ICU 4.0 */ UBLOCK_LYCIAN = 167, /*[10280]*/ /** @stable ICU 4.0 */ UBLOCK_CARIAN = 168, /*[102A0]*/ /** @stable ICU 4.0 */ UBLOCK_LYDIAN = 169, /*[10920]*/ /** @stable ICU 4.0 */ UBLOCK_MAHJONG_TILES = 170, /*[1F000]*/ /** @stable ICU 4.0 */ UBLOCK_DOMINO_TILES = 171, /*[1F030]*/ /* New blocks in Unicode 5.2 */ /** @stable ICU 4.4 */ UBLOCK_SAMARITAN = 172, /*[0800]*/ /** @stable ICU 4.4 */ UBLOCK_UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS_EXTENDED = 173, /*[18B0]*/ /** @stable ICU 4.4 */ UBLOCK_TAI_THAM = 174, /*[1A20]*/ /** @stable ICU 4.4 */ UBLOCK_VEDIC_EXTENSIONS = 175, /*[1CD0]*/ /** @stable ICU 4.4 */ UBLOCK_LISU = 176, /*[A4D0]*/ /** @stable ICU 4.4 */ UBLOCK_BAMUM = 177, /*[A6A0]*/ /** @stable ICU 4.4 */ UBLOCK_COMMON_INDIC_NUMBER_FORMS = 178, /*[A830]*/ /** @stable ICU 4.4 */ UBLOCK_DEVANAGARI_EXTENDED = 179, /*[A8E0]*/ /** @stable ICU 4.4 */ UBLOCK_HANGUL_JAMO_EXTENDED_A = 180, /*[A960]*/ /** @stable ICU 4.4 */ UBLOCK_JAVANESE = 181, /*[A980]*/ /** @stable ICU 4.4 */ UBLOCK_MYANMAR_EXTENDED_A = 182, /*[AA60]*/ /** @stable ICU 4.4 */ UBLOCK_TAI_VIET = 183, /*[AA80]*/ /** @stable ICU 4.4 */ UBLOCK_MEETEI_MAYEK = 184, /*[ABC0]*/ /** @stable ICU 4.4 */ UBLOCK_HANGUL_JAMO_EXTENDED_B = 185, /*[D7B0]*/ /** @stable ICU 4.4 */ UBLOCK_IMPERIAL_ARAMAIC = 186, /*[10840]*/ /** @stable ICU 4.4 */ UBLOCK_OLD_SOUTH_ARABIAN = 187, /*[10A60]*/ /** @stable ICU 4.4 */ UBLOCK_AVESTAN = 188, /*[10B00]*/ /** @stable ICU 4.4 */ UBLOCK_INSCRIPTIONAL_PARTHIAN = 189, /*[10B40]*/ /** @stable ICU 4.4 */ UBLOCK_INSCRIPTIONAL_PAHLAVI = 190, /*[10B60]*/ /** @stable ICU 4.4 */ UBLOCK_OLD_TURKIC = 191, /*[10C00]*/ /** @stable ICU 4.4 */ UBLOCK_RUMI_NUMERAL_SYMBOLS = 192, /*[10E60]*/ /** @stable ICU 4.4 */ UBLOCK_KAITHI = 193, /*[11080]*/ /** @stable ICU 4.4 */ UBLOCK_EGYPTIAN_HIEROGLYPHS = 194, /*[13000]*/ /** @stable ICU 4.4 */ UBLOCK_ENCLOSED_ALPHANUMERIC_SUPPLEMENT = 195, /*[1F100]*/ /** @stable ICU 4.4 */ UBLOCK_ENCLOSED_IDEOGRAPHIC_SUPPLEMENT = 196, /*[1F200]*/ /** @stable ICU 4.4 */ UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_C = 197, /*[2A700]*/ /* New blocks in Unicode 6.0 */ /** @stable ICU 4.6 */ UBLOCK_MANDAIC = 198, /*[0840]*/ /** @stable ICU 4.6 */ UBLOCK_BATAK = 199, /*[1BC0]*/ /** @stable ICU 4.6 */ UBLOCK_ETHIOPIC_EXTENDED_A = 200, /*[AB00]*/ /** @stable ICU 4.6 */ UBLOCK_BRAHMI = 201, /*[11000]*/ /** @stable ICU 4.6 */ UBLOCK_BAMUM_SUPPLEMENT = 202, /*[16800]*/ /** @stable ICU 4.6 */ UBLOCK_KANA_SUPPLEMENT = 203, /*[1B000]*/ /** @stable ICU 4.6 */ UBLOCK_PLAYING_CARDS = 204, /*[1F0A0]*/ /** @stable ICU 4.6 */ UBLOCK_MISCELLANEOUS_SYMBOLS_AND_PICTOGRAPHS = 205, /*[1F300]*/ /** @stable ICU 4.6 */ UBLOCK_EMOTICONS = 206, /*[1F600]*/ /** @stable ICU 4.6 */ UBLOCK_TRANSPORT_AND_MAP_SYMBOLS = 207, /*[1F680]*/ /** @stable ICU 4.6 */ UBLOCK_ALCHEMICAL_SYMBOLS = 208, /*[1F700]*/ /** @stable ICU 4.6 */ UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_D = 209, /*[2B740]*/ /* New blocks in Unicode 6.1 */ /** @stable ICU 49 */ UBLOCK_ARABIC_EXTENDED_A = 210, /*[08A0]*/ /** @stable ICU 49 */ UBLOCK_ARABIC_MATHEMATICAL_ALPHABETIC_SYMBOLS = 211, /*[1EE00]*/ /** @stable ICU 49 */ UBLOCK_CHAKMA = 212, /*[11100]*/ /** @stable ICU 49 */ UBLOCK_MEETEI_MAYEK_EXTENSIONS = 213, /*[AAE0]*/ /** @stable ICU 49 */ UBLOCK_MEROITIC_CURSIVE = 214, /*[109A0]*/ /** @stable ICU 49 */ UBLOCK_MEROITIC_HIEROGLYPHS = 215, /*[10980]*/ /** @stable ICU 49 */ UBLOCK_MIAO = 216, /*[16F00]*/ /** @stable ICU 49 */ UBLOCK_SHARADA = 217, /*[11180]*/ /** @stable ICU 49 */ UBLOCK_SORA_SOMPENG = 218, /*[110D0]*/ /** @stable ICU 49 */ UBLOCK_SUNDANESE_SUPPLEMENT = 219, /*[1CC0]*/ /** @stable ICU 49 */ UBLOCK_TAKRI = 220, /*[11680]*/ /* New blocks in Unicode 7.0 */ /** @stable ICU 54 */ UBLOCK_BASSA_VAH = 221, /*[16AD0]*/ /** @stable ICU 54 */ UBLOCK_CAUCASIAN_ALBANIAN = 222, /*[10530]*/ /** @stable ICU 54 */ UBLOCK_COPTIC_EPACT_NUMBERS = 223, /*[102E0]*/ /** @stable ICU 54 */ UBLOCK_COMBINING_DIACRITICAL_MARKS_EXTENDED = 224, /*[1AB0]*/ /** @stable ICU 54 */ UBLOCK_DUPLOYAN = 225, /*[1BC00]*/ /** @stable ICU 54 */ UBLOCK_ELBASAN = 226, /*[10500]*/ /** @stable ICU 54 */ UBLOCK_GEOMETRIC_SHAPES_EXTENDED = 227, /*[1F780]*/ /** @stable ICU 54 */ UBLOCK_GRANTHA = 228, /*[11300]*/ /** @stable ICU 54 */ UBLOCK_KHOJKI = 229, /*[11200]*/ /** @stable ICU 54 */ UBLOCK_KHUDAWADI = 230, /*[112B0]*/ /** @stable ICU 54 */ UBLOCK_LATIN_EXTENDED_E = 231, /*[AB30]*/ /** @stable ICU 54 */ UBLOCK_LINEAR_A = 232, /*[10600]*/ /** @stable ICU 54 */ UBLOCK_MAHAJANI = 233, /*[11150]*/ /** @stable ICU 54 */ UBLOCK_MANICHAEAN = 234, /*[10AC0]*/ /** @stable ICU 54 */ UBLOCK_MENDE_KIKAKUI = 235, /*[1E800]*/ /** @stable ICU 54 */ UBLOCK_MODI = 236, /*[11600]*/ /** @stable ICU 54 */ UBLOCK_MRO = 237, /*[16A40]*/ /** @stable ICU 54 */ UBLOCK_MYANMAR_EXTENDED_B = 238, /*[A9E0]*/ /** @stable ICU 54 */ UBLOCK_NABATAEAN = 239, /*[10880]*/ /** @stable ICU 54 */ UBLOCK_OLD_NORTH_ARABIAN = 240, /*[10A80]*/ /** @stable ICU 54 */ UBLOCK_OLD_PERMIC = 241, /*[10350]*/ /** @stable ICU 54 */ UBLOCK_ORNAMENTAL_DINGBATS = 242, /*[1F650]*/ /** @stable ICU 54 */ UBLOCK_PAHAWH_HMONG = 243, /*[16B00]*/ /** @stable ICU 54 */ UBLOCK_PALMYRENE = 244, /*[10860]*/ /** @stable ICU 54 */ UBLOCK_PAU_CIN_HAU = 245, /*[11AC0]*/ /** @stable ICU 54 */ UBLOCK_PSALTER_PAHLAVI = 246, /*[10B80]*/ /** @stable ICU 54 */ UBLOCK_SHORTHAND_FORMAT_CONTROLS = 247, /*[1BCA0]*/ /** @stable ICU 54 */ UBLOCK_SIDDHAM = 248, /*[11580]*/ /** @stable ICU 54 */ UBLOCK_SINHALA_ARCHAIC_NUMBERS = 249, /*[111E0]*/ /** @stable ICU 54 */ UBLOCK_SUPPLEMENTAL_ARROWS_C = 250, /*[1F800]*/ /** @stable ICU 54 */ UBLOCK_TIRHUTA = 251, /*[11480]*/ /** @stable ICU 54 */ UBLOCK_WARANG_CITI = 252, /*[118A0]*/ /* New blocks in Unicode 8.0 */ /** @stable ICU 56 */ UBLOCK_AHOM = 253, /*[11700]*/ /** @stable ICU 56 */ UBLOCK_ANATOLIAN_HIEROGLYPHS = 254, /*[14400]*/ /** @stable ICU 56 */ UBLOCK_CHEROKEE_SUPPLEMENT = 255, /*[AB70]*/ /** @stable ICU 56 */ UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_E = 256, /*[2B820]*/ /** @stable ICU 56 */ UBLOCK_EARLY_DYNASTIC_CUNEIFORM = 257, /*[12480]*/ /** @stable ICU 56 */ UBLOCK_HATRAN = 258, /*[108E0]*/ /** @stable ICU 56 */ UBLOCK_MULTANI = 259, /*[11280]*/ /** @stable ICU 56 */ UBLOCK_OLD_HUNGARIAN = 260, /*[10C80]*/ /** @stable ICU 56 */ UBLOCK_SUPPLEMENTAL_SYMBOLS_AND_PICTOGRAPHS = 261, /*[1F900]*/ /** @stable ICU 56 */ UBLOCK_SUTTON_SIGNWRITING = 262, /*[1D800]*/ /** @stable ICU 2.0 */ UBLOCK_COUNT = 263, /** @stable ICU 2.0 */ UBLOCK_INVALID_CODE=-1 }; /** @stable ICU 2.0 */ typedef enum UBlockCode UBlockCode; /** * East Asian Width constants. * * @see UCHAR_EAST_ASIAN_WIDTH * @see u_getIntPropertyValue * @stable ICU 2.2 */ typedef enum UEastAsianWidth { /* * Note: UEastAsianWidth constants are parsed by preparseucd.py. * It matches lines like * U_EA_ */ U_EA_NEUTRAL, /*[N]*/ U_EA_AMBIGUOUS, /*[A]*/ U_EA_HALFWIDTH, /*[H]*/ U_EA_FULLWIDTH, /*[F]*/ U_EA_NARROW, /*[Na]*/ U_EA_WIDE, /*[W]*/ U_EA_COUNT } UEastAsianWidth; /** * Selector constants for u_charName(). * u_charName() returns the "modern" name of a * Unicode character; or the name that was defined in * Unicode version 1.0, before the Unicode standard merged * with ISO-10646; or an "extended" name that gives each * Unicode code point a unique name. * * @see u_charName * @stable ICU 2.0 */ typedef enum UCharNameChoice { /** Unicode character name (Name property). @stable ICU 2.0 */ U_UNICODE_CHAR_NAME, /** Standard or synthetic character name. @stable ICU 2.0 */ U_EXTENDED_CHAR_NAME = U_UNICODE_CHAR_NAME+2, /** Corrected name from NameAliases.txt. @stable ICU 4.4 */ U_CHAR_NAME_ALIAS, /** @stable ICU 2.0 */ U_CHAR_NAME_CHOICE_COUNT } UCharNameChoice; /** * Selector constants for u_getPropertyName() and * u_getPropertyValueName(). These selectors are used to choose which * name is returned for a given property or value. All properties and * values have a long name. Most have a short name, but some do not. * Unicode allows for additional names, beyond the long and short * name, which would be indicated by U_LONG_PROPERTY_NAME + i, where * i=1, 2,... * * @see u_getPropertyName() * @see u_getPropertyValueName() * @stable ICU 2.4 */ typedef enum UPropertyNameChoice { U_SHORT_PROPERTY_NAME, U_LONG_PROPERTY_NAME, U_PROPERTY_NAME_CHOICE_COUNT } UPropertyNameChoice; /** * Decomposition Type constants. * * @see UCHAR_DECOMPOSITION_TYPE * @stable ICU 2.2 */ typedef enum UDecompositionType { /* * Note: UDecompositionType constants are parsed by preparseucd.py. * It matches lines like * U_DT_ */ U_DT_NONE, /*[none]*/ U_DT_CANONICAL, /*[can]*/ U_DT_COMPAT, /*[com]*/ U_DT_CIRCLE, /*[enc]*/ U_DT_FINAL, /*[fin]*/ U_DT_FONT, /*[font]*/ U_DT_FRACTION, /*[fra]*/ U_DT_INITIAL, /*[init]*/ U_DT_ISOLATED, /*[iso]*/ U_DT_MEDIAL, /*[med]*/ U_DT_NARROW, /*[nar]*/ U_DT_NOBREAK, /*[nb]*/ U_DT_SMALL, /*[sml]*/ U_DT_SQUARE, /*[sqr]*/ U_DT_SUB, /*[sub]*/ U_DT_SUPER, /*[sup]*/ U_DT_VERTICAL, /*[vert]*/ U_DT_WIDE, /*[wide]*/ U_DT_COUNT /* 18 */ } UDecompositionType; /** * Joining Type constants. * * @see UCHAR_JOINING_TYPE * @stable ICU 2.2 */ typedef enum UJoiningType { /* * Note: UJoiningType constants are parsed by preparseucd.py. * It matches lines like * U_JT_ */ U_JT_NON_JOINING, /*[U]*/ U_JT_JOIN_CAUSING, /*[C]*/ U_JT_DUAL_JOINING, /*[D]*/ U_JT_LEFT_JOINING, /*[L]*/ U_JT_RIGHT_JOINING, /*[R]*/ U_JT_TRANSPARENT, /*[T]*/ U_JT_COUNT /* 6 */ } UJoiningType; /** * Joining Group constants. * * @see UCHAR_JOINING_GROUP * @stable ICU 2.2 */ typedef enum UJoiningGroup { /* * Note: UJoiningGroup constants are parsed by preparseucd.py. * It matches lines like * U_JG_ */ U_JG_NO_JOINING_GROUP, U_JG_AIN, U_JG_ALAPH, U_JG_ALEF, U_JG_BEH, U_JG_BETH, U_JG_DAL, U_JG_DALATH_RISH, U_JG_E, U_JG_FEH, U_JG_FINAL_SEMKATH, U_JG_GAF, U_JG_GAMAL, U_JG_HAH, U_JG_TEH_MARBUTA_GOAL, /**< @stable ICU 4.6 */ U_JG_HAMZA_ON_HEH_GOAL=U_JG_TEH_MARBUTA_GOAL, U_JG_HE, U_JG_HEH, U_JG_HEH_GOAL, U_JG_HETH, U_JG_KAF, U_JG_KAPH, U_JG_KNOTTED_HEH, U_JG_LAM, U_JG_LAMADH, U_JG_MEEM, U_JG_MIM, U_JG_NOON, U_JG_NUN, U_JG_PE, U_JG_QAF, U_JG_QAPH, U_JG_REH, U_JG_REVERSED_PE, U_JG_SAD, U_JG_SADHE, U_JG_SEEN, U_JG_SEMKATH, U_JG_SHIN, U_JG_SWASH_KAF, U_JG_SYRIAC_WAW, U_JG_TAH, U_JG_TAW, U_JG_TEH_MARBUTA, U_JG_TETH, U_JG_WAW, U_JG_YEH, U_JG_YEH_BARREE, U_JG_YEH_WITH_TAIL, U_JG_YUDH, U_JG_YUDH_HE, U_JG_ZAIN, U_JG_FE, /**< @stable ICU 2.6 */ U_JG_KHAPH, /**< @stable ICU 2.6 */ U_JG_ZHAIN, /**< @stable ICU 2.6 */ U_JG_BURUSHASKI_YEH_BARREE, /**< @stable ICU 4.0 */ U_JG_FARSI_YEH, /**< @stable ICU 4.4 */ U_JG_NYA, /**< @stable ICU 4.4 */ U_JG_ROHINGYA_YEH, /**< @stable ICU 49 */ U_JG_MANICHAEAN_ALEPH, /**< @stable ICU 54 */ U_JG_MANICHAEAN_AYIN, /**< @stable ICU 54 */ U_JG_MANICHAEAN_BETH, /**< @stable ICU 54 */ U_JG_MANICHAEAN_DALETH, /**< @stable ICU 54 */ U_JG_MANICHAEAN_DHAMEDH, /**< @stable ICU 54 */ U_JG_MANICHAEAN_FIVE, /**< @stable ICU 54 */ U_JG_MANICHAEAN_GIMEL, /**< @stable ICU 54 */ U_JG_MANICHAEAN_HETH, /**< @stable ICU 54 */ U_JG_MANICHAEAN_HUNDRED, /**< @stable ICU 54 */ U_JG_MANICHAEAN_KAPH, /**< @stable ICU 54 */ U_JG_MANICHAEAN_LAMEDH, /**< @stable ICU 54 */ U_JG_MANICHAEAN_MEM, /**< @stable ICU 54 */ U_JG_MANICHAEAN_NUN, /**< @stable ICU 54 */ U_JG_MANICHAEAN_ONE, /**< @stable ICU 54 */ U_JG_MANICHAEAN_PE, /**< @stable ICU 54 */ U_JG_MANICHAEAN_QOPH, /**< @stable ICU 54 */ U_JG_MANICHAEAN_RESH, /**< @stable ICU 54 */ U_JG_MANICHAEAN_SADHE, /**< @stable ICU 54 */ U_JG_MANICHAEAN_SAMEKH, /**< @stable ICU 54 */ U_JG_MANICHAEAN_TAW, /**< @stable ICU 54 */ U_JG_MANICHAEAN_TEN, /**< @stable ICU 54 */ U_JG_MANICHAEAN_TETH, /**< @stable ICU 54 */ U_JG_MANICHAEAN_THAMEDH, /**< @stable ICU 54 */ U_JG_MANICHAEAN_TWENTY, /**< @stable ICU 54 */ U_JG_MANICHAEAN_WAW, /**< @stable ICU 54 */ U_JG_MANICHAEAN_YODH, /**< @stable ICU 54 */ U_JG_MANICHAEAN_ZAYIN, /**< @stable ICU 54 */ U_JG_STRAIGHT_WAW, /**< @stable ICU 54 */ U_JG_COUNT } UJoiningGroup; /** * Grapheme Cluster Break constants. * * @see UCHAR_GRAPHEME_CLUSTER_BREAK * @stable ICU 3.4 */ typedef enum UGraphemeClusterBreak { /* * Note: UGraphemeClusterBreak constants are parsed by preparseucd.py. * It matches lines like * U_GCB_ */ U_GCB_OTHER = 0, /*[XX]*/ U_GCB_CONTROL = 1, /*[CN]*/ U_GCB_CR = 2, /*[CR]*/ U_GCB_EXTEND = 3, /*[EX]*/ U_GCB_L = 4, /*[L]*/ U_GCB_LF = 5, /*[LF]*/ U_GCB_LV = 6, /*[LV]*/ U_GCB_LVT = 7, /*[LVT]*/ U_GCB_T = 8, /*[T]*/ U_GCB_V = 9, /*[V]*/ U_GCB_SPACING_MARK = 10, /*[SM]*/ /* from here on: new in Unicode 5.1/ICU 4.0 */ U_GCB_PREPEND = 11, /*[PP]*/ U_GCB_REGIONAL_INDICATOR = 12, /*[RI]*/ /* new in Unicode 6.2/ICU 50 */ U_GCB_COUNT = 13 } UGraphemeClusterBreak; /** * Word Break constants. * (UWordBreak is a pre-existing enum type in ubrk.h for word break status tags.) * * @see UCHAR_WORD_BREAK * @stable ICU 3.4 */ typedef enum UWordBreakValues { /* * Note: UWordBreakValues constants are parsed by preparseucd.py. * It matches lines like * U_WB_ */ U_WB_OTHER = 0, /*[XX]*/ U_WB_ALETTER = 1, /*[LE]*/ U_WB_FORMAT = 2, /*[FO]*/ U_WB_KATAKANA = 3, /*[KA]*/ U_WB_MIDLETTER = 4, /*[ML]*/ U_WB_MIDNUM = 5, /*[MN]*/ U_WB_NUMERIC = 6, /*[NU]*/ U_WB_EXTENDNUMLET = 7, /*[EX]*/ U_WB_CR = 8, /*[CR]*/ /* from here on: new in Unicode 5.1/ICU 4.0 */ U_WB_EXTEND = 9, /*[Extend]*/ U_WB_LF = 10, /*[LF]*/ U_WB_MIDNUMLET =11, /*[MB]*/ U_WB_NEWLINE =12, /*[NL]*/ U_WB_REGIONAL_INDICATOR = 13, /*[RI]*/ /* new in Unicode 6.2/ICU 50 */ U_WB_HEBREW_LETTER = 14, /*[HL]*/ /* from here on: new in Unicode 6.3/ICU 52 */ U_WB_SINGLE_QUOTE = 15, /*[SQ]*/ U_WB_DOUBLE_QUOTE = 16, /*[DQ]*/ U_WB_COUNT = 17 } UWordBreakValues; /** * Sentence Break constants. * * @see UCHAR_SENTENCE_BREAK * @stable ICU 3.4 */ typedef enum USentenceBreak { /* * Note: USentenceBreak constants are parsed by preparseucd.py. * It matches lines like * U_SB_ */ U_SB_OTHER = 0, /*[XX]*/ U_SB_ATERM = 1, /*[AT]*/ U_SB_CLOSE = 2, /*[CL]*/ U_SB_FORMAT = 3, /*[FO]*/ U_SB_LOWER = 4, /*[LO]*/ U_SB_NUMERIC = 5, /*[NU]*/ U_SB_OLETTER = 6, /*[LE]*/ U_SB_SEP = 7, /*[SE]*/ U_SB_SP = 8, /*[SP]*/ U_SB_STERM = 9, /*[ST]*/ U_SB_UPPER = 10, /*[UP]*/ U_SB_CR = 11, /*[CR]*/ /* from here on: new in Unicode 5.1/ICU 4.0 */ U_SB_EXTEND = 12, /*[EX]*/ U_SB_LF = 13, /*[LF]*/ U_SB_SCONTINUE = 14, /*[SC]*/ U_SB_COUNT = 15 } USentenceBreak; /** * Line Break constants. * * @see UCHAR_LINE_BREAK * @stable ICU 2.2 */ typedef enum ULineBreak { /* * Note: ULineBreak constants are parsed by preparseucd.py. * It matches lines like * U_LB_ */ U_LB_UNKNOWN = 0, /*[XX]*/ U_LB_AMBIGUOUS = 1, /*[AI]*/ U_LB_ALPHABETIC = 2, /*[AL]*/ U_LB_BREAK_BOTH = 3, /*[B2]*/ U_LB_BREAK_AFTER = 4, /*[BA]*/ U_LB_BREAK_BEFORE = 5, /*[BB]*/ U_LB_MANDATORY_BREAK = 6, /*[BK]*/ U_LB_CONTINGENT_BREAK = 7, /*[CB]*/ U_LB_CLOSE_PUNCTUATION = 8, /*[CL]*/ U_LB_COMBINING_MARK = 9, /*[CM]*/ U_LB_CARRIAGE_RETURN = 10, /*[CR]*/ U_LB_EXCLAMATION = 11, /*[EX]*/ U_LB_GLUE = 12, /*[GL]*/ U_LB_HYPHEN = 13, /*[HY]*/ U_LB_IDEOGRAPHIC = 14, /*[ID]*/ /** Renamed from the misspelled "inseperable" in Unicode 4.0.1/ICU 3.0 @stable ICU 3.0 */ U_LB_INSEPARABLE = 15, /*[IN]*/ U_LB_INSEPERABLE = U_LB_INSEPARABLE, U_LB_INFIX_NUMERIC = 16, /*[IS]*/ U_LB_LINE_FEED = 17, /*[LF]*/ U_LB_NONSTARTER = 18, /*[NS]*/ U_LB_NUMERIC = 19, /*[NU]*/ U_LB_OPEN_PUNCTUATION = 20, /*[OP]*/ U_LB_POSTFIX_NUMERIC = 21, /*[PO]*/ U_LB_PREFIX_NUMERIC = 22, /*[PR]*/ U_LB_QUOTATION = 23, /*[QU]*/ U_LB_COMPLEX_CONTEXT = 24, /*[SA]*/ U_LB_SURROGATE = 25, /*[SG]*/ U_LB_SPACE = 26, /*[SP]*/ U_LB_BREAK_SYMBOLS = 27, /*[SY]*/ U_LB_ZWSPACE = 28, /*[ZW]*/ U_LB_NEXT_LINE = 29, /*[NL]*/ /* from here on: new in Unicode 4/ICU 2.6 */ U_LB_WORD_JOINER = 30, /*[WJ]*/ U_LB_H2 = 31, /*[H2]*/ /* from here on: new in Unicode 4.1/ICU 3.4 */ U_LB_H3 = 32, /*[H3]*/ U_LB_JL = 33, /*[JL]*/ U_LB_JT = 34, /*[JT]*/ U_LB_JV = 35, /*[JV]*/ U_LB_CLOSE_PARENTHESIS = 36, /*[CP]*/ /* new in Unicode 5.2/ICU 4.4 */ U_LB_CONDITIONAL_JAPANESE_STARTER = 37,/*[CJ]*/ /* new in Unicode 6.1/ICU 49 */ U_LB_HEBREW_LETTER = 38, /*[HL]*/ /* new in Unicode 6.1/ICU 49 */ U_LB_REGIONAL_INDICATOR = 39,/*[RI]*/ /* new in Unicode 6.2/ICU 50 */ U_LB_COUNT = 40 } ULineBreak; /** * Numeric Type constants. * * @see UCHAR_NUMERIC_TYPE * @stable ICU 2.2 */ typedef enum UNumericType { /* * Note: UNumericType constants are parsed by preparseucd.py. * It matches lines like * U_NT_ */ U_NT_NONE, /*[None]*/ U_NT_DECIMAL, /*[de]*/ U_NT_DIGIT, /*[di]*/ U_NT_NUMERIC, /*[nu]*/ U_NT_COUNT } UNumericType; /** * Hangul Syllable Type constants. * * @see UCHAR_HANGUL_SYLLABLE_TYPE * @stable ICU 2.6 */ typedef enum UHangulSyllableType { /* * Note: UHangulSyllableType constants are parsed by preparseucd.py. * It matches lines like * U_HST_ */ U_HST_NOT_APPLICABLE, /*[NA]*/ U_HST_LEADING_JAMO, /*[L]*/ U_HST_VOWEL_JAMO, /*[V]*/ U_HST_TRAILING_JAMO, /*[T]*/ U_HST_LV_SYLLABLE, /*[LV]*/ U_HST_LVT_SYLLABLE, /*[LVT]*/ U_HST_COUNT } UHangulSyllableType; /** * Check a binary Unicode property for a code point. * * Unicode, especially in version 3.2, defines many more properties than the * original set in UnicodeData.txt. * * The properties APIs are intended to reflect Unicode properties as defined * in the Unicode Character Database (UCD) and Unicode Technical Reports (UTR). * For details about the properties see http://www.unicode.org/ucd/ . * For names of Unicode properties see the UCD file PropertyAliases.txt. * * Important: If ICU is built with UCD files from Unicode versions below 3.2, * then properties marked with "new in Unicode 3.2" are not or not fully available. * * @param c Code point to test. * @param which UProperty selector constant, identifies which binary property to check. * Must be UCHAR_BINARY_START<=which=0. * True for characters with general category "Nd" (decimal digit numbers) * as well as Latin letters a-f and A-F in both ASCII and Fullwidth ASCII. * (That is, for letters with code points * 0041..0046, 0061..0066, FF21..FF26, FF41..FF46.) * * In order to narrow the definition of hexadecimal digits to only ASCII * characters, use (c<=0x7f && u_isxdigit(c)). * * This is a C/POSIX migration function. * See the comments about C/POSIX character classification functions in the * documentation at the top of this header file. * * @param c the code point to be tested * @return TRUE if the code point is a hexadecimal digit * * @stable ICU 2.6 */ U_STABLE UBool U_EXPORT2 u_isxdigit(UChar32 c); /** * Determines whether the specified code point is a punctuation character. * True for characters with general categories "P" (punctuation). * * This is a C/POSIX migration function. * See the comments about C/POSIX character classification functions in the * documentation at the top of this header file. * * @param c the code point to be tested * @return TRUE if the code point is a punctuation character * * @stable ICU 2.6 */ U_STABLE UBool U_EXPORT2 u_ispunct(UChar32 c); /** * Determines whether the specified code point is a "graphic" character * (printable, excluding spaces). * TRUE for all characters except those with general categories * "Cc" (control codes), "Cf" (format controls), "Cs" (surrogates), * "Cn" (unassigned), and "Z" (separators). * * This is a C/POSIX migration function. * See the comments about C/POSIX character classification functions in the * documentation at the top of this header file. * * @param c the code point to be tested * @return TRUE if the code point is a "graphic" character * * @stable ICU 2.6 */ U_STABLE UBool U_EXPORT2 u_isgraph(UChar32 c); /** * Determines whether the specified code point is a "blank" or "horizontal space", * a character that visibly separates words on a line. * The following are equivalent definitions: * * TRUE for Unicode White_Space characters except for "vertical space controls" * where "vertical space controls" are the following characters: * U+000A (LF) U+000B (VT) U+000C (FF) U+000D (CR) U+0085 (NEL) U+2028 (LS) U+2029 (PS) * * same as * * TRUE for U+0009 (TAB) and characters with general category "Zs" (space separators) * except Zero Width Space (ZWSP, U+200B). * * Note: There are several ICU whitespace functions; please see the uchar.h * file documentation for a detailed comparison. * * This is a C/POSIX migration function. * See the comments about C/POSIX character classification functions in the * documentation at the top of this header file. * * @param c the code point to be tested * @return TRUE if the code point is a "blank" * * @stable ICU 2.6 */ U_STABLE UBool U_EXPORT2 u_isblank(UChar32 c); /** * Determines whether the specified code point is "defined", * which usually means that it is assigned a character. * True for general categories other than "Cn" (other, not assigned), * i.e., true for all code points mentioned in UnicodeData.txt. * * Note that non-character code points (e.g., U+FDD0) are not "defined" * (they are Cn), but surrogate code points are "defined" (Cs). * * Same as java.lang.Character.isDefined(). * * @param c the code point to be tested * @return TRUE if the code point is assigned a character * * @see u_isdigit * @see u_isalpha * @see u_isalnum * @see u_isupper * @see u_islower * @see u_istitle * @stable ICU 2.0 */ U_STABLE UBool U_EXPORT2 u_isdefined(UChar32 c); /** * Determines if the specified character is a space character or not. * * Note: There are several ICU whitespace functions; please see the uchar.h * file documentation for a detailed comparison. * * This is a C/POSIX migration function. * See the comments about C/POSIX character classification functions in the * documentation at the top of this header file. * * @param c the character to be tested * @return true if the character is a space character; false otherwise. * * @see u_isJavaSpaceChar * @see u_isWhitespace * @see u_isUWhiteSpace * @stable ICU 2.0 */ U_STABLE UBool U_EXPORT2 u_isspace(UChar32 c); /** * Determine if the specified code point is a space character according to Java. * True for characters with general categories "Z" (separators), * which does not include control codes (e.g., TAB or Line Feed). * * Same as java.lang.Character.isSpaceChar(). * * Note: There are several ICU whitespace functions; please see the uchar.h * file documentation for a detailed comparison. * * @param c the code point to be tested * @return TRUE if the code point is a space character according to Character.isSpaceChar() * * @see u_isspace * @see u_isWhitespace * @see u_isUWhiteSpace * @stable ICU 2.6 */ U_STABLE UBool U_EXPORT2 u_isJavaSpaceChar(UChar32 c); /** * Determines if the specified code point is a whitespace character according to Java/ICU. * A character is considered to be a Java whitespace character if and only * if it satisfies one of the following criteria: * * - It is a Unicode Separator character (categories "Z" = "Zs" or "Zl" or "Zp"), but is not * also a non-breaking space (U+00A0 NBSP or U+2007 Figure Space or U+202F Narrow NBSP). * - It is U+0009 HORIZONTAL TABULATION. * - It is U+000A LINE FEED. * - It is U+000B VERTICAL TABULATION. * - It is U+000C FORM FEED. * - It is U+000D CARRIAGE RETURN. * - It is U+001C FILE SEPARATOR. * - It is U+001D GROUP SEPARATOR. * - It is U+001E RECORD SEPARATOR. * - It is U+001F UNIT SEPARATOR. * * This API tries to sync with the semantics of Java's * java.lang.Character.isWhitespace(), but it may not return * the exact same results because of the Unicode version * difference. * * Note: Unicode 4.0.1 changed U+200B ZERO WIDTH SPACE from a Space Separator (Zs) * to a Format Control (Cf). Since then, isWhitespace(0x200b) returns false. * See http://www.unicode.org/versions/Unicode4.0.1/ * * Note: There are several ICU whitespace functions; please see the uchar.h * file documentation for a detailed comparison. * * @param c the code point to be tested * @return TRUE if the code point is a whitespace character according to Java/ICU * * @see u_isspace * @see u_isJavaSpaceChar * @see u_isUWhiteSpace * @stable ICU 2.0 */ U_STABLE UBool U_EXPORT2 u_isWhitespace(UChar32 c); /** * Determines whether the specified code point is a control character * (as defined by this function). * A control character is one of the following: * - ISO 8-bit control character (U+0000..U+001f and U+007f..U+009f) * - U_CONTROL_CHAR (Cc) * - U_FORMAT_CHAR (Cf) * - U_LINE_SEPARATOR (Zl) * - U_PARAGRAPH_SEPARATOR (Zp) * * This is a C/POSIX migration function. * See the comments about C/POSIX character classification functions in the * documentation at the top of this header file. * * @param c the code point to be tested * @return TRUE if the code point is a control character * * @see UCHAR_DEFAULT_IGNORABLE_CODE_POINT * @see u_isprint * @stable ICU 2.0 */ U_STABLE UBool U_EXPORT2 u_iscntrl(UChar32 c); /** * Determines whether the specified code point is an ISO control code. * True for U+0000..U+001f and U+007f..U+009f (general category "Cc"). * * Same as java.lang.Character.isISOControl(). * * @param c the code point to be tested * @return TRUE if the code point is an ISO control code * * @see u_iscntrl * @stable ICU 2.6 */ U_STABLE UBool U_EXPORT2 u_isISOControl(UChar32 c); /** * Determines whether the specified code point is a printable character. * True for general categories other than "C" (controls). * * This is a C/POSIX migration function. * See the comments about C/POSIX character classification functions in the * documentation at the top of this header file. * * @param c the code point to be tested * @return TRUE if the code point is a printable character * * @see UCHAR_DEFAULT_IGNORABLE_CODE_POINT * @see u_iscntrl * @stable ICU 2.0 */ U_STABLE UBool U_EXPORT2 u_isprint(UChar32 c); /** * Determines whether the specified code point is a base character. * True for general categories "L" (letters), "N" (numbers), * "Mc" (spacing combining marks), and "Me" (enclosing marks). * * Note that this is different from the Unicode definition in * chapter 3.5, conformance clause D13, * which defines base characters to be all characters (not Cn) * that do not graphically combine with preceding characters (M) * and that are neither control (Cc) or format (Cf) characters. * * @param c the code point to be tested * @return TRUE if the code point is a base character according to this function * * @see u_isalpha * @see u_isdigit * @stable ICU 2.0 */ U_STABLE UBool U_EXPORT2 u_isbase(UChar32 c); /** * Returns the bidirectional category value for the code point, * which is used in the Unicode bidirectional algorithm * (UAX #9 http://www.unicode.org/reports/tr9/). * Note that some unassigned code points have bidi values * of R or AL because they are in blocks that are reserved * for Right-To-Left scripts. * * Same as java.lang.Character.getDirectionality() * * @param c the code point to be tested * @return the bidirectional category (UCharDirection) value * * @see UCharDirection * @stable ICU 2.0 */ U_STABLE UCharDirection U_EXPORT2 u_charDirection(UChar32 c); /** * Determines whether the code point has the Bidi_Mirrored property. * This property is set for characters that are commonly used in * Right-To-Left contexts and need to be displayed with a "mirrored" * glyph. * * Same as java.lang.Character.isMirrored(). * Same as UCHAR_BIDI_MIRRORED * * @param c the code point to be tested * @return TRUE if the character has the Bidi_Mirrored property * * @see UCHAR_BIDI_MIRRORED * @stable ICU 2.0 */ U_STABLE UBool U_EXPORT2 u_isMirrored(UChar32 c); /** * Maps the specified character to a "mirror-image" character. * For characters with the Bidi_Mirrored property, implementations * sometimes need a "poor man's" mapping to another Unicode * character (code point) such that the default glyph may serve * as the mirror-image of the default glyph of the specified * character. This is useful for text conversion to and from * codepages with visual order, and for displays without glyph * selection capabilities. * * @param c the code point to be mapped * @return another Unicode code point that may serve as a mirror-image * substitute, or c itself if there is no such mapping or c * does not have the Bidi_Mirrored property * * @see UCHAR_BIDI_MIRRORED * @see u_isMirrored * @stable ICU 2.0 */ U_STABLE UChar32 U_EXPORT2 u_charMirror(UChar32 c); /** * Maps the specified character to its paired bracket character. * For Bidi_Paired_Bracket_Type!=None, this is the same as u_charMirror(). * Otherwise c itself is returned. * See http://www.unicode.org/reports/tr9/ * * @param c the code point to be mapped * @return the paired bracket code point, * or c itself if there is no such mapping * (Bidi_Paired_Bracket_Type=None) * * @see UCHAR_BIDI_PAIRED_BRACKET * @see UCHAR_BIDI_PAIRED_BRACKET_TYPE * @see u_charMirror * @stable ICU 52 */ U_STABLE UChar32 U_EXPORT2 u_getBidiPairedBracket(UChar32 c); /** * Returns the general category value for the code point. * * Same as java.lang.Character.getType(). * * @param c the code point to be tested * @return the general category (UCharCategory) value * * @see UCharCategory * @stable ICU 2.0 */ U_STABLE int8_t U_EXPORT2 u_charType(UChar32 c); /** * Get a single-bit bit set for the general category of a character. * This bit set can be compared bitwise with U_GC_SM_MASK, U_GC_L_MASK, etc. * Same as U_MASK(u_charType(c)). * * @param c the code point to be tested * @return a single-bit mask corresponding to the general category (UCharCategory) value * * @see u_charType * @see UCharCategory * @see U_GC_CN_MASK * @stable ICU 2.1 */ #define U_GET_GC_MASK(c) U_MASK(u_charType(c)) /** * Callback from u_enumCharTypes(), is called for each contiguous range * of code points c (where start<=cnameChoice, the character name written * into the buffer is the "modern" name or the name that was defined * in Unicode version 1.0. * The name contains only "invariant" characters * like A-Z, 0-9, space, and '-'. * Unicode 1.0 names are only retrieved if they are different from the modern * names and if the data file contains the data for them. gennames may or may * not be called with a command line option to include 1.0 names in unames.dat. * * @param code The character (code point) for which to get the name. * It must be 0<=code<=0x10ffff. * @param nameChoice Selector for which name to get. * @param buffer Destination address for copying the name. * The name will always be zero-terminated. * If there is no name, then the buffer will be set to the empty string. * @param bufferLength ==sizeof(buffer) * @param pErrorCode Pointer to a UErrorCode variable; * check for U_SUCCESS() after u_charName() * returns. * @return The length of the name, or 0 if there is no name for this character. * If the bufferLength is less than or equal to the length, then the buffer * contains the truncated name and the returned length indicates the full * length of the name. * The length does not include the zero-termination. * * @see UCharNameChoice * @see u_charFromName * @see u_enumCharNames * @stable ICU 2.0 */ U_STABLE int32_t U_EXPORT2 u_charName(UChar32 code, UCharNameChoice nameChoice, char *buffer, int32_t bufferLength, UErrorCode *pErrorCode); /** * Find a Unicode character by its name and return its code point value. * The name is matched exactly and completely. * If the name does not correspond to a code point, pErrorCode * is set to U_INVALID_CHAR_FOUND. * A Unicode 1.0 name is matched only if it differs from the modern name. * Unicode names are all uppercase. Extended names are lowercase followed * by an uppercase hexadecimal number, and within angle brackets. * * @param nameChoice Selector for which name to match. * @param name The name to match. * @param pErrorCode Pointer to a UErrorCode variable * @return The Unicode value of the code point with the given name, * or an undefined value if there is no such code point. * * @see UCharNameChoice * @see u_charName * @see u_enumCharNames * @stable ICU 1.7 */ U_STABLE UChar32 U_EXPORT2 u_charFromName(UCharNameChoice nameChoice, const char *name, UErrorCode *pErrorCode); /** * Type of a callback function for u_enumCharNames() that gets called * for each Unicode character with the code point value and * the character name. * If such a function returns FALSE, then the enumeration is stopped. * * @param context The context pointer that was passed to u_enumCharNames(). * @param code The Unicode code point for the character with this name. * @param nameChoice Selector for which kind of names is enumerated. * @param name The character's name, zero-terminated. * @param length The length of the name. * @return TRUE if the enumeration should continue, FALSE to stop it. * * @see UCharNameChoice * @see u_enumCharNames * @stable ICU 1.7 */ typedef UBool U_CALLCONV UEnumCharNamesFn(void *context, UChar32 code, UCharNameChoice nameChoice, const char *name, int32_t length); /** * Enumerate all assigned Unicode characters between the start and limit * code points (start inclusive, limit exclusive) and call a function * for each, passing the code point value and the character name. * For Unicode 1.0 names, only those are enumerated that differ from the * modern names. * * @param start The first code point in the enumeration range. * @param limit One more than the last code point in the enumeration range * (the first one after the range). * @param fn The function that is to be called for each character name. * @param context An arbitrary pointer that is passed to the function. * @param nameChoice Selector for which kind of names to enumerate. * @param pErrorCode Pointer to a UErrorCode variable * * @see UCharNameChoice * @see UEnumCharNamesFn * @see u_charName * @see u_charFromName * @stable ICU 1.7 */ U_STABLE void U_EXPORT2 u_enumCharNames(UChar32 start, UChar32 limit, UEnumCharNamesFn *fn, void *context, UCharNameChoice nameChoice, UErrorCode *pErrorCode); /** * Return the Unicode name for a given property, as given in the * Unicode database file PropertyAliases.txt. * * In addition, this function maps the property * UCHAR_GENERAL_CATEGORY_MASK to the synthetic names "gcm" / * "General_Category_Mask". These names are not in * PropertyAliases.txt. * * @param property UProperty selector other than UCHAR_INVALID_CODE. * If out of range, NULL is returned. * * @param nameChoice selector for which name to get. If out of range, * NULL is returned. All properties have a long name. Most * have a short name, but some do not. Unicode allows for * additional names; if present these will be returned by * U_LONG_PROPERTY_NAME + i, where i=1, 2,... * * @return a pointer to the name, or NULL if either the * property or the nameChoice is out of range. If a given * nameChoice returns NULL, then all larger values of * nameChoice will return NULL, with one exception: if NULL is * returned for U_SHORT_PROPERTY_NAME, then * U_LONG_PROPERTY_NAME (and higher) may still return a * non-NULL value. The returned pointer is valid until * u_cleanup() is called. * * @see UProperty * @see UPropertyNameChoice * @stable ICU 2.4 */ U_STABLE const char* U_EXPORT2 u_getPropertyName(UProperty property, UPropertyNameChoice nameChoice); /** * Return the UProperty enum for a given property name, as specified * in the Unicode database file PropertyAliases.txt. Short, long, and * any other variants are recognized. * * In addition, this function maps the synthetic names "gcm" / * "General_Category_Mask" to the property * UCHAR_GENERAL_CATEGORY_MASK. These names are not in * PropertyAliases.txt. * * @param alias the property name to be matched. The name is compared * using "loose matching" as described in PropertyAliases.txt. * * @return a UProperty enum, or UCHAR_INVALID_CODE if the given name * does not match any property. * * @see UProperty * @stable ICU 2.4 */ U_STABLE UProperty U_EXPORT2 u_getPropertyEnum(const char* alias); /** * Return the Unicode name for a given property value, as given in the * Unicode database file PropertyValueAliases.txt. * * Note: Some of the names in PropertyValueAliases.txt can only be * retrieved using UCHAR_GENERAL_CATEGORY_MASK, not * UCHAR_GENERAL_CATEGORY. These include: "C" / "Other", "L" / * "Letter", "LC" / "Cased_Letter", "M" / "Mark", "N" / "Number", "P" * / "Punctuation", "S" / "Symbol", and "Z" / "Separator". * * @param property UProperty selector constant. * Must be UCHAR_BINARY_START<=which2<=radix<=36 or if the * value of c is not a valid digit in the specified * radix, -1 is returned. A character is a valid digit * if at least one of the following is true: *
    *
  • The character has a decimal digit value. * Such characters have the general category "Nd" (decimal digit numbers) * and a Numeric_Type of Decimal. * In this case the value is the character's decimal digit value.
  • *
  • The character is one of the uppercase Latin letters * 'A' through 'Z'. * In this case the value is c-'A'+10.
  • *
  • The character is one of the lowercase Latin letters * 'a' through 'z'. * In this case the value is ch-'a'+10.
  • *
  • Latin letters from both the ASCII range (0061..007A, 0041..005A) * as well as from the Fullwidth ASCII range (FF41..FF5A, FF21..FF3A) * are recognized.
  • *
* * Same as java.lang.Character.digit(). * * @param ch the code point to be tested. * @param radix the radix. * @return the numeric value represented by the character in the * specified radix, * or -1 if there is no value or if the value exceeds the radix. * * @see UCHAR_NUMERIC_TYPE * @see u_forDigit * @see u_charDigitValue * @see u_isdigit * @stable ICU 2.0 */ U_STABLE int32_t U_EXPORT2 u_digit(UChar32 ch, int8_t radix); /** * Determines the character representation for a specific digit in * the specified radix. If the value of radix is not a * valid radix, or the value of digit is not a valid * digit in the specified radix, the null character * (U+0000) is returned. *

* The radix argument is valid if it is greater than or * equal to 2 and less than or equal to 36. * The digit argument is valid if * 0 <= digit < radix. *

* If the digit is less than 10, then * '0' + digit is returned. Otherwise, the value * 'a' + digit - 10 is returned. * * Same as java.lang.Character.forDigit(). * * @param digit the number to convert to a character. * @param radix the radix. * @return the char representation of the specified digit * in the specified radix. * * @see u_digit * @see u_charDigitValue * @see u_isdigit * @stable ICU 2.0 */ U_STABLE UChar32 U_EXPORT2 u_forDigit(int32_t digit, int8_t radix); /** * Get the "age" of the code point. * The "age" is the Unicode version when the code point was first * designated (as a non-character or for Private Use) * or assigned a character. * This can be useful to avoid emitting code points to receiving * processes that do not accept newer characters. * The data is from the UCD file DerivedAge.txt. * * @param c The code point. * @param versionArray The Unicode version number array, to be filled in. * * @stable ICU 2.1 */ U_STABLE void U_EXPORT2 u_charAge(UChar32 c, UVersionInfo versionArray); /** * Gets the Unicode version information. * The version array is filled in with the version information * for the Unicode standard that is currently used by ICU. * For example, Unicode version 3.1.1 is represented as an array with * the values { 3, 1, 1, 0 }. * * @param versionArray an output array that will be filled in with * the Unicode version number * @stable ICU 2.0 */ U_STABLE void U_EXPORT2 u_getUnicodeVersion(UVersionInfo versionArray); #if !UCONFIG_NO_NORMALIZATION /** * Get the FC_NFKC_Closure property string for a character. * See Unicode Standard Annex #15 for details, search for "FC_NFKC_Closure" * or for "FNC": http://www.unicode.org/reports/tr15/ * * @param c The character (code point) for which to get the FC_NFKC_Closure string. * It must be 0<=c<=0x10ffff. * @param dest Destination address for copying the string. * The string will be zero-terminated if possible. * If there is no FC_NFKC_Closure string, * then the buffer will be set to the empty string. * @param destCapacity ==sizeof(dest) * @param pErrorCode Pointer to a UErrorCode variable. * @return The length of the string, or 0 if there is no FC_NFKC_Closure string for this character. * If the destCapacity is less than or equal to the length, then the buffer * contains the truncated name and the returned length indicates the full * length of the name. * The length does not include the zero-termination. * * @stable ICU 2.2 */ U_STABLE int32_t U_EXPORT2 u_getFC_NFKC_Closure(UChar32 c, UChar *dest, int32_t destCapacity, UErrorCode *pErrorCode); #endif U_CDECL_END #endif /*_UCHAR*/ /*eof*/ // utext.h /* ******************************************************************************* * * Copyright (C) 2004-2012, International Business Machines * Corporation and others. All Rights Reserved. * ******************************************************************************* * file name: utext.h * encoding: US-ASCII * tab size: 8 (not used) * indentation:4 * * created on: 2004oct06 * created by: Markus W. Scherer */ #ifndef __UTEXT_H__ #define __UTEXT_H__ /** * \file * \brief C API: Abstract Unicode Text API * * The Text Access API provides a means to allow text that is stored in alternative * formats to work with ICU services. ICU normally operates on text that is * stored in UTF-16 format, in (UChar *) arrays for the C APIs or as type * UnicodeString for C++ APIs. * * ICU Text Access allows other formats, such as UTF-8 or non-contiguous * UTF-16 strings, to be placed in a UText wrapper and then passed to ICU services. * * There are three general classes of usage for UText: * * Application Level Use. This is the simplest usage - applications would * use one of the utext_open() functions on their input text, and pass * the resulting UText to the desired ICU service. * * Second is usage in ICU Services, such as break iteration, that will need to * operate on input presented to them as a UText. These implementations * will need to use the iteration and related UText functions to gain * access to the actual text. * * The third class of UText users are "text providers." These are the * UText implementations for the various text storage formats. An application * or system with a unique text storage format can implement a set of * UText provider functions for that format, which will then allow * ICU services to operate on that format. * * * Iterating over text * * Here is sample code for a forward iteration over the contents of a UText * * \code * UChar32 c; * UText *ut = whatever(); * * for (c=utext_next32From(ut, 0); c>=0; c=utext_next32(ut)) { * // do whatever with the codepoint c here. * } * \endcode * * And here is similar code to iterate in the reverse direction, from the end * of the text towards the beginning. * * \code * UChar32 c; * UText *ut = whatever(); * int textLength = utext_nativeLength(ut); * for (c=utext_previous32From(ut, textLength); c>=0; c=utext_previous32(ut)) { * // do whatever with the codepoint c here. * } * \endcode * * Characters and Indexing * * Indexing into text by UText functions is nearly always in terms of the native * indexing of the underlying text storage. The storage format could be UTF-8 * or UTF-32, for example. When coding to the UText access API, no assumptions * can be made regarding the size of characters, or how far an index * may move when iterating between characters. * * All indices supplied to UText functions are pinned to the length of the * text. An out-of-bounds index is not considered to be an error, but is * adjusted to be in the range 0 <= index <= length of input text. * * * When an index position is returned from a UText function, it will be * a native index to the underlying text. In the case of multi-unit characters, * it will always refer to the first position of the character, * never to the interior. This is essentially the same thing as saying that * a returned index will always point to a boundary between characters. * * When a native index is supplied to a UText function, all indices that * refer to any part of a multi-unit character representation are considered * to be equivalent. In the case of multi-unit characters, an incoming index * will be logically normalized to refer to the start of the character. * * It is possible to test whether a native index is on a code point boundary * by doing a utext_setNativeIndex() followed by a utext_getNativeIndex(). * If the index is returned unchanged, it was on a code point boundary. If * an adjusted index is returned, the original index referred to the * interior of a character. * * Conventions for calling UText functions * * Most UText access functions have as their first parameter a (UText *) pointer, * which specifies the UText to be used. Unless otherwise noted, the * pointer must refer to a valid, open UText. Attempting to * use a closed UText or passing a NULL pointer is a programming error and * will produce undefined results or NULL pointer exceptions. * * The UText_Open family of functions can either open an existing (closed) * UText, or heap allocate a new UText. Here is sample code for creating * a stack-allocated UText. * * \code * char *s = whatever(); // A utf-8 string * U_ErrorCode status = U_ZERO_ERROR; * UText ut = UTEXT_INITIALIZER; * utext_openUTF8(ut, s, -1, &status); * if (U_FAILURE(status)) { * // error handling * } else { * // work with the UText * } * \endcode * * Any existing UText passed to an open function _must_ have been initialized, * either by the UTEXT_INITIALIZER, or by having been originally heap-allocated * by an open function. Passing NULL will cause the open function to * heap-allocate and fully initialize a new UText. * */ U_CDECL_BEGIN struct UText; typedef struct UText UText; /**< C typedef for struct UText. @stable ICU 3.6 */ /*************************************************************************************** * * C Functions for creating UText wrappers around various kinds of text strings. * ****************************************************************************************/ /** * Close function for UText instances. * Cleans up, releases any resources being held by an open UText. *

* If the UText was originally allocated by one of the utext_open functions, * the storage associated with the utext will also be freed. * If the UText storage originated with the application, as it would with * a local or static instance, the storage will not be deleted. * * An open UText can be reset to refer to new string by using one of the utext_open() * functions without first closing the UText. * * @param ut The UText to be closed. * @return NULL if the UText struct was deleted by the close. If the UText struct * was originally provided by the caller to the open function, it is * returned by this function, and may be safely used again in * a subsequent utext_open. * * @stable ICU 3.4 */ U_STABLE UText * U_EXPORT2 utext_close(UText *ut); /** * Open a read-only UText implementation for UTF-8 strings. * * \htmlonly * Any invalid UTF-8 in the input will be handled in this way: * a sequence of bytes that has the form of a truncated, but otherwise valid, * UTF-8 sequence will be replaced by a single unicode replacement character, \uFFFD. * Any other illegal bytes will each be replaced by a \uFFFD. * \endhtmlonly * * @param ut Pointer to a UText struct. If NULL, a new UText will be created. * If non-NULL, must refer to an initialized UText struct, which will then * be reset to reference the specified UTF-8 string. * @param s A UTF-8 string. Must not be NULL. * @param length The length of the UTF-8 string in bytes, or -1 if the string is * zero terminated. * @param status Errors are returned here. * @return A pointer to the UText. If a pre-allocated UText was provided, it * will always be used and returned. * @stable ICU 3.4 */ U_STABLE UText * U_EXPORT2 utext_openUTF8(UText *ut, const char *s, int64_t length, UErrorCode *status); /** * Open a read-only UText for UChar * string. * * @param ut Pointer to a UText struct. If NULL, a new UText will be created. * If non-NULL, must refer to an initialized UText struct, which will then * be reset to reference the specified UChar string. * @param s A UChar (UTF-16) string * @param length The number of UChars in the input string, or -1 if the string is * zero terminated. * @param status Errors are returned here. * @return A pointer to the UText. If a pre-allocated UText was provided, it * will always be used and returned. * @stable ICU 3.4 */ U_STABLE UText * U_EXPORT2 utext_openUChars(UText *ut, const UChar *s, int64_t length, UErrorCode *status); /** * Clone a UText. This is much like opening a UText where the source text is itself * another UText. * * A deep clone will copy both the UText data structures and the underlying text. * The original and cloned UText will operate completely independently; modifications * made to the text in one will not affect the other. Text providers are not * required to support deep clones. The user of clone() must check the status return * and be prepared to handle failures. * * The standard UText implementations for UTF8, UChar *, UnicodeString and * Replaceable all support deep cloning. * * The UText returned from a deep clone will be writable, assuming that the text * provider is able to support writing, even if the source UText had been made * non-writable by means of UText_freeze(). * * A shallow clone replicates only the UText data structures; it does not make * a copy of the underlying text. Shallow clones can be used as an efficient way to * have multiple iterators active in a single text string that is not being * modified. * * A shallow clone operation will not fail, barring truly exceptional conditions such * as memory allocation failures. * * Shallow UText clones should be avoided if the UText functions that modify the * text are expected to be used, either on the original or the cloned UText. * Any such modifications can cause unpredictable behavior. Read Only * shallow clones provide some protection against errors of this type by * disabling text modification via the cloned UText. * * A shallow clone made with the readOnly parameter == FALSE will preserve the * utext_isWritable() state of the source object. Note, however, that * write operations must be avoided while more than one UText exists that refer * to the same underlying text. * * A UText and its clone may be safely concurrently accessed by separate threads. * This is true for read access only with shallow clones, and for both read and * write access with deep clones. * It is the responsibility of the Text Provider to ensure that this thread safety * constraint is met. * * @param dest A UText struct to be filled in with the result of the clone operation, * or NULL if the clone function should heap-allocate a new UText struct. * If non-NULL, must refer to an already existing UText, which will then * be reset to become the clone. * @param src The UText to be cloned. * @param deep TRUE to request a deep clone, FALSE for a shallow clone. * @param readOnly TRUE to request that the cloned UText have read only access to the * underlying text. * @param status Errors are returned here. For deep clones, U_UNSUPPORTED_ERROR * will be returned if the text provider is unable to clone the * original text. * @return The newly created clone, or NULL if the clone operation failed. * @stable ICU 3.4 */ U_STABLE UText * U_EXPORT2 utext_clone(UText *dest, const UText *src, UBool deep, UBool readOnly, UErrorCode *status); /** * Compare two UText objects for equality. * UTexts are equal if they are iterating over the same text, and * have the same iteration position within the text. * If either or both of the parameters are NULL, the comparison is FALSE. * * @param a The first of the two UTexts to compare. * @param b The other UText to be compared. * @return TRUE if the two UTexts are equal. * @stable ICU 3.6 */ U_STABLE UBool U_EXPORT2 utext_equals(const UText *a, const UText *b); /***************************************************************************** * * Functions to work with the text represeted by a UText wrapper * *****************************************************************************/ /** * Get the length of the text. Depending on the characteristics * of the underlying text representation, this may be expensive. * @see utext_isLengthExpensive() * * * @param ut the text to be accessed. * @return the length of the text, expressed in native units. * * @stable ICU 3.4 */ U_STABLE int64_t U_EXPORT2 utext_nativeLength(UText *ut); /** * Return TRUE if calculating the length of the text could be expensive. * Finding the length of NUL terminated strings is considered to be expensive. * * Note that the value of this function may change * as the result of other operations on a UText. * Once the length of a string has been discovered, it will no longer * be expensive to report it. * * @param ut the text to be accessed. * @return TRUE if determining the length of the text could be time consuming. * @stable ICU 3.4 */ U_STABLE UBool U_EXPORT2 utext_isLengthExpensive(const UText *ut); /** * Returns the code point at the requested index, * or U_SENTINEL (-1) if it is out of bounds. * * If the specified index points to the interior of a multi-unit * character - one of the trail bytes of a UTF-8 sequence, for example - * the complete code point will be returned. * * The iteration position will be set to the start of the returned code point. * * This function is roughly equivalent to the the sequence * utext_setNativeIndex(index); * utext_current32(); * (There is a subtle difference if the index is out of bounds by being less than zero - * utext_setNativeIndex(negative value) sets the index to zero, after which utext_current() * will return the char at zero. utext_char32At(negative index), on the other hand, will * return the U_SENTINEL value of -1.) * * @param ut the text to be accessed * @param nativeIndex the native index of the character to be accessed. If the index points * to other than the first unit of a multi-unit character, it will be adjusted * to the start of the character. * @return the code point at the specified index. * @stable ICU 3.4 */ U_STABLE UChar32 U_EXPORT2 utext_char32At(UText *ut, int64_t nativeIndex); /** * * Get the code point at the current iteration position, * or U_SENTINEL (-1) if the iteration has reached the end of * the input text. * * @param ut the text to be accessed. * @return the Unicode code point at the current iterator position. * @stable ICU 3.4 */ U_STABLE UChar32 U_EXPORT2 utext_current32(UText *ut); /** * Get the code point at the current iteration position of the UText, and * advance the position to the first index following the character. * * If the position is at the end of the text (the index following * the last character, which is also the length of the text), * return U_SENTINEL (-1) and do not advance the index. * * This is a post-increment operation. * * An inline macro version of this function, UTEXT_NEXT32(), * is available for performance critical use. * * @param ut the text to be accessed. * @return the Unicode code point at the iteration position. * @see UTEXT_NEXT32 * @stable ICU 3.4 */ U_STABLE UChar32 U_EXPORT2 utext_next32(UText *ut); /** * Move the iterator position to the character (code point) whose * index precedes the current position, and return that character. * This is a pre-decrement operation. * * If the initial position is at the start of the text (index of 0) * return U_SENTINEL (-1), and leave the position unchanged. * * An inline macro version of this function, UTEXT_PREVIOUS32(), * is available for performance critical use. * * @param ut the text to be accessed. * @return the previous UChar32 code point, or U_SENTINEL (-1) * if the iteration has reached the start of the text. * @see UTEXT_PREVIOUS32 * @stable ICU 3.4 */ U_STABLE UChar32 U_EXPORT2 utext_previous32(UText *ut); /** * Set the iteration index and return the code point at that index. * Leave the iteration index at the start of the following code point. * * This function is the most efficient and convenient way to * begin a forward iteration. The results are identical to the those * from the sequence * \code * utext_setIndex(); * utext_next32(); * \endcode * * @param ut the text to be accessed. * @param nativeIndex Iteration index, in the native units of the text provider. * @return Code point which starts at or before index, * or U_SENTINEL (-1) if it is out of bounds. * @stable ICU 3.4 */ U_STABLE UChar32 U_EXPORT2 utext_next32From(UText *ut, int64_t nativeIndex); /** * Set the iteration index, and return the code point preceding the * one specified by the initial index. Leave the iteration position * at the start of the returned code point. * * This function is the most efficient and convenient way to * begin a backwards iteration. * * @param ut the text to be accessed. * @param nativeIndex Iteration index in the native units of the text provider. * @return Code point preceding the one at the initial index, * or U_SENTINEL (-1) if it is out of bounds. * * @stable ICU 3.4 */ U_STABLE UChar32 U_EXPORT2 utext_previous32From(UText *ut, int64_t nativeIndex); /** * Get the current iterator position, which can range from 0 to * the length of the text. * The position is a native index into the input text, in whatever format it * may have (possibly UTF-8 for example), and may not always be the same as * the corresponding UChar (UTF-16) index. * The returned position will always be aligned to a code point boundary. * * @param ut the text to be accessed. * @return the current index position, in the native units of the text provider. * @stable ICU 3.4 */ U_STABLE int64_t U_EXPORT2 utext_getNativeIndex(const UText *ut); /** * Set the current iteration position to the nearest code point * boundary at or preceding the specified index. * The index is in the native units of the original input text. * If the index is out of range, it will be pinned to be within * the range of the input text. *

* It will usually be more efficient to begin an iteration * using the functions utext_next32From() or utext_previous32From() * rather than setIndex(). *

* Moving the index position to an adjacent character is best done * with utext_next32(), utext_previous32() or utext_moveIndex32(). * Attempting to do direct arithmetic on the index position is * complicated by the fact that the size (in native units) of a * character depends on the underlying representation of the character * (UTF-8, UTF-16, UTF-32, arbitrary codepage), and is not * easily knowable. * * @param ut the text to be accessed. * @param nativeIndex the native unit index of the new iteration position. * @stable ICU 3.4 */ U_STABLE void U_EXPORT2 utext_setNativeIndex(UText *ut, int64_t nativeIndex); /** * Move the iterator postion by delta code points. The number of code points * is a signed number; a negative delta will move the iterator backwards, * towards the start of the text. *

* The index is moved by delta code points * forward or backward, but no further backward than to 0 and * no further forward than to utext_nativeLength(). * The resulting index value will be in between 0 and length, inclusive. * * @param ut the text to be accessed. * @param delta the signed number of code points to move the iteration position. * @return TRUE if the position could be moved the requested number of positions while * staying within the range [0 - text length]. * @stable ICU 3.4 */ U_STABLE UBool U_EXPORT2 utext_moveIndex32(UText *ut, int32_t delta); /** * Get the native index of the character preceeding the current position. * If the iteration position is already at the start of the text, zero * is returned. * The value returned is the same as that obtained from the following sequence, * but without the side effect of changing the iteration position. * * \code * UText *ut = whatever; * ... * utext_previous(ut) * utext_getNativeIndex(ut); * \endcode * * This function is most useful during forwards iteration, where it will get the * native index of the character most recently returned from utext_next(). * * @param ut the text to be accessed * @return the native index of the character preceeding the current index position, * or zero if the current position is at the start of the text. * @stable ICU 3.6 */ U_STABLE int64_t U_EXPORT2 utext_getPreviousNativeIndex(UText *ut); /** * * Extract text from a UText into a UChar buffer. The range of text to be extracted * is specified in the native indices of the UText provider. These may not necessarily * be UTF-16 indices. *

* The size (number of 16 bit UChars) of the data to be extracted is returned. The * full number of UChars is returned, even when the extracted text is truncated * because the specified buffer size is too small. *

* The extracted string will (if you are a user) / must (if you are a text provider) * be NUL-terminated if there is sufficient space in the destination buffer. This * terminating NUL is not included in the returned length. *

* The iteration index is left at the position following the last extracted character. * * @param ut the UText from which to extract data. * @param nativeStart the native index of the first character to extract.\ * If the specified index is out of range, * it will be pinned to to be within 0 <= index <= textLength * @param nativeLimit the native string index of the position following the last * character to extract. If the specified index is out of range, * it will be pinned to to be within 0 <= index <= textLength. * nativeLimit must be >= nativeStart. * @param dest the UChar (UTF-16) buffer into which the extracted text is placed * @param destCapacity The size, in UChars, of the destination buffer. May be zero * for precomputing the required size. * @param status receives any error status. * U_BUFFER_OVERFLOW_ERROR: the extracted text was truncated because the * buffer was too small. Returns number of UChars for preflighting. * @return Number of UChars in the data to be extracted. Does not include a trailing NUL. * * @stable ICU 3.4 */ U_STABLE int32_t U_EXPORT2 utext_extract(UText *ut, int64_t nativeStart, int64_t nativeLimit, UChar *dest, int32_t destCapacity, UErrorCode *status); /************************************************************************************ * * #define inline versions of selected performance-critical text access functions * Caution: do not use auto increment++ or decrement-- expressions * as parameters to these macros. * * For most use, where there is no extreme performance constraint, the * normal, non-inline functions are a better choice. The resulting code * will be smaller, and, if the need ever arises, easier to debug. * * These are implemented as #defines rather than real functions * because there is no fully portable way to do inline functions in plain C. * ************************************************************************************/ /** * inline version of utext_next32(), for performance-critical situations. * * Get the code point at the current iteration position of the UText, and * advance the position to the first index following the character. * This is a post-increment operation. * Returns U_SENTINEL (-1) if the position is at the end of the * text. * * @stable ICU 3.4 */ #define UTEXT_NEXT32(ut) \ ((ut)->chunkOffset < (ut)->chunkLength && ((ut)->chunkContents)[(ut)->chunkOffset]<0xd800 ? \ ((ut)->chunkContents)[((ut)->chunkOffset)++] : utext_next32(ut)) /** * inline version of utext_previous32(), for performance-critical situations. * * Move the iterator position to the character (code point) whose * index precedes the current position, and return that character. * This is a pre-decrement operation. * Returns U_SENTINEL (-1) if the position is at the start of the text. * * @stable ICU 3.4 */ #define UTEXT_PREVIOUS32(ut) \ ((ut)->chunkOffset > 0 && \ (ut)->chunkContents[(ut)->chunkOffset-1] < 0xd800 ? \ (ut)->chunkContents[--((ut)->chunkOffset)] : utext_previous32(ut)) /** * inline version of utext_getNativeIndex(), for performance-critical situations. * * Get the current iterator position, which can range from 0 to * the length of the text. * The position is a native index into the input text, in whatever format it * may have (possibly UTF-8 for example), and may not always be the same as * the corresponding UChar (UTF-16) index. * The returned position will always be aligned to a code point boundary. * * @stable ICU 3.6 */ #define UTEXT_GETNATIVEINDEX(ut) \ ((ut)->chunkOffset <= (ut)->nativeIndexingLimit? \ (ut)->chunkNativeStart+(ut)->chunkOffset : \ (ut)->pFuncs->mapOffsetToNative(ut)) /** * inline version of utext_setNativeIndex(), for performance-critical situations. * * Set the current iteration position to the nearest code point * boundary at or preceding the specified index. * The index is in the native units of the original input text. * If the index is out of range, it will be pinned to be within * the range of the input text. * * @stable ICU 3.8 */ #define UTEXT_SETNATIVEINDEX(ut, ix) \ { int64_t __offset = (ix) - (ut)->chunkNativeStart; \ if (__offset>=0 && __offset<=(int64_t)(ut)->nativeIndexingLimit) { \ (ut)->chunkOffset=(int32_t)__offset; \ } else { \ utext_setNativeIndex((ut), (ix)); } } /************************************************************************************ * * Functions related to writing or modifying the text. * These will work only with modifiable UTexts. Attempting to * modify a read-only UText will return an error status. * ************************************************************************************/ /** * Return TRUE if the text can be written (modified) with utext_replace() or * utext_copy(). For the text to be writable, the text provider must * be of a type that supports writing and the UText must not be frozen. * * Attempting to modify text when utext_isWriteable() is FALSE will fail - * the text will not be modified, and an error will be returned from the function * that attempted the modification. * * @param ut the UText to be tested. * @return TRUE if the text is modifiable. * * @see utext_freeze() * @see utext_replace() * @see utext_copy() * @stable ICU 3.4 * */ U_STABLE UBool U_EXPORT2 utext_isWritable(const UText *ut); /** * Test whether there is meta data associated with the text. * @see Replaceable::hasMetaData() * * @param ut The UText to be tested * @return TRUE if the underlying text includes meta data. * @stable ICU 3.4 */ U_STABLE UBool U_EXPORT2 utext_hasMetaData(const UText *ut); /** * Replace a range of the original text with a replacement text. * * Leaves the current iteration position at the position following the * newly inserted replacement text. * * This function is only available on UText types that support writing, * that is, ones where utext_isWritable() returns TRUE. * * When using this function, there should be only a single UText opened onto the * underlying native text string. Behavior after a replace operation * on a UText is undefined for any other additional UTexts that refer to the * modified string. * * @param ut the UText representing the text to be operated on. * @param nativeStart the native index of the start of the region to be replaced * @param nativeLimit the native index of the character following the region to be replaced. * @param replacementText pointer to the replacement text * @param replacementLength length of the replacement text, or -1 if the text is NUL terminated. * @param status receives any error status. Possible errors include * U_NO_WRITE_PERMISSION * * @return The signed number of (native) storage units by which * the length of the text expanded or contracted. * * @stable ICU 3.4 */ U_STABLE int32_t U_EXPORT2 utext_replace(UText *ut, int64_t nativeStart, int64_t nativeLimit, const UChar *replacementText, int32_t replacementLength, UErrorCode *status); /** * * Copy or move a substring from one position to another within the text, * while retaining any metadata associated with the text. * This function is used to duplicate or reorder substrings. * The destination index must not overlap the source range. * * The text to be copied or moved is inserted at destIndex; * it does not replace or overwrite any existing text. * * The iteration position is left following the newly inserted text * at the destination position. * * This function is only available on UText types that support writing, * that is, ones where utext_isWritable() returns TRUE. * * When using this function, there should be only a single UText opened onto the * underlying native text string. Behavior after a copy operation * on a UText is undefined in any other additional UTexts that refer to the * modified string. * * @param ut The UText representing the text to be operated on. * @param nativeStart The native index of the start of the region to be copied or moved * @param nativeLimit The native index of the character position following the region * to be copied. * @param destIndex The native destination index to which the source substring is * copied or moved. * @param move If TRUE, then the substring is moved, not copied/duplicated. * @param status receives any error status. Possible errors include U_NO_WRITE_PERMISSION * * @stable ICU 3.4 */ U_STABLE void U_EXPORT2 utext_copy(UText *ut, int64_t nativeStart, int64_t nativeLimit, int64_t destIndex, UBool move, UErrorCode *status); /** *

* Freeze a UText. This prevents any modification to the underlying text itself * by means of functions operating on this UText. *

*

* Once frozen, a UText can not be unfrozen. The intent is to ensure * that a the text underlying a frozen UText wrapper cannot be modified via that UText. *

*

* Caution: freezing a UText will disable changes made via the specific * frozen UText wrapper only; it will not have any effect on the ability to * directly modify the text by bypassing the UText. Any such backdoor modifications * are always an error while UText access is occuring because the underlying * text can get out of sync with UText's buffering. *

* * @param ut The UText to be frozen. * @see utext_isWritable() * @stable ICU 3.6 */ U_STABLE void U_EXPORT2 utext_freeze(UText *ut); /** * UText provider properties (bit field indexes). * * @see UText * @stable ICU 3.4 */ enum { /** * It is potentially time consuming for the provider to determine the length of the text. * @stable ICU 3.4 */ UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE = 1, /** * Text chunks remain valid and usable until the text object is modified or * deleted, not just until the next time the access() function is called * (which is the default). * @stable ICU 3.4 */ UTEXT_PROVIDER_STABLE_CHUNKS = 2, /** * The provider supports modifying the text via the replace() and copy() * functions. * @see Replaceable * @stable ICU 3.4 */ UTEXT_PROVIDER_WRITABLE = 3, /** * There is meta data associated with the text. * @see Replaceable::hasMetaData() * @stable ICU 3.4 */ UTEXT_PROVIDER_HAS_META_DATA = 4, /** * Text provider owns the text storage. * Generally occurs as the result of a deep clone of the UText. * When closing the UText, the associated text must * also be closed/deleted/freed/ whatever is appropriate. * @stable ICU 3.6 */ UTEXT_PROVIDER_OWNS_TEXT = 5 }; /** * Function type declaration for UText.clone(). * * clone a UText. Much like opening a UText where the source text is itself * another UText. * * A deep clone will copy both the UText data structures and the underlying text. * The original and cloned UText will operate completely independently; modifications * made to the text in one will not effect the other. Text providers are not * required to support deep clones. The user of clone() must check the status return * and be prepared to handle failures. * * A shallow clone replicates only the UText data structures; it does not make * a copy of the underlying text. Shallow clones can be used as an efficient way to * have multiple iterators active in a single text string that is not being * modified. * * A shallow clone operation must not fail except for truly exceptional conditions such * as memory allocation failures. * * A UText and its clone may be safely concurrently accessed by separate threads. * This is true for both shallow and deep clones. * It is the responsibility of the Text Provider to ensure that this thread safety * constraint is met. * * @param dest A UText struct to be filled in with the result of the clone operation, * or NULL if the clone function should heap-allocate a new UText struct. * @param src The UText to be cloned. * @param deep TRUE to request a deep clone, FALSE for a shallow clone. * @param status Errors are returned here. For deep clones, U_UNSUPPORTED_ERROR * should be returned if the text provider is unable to clone the * original text. * @return The newly created clone, or NULL if the clone operation failed. * * @stable ICU 3.4 */ typedef UText * U_CALLCONV UTextClone(UText *dest, const UText *src, UBool deep, UErrorCode *status); /** * Function type declaration for UText.nativeLength(). * * @param ut the UText to get the length of. * @return the length, in the native units of the original text string. * @see UText * @stable ICU 3.4 */ typedef int64_t U_CALLCONV UTextNativeLength(UText *ut); /** * Function type declaration for UText.access(). Get the description of the text chunk * containing the text at a requested native index. The UText's iteration * position will be left at the requested index. If the index is out * of bounds, the iteration position will be left at the start or end * of the string, as appropriate. * * Chunks must begin and end on code point boundaries. A single code point * comprised of multiple storage units must never span a chunk boundary. * * * @param ut the UText being accessed. * @param nativeIndex Requested index of the text to be accessed. * @param forward If TRUE, then the returned chunk must contain text * starting from the index, so that start<=index * The size (number of 16 bit UChars) in the data to be extracted is returned. The * full amount is returned, even when the specified buffer size is smaller. *

* The extracted string will (if you are a user) / must (if you are a text provider) * be NUL-terminated if there is sufficient space in the destination buffer. * * @param ut the UText from which to extract data. * @param nativeStart the native index of the first characer to extract. * @param nativeLimit the native string index of the position following the last * character to extract. * @param dest the UChar (UTF-16) buffer into which the extracted text is placed * @param destCapacity The size, in UChars, of the destination buffer. May be zero * for precomputing the required size. * @param status receives any error status. * If U_BUFFER_OVERFLOW_ERROR: Returns number of UChars for * preflighting. * @return Number of UChars in the data. Does not include a trailing NUL. * * @stable ICU 3.4 */ typedef int32_t U_CALLCONV UTextExtract(UText *ut, int64_t nativeStart, int64_t nativeLimit, UChar *dest, int32_t destCapacity, UErrorCode *status); /** * Function type declaration for UText.replace(). * * Replace a range of the original text with a replacement text. * * Leaves the current iteration position at the position following the * newly inserted replacement text. * * This function need only be implemented on UText types that support writing. * * When using this function, there should be only a single UText opened onto the * underlying native text string. The function is responsible for updating the * text chunk within the UText to reflect the updated iteration position, * taking into account any changes to the underlying string's structure caused * by the replace operation. * * @param ut the UText representing the text to be operated on. * @param nativeStart the index of the start of the region to be replaced * @param nativeLimit the index of the character following the region to be replaced. * @param replacementText pointer to the replacement text * @param replacmentLength length of the replacement text in UChars, or -1 if the text is NUL terminated. * @param status receives any error status. Possible errors include * U_NO_WRITE_PERMISSION * * @return The signed number of (native) storage units by which * the length of the text expanded or contracted. * * @stable ICU 3.4 */ typedef int32_t U_CALLCONV UTextReplace(UText *ut, int64_t nativeStart, int64_t nativeLimit, const UChar *replacementText, int32_t replacmentLength, UErrorCode *status); /** * Function type declaration for UText.copy(). * * Copy or move a substring from one position to another within the text, * while retaining any metadata associated with the text. * This function is used to duplicate or reorder substrings. * The destination index must not overlap the source range. * * The text to be copied or moved is inserted at destIndex; * it does not replace or overwrite any existing text. * * This function need only be implemented for UText types that support writing. * * When using this function, there should be only a single UText opened onto the * underlying native text string. The function is responsible for updating the * text chunk within the UText to reflect the updated iteration position, * taking into account any changes to the underlying string's structure caused * by the replace operation. * * @param ut The UText representing the text to be operated on. * @param nativeStart The index of the start of the region to be copied or moved * @param nativeLimit The index of the character following the region to be replaced. * @param nativeDest The destination index to which the source substring is copied or moved. * @param move If TRUE, then the substring is moved, not copied/duplicated. * @param status receives any error status. Possible errors include U_NO_WRITE_PERMISSION * * @stable ICU 3.4 */ typedef void U_CALLCONV UTextCopy(UText *ut, int64_t nativeStart, int64_t nativeLimit, int64_t nativeDest, UBool move, UErrorCode *status); /** * Function type declaration for UText.mapOffsetToNative(). * Map from the current UChar offset within the current text chunk to * the corresponding native index in the original source text. * * This is required only for text providers that do not use native UTF-16 indexes. * * @param ut the UText. * @return Absolute (native) index corresponding to chunkOffset in the current chunk. * The returned native index should always be to a code point boundary. * * @stable ICU 3.4 */ typedef int64_t U_CALLCONV UTextMapOffsetToNative(const UText *ut); /** * Function type declaration for UText.mapIndexToUTF16(). * Map from a native index to a UChar offset within a text chunk. * Behavior is undefined if the native index does not fall within the * current chunk. * * This function is required only for text providers that do not use native UTF-16 indexes. * * @param ut The UText containing the text chunk. * @param nativeIndex Absolute (native) text index, chunk->start<=index<=chunk->limit. * @return Chunk-relative UTF-16 offset corresponding to the specified native * index. * * @stable ICU 3.4 */ typedef int32_t U_CALLCONV UTextMapNativeIndexToUTF16(const UText *ut, int64_t nativeIndex); /** * Function type declaration for UText.utextClose(). * * A Text Provider close function is only required for provider types that make * allocations in their open function (or other functions) that must be * cleaned when the UText is closed. * * The allocation of the UText struct itself and any "extra" storage * associated with the UText is handled by the common UText implementation * and does not require provider specific cleanup in a close function. * * Most UText provider implementations do not need to implement this function. * * @param ut A UText object to be closed. * * @stable ICU 3.4 */ typedef void U_CALLCONV UTextClose(UText *ut); /** * (public) Function dispatch table for UText. * Conceptually very much like a C++ Virtual Function Table. * This struct defines the organization of the table. * Each text provider implementation must provide an * actual table that is initialized with the appropriate functions * for the type of text being handled. * @stable ICU 3.6 */ struct UTextFuncs { /** * (public) Function table size, sizeof(UTextFuncs) * Intended for use should the table grow to accomodate added * functions in the future, to allow tests for older format * function tables that do not contain the extensions. * * Fields are placed for optimal alignment on * 32/64/128-bit-pointer machines, by normally grouping together * 4 32-bit fields, * 4 pointers, * 2 64-bit fields * in sequence. * @stable ICU 3.6 */ int32_t tableSize; /** * (private) Alignment padding. * Do not use, reserved for use by the UText framework only. * @internal */ int32_t reserved1, /** @internal */ reserved2, /** @internal */ reserved3; /** * (public) Function pointer for UTextClone * * @see UTextClone * @stable ICU 3.6 */ UTextClone *clone; /** * (public) function pointer for UTextLength * May be expensive to compute! * * @see UTextLength * @stable ICU 3.6 */ UTextNativeLength *nativeLength; /** * (public) Function pointer for UTextAccess. * * @see UTextAccess * @stable ICU 3.6 */ UTextAccess *access; /** * (public) Function pointer for UTextExtract. * * @see UTextExtract * @stable ICU 3.6 */ UTextExtract *extract; /** * (public) Function pointer for UTextReplace. * * @see UTextReplace * @stable ICU 3.6 */ UTextReplace *replace; /** * (public) Function pointer for UTextCopy. * * @see UTextCopy * @stable ICU 3.6 */ UTextCopy *copy; /** * (public) Function pointer for UTextMapOffsetToNative. * * @see UTextMapOffsetToNative * @stable ICU 3.6 */ UTextMapOffsetToNative *mapOffsetToNative; /** * (public) Function pointer for UTextMapNativeIndexToUTF16. * * @see UTextMapNativeIndexToUTF16 * @stable ICU 3.6 */ UTextMapNativeIndexToUTF16 *mapNativeIndexToUTF16; /** * (public) Function pointer for UTextClose. * * @see UTextClose * @stable ICU 3.6 */ UTextClose *close; /** * (private) Spare function pointer * @internal */ UTextClose *spare1; /** * (private) Spare function pointer * @internal */ UTextClose *spare2; /** * (private) Spare function pointer * @internal */ UTextClose *spare3; }; /** * Function dispatch table for UText * @see UTextFuncs */ typedef struct UTextFuncs UTextFuncs; /** * UText struct. Provides the interface between the generic UText access code * and the UText provider code that works on specific kinds of * text (UTF-8, noncontiguous UTF-16, whatever.) * * Applications that are using predefined types of text providers * to pass text data to ICU services will have no need to view the * internals of the UText structs that they open. * * @stable ICU 3.6 */ struct UText { /** * (private) Magic. Used to help detect when UText functions are handed * invalid or unitialized UText structs. * utext_openXYZ() functions take an initialized, * but not necessarily open, UText struct as an * optional fill-in parameter. This magic field * is used to check for that initialization. * Text provider close functions must NOT clear * the magic field because that would prevent * reuse of the UText struct. * @internal */ uint32_t magic; /** * (private) Flags for managing the allocation and freeing of * memory associated with this UText. * @internal */ int32_t flags; /** * Text provider properties. This set of flags is maintainted by the * text provider implementation. * @stable ICU 3.4 */ int32_t providerProperties; /** * (public) sizeOfStruct=sizeof(UText) * Allows possible backward compatible extension. * * @stable ICU 3.4 */ int32_t sizeOfStruct; /* ------ 16 byte alignment boundary ----------- */ /** * (protected) Native index of the first character position following * the current chunk. * @stable ICU 3.6 */ int64_t chunkNativeLimit; /** * (protected) Size in bytes of the extra space (pExtra). * @stable ICU 3.4 */ int32_t extraSize; /** * (protected) The highest chunk offset where native indexing and * chunk (UTF-16) indexing correspond. For UTF-16 sources, value * will be equal to chunkLength. * * @stable ICU 3.6 */ int32_t nativeIndexingLimit; /* ---- 16 byte alignment boundary------ */ /** * (protected) Native index of the first character in the text chunk. * @stable ICU 3.6 */ int64_t chunkNativeStart; /** * (protected) Current iteration position within the text chunk (UTF-16 buffer). * This is the index to the character that will be returned by utext_next32(). * @stable ICU 3.6 */ int32_t chunkOffset; /** * (protected) Length the text chunk (UTF-16 buffer), in UChars. * @stable ICU 3.6 */ int32_t chunkLength; /* ---- 16 byte alignment boundary-- */ /** * (protected) pointer to a chunk of text in UTF-16 format. * May refer either to original storage of the source of the text, or * if conversion was required, to a buffer owned by the UText. * @stable ICU 3.6 */ const UChar *chunkContents; /** * (public) Pointer to Dispatch table for accessing functions for this UText. * @stable ICU 3.6 */ const UTextFuncs *pFuncs; /** * (protected) Pointer to additional space requested by the * text provider during the utext_open operation. * @stable ICU 3.4 */ void *pExtra; /** * (protected) Pointer to string or text-containin object or similar. * This is the source of the text that this UText is wrapping, in a format * that is known to the text provider functions. * @stable ICU 3.4 */ const void *context; /* --- 16 byte alignment boundary--- */ /** * (protected) Pointer fields available for use by the text provider. * Not used by UText common code. * @stable ICU 3.6 */ const void *p; /** * (protected) Pointer fields available for use by the text provider. * Not used by UText common code. * @stable ICU 3.6 */ const void *q; /** * (protected) Pointer fields available for use by the text provider. * Not used by UText common code. * @stable ICU 3.6 */ const void *r; /** * Private field reserved for future use by the UText framework * itself. This is not to be touched by the text providers. * @internal ICU 3.4 */ void *privP; /* --- 16 byte alignment boundary--- */ /** * (protected) Integer field reserved for use by the text provider. * Not used by the UText framework, or by the client (user) of the UText. * @stable ICU 3.4 */ int64_t a; /** * (protected) Integer field reserved for use by the text provider. * Not used by the UText framework, or by the client (user) of the UText. * @stable ICU 3.4 */ int32_t b; /** * (protected) Integer field reserved for use by the text provider. * Not used by the UText framework, or by the client (user) of the UText. * @stable ICU 3.4 */ int32_t c; /* ---- 16 byte alignment boundary---- */ /** * Private field reserved for future use by the UText framework * itself. This is not to be touched by the text providers. * @internal ICU 3.4 */ int64_t privA; /** * Private field reserved for future use by the UText framework * itself. This is not to be touched by the text providers. * @internal ICU 3.4 */ int32_t privB; /** * Private field reserved for future use by the UText framework * itself. This is not to be touched by the text providers. * @internal ICU 3.4 */ int32_t privC; }; /** * Common function for use by Text Provider implementations to allocate and/or initialize * a new UText struct. To be called in the implementation of utext_open() functions. * If the supplied UText parameter is null, a new UText struct will be allocated on the heap. * If the supplied UText is already open, the provider's close function will be called * so that the struct can be reused by the open that is in progress. * * @param ut pointer to a UText struct to be re-used, or null if a new UText * should be allocated. * @param extraSpace The amount of additional space to be allocated as part * of this UText, for use by types of providers that require * additional storage. * @param status Errors are returned here. * @return pointer to the UText, allocated if necessary, with extra space set up if requested. * @stable ICU 3.4 */ U_STABLE UText * U_EXPORT2 utext_setup(UText *ut, int32_t extraSpace, UErrorCode *status); /** * initializer to be used with local (stack) instances of a UText * struct. UText structs must be initialized before passing * them to one of the utext_open functions. * * @stable ICU 3.6 */ #define UTEXT_INITIALIZER { \ UTEXT_MAGIC, /* magic */ \ 0, /* flags */ \ 0, /* providerProps */ \ sizeof(UText), /* sizeOfStruct */ \ 0, /* chunkNativeLimit */ \ 0, /* extraSize */ \ 0, /* nativeIndexingLimit */ \ 0, /* chunkNativeStart */ \ 0, /* chunkOffset */ \ 0, /* chunkLength */ \ NULL, /* chunkContents */ \ NULL, /* pFuncs */ \ NULL, /* pExtra */ \ NULL, /* context */ \ NULL, NULL, NULL, /* p, q, r */ \ NULL, /* privP */ \ 0, 0, 0, /* a, b, c */ \ 0, 0, 0 /* privA,B,C, */ \ } U_CDECL_END #endif // uset.h /* ******************************************************************************* * * Copyright (C) 2002-2014, International Business Machines * Corporation and others. All Rights Reserved. * ******************************************************************************* * file name: uset.h * encoding: US-ASCII * tab size: 8 (not used) * indentation:4 * * created on: 2002mar07 * created by: Markus W. Scherer * * C version of UnicodeSet. */ /** * \file * \brief C API: Unicode Set * *

This is a C wrapper around the C++ UnicodeSet class.

*/ #ifndef __USET_H__ #define __USET_H__ #ifndef UCNV_H struct USet; /** * A UnicodeSet. Use the uset_* API to manipulate. Create with * uset_open*, and destroy with uset_close. * @stable ICU 2.4 */ typedef struct USet USet; #endif /** * Bitmask values to be passed to uset_openPatternOptions() or * uset_applyPattern() taking an option parameter. * @stable ICU 2.4 */ enum { /** * Ignore white space within patterns unless quoted or escaped. * @stable ICU 2.4 */ USET_IGNORE_SPACE = 1, /** * Enable case insensitive matching. E.g., "[ab]" with this flag * will match 'a', 'A', 'b', and 'B'. "[^ab]" with this flag will * match all except 'a', 'A', 'b', and 'B'. This performs a full * closure over case mappings, e.g. U+017F for s. * * The resulting set is a superset of the input for the code points but * not for the strings. * It performs a case mapping closure of the code points and adds * full case folding strings for the code points, and reduces strings of * the original set to their full case folding equivalents. * * This is designed for case-insensitive matches, for example * in regular expressions. The full code point case closure allows checking of * an input character directly against the closure set. * Strings are matched by comparing the case-folded form from the closure * set with an incremental case folding of the string in question. * * The closure set will also contain single code points if the original * set contained case-equivalent strings (like U+00DF for "ss" or "Ss" etc.). * This is not necessary (that is, redundant) for the above matching method * but results in the same closure sets regardless of whether the original * set contained the code point or a string. * * @stable ICU 2.4 */ USET_CASE_INSENSITIVE = 2, /** * Enable case insensitive matching. E.g., "[ab]" with this flag * will match 'a', 'A', 'b', and 'B'. "[^ab]" with this flag will * match all except 'a', 'A', 'b', and 'B'. This adds the lower-, * title-, and uppercase mappings as well as the case folding * of each existing element in the set. * @stable ICU 3.2 */ USET_ADD_CASE_MAPPINGS = 4 }; /** * Argument values for whether span() and similar functions continue while * the current character is contained vs. not contained in the set. * * The functionality is straightforward for sets with only single code points, * without strings (which is the common case): * - USET_SPAN_CONTAINED and USET_SPAN_SIMPLE work the same. * - USET_SPAN_CONTAINED and USET_SPAN_SIMPLE are inverses of USET_SPAN_NOT_CONTAINED. * - span() and spanBack() partition any string the same way when * alternating between span(USET_SPAN_NOT_CONTAINED) and * span(either "contained" condition). * - Using a complemented (inverted) set and the opposite span conditions * yields the same results. * * When a set contains multi-code point strings, then these statements may not * be true, depending on the strings in the set (for example, whether they * overlap with each other) and the string that is processed. * For a set with strings: * - The complement of the set contains the opposite set of code points, * but the same set of strings. * Therefore, complementing both the set and the span conditions * may yield different results. * - When starting spans at different positions in a string * (span(s, ...) vs. span(s+1, ...)) the ends of the spans may be different * because a set string may start before the later position. * - span(USET_SPAN_SIMPLE) may be shorter than * span(USET_SPAN_CONTAINED) because it will not recursively try * all possible paths. * For example, with a set which contains the three strings "xy", "xya" and "ax", * span("xyax", USET_SPAN_CONTAINED) will return 4 but * span("xyax", USET_SPAN_SIMPLE) will return 3. * span(USET_SPAN_SIMPLE) will never be longer than * span(USET_SPAN_CONTAINED). * - With either "contained" condition, span() and spanBack() may partition * a string in different ways. * For example, with a set which contains the two strings "ab" and "ba", * and when processing the string "aba", * span() will yield contained/not-contained boundaries of { 0, 2, 3 } * while spanBack() will yield boundaries of { 0, 1, 3 }. * * Note: If it is important to get the same boundaries whether iterating forward * or backward through a string, then either only span() should be used and * the boundaries cached for backward operation, or an ICU BreakIterator * could be used. * * Note: Unpaired surrogates are treated like surrogate code points. * Similarly, set strings match only on code point boundaries, * never in the middle of a surrogate pair. * Illegal UTF-8 sequences are treated like U+FFFD. * When processing UTF-8 strings, malformed set strings * (strings with unpaired surrogates which cannot be converted to UTF-8) * are ignored. * * @stable ICU 3.8 */ typedef enum USetSpanCondition { /** * Continues a span() while there is no set element at the current position. * Increments by one code point at a time. * Stops before the first set element (character or string). * (For code points only, this is like while contains(current)==FALSE). * * When span() returns, the substring between where it started and the position * it returned consists only of characters that are not in the set, * and none of its strings overlap with the span. * * @stable ICU 3.8 */ USET_SPAN_NOT_CONTAINED = 0, /** * Spans the longest substring that is a concatenation of set elements (characters or strings). * (For characters only, this is like while contains(current)==TRUE). * * When span() returns, the substring between where it started and the position * it returned consists only of set elements (characters or strings) that are in the set. * * If a set contains strings, then the span will be the longest substring for which there * exists at least one non-overlapping concatenation of set elements (characters or strings). * This is equivalent to a POSIX regular expression for (OR of each set element)*. * (Java/ICU/Perl regex stops at the first match of an OR.) * * @stable ICU 3.8 */ USET_SPAN_CONTAINED = 1, /** * Continues a span() while there is a set element at the current position. * Increments by the longest matching element at each position. * (For characters only, this is like while contains(current)==TRUE). * * When span() returns, the substring between where it started and the position * it returned consists only of set elements (characters or strings) that are in the set. * * If a set only contains single characters, then this is the same * as USET_SPAN_CONTAINED. * * If a set contains strings, then the span will be the longest substring * with a match at each position with the longest single set element (character or string). * * Use this span condition together with other longest-match algorithms, * such as ICU converters (ucnv_getUnicodeSet()). * * @stable ICU 3.8 */ USET_SPAN_SIMPLE = 2, /** * One more than the last span condition. * @stable ICU 3.8 */ USET_SPAN_CONDITION_COUNT } USetSpanCondition; enum { /** * Capacity of USerializedSet::staticArray. * Enough for any single-code point set. * Also provides padding for nice sizeof(USerializedSet). * @stable ICU 2.4 */ USET_SERIALIZED_STATIC_ARRAY_CAPACITY=8 }; /** * A serialized form of a Unicode set. Limited manipulations are * possible directly on a serialized set. See below. * @stable ICU 2.4 */ typedef struct USerializedSet { /** * The serialized Unicode Set. * @stable ICU 2.4 */ const uint16_t *array; /** * The length of the array that contains BMP characters. * @stable ICU 2.4 */ int32_t bmpLength; /** * The total length of the array. * @stable ICU 2.4 */ int32_t length; /** * A small buffer for the array to reduce memory allocations. * @stable ICU 2.4 */ uint16_t staticArray[USET_SERIALIZED_STATIC_ARRAY_CAPACITY]; } USerializedSet; /********************************************************************* * USet API *********************************************************************/ /** * Create an empty USet object. * Equivalent to uset_open(1, 0). * @return a newly created USet. The caller must call uset_close() on * it when done. * @stable ICU 4.2 */ U_STABLE USet* U_EXPORT2 uset_openEmpty(void); /** * Creates a USet object that contains the range of characters * start..end, inclusive. If start > end * then an empty set is created (same as using uset_openEmpty()). * @param start first character of the range, inclusive * @param end last character of the range, inclusive * @return a newly created USet. The caller must call uset_close() on * it when done. * @stable ICU 2.4 */ U_STABLE USet* U_EXPORT2 uset_open(UChar32 start, UChar32 end); /** * Creates a set from the given pattern. See the UnicodeSet class * description for the syntax of the pattern language. * @param pattern a string specifying what characters are in the set * @param patternLength the length of the pattern, or -1 if null * terminated * @param ec the error code * @stable ICU 2.4 */ U_STABLE USet* U_EXPORT2 uset_openPattern(const UChar* pattern, int32_t patternLength, UErrorCode* ec); /** * Creates a set from the given pattern. See the UnicodeSet class * description for the syntax of the pattern language. * @param pattern a string specifying what characters are in the set * @param patternLength the length of the pattern, or -1 if null * terminated * @param options bitmask for options to apply to the pattern. * Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE. * @param ec the error code * @stable ICU 2.4 */ U_STABLE USet* U_EXPORT2 uset_openPatternOptions(const UChar* pattern, int32_t patternLength, uint32_t options, UErrorCode* ec); /** * Disposes of the storage used by a USet object. This function should * be called exactly once for objects returned by uset_open(). * @param set the object to dispose of * @stable ICU 2.4 */ U_STABLE void U_EXPORT2 uset_close(USet* set); /** * Returns a copy of this object. * If this set is frozen, then the clone will be frozen as well. * Use uset_cloneAsThawed() for a mutable clone of a frozen set. * @param set the original set * @return the newly allocated copy of the set * @see uset_cloneAsThawed * @stable ICU 3.8 */ U_STABLE USet * U_EXPORT2 uset_clone(const USet *set); /** * Determines whether the set has been frozen (made immutable) or not. * See the ICU4J Freezable interface for details. * @param set the set * @return TRUE/FALSE for whether the set has been frozen * @see uset_freeze * @see uset_cloneAsThawed * @stable ICU 3.8 */ U_STABLE UBool U_EXPORT2 uset_isFrozen(const USet *set); /** * Freeze the set (make it immutable). * Once frozen, it cannot be unfrozen and is therefore thread-safe * until it is deleted. * See the ICU4J Freezable interface for details. * Freezing the set may also make some operations faster, for example * uset_contains() and uset_span(). * A frozen set will not be modified. (It remains frozen.) * @param set the set * @return the same set, now frozen * @see uset_isFrozen * @see uset_cloneAsThawed * @stable ICU 3.8 */ U_STABLE void U_EXPORT2 uset_freeze(USet *set); /** * Clone the set and make the clone mutable. * See the ICU4J Freezable interface for details. * @param set the set * @return the mutable clone * @see uset_freeze * @see uset_isFrozen * @see uset_clone * @stable ICU 3.8 */ U_STABLE USet * U_EXPORT2 uset_cloneAsThawed(const USet *set); /** * Causes the USet object to represent the range start - end. * If start > end then this USet is set to an empty range. * A frozen set will not be modified. * @param set the object to set to the given range * @param start first character in the set, inclusive * @param end last character in the set, inclusive * @stable ICU 3.2 */ U_STABLE void U_EXPORT2 uset_set(USet* set, UChar32 start, UChar32 end); /** * Modifies the set to represent the set specified by the given * pattern. See the UnicodeSet class description for the syntax of * the pattern language. See also the User Guide chapter about UnicodeSet. * Empties the set passed before applying the pattern. * A frozen set will not be modified. * @param set The set to which the pattern is to be applied. * @param pattern A pointer to UChar string specifying what characters are in the set. * The character at pattern[0] must be a '['. * @param patternLength The length of the UChar string. -1 if NUL terminated. * @param options A bitmask for options to apply to the pattern. * Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE. * @param status Returns an error if the pattern cannot be parsed. * @return Upon successful parse, the value is either * the index of the character after the closing ']' * of the parsed pattern. * If the status code indicates failure, then the return value * is the index of the error in the source. * * @stable ICU 2.8 */ U_STABLE int32_t U_EXPORT2 uset_applyPattern(USet *set, const UChar *pattern, int32_t patternLength, uint32_t options, UErrorCode *status); /** * Modifies the set to contain those code points which have the given value * for the given binary or enumerated property, as returned by * u_getIntPropertyValue. Prior contents of this set are lost. * A frozen set will not be modified. * * @param set the object to contain the code points defined by the property * * @param prop a property in the range UCHAR_BIN_START..UCHAR_BIN_LIMIT-1 * or UCHAR_INT_START..UCHAR_INT_LIMIT-1 * or UCHAR_MASK_START..UCHAR_MASK_LIMIT-1. * * @param value a value in the range u_getIntPropertyMinValue(prop).. * u_getIntPropertyMaxValue(prop), with one exception. If prop is * UCHAR_GENERAL_CATEGORY_MASK, then value should not be a UCharCategory, but * rather a mask value produced by U_GET_GC_MASK(). This allows grouped * categories such as [:L:] to be represented. * * @param ec error code input/output parameter * * @stable ICU 3.2 */ U_STABLE void U_EXPORT2 uset_applyIntPropertyValue(USet* set, UProperty prop, int32_t value, UErrorCode* ec); /** * Modifies the set to contain those code points which have the * given value for the given property. Prior contents of this * set are lost. * A frozen set will not be modified. * * @param set the object to contain the code points defined by the given * property and value alias * * @param prop a string specifying a property alias, either short or long. * The name is matched loosely. See PropertyAliases.txt for names and a * description of loose matching. If the value string is empty, then this * string is interpreted as either a General_Category value alias, a Script * value alias, a binary property alias, or a special ID. Special IDs are * matched loosely and correspond to the following sets: * * "ANY" = [\\u0000-\\U0010FFFF], * "ASCII" = [\\u0000-\\u007F], * "Assigned" = [:^Cn:]. * * @param propLength the length of the prop, or -1 if NULL * * @param value a string specifying a value alias, either short or long. * The name is matched loosely. See PropertyValueAliases.txt for names * and a description of loose matching. In addition to aliases listed, * numeric values and canonical combining classes may be expressed * numerically, e.g., ("nv", "0.5") or ("ccc", "220"). The value string * may also be empty. * * @param valueLength the length of the value, or -1 if NULL * * @param ec error code input/output parameter * * @stable ICU 3.2 */ U_STABLE void U_EXPORT2 uset_applyPropertyAlias(USet* set, const UChar *prop, int32_t propLength, const UChar *value, int32_t valueLength, UErrorCode* ec); /** * Return true if the given position, in the given pattern, appears * to be the start of a UnicodeSet pattern. * * @param pattern a string specifying the pattern * @param patternLength the length of the pattern, or -1 if NULL * @param pos the given position * @stable ICU 3.2 */ U_STABLE UBool U_EXPORT2 uset_resemblesPattern(const UChar *pattern, int32_t patternLength, int32_t pos); /** * Returns a string representation of this set. If the result of * calling this function is passed to a uset_openPattern(), it * will produce another set that is equal to this one. * @param set the set * @param result the string to receive the rules, may be NULL * @param resultCapacity the capacity of result, may be 0 if result is NULL * @param escapeUnprintable if TRUE then convert unprintable * character to their hex escape representations, \\uxxxx or * \\Uxxxxxxxx. Unprintable characters are those other than * U+000A, U+0020..U+007E. * @param ec error code. * @return length of string, possibly larger than resultCapacity * @stable ICU 2.4 */ U_STABLE int32_t U_EXPORT2 uset_toPattern(const USet* set, UChar* result, int32_t resultCapacity, UBool escapeUnprintable, UErrorCode* ec); /** * Adds the given character to the given USet. After this call, * uset_contains(set, c) will return TRUE. * A frozen set will not be modified. * @param set the object to which to add the character * @param c the character to add * @stable ICU 2.4 */ U_STABLE void U_EXPORT2 uset_add(USet* set, UChar32 c); /** * Adds all of the elements in the specified set to this set if * they're not already present. This operation effectively * modifies this set so that its value is the union of the two * sets. The behavior of this operation is unspecified if the specified * collection is modified while the operation is in progress. * A frozen set will not be modified. * * @param set the object to which to add the set * @param additionalSet the source set whose elements are to be added to this set. * @stable ICU 2.6 */ U_STABLE void U_EXPORT2 uset_addAll(USet* set, const USet *additionalSet); /** * Adds the given range of characters to the given USet. After this call, * uset_contains(set, start, end) will return TRUE. * A frozen set will not be modified. * @param set the object to which to add the character * @param start the first character of the range to add, inclusive * @param end the last character of the range to add, inclusive * @stable ICU 2.2 */ U_STABLE void U_EXPORT2 uset_addRange(USet* set, UChar32 start, UChar32 end); /** * Adds the given string to the given USet. After this call, * uset_containsString(set, str, strLen) will return TRUE. * A frozen set will not be modified. * @param set the object to which to add the character * @param str the string to add * @param strLen the length of the string or -1 if null terminated. * @stable ICU 2.4 */ U_STABLE void U_EXPORT2 uset_addString(USet* set, const UChar* str, int32_t strLen); /** * Adds each of the characters in this string to the set. Thus "ch" => {"c", "h"} * If this set already any particular character, it has no effect on that character. * A frozen set will not be modified. * @param set the object to which to add the character * @param str the source string * @param strLen the length of the string or -1 if null terminated. * @stable ICU 3.4 */ U_STABLE void U_EXPORT2 uset_addAllCodePoints(USet* set, const UChar *str, int32_t strLen); /** * Removes the given character from the given USet. After this call, * uset_contains(set, c) will return FALSE. * A frozen set will not be modified. * @param set the object from which to remove the character * @param c the character to remove * @stable ICU 2.4 */ U_STABLE void U_EXPORT2 uset_remove(USet* set, UChar32 c); /** * Removes the given range of characters from the given USet. After this call, * uset_contains(set, start, end) will return FALSE. * A frozen set will not be modified. * @param set the object to which to add the character * @param start the first character of the range to remove, inclusive * @param end the last character of the range to remove, inclusive * @stable ICU 2.2 */ U_STABLE void U_EXPORT2 uset_removeRange(USet* set, UChar32 start, UChar32 end); /** * Removes the given string to the given USet. After this call, * uset_containsString(set, str, strLen) will return FALSE. * A frozen set will not be modified. * @param set the object to which to add the character * @param str the string to remove * @param strLen the length of the string or -1 if null terminated. * @stable ICU 2.4 */ U_STABLE void U_EXPORT2 uset_removeString(USet* set, const UChar* str, int32_t strLen); /** * Removes from this set all of its elements that are contained in the * specified set. This operation effectively modifies this * set so that its value is the asymmetric set difference of * the two sets. * A frozen set will not be modified. * @param set the object from which the elements are to be removed * @param removeSet the object that defines which elements will be * removed from this set * @stable ICU 3.2 */ U_STABLE void U_EXPORT2 uset_removeAll(USet* set, const USet* removeSet); /** * Retain only the elements in this set that are contained in the * specified range. If start > end then an empty range is * retained, leaving the set empty. This is equivalent to * a boolean logic AND, or a set INTERSECTION. * A frozen set will not be modified. * * @param set the object for which to retain only the specified range * @param start first character, inclusive, of range to be retained * to this set. * @param end last character, inclusive, of range to be retained * to this set. * @stable ICU 3.2 */ U_STABLE void U_EXPORT2 uset_retain(USet* set, UChar32 start, UChar32 end); /** * Retains only the elements in this set that are contained in the * specified set. In other words, removes from this set all of * its elements that are not contained in the specified set. This * operation effectively modifies this set so that its value is * the intersection of the two sets. * A frozen set will not be modified. * * @param set the object on which to perform the retain * @param retain set that defines which elements this set will retain * @stable ICU 3.2 */ U_STABLE void U_EXPORT2 uset_retainAll(USet* set, const USet* retain); /** * Reallocate this objects internal structures to take up the least * possible space, without changing this object's value. * A frozen set will not be modified. * * @param set the object on which to perfrom the compact * @stable ICU 3.2 */ U_STABLE void U_EXPORT2 uset_compact(USet* set); /** * Inverts this set. This operation modifies this set so that * its value is its complement. This operation does not affect * the multicharacter strings, if any. * A frozen set will not be modified. * @param set the set * @stable ICU 2.4 */ U_STABLE void U_EXPORT2 uset_complement(USet* set); /** * Complements in this set all elements contained in the specified * set. Any character in the other set will be removed if it is * in this set, or will be added if it is not in this set. * A frozen set will not be modified. * * @param set the set with which to complement * @param complement set that defines which elements will be xor'ed * from this set. * @stable ICU 3.2 */ U_STABLE void U_EXPORT2 uset_complementAll(USet* set, const USet* complement); /** * Removes all of the elements from this set. This set will be * empty after this call returns. * A frozen set will not be modified. * @param set the set * @stable ICU 2.4 */ U_STABLE void U_EXPORT2 uset_clear(USet* set); /** * Close this set over the given attribute. For the attribute * USET_CASE, the result is to modify this set so that: * * 1. For each character or string 'a' in this set, all strings or * characters 'b' such that foldCase(a) == foldCase(b) are added * to this set. * * 2. For each string 'e' in the resulting set, if e != * foldCase(e), 'e' will be removed. * * Example: [aq\\u00DF{Bc}{bC}{Fi}] => [aAqQ\\u00DF\\uFB01{ss}{bc}{fi}] * * (Here foldCase(x) refers to the operation u_strFoldCase, and a * == b denotes that the contents are the same, not pointer * comparison.) * * A frozen set will not be modified. * * @param set the set * * @param attributes bitmask for attributes to close over. * Currently only the USET_CASE bit is supported. Any undefined bits * are ignored. * @stable ICU 4.2 */ U_STABLE void U_EXPORT2 uset_closeOver(USet* set, int32_t attributes); /** * Remove all strings from this set. * * @param set the set * @stable ICU 4.2 */ U_STABLE void U_EXPORT2 uset_removeAllStrings(USet* set); /** * Returns TRUE if the given USet contains no characters and no * strings. * @param set the set * @return true if set is empty * @stable ICU 2.4 */ U_STABLE UBool U_EXPORT2 uset_isEmpty(const USet* set); /** * Returns TRUE if the given USet contains the given character. * This function works faster with a frozen set. * @param set the set * @param c The codepoint to check for within the set * @return true if set contains c * @stable ICU 2.4 */ U_STABLE UBool U_EXPORT2 uset_contains(const USet* set, UChar32 c); /** * Returns TRUE if the given USet contains all characters c * where start <= c && c <= end. * @param set the set * @param start the first character of the range to test, inclusive * @param end the last character of the range to test, inclusive * @return TRUE if set contains the range * @stable ICU 2.2 */ U_STABLE UBool U_EXPORT2 uset_containsRange(const USet* set, UChar32 start, UChar32 end); /** * Returns TRUE if the given USet contains the given string. * @param set the set * @param str the string * @param strLen the length of the string or -1 if null terminated. * @return true if set contains str * @stable ICU 2.4 */ U_STABLE UBool U_EXPORT2 uset_containsString(const USet* set, const UChar* str, int32_t strLen); /** * Returns the index of the given character within this set, where * the set is ordered by ascending code point. If the character * is not in this set, return -1. The inverse of this method is * charAt(). * @param set the set * @param c the character to obtain the index for * @return an index from 0..size()-1, or -1 * @stable ICU 3.2 */ U_STABLE int32_t U_EXPORT2 uset_indexOf(const USet* set, UChar32 c); /** * Returns the character at the given index within this set, where * the set is ordered by ascending code point. If the index is * out of range, return (UChar32)-1. The inverse of this method is * indexOf(). * @param set the set * @param charIndex an index from 0..size()-1 to obtain the char for * @return the character at the given index, or (UChar32)-1. * @stable ICU 3.2 */ U_STABLE UChar32 U_EXPORT2 uset_charAt(const USet* set, int32_t charIndex); /** * Returns the number of characters and strings contained in the given * USet. * @param set the set * @return a non-negative integer counting the characters and strings * contained in set * @stable ICU 2.4 */ U_STABLE int32_t U_EXPORT2 uset_size(const USet* set); /** * Returns the number of items in this set. An item is either a range * of characters or a single multicharacter string. * @param set the set * @return a non-negative integer counting the character ranges * and/or strings contained in set * @stable ICU 2.4 */ U_STABLE int32_t U_EXPORT2 uset_getItemCount(const USet* set); /** * Returns an item of this set. An item is either a range of * characters or a single multicharacter string. * @param set the set * @param itemIndex a non-negative integer in the range 0.. * uset_getItemCount(set)-1 * @param start pointer to variable to receive first character * in range, inclusive * @param end pointer to variable to receive last character in range, * inclusive * @param str buffer to receive the string, may be NULL * @param strCapacity capacity of str, or 0 if str is NULL * @param ec error code * @return the length of the string (>= 2), or 0 if the item is a * range, in which case it is the range *start..*end, or -1 if * itemIndex is out of range * @stable ICU 2.4 */ U_STABLE int32_t U_EXPORT2 uset_getItem(const USet* set, int32_t itemIndex, UChar32* start, UChar32* end, UChar* str, int32_t strCapacity, UErrorCode* ec); /** * Returns true if set1 contains all the characters and strings * of set2. It answers the question, 'Is set1 a superset of set2?' * @param set1 set to be checked for containment * @param set2 set to be checked for containment * @return true if the test condition is met * @stable ICU 3.2 */ U_STABLE UBool U_EXPORT2 uset_containsAll(const USet* set1, const USet* set2); /** * Returns true if this set contains all the characters * of the given string. This is does not check containment of grapheme * clusters, like uset_containsString. * @param set set of characters to be checked for containment * @param str string containing codepoints to be checked for containment * @param strLen the length of the string or -1 if null terminated. * @return true if the test condition is met * @stable ICU 3.4 */ U_STABLE UBool U_EXPORT2 uset_containsAllCodePoints(const USet* set, const UChar *str, int32_t strLen); /** * Returns true if set1 contains none of the characters and strings * of set2. It answers the question, 'Is set1 a disjoint set of set2?' * @param set1 set to be checked for containment * @param set2 set to be checked for containment * @return true if the test condition is met * @stable ICU 3.2 */ U_STABLE UBool U_EXPORT2 uset_containsNone(const USet* set1, const USet* set2); /** * Returns true if set1 contains some of the characters and strings * of set2. It answers the question, 'Does set1 and set2 have an intersection?' * @param set1 set to be checked for containment * @param set2 set to be checked for containment * @return true if the test condition is met * @stable ICU 3.2 */ U_STABLE UBool U_EXPORT2 uset_containsSome(const USet* set1, const USet* set2); /** * Returns the length of the initial substring of the input string which * consists only of characters and strings that are contained in this set * (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE), * or only of characters and strings that are not contained * in this set (USET_SPAN_NOT_CONTAINED). * See USetSpanCondition for details. * Similar to the strspn() C library function. * Unpaired surrogates are treated according to contains() of their surrogate code points. * This function works faster with a frozen set and with a non-negative string length argument. * @param set the set * @param s start of the string * @param length of the string; can be -1 for NUL-terminated * @param spanCondition specifies the containment condition * @return the length of the initial substring according to the spanCondition; * 0 if the start of the string does not fit the spanCondition * @stable ICU 3.8 * @see USetSpanCondition */ U_STABLE int32_t U_EXPORT2 uset_span(const USet *set, const UChar *s, int32_t length, USetSpanCondition spanCondition); /** * Returns the start of the trailing substring of the input string which * consists only of characters and strings that are contained in this set * (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE), * or only of characters and strings that are not contained * in this set (USET_SPAN_NOT_CONTAINED). * See USetSpanCondition for details. * Unpaired surrogates are treated according to contains() of their surrogate code points. * This function works faster with a frozen set and with a non-negative string length argument. * @param set the set * @param s start of the string * @param length of the string; can be -1 for NUL-terminated * @param spanCondition specifies the containment condition * @return the start of the trailing substring according to the spanCondition; * the string length if the end of the string does not fit the spanCondition * @stable ICU 3.8 * @see USetSpanCondition */ U_STABLE int32_t U_EXPORT2 uset_spanBack(const USet *set, const UChar *s, int32_t length, USetSpanCondition spanCondition); /** * Returns the length of the initial substring of the input string which * consists only of characters and strings that are contained in this set * (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE), * or only of characters and strings that are not contained * in this set (USET_SPAN_NOT_CONTAINED). * See USetSpanCondition for details. * Similar to the strspn() C library function. * Malformed byte sequences are treated according to contains(0xfffd). * This function works faster with a frozen set and with a non-negative string length argument. * @param set the set * @param s start of the string (UTF-8) * @param length of the string; can be -1 for NUL-terminated * @param spanCondition specifies the containment condition * @return the length of the initial substring according to the spanCondition; * 0 if the start of the string does not fit the spanCondition * @stable ICU 3.8 * @see USetSpanCondition */ U_STABLE int32_t U_EXPORT2 uset_spanUTF8(const USet *set, const char *s, int32_t length, USetSpanCondition spanCondition); /** * Returns the start of the trailing substring of the input string which * consists only of characters and strings that are contained in this set * (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE), * or only of characters and strings that are not contained * in this set (USET_SPAN_NOT_CONTAINED). * See USetSpanCondition for details. * Malformed byte sequences are treated according to contains(0xfffd). * This function works faster with a frozen set and with a non-negative string length argument. * @param set the set * @param s start of the string (UTF-8) * @param length of the string; can be -1 for NUL-terminated * @param spanCondition specifies the containment condition * @return the start of the trailing substring according to the spanCondition; * the string length if the end of the string does not fit the spanCondition * @stable ICU 3.8 * @see USetSpanCondition */ U_STABLE int32_t U_EXPORT2 uset_spanBackUTF8(const USet *set, const char *s, int32_t length, USetSpanCondition spanCondition); /** * Returns true if set1 contains all of the characters and strings * of set2, and vis versa. It answers the question, 'Is set1 equal to set2?' * @param set1 set to be checked for containment * @param set2 set to be checked for containment * @return true if the test condition is met * @stable ICU 3.2 */ U_STABLE UBool U_EXPORT2 uset_equals(const USet* set1, const USet* set2); /********************************************************************* * Serialized set API *********************************************************************/ /** * Serializes this set into an array of 16-bit integers. Serialization * (currently) only records the characters in the set; multicharacter * strings are ignored. * * The array * has following format (each line is one 16-bit integer): * * length = (n+2*m) | (m!=0?0x8000:0) * bmpLength = n; present if m!=0 * bmp[0] * bmp[1] * ... * bmp[n-1] * supp-high[0] * supp-low[0] * supp-high[1] * supp-low[1] * ... * supp-high[m-1] * supp-low[m-1] * * The array starts with a header. After the header are n bmp * code points, then m supplementary code points. Either n or m * or both may be zero. n+2*m is always <= 0x7FFF. * * If there are no supplementary characters (if m==0) then the * header is one 16-bit integer, 'length', with value n. * * If there are supplementary characters (if m!=0) then the header * is two 16-bit integers. The first, 'length', has value * (n+2*m)|0x8000. The second, 'bmpLength', has value n. * * After the header the code points are stored in ascending order. * Supplementary code points are stored as most significant 16 * bits followed by least significant 16 bits. * * @param set the set * @param dest pointer to buffer of destCapacity 16-bit integers. * May be NULL only if destCapacity is zero. * @param destCapacity size of dest, or zero. Must not be negative. * @param pErrorCode pointer to the error code. Will be set to * U_INDEX_OUTOFBOUNDS_ERROR if n+2*m > 0x7FFF. Will be set to * U_BUFFER_OVERFLOW_ERROR if n+2*m+(m!=0?2:1) > destCapacity. * @return the total length of the serialized format, including * the header, that is, n+2*m+(m!=0?2:1), or 0 on error other * than U_BUFFER_OVERFLOW_ERROR. * @stable ICU 2.4 */ U_STABLE int32_t U_EXPORT2 uset_serialize(const USet* set, uint16_t* dest, int32_t destCapacity, UErrorCode* pErrorCode); /** * Given a serialized array, fill in the given serialized set object. * @param fillSet pointer to result * @param src pointer to start of array * @param srcLength length of array * @return true if the given array is valid, otherwise false * @stable ICU 2.4 */ U_STABLE UBool U_EXPORT2 uset_getSerializedSet(USerializedSet* fillSet, const uint16_t* src, int32_t srcLength); /** * Set the USerializedSet to contain the given character (and nothing * else). * @param fillSet pointer to result * @param c The codepoint to set * @stable ICU 2.4 */ U_STABLE void U_EXPORT2 uset_setSerializedToOne(USerializedSet* fillSet, UChar32 c); /** * Returns TRUE if the given USerializedSet contains the given * character. * @param set the serialized set * @param c The codepoint to check for within the set * @return true if set contains c * @stable ICU 2.4 */ U_STABLE UBool U_EXPORT2 uset_serializedContains(const USerializedSet* set, UChar32 c); /** * Returns the number of disjoint ranges of characters contained in * the given serialized set. Ignores any strings contained in the * set. * @param set the serialized set * @return a non-negative integer counting the character ranges * contained in set * @stable ICU 2.4 */ U_STABLE int32_t U_EXPORT2 uset_getSerializedRangeCount(const USerializedSet* set); /** * Returns a range of characters contained in the given serialized * set. * @param set the serialized set * @param rangeIndex a non-negative integer in the range 0.. * uset_getSerializedRangeCount(set)-1 * @param pStart pointer to variable to receive first character * in range, inclusive * @param pEnd pointer to variable to receive last character in range, * inclusive * @return true if rangeIndex is valid, otherwise false * @stable ICU 2.4 */ U_STABLE UBool U_EXPORT2 uset_getSerializedRange(const USerializedSet* set, int32_t rangeIndex, UChar32* pStart, UChar32* pEnd); #endif // unorm2.h /* ******************************************************************************* * * Copyright (C) 2009-2015, International Business Machines * Corporation and others. All Rights Reserved. * ******************************************************************************* * file name: unorm2.h * encoding: US-ASCII * tab size: 8 (not used) * indentation:4 * * created on: 2009dec15 * created by: Markus W. Scherer */ #ifndef __UNORM2_H__ #define __UNORM2_H__ /** * \file * \brief C API: New API for Unicode Normalization. * * Unicode normalization functionality for standard Unicode normalization or * for using custom mapping tables. * All instances of UNormalizer2 are unmodifiable/immutable. * Instances returned by unorm2_getInstance() are singletons that must not be deleted by the caller. * For more details see the Normalizer2 C++ class. */ /** * Constants for normalization modes. * For details about standard Unicode normalization forms * and about the algorithms which are also used with custom mapping tables * see http://www.unicode.org/unicode/reports/tr15/ * @stable ICU 4.4 */ typedef enum { /** * Decomposition followed by composition. * Same as standard NFC when using an "nfc" instance. * Same as standard NFKC when using an "nfkc" instance. * For details about standard Unicode normalization forms * see http://www.unicode.org/unicode/reports/tr15/ * @stable ICU 4.4 */ UNORM2_COMPOSE, /** * Map, and reorder canonically. * Same as standard NFD when using an "nfc" instance. * Same as standard NFKD when using an "nfkc" instance. * For details about standard Unicode normalization forms * see http://www.unicode.org/unicode/reports/tr15/ * @stable ICU 4.4 */ UNORM2_DECOMPOSE, /** * "Fast C or D" form. * If a string is in this form, then further decomposition without reordering * would yield the same form as DECOMPOSE. * Text in "Fast C or D" form can be processed efficiently with data tables * that are "canonically closed", that is, that provide equivalent data for * equivalent text, without having to be fully normalized. * Not a standard Unicode normalization form. * Not a unique form: Different FCD strings can be canonically equivalent. * For details see http://www.unicode.org/notes/tn5/#FCD * @stable ICU 4.4 */ UNORM2_FCD, /** * Compose only contiguously. * Also known as "FCC" or "Fast C Contiguous". * The result will often but not always be in NFC. * The result will conform to FCD which is useful for processing. * Not a standard Unicode normalization form. * For details see http://www.unicode.org/notes/tn5/#FCC * @stable ICU 4.4 */ UNORM2_COMPOSE_CONTIGUOUS } UNormalization2Mode; /** * Result values for normalization quick check functions. * For details see http://www.unicode.org/reports/tr15/#Detecting_Normalization_Forms * @stable ICU 2.0 */ typedef enum UNormalizationCheckResult { /** * The input string is not in the normalization form. * @stable ICU 2.0 */ UNORM_NO, /** * The input string is in the normalization form. * @stable ICU 2.0 */ UNORM_YES, /** * The input string may or may not be in the normalization form. * This value is only returned for composition forms like NFC and FCC, * when a backward-combining character is found for which the surrounding text * would have to be analyzed further. * @stable ICU 2.0 */ UNORM_MAYBE } UNormalizationCheckResult; /** * Opaque C service object type for the new normalization API. * @stable ICU 4.4 */ struct UNormalizer2; typedef struct UNormalizer2 UNormalizer2; /**< C typedef for struct UNormalizer2. @stable ICU 4.4 */ #if !UCONFIG_NO_NORMALIZATION /** * Returns a UNormalizer2 instance for Unicode NFC normalization. * Same as unorm2_getInstance(NULL, "nfc", UNORM2_COMPOSE, pErrorCode). * Returns an unmodifiable singleton instance. Do not delete it. * @param pErrorCode Standard ICU error code. Its input value must * pass the U_SUCCESS() test, or else the function returns * immediately. Check for U_FAILURE() on output or use with * function chaining. (See User Guide for details.) * @return the requested Normalizer2, if successful * @stable ICU 49 */ U_STABLE const UNormalizer2 * U_EXPORT2 unorm2_getNFCInstance(UErrorCode *pErrorCode); /** * Returns a UNormalizer2 instance for Unicode NFD normalization. * Same as unorm2_getInstance(NULL, "nfc", UNORM2_DECOMPOSE, pErrorCode). * Returns an unmodifiable singleton instance. Do not delete it. * @param pErrorCode Standard ICU error code. Its input value must * pass the U_SUCCESS() test, or else the function returns * immediately. Check for U_FAILURE() on output or use with * function chaining. (See User Guide for details.) * @return the requested Normalizer2, if successful * @stable ICU 49 */ U_STABLE const UNormalizer2 * U_EXPORT2 unorm2_getNFDInstance(UErrorCode *pErrorCode); /** * Returns a UNormalizer2 instance for Unicode NFKC normalization. * Same as unorm2_getInstance(NULL, "nfkc", UNORM2_COMPOSE, pErrorCode). * Returns an unmodifiable singleton instance. Do not delete it. * @param pErrorCode Standard ICU error code. Its input value must * pass the U_SUCCESS() test, or else the function returns * immediately. Check for U_FAILURE() on output or use with * function chaining. (See User Guide for details.) * @return the requested Normalizer2, if successful * @stable ICU 49 */ U_STABLE const UNormalizer2 * U_EXPORT2 unorm2_getNFKCInstance(UErrorCode *pErrorCode); /** * Returns a UNormalizer2 instance for Unicode NFKD normalization. * Same as unorm2_getInstance(NULL, "nfkc", UNORM2_DECOMPOSE, pErrorCode). * Returns an unmodifiable singleton instance. Do not delete it. * @param pErrorCode Standard ICU error code. Its input value must * pass the U_SUCCESS() test, or else the function returns * immediately. Check for U_FAILURE() on output or use with * function chaining. (See User Guide for details.) * @return the requested Normalizer2, if successful * @stable ICU 49 */ U_STABLE const UNormalizer2 * U_EXPORT2 unorm2_getNFKDInstance(UErrorCode *pErrorCode); /** * Returns a UNormalizer2 instance for Unicode NFKC_Casefold normalization. * Same as unorm2_getInstance(NULL, "nfkc_cf", UNORM2_COMPOSE, pErrorCode). * Returns an unmodifiable singleton instance. Do not delete it. * @param pErrorCode Standard ICU error code. Its input value must * pass the U_SUCCESS() test, or else the function returns * immediately. Check for U_FAILURE() on output or use with * function chaining. (See User Guide for details.) * @return the requested Normalizer2, if successful * @stable ICU 49 */ U_STABLE const UNormalizer2 * U_EXPORT2 unorm2_getNFKCCasefoldInstance(UErrorCode *pErrorCode); /** * Returns a UNormalizer2 instance which uses the specified data file * (packageName/name similar to ucnv_openPackage() and ures_open()/ResourceBundle) * and which composes or decomposes text according to the specified mode. * Returns an unmodifiable singleton instance. Do not delete it. * * Use packageName=NULL for data files that are part of ICU's own data. * Use name="nfc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFC/NFD. * Use name="nfkc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFKC/NFKD. * Use name="nfkc_cf" and UNORM2_COMPOSE for Unicode standard NFKC_CF=NFKC_Casefold. * * @param packageName NULL for ICU built-in data, otherwise application data package name * @param name "nfc" or "nfkc" or "nfkc_cf" or name of custom data file * @param mode normalization mode (compose or decompose etc.) * @param pErrorCode Standard ICU error code. Its input value must * pass the U_SUCCESS() test, or else the function returns * immediately. Check for U_FAILURE() on output or use with * function chaining. (See User Guide for details.) * @return the requested UNormalizer2, if successful * @stable ICU 4.4 */ U_STABLE const UNormalizer2 * U_EXPORT2 unorm2_getInstance(const char *packageName, const char *name, UNormalization2Mode mode, UErrorCode *pErrorCode); /** * Constructs a filtered normalizer wrapping any UNormalizer2 instance * and a filter set. * Both are aliased and must not be modified or deleted while this object * is used. * The filter set should be frozen; otherwise the performance will suffer greatly. * @param norm2 wrapped UNormalizer2 instance * @param filterSet USet which determines the characters to be normalized * @param pErrorCode Standard ICU error code. Its input value must * pass the U_SUCCESS() test, or else the function returns * immediately. Check for U_FAILURE() on output or use with * function chaining. (See User Guide for details.) * @return the requested UNormalizer2, if successful * @stable ICU 4.4 */ U_STABLE UNormalizer2 * U_EXPORT2 unorm2_openFiltered(const UNormalizer2 *norm2, const USet *filterSet, UErrorCode *pErrorCode); /** * Closes a UNormalizer2 instance from unorm2_openFiltered(). * Do not close instances from unorm2_getInstance()! * @param norm2 UNormalizer2 instance to be closed * @stable ICU 4.4 */ U_STABLE void U_EXPORT2 unorm2_close(UNormalizer2 *norm2); /** * Writes the normalized form of the source string to the destination string * (replacing its contents) and returns the length of the destination string. * The source and destination strings must be different buffers. * @param norm2 UNormalizer2 instance * @param src source string * @param length length of the source string, or -1 if NUL-terminated * @param dest destination string; its contents is replaced with normalized src * @param capacity number of UChars that can be written to dest * @param pErrorCode Standard ICU error code. Its input value must * pass the U_SUCCESS() test, or else the function returns * immediately. Check for U_FAILURE() on output or use with * function chaining. (See User Guide for details.) * @return dest * @stable ICU 4.4 */ U_STABLE int32_t U_EXPORT2 unorm2_normalize(const UNormalizer2 *norm2, const UChar *src, int32_t length, UChar *dest, int32_t capacity, UErrorCode *pErrorCode); /** * Appends the normalized form of the second string to the first string * (merging them at the boundary) and returns the length of the first string. * The result is normalized if the first string was normalized. * The first and second strings must be different buffers. * @param norm2 UNormalizer2 instance * @param first string, should be normalized * @param firstLength length of the first string, or -1 if NUL-terminated * @param firstCapacity number of UChars that can be written to first * @param second string, will be normalized * @param secondLength length of the source string, or -1 if NUL-terminated * @param pErrorCode Standard ICU error code. Its input value must * pass the U_SUCCESS() test, or else the function returns * immediately. Check for U_FAILURE() on output or use with * function chaining. (See User Guide for details.) * @return first * @stable ICU 4.4 */ U_STABLE int32_t U_EXPORT2 unorm2_normalizeSecondAndAppend(const UNormalizer2 *norm2, UChar *first, int32_t firstLength, int32_t firstCapacity, const UChar *second, int32_t secondLength, UErrorCode *pErrorCode); /** * Appends the second string to the first string * (merging them at the boundary) and returns the length of the first string. * The result is normalized if both the strings were normalized. * The first and second strings must be different buffers. * @param norm2 UNormalizer2 instance * @param first string, should be normalized * @param firstLength length of the first string, or -1 if NUL-terminated * @param firstCapacity number of UChars that can be written to first * @param second string, should be normalized * @param secondLength length of the source string, or -1 if NUL-terminated * @param pErrorCode Standard ICU error code. Its input value must * pass the U_SUCCESS() test, or else the function returns * immediately. Check for U_FAILURE() on output or use with * function chaining. (See User Guide for details.) * @return first * @stable ICU 4.4 */ U_STABLE int32_t U_EXPORT2 unorm2_append(const UNormalizer2 *norm2, UChar *first, int32_t firstLength, int32_t firstCapacity, const UChar *second, int32_t secondLength, UErrorCode *pErrorCode); /** * Gets the decomposition mapping of c. * Roughly equivalent to normalizing the String form of c * on a UNORM2_DECOMPOSE UNormalizer2 instance, but much faster, and except that this function * returns a negative value and does not write a string * if c does not have a decomposition mapping in this instance's data. * This function is independent of the mode of the UNormalizer2. * @param norm2 UNormalizer2 instance * @param c code point * @param decomposition String buffer which will be set to c's * decomposition mapping, if there is one. * @param capacity number of UChars that can be written to decomposition * @param pErrorCode Standard ICU error code. Its input value must * pass the U_SUCCESS() test, or else the function returns * immediately. Check for U_FAILURE() on output or use with * function chaining. (See User Guide for details.) * @return the non-negative length of c's decomposition, if there is one; otherwise a negative value * @stable ICU 4.6 */ U_STABLE int32_t U_EXPORT2 unorm2_getDecomposition(const UNormalizer2 *norm2, UChar32 c, UChar *decomposition, int32_t capacity, UErrorCode *pErrorCode); /** * Gets the raw decomposition mapping of c. * * This is similar to the unorm2_getDecomposition() function but returns the * raw decomposition mapping as specified in UnicodeData.txt or * (for custom data) in the mapping files processed by the gennorm2 tool. * By contrast, unorm2_getDecomposition() returns the processed, * recursively-decomposed version of this mapping. * * When used on a standard NFKC Normalizer2 instance, * unorm2_getRawDecomposition() returns the Unicode Decomposition_Mapping (dm) property. * * When used on a standard NFC Normalizer2 instance, * it returns the Decomposition_Mapping only if the Decomposition_Type (dt) is Canonical (Can); * in this case, the result contains either one or two code points (=1..4 UChars). * * This function is independent of the mode of the UNormalizer2. * @param norm2 UNormalizer2 instance * @param c code point * @param decomposition String buffer which will be set to c's * raw decomposition mapping, if there is one. * @param capacity number of UChars that can be written to decomposition * @param pErrorCode Standard ICU error code. Its input value must * pass the U_SUCCESS() test, or else the function returns * immediately. Check for U_FAILURE() on output or use with * function chaining. (See User Guide for details.) * @return the non-negative length of c's raw decomposition, if there is one; otherwise a negative value * @stable ICU 49 */ U_STABLE int32_t U_EXPORT2 unorm2_getRawDecomposition(const UNormalizer2 *norm2, UChar32 c, UChar *decomposition, int32_t capacity, UErrorCode *pErrorCode); /** * Performs pairwise composition of a & b and returns the composite if there is one. * * Returns a composite code point c only if c has a two-way mapping to a+b. * In standard Unicode normalization, this means that * c has a canonical decomposition to a+b * and c does not have the Full_Composition_Exclusion property. * * This function is independent of the mode of the UNormalizer2. * @param norm2 UNormalizer2 instance * @param a A (normalization starter) code point. * @param b Another code point. * @return The non-negative composite code point if there is one; otherwise a negative value. * @stable ICU 49 */ U_STABLE UChar32 U_EXPORT2 unorm2_composePair(const UNormalizer2 *norm2, UChar32 a, UChar32 b); /** * Gets the combining class of c. * The default implementation returns 0 * but all standard implementations return the Unicode Canonical_Combining_Class value. * @param norm2 UNormalizer2 instance * @param c code point * @return c's combining class * @stable ICU 49 */ U_STABLE uint8_t U_EXPORT2 unorm2_getCombiningClass(const UNormalizer2 *norm2, UChar32 c); /** * Tests if the string is normalized. * Internally, in cases where the quickCheck() method would return "maybe" * (which is only possible for the two COMPOSE modes) this method * resolves to "yes" or "no" to provide a definitive result, * at the cost of doing more work in those cases. * @param norm2 UNormalizer2 instance * @param s input string * @param length length of the string, or -1 if NUL-terminated * @param pErrorCode Standard ICU error code. Its input value must * pass the U_SUCCESS() test, or else the function returns * immediately. Check for U_FAILURE() on output or use with * function chaining. (See User Guide for details.) * @return TRUE if s is normalized * @stable ICU 4.4 */ U_STABLE UBool U_EXPORT2 unorm2_isNormalized(const UNormalizer2 *norm2, const UChar *s, int32_t length, UErrorCode *pErrorCode); /** * Tests if the string is normalized. * For the two COMPOSE modes, the result could be "maybe" in cases that * would take a little more work to resolve definitively. * Use spanQuickCheckYes() and normalizeSecondAndAppend() for a faster * combination of quick check + normalization, to avoid * re-checking the "yes" prefix. * @param norm2 UNormalizer2 instance * @param s input string * @param length length of the string, or -1 if NUL-terminated * @param pErrorCode Standard ICU error code. Its input value must * pass the U_SUCCESS() test, or else the function returns * immediately. Check for U_FAILURE() on output or use with * function chaining. (See User Guide for details.) * @return UNormalizationCheckResult * @stable ICU 4.4 */ U_STABLE UNormalizationCheckResult U_EXPORT2 unorm2_quickCheck(const UNormalizer2 *norm2, const UChar *s, int32_t length, UErrorCode *pErrorCode); /** * Returns the end of the normalized substring of the input string. * In other words, with end=spanQuickCheckYes(s, ec); * the substring UnicodeString(s, 0, end) * will pass the quick check with a "yes" result. * * The returned end index is usually one or more characters before the * "no" or "maybe" character: The end index is at a normalization boundary. * (See the class documentation for more about normalization boundaries.) * * When the goal is a normalized string and most input strings are expected * to be normalized already, then call this method, * and if it returns a prefix shorter than the input string, * copy that prefix and use normalizeSecondAndAppend() for the remainder. * @param norm2 UNormalizer2 instance * @param s input string * @param length length of the string, or -1 if NUL-terminated * @param pErrorCode Standard ICU error code. Its input value must * pass the U_SUCCESS() test, or else the function returns * immediately. Check for U_FAILURE() on output or use with * function chaining. (See User Guide for details.) * @return "yes" span end index * @stable ICU 4.4 */ U_STABLE int32_t U_EXPORT2 unorm2_spanQuickCheckYes(const UNormalizer2 *norm2, const UChar *s, int32_t length, UErrorCode *pErrorCode); /** * Tests if the character always has a normalization boundary before it, * regardless of context. * For details see the Normalizer2 base class documentation. * @param norm2 UNormalizer2 instance * @param c character to test * @return TRUE if c has a normalization boundary before it * @stable ICU 4.4 */ U_STABLE UBool U_EXPORT2 unorm2_hasBoundaryBefore(const UNormalizer2 *norm2, UChar32 c); /** * Tests if the character always has a normalization boundary after it, * regardless of context. * For details see the Normalizer2 base class documentation. * @param norm2 UNormalizer2 instance * @param c character to test * @return TRUE if c has a normalization boundary after it * @stable ICU 4.4 */ U_STABLE UBool U_EXPORT2 unorm2_hasBoundaryAfter(const UNormalizer2 *norm2, UChar32 c); /** * Tests if the character is normalization-inert. * For details see the Normalizer2 base class documentation. * @param norm2 UNormalizer2 instance * @param c character to test * @return TRUE if c is normalization-inert * @stable ICU 4.4 */ U_STABLE UBool U_EXPORT2 unorm2_isInert(const UNormalizer2 *norm2, UChar32 c); /** * Option bit for unorm_compare: * Both input strings are assumed to fulfill FCD conditions. * @stable ICU 2.2 */ #define UNORM_INPUT_IS_FCD 0x20000 /** * Option bit for unorm_compare: * Perform case-insensitive comparison. * @stable ICU 2.2 */ #define U_COMPARE_IGNORE_CASE 0x10000 #ifndef U_COMPARE_CODE_POINT_ORDER /* see also unistr.h and ustring.h */ /** * Option bit for u_strCaseCompare, u_strcasecmp, unorm_compare, etc: * Compare strings in code point order instead of code unit order. * @stable ICU 2.2 */ #define U_COMPARE_CODE_POINT_ORDER 0x8000 #endif /** * Compares two strings for canonical equivalence. * Further options include case-insensitive comparison and * code point order (as opposed to code unit order). * * Canonical equivalence between two strings is defined as their normalized * forms (NFD or NFC) being identical. * This function compares strings incrementally instead of normalizing * (and optionally case-folding) both strings entirely, * improving performance significantly. * * Bulk normalization is only necessary if the strings do not fulfill the FCD * conditions. Only in this case, and only if the strings are relatively long, * is memory allocated temporarily. * For FCD strings and short non-FCD strings there is no memory allocation. * * Semantically, this is equivalent to * strcmp[CodePointOrder](NFD(foldCase(NFD(s1))), NFD(foldCase(NFD(s2)))) * where code point order and foldCase are all optional. * * UAX 21 2.5 Caseless Matching specifies that for a canonical caseless match * the case folding must be performed first, then the normalization. * * @param s1 First source string. * @param length1 Length of first source string, or -1 if NUL-terminated. * * @param s2 Second source string. * @param length2 Length of second source string, or -1 if NUL-terminated. * * @param options A bit set of options: * - U_FOLD_CASE_DEFAULT or 0 is used for default options: * Case-sensitive comparison in code unit order, and the input strings * are quick-checked for FCD. * * - UNORM_INPUT_IS_FCD * Set if the caller knows that both s1 and s2 fulfill the FCD conditions. * If not set, the function will quickCheck for FCD * and normalize if necessary. * * - U_COMPARE_CODE_POINT_ORDER * Set to choose code point order instead of code unit order * (see u_strCompare for details). * * - U_COMPARE_IGNORE_CASE * Set to compare strings case-insensitively using case folding, * instead of case-sensitively. * If set, then the following case folding options are used. * * - Options as used with case-insensitive comparisons, currently: * * - U_FOLD_CASE_EXCLUDE_SPECIAL_I * (see u_strCaseCompare for details) * * - regular normalization options shifted left by UNORM_COMPARE_NORM_OPTIONS_SHIFT * * @param pErrorCode ICU error code in/out parameter. * Must fulfill U_SUCCESS before the function call. * @return <0 or 0 or >0 as usual for string comparisons * * @see unorm_normalize * @see UNORM_FCD * @see u_strCompare * @see u_strCaseCompare * * @stable ICU 2.2 */ U_STABLE int32_t U_EXPORT2 unorm_compare(const UChar *s1, int32_t length1, const UChar *s2, int32_t length2, uint32_t options, UErrorCode *pErrorCode); #endif /* !UCONFIG_NO_NORMALIZATION */ #endif /* __UNORM2_H__ */ // ucnvsel.h /* ******************************************************************************* * * Copyright (C) 2008-2011, International Business Machines * Corporation, Google and others. All Rights Reserved. * ******************************************************************************* */ /* * Author : eldawy@google.com (Mohamed Eldawy) * ucnvsel.h * * Purpose: To generate a list of encodings capable of handling * a given Unicode text * * Started 09-April-2008 */ #ifndef __ICU_UCNV_SEL_H__ #define __ICU_UCNV_SEL_H__ #if !UCONFIG_NO_CONVERSION /** * \file * * A converter selector is built with a set of encoding/charset names * and given an input string returns the set of names of the * corresponding converters which can convert the string. * * A converter selector can be serialized into a buffer and reopened * from the serialized form. */ /** * @{ * The selector data structure */ struct UConverterSelector; typedef struct UConverterSelector UConverterSelector; /** @} */ /** * Open a selector. * If converterListSize is 0, build for all available converters. * If excludedCodePoints is NULL, don't exclude any code points. * * @param converterList a pointer to encoding names needed to be involved. * Can be NULL if converterListSize==0. * The list and the names will be cloned, and the caller * retains ownership of the original. * @param converterListSize number of encodings in above list. * If 0, builds a selector for all available converters. * @param excludedCodePoints a set of code points to be excluded from consideration. * That is, excluded code points in a string do not change * the selection result. (They might be handled by a callback.) * Use NULL to exclude nothing. * @param whichSet what converter set to use? Use this to determine whether * to consider only roundtrip mappings or also fallbacks. * @param status an in/out ICU UErrorCode * @return the new selector * * @stable ICU 4.2 */ U_STABLE UConverterSelector* U_EXPORT2 ucnvsel_open(const char* const* converterList, int32_t converterListSize, const USet* excludedCodePoints, const UConverterUnicodeSet whichSet, UErrorCode* status); /** * Closes a selector. * If any Enumerations were returned by ucnv_select*, they become invalid. * They can be closed before or after calling ucnv_closeSelector, * but should never be used after the selector is closed. * * @see ucnv_selectForString * @see ucnv_selectForUTF8 * * @param sel selector to close * * @stable ICU 4.2 */ U_STABLE void U_EXPORT2 ucnvsel_close(UConverterSelector *sel); /** * Open a selector from its serialized form. * The buffer must remain valid and unchanged for the lifetime of the selector. * This is much faster than creating a selector from scratch. * Using a serialized form from a different machine (endianness/charset) is supported. * * @param buffer pointer to the serialized form of a converter selector; * must be 32-bit-aligned * @param length the capacity of this buffer (can be equal to or larger than * the actual data length) * @param status an in/out ICU UErrorCode * @return the new selector * * @stable ICU 4.2 */ U_STABLE UConverterSelector* U_EXPORT2 ucnvsel_openFromSerialized(const void* buffer, int32_t length, UErrorCode* status); /** * Serialize a selector into a linear buffer. * The serialized form is portable to different machines. * * @param sel selector to consider * @param buffer pointer to 32-bit-aligned memory to be filled with the * serialized form of this converter selector * @param bufferCapacity the capacity of this buffer * @param status an in/out ICU UErrorCode * @return the required buffer capacity to hold serialize data (even if the call fails * with a U_BUFFER_OVERFLOW_ERROR, it will return the required capacity) * * @stable ICU 4.2 */ U_STABLE int32_t U_EXPORT2 ucnvsel_serialize(const UConverterSelector* sel, void* buffer, int32_t bufferCapacity, UErrorCode* status); /** * Select converters that can map all characters in a UTF-16 string, * ignoring the excluded code points. * * @param sel a selector * @param s UTF-16 string * @param length length of the string, or -1 if NUL-terminated * @param status an in/out ICU UErrorCode * @return an enumeration containing encoding names. * The returned encoding names and their order will be the same as * supplied when building the selector. * * @stable ICU 4.2 */ U_STABLE UEnumeration * U_EXPORT2 ucnvsel_selectForString(const UConverterSelector* sel, const UChar *s, int32_t length, UErrorCode *status); /** * Select converters that can map all characters in a UTF-8 string, * ignoring the excluded code points. * * @param sel a selector * @param s UTF-8 string * @param length length of the string, or -1 if NUL-terminated * @param status an in/out ICU UErrorCode * @return an enumeration containing encoding names. * The returned encoding names and their order will be the same as * supplied when building the selector. * * @stable ICU 4.2 */ U_STABLE UEnumeration * U_EXPORT2 ucnvsel_selectForUTF8(const UConverterSelector* sel, const char *s, int32_t length, UErrorCode *status); #endif /* !UCONFIG_NO_CONVERSION */ #endif /* __ICU_UCNV_SEL_H__ */ // ucat.h /* ********************************************************************** * Copyright (c) 2003-2004, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** * Author: Alan Liu * Created: March 19 2003 * Since: ICU 2.6 ********************************************************************** */ #ifndef UCAT_H #define UCAT_H /** * \file * \brief C API: Message Catalog Wrappers * * This C API provides look-alike functions that deliberately resemble * the POSIX catopen, catclose, and catgets functions. The underlying * implementation is in terms of ICU resource bundles, rather than * POSIX message catalogs. * * The ICU resource bundles obey standard ICU inheritance policies. * To facilitate this, sets and messages are flattened into one tier. * This is done by creating resource bundle keys of the form * <set_num>%<msg_num> where set_num is the set number and msg_num is * the message number, formatted as decimal strings. * * Example: Consider a message catalog containing two sets: * * Set 1: Message 4 = "Good morning." * Message 5 = "Good afternoon." * Message 7 = "Good evening." * Message 8 = "Good night." * Set 4: Message 14 = "Please " * Message 19 = "Thank you." * Message 20 = "Sincerely," * * The ICU resource bundle source file would, assuming it is named * "greet.txt", would look like this: * * greet * { * 1%4 { "Good morning." } * 1%5 { "Good afternoon." } * 1%7 { "Good evening." } * 1%8 { "Good night." } * * 4%14 { "Please " } * 4%19 { "Thank you." } * 4%20 { "Sincerely," } * } * * The catgets function is commonly used in combination with functions * like printf and strftime. ICU components like message format can * be used instead, although they use a different format syntax. * There is an ICU package, icuio, that provides some of * the POSIX-style formatting API. */ U_CDECL_BEGIN /** * An ICU message catalog descriptor, analogous to nl_catd. * * @stable ICU 2.6 */ typedef UResourceBundle* u_nl_catd; /** * Open and return an ICU message catalog descriptor. The descriptor * may be passed to u_catgets() to retrieve localized strings. * * @param name string containing the full path pointing to the * directory where the resources reside followed by the package name * e.g. "/usr/resource/my_app/resources/guimessages" on a Unix system. * If NULL, ICU default data files will be used. * * Unlike POSIX, environment variables are not interpolated within the * name. * * @param locale the locale for which we want to open the resource. If * NULL, the default ICU locale will be used (see uloc_getDefault). If * strlen(locale) == 0, the root locale will be used. * * @param ec input/output error code. Upon output, * U_USING_FALLBACK_WARNING indicates that a fallback locale was * used. For example, 'de_CH' was requested, but nothing was found * there, so 'de' was used. U_USING_DEFAULT_WARNING indicates that the * default locale data or root locale data was used; neither the * requested locale nor any of its fallback locales were found. * * @return a message catalog descriptor that may be passed to * u_catgets(). If the ec parameter indicates success, then the caller * is responsible for calling u_catclose() to close the message * catalog. If the ec parameter indicates failure, then NULL will be * returned. * * @stable ICU 2.6 */ U_STABLE u_nl_catd U_EXPORT2 u_catopen(const char* name, const char* locale, UErrorCode* ec); /** * Close an ICU message catalog, given its descriptor. * * @param catd a message catalog descriptor to be closed. May be NULL, * in which case no action is taken. * * @stable ICU 2.6 */ U_STABLE void U_EXPORT2 u_catclose(u_nl_catd catd); /** * Retrieve a localized string from an ICU message catalog. * * @param catd a message catalog descriptor returned by u_catopen. * * @param set_num the message catalog set number. Sets need not be * numbered consecutively. * * @param msg_num the message catalog message number within the * set. Messages need not be numbered consecutively. * * @param s the default string. This is returned if the string * specified by the set_num and msg_num is not found. It must be * zero-terminated. * * @param len fill-in parameter to receive the length of the result. * May be NULL, in which case it is ignored. * * @param ec input/output error code. May be U_USING_FALLBACK_WARNING * or U_USING_DEFAULT_WARNING. U_MISSING_RESOURCE_ERROR indicates that * the set_num/msg_num tuple does not specify a valid message string * in this catalog. * * @return a pointer to a zero-terminated UChar array which lives in * an internal buffer area, typically a memory mapped/DLL file. The * caller must NOT delete this pointer. If the call is unsuccessful * for any reason, then s is returned. This includes the situation in * which ec indicates a failing error code upon entry to this * function. * * @stable ICU 2.6 */ U_STABLE const UChar* U_EXPORT2 u_catgets(u_nl_catd catd, int32_t set_num, int32_t msg_num, const UChar* s, int32_t* len, UErrorCode* ec); U_CDECL_END #endif /*UCAT_H*/ /*eof*/ // ubidi.h /* ****************************************************************************** * * Copyright (C) 1999-2013, International Business Machines * Corporation and others. All Rights Reserved. * ****************************************************************************** * file name: ubidi.h * encoding: US-ASCII * tab size: 8 (not used) * indentation:4 * * created on: 1999jul27 * created by: Markus W. Scherer, updated by Matitiahu Allouche */ #ifndef UBIDI_H #define UBIDI_H /** *\file * \brief C API: Bidi algorithm * *

Bidi algorithm for ICU

* * This is an implementation of the Unicode Bidirectional Algorithm. * The algorithm is defined in the * Unicode Standard Annex #9.

* * Note: Libraries that perform a bidirectional algorithm and * reorder strings accordingly are sometimes called "Storage Layout Engines". * ICU's Bidi and shaping (u_shapeArabic()) APIs can be used at the core of such * "Storage Layout Engines". * *

General remarks about the API:

* * In functions with an error code parameter, * the pErrorCode pointer must be valid * and the value that it points to must not indicate a failure before * the function call. Otherwise, the function returns immediately. * After the function call, the value indicates success or failure.

* * The "limit" of a sequence of characters is the position just after their * last character, i.e., one more than that position.

* * Some of the API functions provide access to "runs". * Such a "run" is defined as a sequence of characters * that are at the same embedding level * after performing the Bidi algorithm.

* * @author Markus W. Scherer * @version 1.0 * * *

Sample code for the ICU Bidi API

* *
Rendering a paragraph with the ICU Bidi API
* * This is (hypothetical) sample code that illustrates * how the ICU Bidi API could be used to render a paragraph of text. * Rendering code depends highly on the graphics system, * therefore this sample code must make a lot of assumptions, * which may or may not match any existing graphics system's properties. * *

The basic assumptions are:

*
    *
  • Rendering is done from left to right on a horizontal line.
  • *
  • A run of single-style, unidirectional text can be rendered at once.
  • *
  • Such a run of text is passed to the graphics system with * characters (code units) in logical order.
  • *
  • The line-breaking algorithm is very complicated * and Locale-dependent - * and therefore its implementation omitted from this sample code.
  • *
* *
 * \code
 *#include "unicode/ubidi.h"
 *
 *typedef enum {
 *     styleNormal=0, styleSelected=1,
 *     styleBold=2, styleItalics=4,
 *     styleSuper=8, styleSub=16
 *} Style;
 *
 *typedef struct { int32_t limit; Style style; } StyleRun;
 *
 *int getTextWidth(const UChar *text, int32_t start, int32_t limit,
 *                  const StyleRun *styleRuns, int styleRunCount);
 *
 * // set *pLimit and *pStyleRunLimit for a line
 * // from text[start] and from styleRuns[styleRunStart]
 * // using ubidi_getLogicalRun(para, ...)
 *void getLineBreak(const UChar *text, int32_t start, int32_t *pLimit,
 *                  UBiDi *para,
 *                  const StyleRun *styleRuns, int styleRunStart, int *pStyleRunLimit,
 *                  int *pLineWidth);
 *
 * // render runs on a line sequentially, always from left to right
 *
 * // prepare rendering a new line
 * void startLine(UBiDiDirection textDirection, int lineWidth);
 *
 * // render a run of text and advance to the right by the run width
 * // the text[start..limit-1] is always in logical order
 * void renderRun(const UChar *text, int32_t start, int32_t limit,
 *               UBiDiDirection textDirection, Style style);
 *
 * // We could compute a cross-product
 * // from the style runs with the directional runs
 * // and then reorder it.
 * // Instead, here we iterate over each run type
 * // and render the intersections -
 * // with shortcuts in simple (and common) cases.
 * // renderParagraph() is the main function.
 *
 * // render a directional run with
 * // (possibly) multiple style runs intersecting with it
 * void renderDirectionalRun(const UChar *text,
 *                           int32_t start, int32_t limit,
 *                           UBiDiDirection direction,
 *                           const StyleRun *styleRuns, int styleRunCount) {
 *     int i;
 *
 *     // iterate over style runs
 *     if(direction==UBIDI_LTR) {
 *         int styleLimit;
 *
 *         for(i=0; ilimit) { styleLimit=limit; }
 *                 renderRun(text, start, styleLimit,
 *                           direction, styleRun[i].style);
 *                 if(styleLimit==limit) { break; }
 *                 start=styleLimit;
 *             }
 *         }
 *     } else {
 *         int styleStart;
 *
 *         for(i=styleRunCount-1; i>=0; --i) {
 *             if(i>0) {
 *                 styleStart=styleRun[i-1].limit;
 *             } else {
 *                 styleStart=0;
 *             }
 *             if(limit>=styleStart) {
 *                 if(styleStart=length
 *
 *         width=getTextWidth(text, 0, length, styleRuns, styleRunCount);
 *         if(width<=lineWidth) {
 *             // everything fits onto one line
 *
 *            // prepare rendering a new line from either left or right
 *             startLine(paraLevel, width);
 *
 *             renderLine(para, text, 0, length,
 *                        styleRuns, styleRunCount);
 *         } else {
 *             UBiDi *line;
 *
 *             // we need to render several lines
 *             line=ubidi_openSized(length, 0, pErrorCode);
 *             if(line!=NULL) {
 *                 int32_t start=0, limit;
 *                 int styleRunStart=0, styleRunLimit;
 *
 *                 for(;;) {
 *                     limit=length;
 *                     styleRunLimit=styleRunCount;
 *                     getLineBreak(text, start, &limit, para,
 *                                  styleRuns, styleRunStart, &styleRunLimit,
 *                                 &width);
 *                     ubidi_setLine(para, start, limit, line, pErrorCode);
 *                     if(U_SUCCESS(*pErrorCode)) {
 *                         // prepare rendering a new line
 *                         // from either left or right
 *                         startLine(paraLevel, width);
 *
 *                         renderLine(line, text, start, limit,
 *                                    styleRuns+styleRunStart,
 *                                    styleRunLimit-styleRunStart);
 *                     }
 *                     if(limit==length) { break; }
 *                     start=limit;
 *                     styleRunStart=styleRunLimit-1;
 *                     if(start>=styleRuns[styleRunStart].limit) {
 *                         ++styleRunStart;
 *                     }
 *                 }
 *
 *                 ubidi_close(line);
 *             }
 *        }
 *    }
 *
 *     ubidi_close(para);
 *}
 *\endcode
 * 
*/ /*DOCXX_TAG*/ /*@{*/ /** * UBiDiLevel is the type of the level values in this * Bidi implementation. * It holds an embedding level and indicates the visual direction * by its bit 0 (even/odd value).

* * It can also hold non-level values for the * paraLevel and embeddingLevels * arguments of ubidi_setPara(); there: *

    *
  • bit 7 of an embeddingLevels[] * value indicates whether the using application is * specifying the level of a character to override whatever the * Bidi implementation would resolve it to.
  • *
  • paraLevel can be set to the * pseudo-level values UBIDI_DEFAULT_LTR * and UBIDI_DEFAULT_RTL.
  • *
* * @see ubidi_setPara * *

The related constants are not real, valid level values. * UBIDI_DEFAULT_XXX can be used to specify * a default for the paragraph level for * when the ubidi_setPara() function * shall determine it but there is no * strongly typed character in the input.

* * Note that the value for UBIDI_DEFAULT_LTR is even * and the one for UBIDI_DEFAULT_RTL is odd, * just like with normal LTR and RTL level values - * these special values are designed that way. Also, the implementation * assumes that UBIDI_MAX_EXPLICIT_LEVEL is odd. * * @see UBIDI_DEFAULT_LTR * @see UBIDI_DEFAULT_RTL * @see UBIDI_LEVEL_OVERRIDE * @see UBIDI_MAX_EXPLICIT_LEVEL * @stable ICU 2.0 */ typedef uint8_t UBiDiLevel; /** Paragraph level setting.

* * Constant indicating that the base direction depends on the first strong * directional character in the text according to the Unicode Bidirectional * Algorithm. If no strong directional character is present, * then set the paragraph level to 0 (left-to-right).

* * If this value is used in conjunction with reordering modes * UBIDI_REORDER_INVERSE_LIKE_DIRECT or * UBIDI_REORDER_INVERSE_FOR_NUMBERS_SPECIAL, the text to reorder * is assumed to be visual LTR, and the text after reordering is required * to be the corresponding logical string with appropriate contextual * direction. The direction of the result string will be RTL if either * the righmost or leftmost strong character of the source text is RTL * or Arabic Letter, the direction will be LTR otherwise.

* * If reordering option UBIDI_OPTION_INSERT_MARKS is set, an RLM may * be added at the beginning of the result string to ensure round trip * (that the result string, when reordered back to visual, will produce * the original source text). * @see UBIDI_REORDER_INVERSE_LIKE_DIRECT * @see UBIDI_REORDER_INVERSE_FOR_NUMBERS_SPECIAL * @stable ICU 2.0 */ #define UBIDI_DEFAULT_LTR 0xfe /** Paragraph level setting.

* * Constant indicating that the base direction depends on the first strong * directional character in the text according to the Unicode Bidirectional * Algorithm. If no strong directional character is present, * then set the paragraph level to 1 (right-to-left).

* * If this value is used in conjunction with reordering modes * UBIDI_REORDER_INVERSE_LIKE_DIRECT or * UBIDI_REORDER_INVERSE_FOR_NUMBERS_SPECIAL, the text to reorder * is assumed to be visual LTR, and the text after reordering is required * to be the corresponding logical string with appropriate contextual * direction. The direction of the result string will be RTL if either * the righmost or leftmost strong character of the source text is RTL * or Arabic Letter, or if the text contains no strong character; * the direction will be LTR otherwise.

* * If reordering option UBIDI_OPTION_INSERT_MARKS is set, an RLM may * be added at the beginning of the result string to ensure round trip * (that the result string, when reordered back to visual, will produce * the original source text). * @see UBIDI_REORDER_INVERSE_LIKE_DIRECT * @see UBIDI_REORDER_INVERSE_FOR_NUMBERS_SPECIAL * @stable ICU 2.0 */ #define UBIDI_DEFAULT_RTL 0xff /** * Maximum explicit embedding level. * (The maximum resolved level can be up to UBIDI_MAX_EXPLICIT_LEVEL+1). * @stable ICU 2.0 */ #define UBIDI_MAX_EXPLICIT_LEVEL 125 /** Bit flag for level input. * Overrides directional properties. * @stable ICU 2.0 */ #define UBIDI_LEVEL_OVERRIDE 0x80 /** * Special value which can be returned by the mapping functions when a logical * index has no corresponding visual index or vice-versa. This may happen * for the logical-to-visual mapping of a Bidi control when option * #UBIDI_OPTION_REMOVE_CONTROLS is specified. This can also happen * for the visual-to-logical mapping of a Bidi mark (LRM or RLM) inserted * by option #UBIDI_OPTION_INSERT_MARKS. * @see ubidi_getVisualIndex * @see ubidi_getVisualMap * @see ubidi_getLogicalIndex * @see ubidi_getLogicalMap * @stable ICU 3.6 */ #define UBIDI_MAP_NOWHERE (-1) /** * UBiDiDirection values indicate the text direction. * @stable ICU 2.0 */ enum UBiDiDirection { /** Left-to-right text. This is a 0 value. *

    *
  • As return value for ubidi_getDirection(), it means * that the source string contains no right-to-left characters, or * that the source string is empty and the paragraph level is even. *
  • As return value for ubidi_getBaseDirection(), it * means that the first strong character of the source string has * a left-to-right direction. *
* @stable ICU 2.0 */ UBIDI_LTR, /** Right-to-left text. This is a 1 value. *
    *
  • As return value for ubidi_getDirection(), it means * that the source string contains no left-to-right characters, or * that the source string is empty and the paragraph level is odd. *
  • As return value for ubidi_getBaseDirection(), it * means that the first strong character of the source string has * a right-to-left direction. *
* @stable ICU 2.0 */ UBIDI_RTL, /** Mixed-directional text. *

As return value for ubidi_getDirection(), it means * that the source string contains both left-to-right and * right-to-left characters. * @stable ICU 2.0 */ UBIDI_MIXED, /** No strongly directional text. *

As return value for ubidi_getBaseDirection(), it means * that the source string is missing or empty, or contains neither left-to-right * nor right-to-left characters. * @stable ICU 4.6 */ UBIDI_NEUTRAL }; /** @stable ICU 2.0 */ typedef enum UBiDiDirection UBiDiDirection; /** * Forward declaration of the UBiDi structure for the declaration of * the API functions. Its fields are implementation-specific.

* This structure holds information about a paragraph (or multiple paragraphs) * of text with Bidi-algorithm-related details, or about one line of * such a paragraph.

* Reordering can be done on a line, or on one or more paragraphs which are * then interpreted each as one single line. * @stable ICU 2.0 */ struct UBiDi; /** @stable ICU 2.0 */ typedef struct UBiDi UBiDi; /** * Allocate a UBiDi structure. * Such an object is initially empty. It is assigned * the Bidi properties of a piece of text containing one or more paragraphs * by ubidi_setPara() * or the Bidi properties of a line within a paragraph by * ubidi_setLine().

* This object can be reused for as long as it is not deallocated * by calling ubidi_close().

* ubidi_setPara() and ubidi_setLine() will allocate * additional memory for internal structures as necessary. * * @return An empty UBiDi object. * @stable ICU 2.0 */ U_STABLE UBiDi * U_EXPORT2 ubidi_open(void); /** * Allocate a UBiDi structure with preallocated memory * for internal structures. * This function provides a UBiDi object like ubidi_open() * with no arguments, but it also preallocates memory for internal structures * according to the sizings supplied by the caller.

* Subsequent functions will not allocate any more memory, and are thus * guaranteed not to fail because of lack of memory.

* The preallocation can be limited to some of the internal memory * by setting some values to 0 here. That means that if, e.g., * maxRunCount cannot be reasonably predetermined and should not * be set to maxLength (the only failproof value) to avoid * wasting memory, then maxRunCount could be set to 0 here * and the internal structures that are associated with it will be allocated * on demand, just like with ubidi_open(). * * @param maxLength is the maximum text or line length that internal memory * will be preallocated for. An attempt to associate this object with a * longer text will fail, unless this value is 0, which leaves the allocation * up to the implementation. * * @param maxRunCount is the maximum anticipated number of same-level runs * that internal memory will be preallocated for. An attempt to access * visual runs on an object that was not preallocated for as many runs * as the text was actually resolved to will fail, * unless this value is 0, which leaves the allocation up to the implementation.

* The number of runs depends on the actual text and maybe anywhere between * 1 and maxLength. It is typically small. * * @param pErrorCode must be a valid pointer to an error code value. * * @return An empty UBiDi object with preallocated memory. * @stable ICU 2.0 */ U_STABLE UBiDi * U_EXPORT2 ubidi_openSized(int32_t maxLength, int32_t maxRunCount, UErrorCode *pErrorCode); /** * ubidi_close() must be called to free the memory * associated with a UBiDi object.

* * Important: * A parent UBiDi object must not be destroyed or reused if * it still has children. * If a UBiDi object has become the child * of another one (its parent) by calling * ubidi_setLine(), then the child object must * be destroyed (closed) or reused (by calling * ubidi_setPara() or ubidi_setLine()) * before the parent object. * * @param pBiDi is a UBiDi object. * * @see ubidi_setPara * @see ubidi_setLine * @stable ICU 2.0 */ U_STABLE void U_EXPORT2 ubidi_close(UBiDi *pBiDi); /** * Modify the operation of the Bidi algorithm such that it * approximates an "inverse Bidi" algorithm. This function * must be called before ubidi_setPara(). * *

The normal operation of the Bidi algorithm as described * in the Unicode Technical Report is to take text stored in logical * (keyboard, typing) order and to determine the reordering of it for visual * rendering. * Some legacy systems store text in visual order, and for operations * with standard, Unicode-based algorithms, the text needs to be transformed * to logical order. This is effectively the inverse algorithm of the * described Bidi algorithm. Note that there is no standard algorithm for * this "inverse Bidi" and that the current implementation provides only an * approximation of "inverse Bidi".

* *

With isInverse set to TRUE, * this function changes the behavior of some of the subsequent functions * in a way that they can be used for the inverse Bidi algorithm. * Specifically, runs of text with numeric characters will be treated in a * special way and may need to be surrounded with LRM characters when they are * written in reordered sequence.

* *

Output runs should be retrieved using ubidi_getVisualRun(). * Since the actual input for "inverse Bidi" is visually ordered text and * ubidi_getVisualRun() gets the reordered runs, these are actually * the runs of the logically ordered output.

* *

Calling this function with argument isInverse set to * TRUE is equivalent to calling * ubidi_setReorderingMode with argument * reorderingMode * set to #UBIDI_REORDER_INVERSE_NUMBERS_AS_L.
* Calling this function with argument isInverse set to * FALSE is equivalent to calling * ubidi_setReorderingMode with argument * reorderingMode * set to #UBIDI_REORDER_DEFAULT. * * @param pBiDi is a UBiDi object. * * @param isInverse specifies "forward" or "inverse" Bidi operation. * * @see ubidi_setPara * @see ubidi_writeReordered * @see ubidi_setReorderingMode * @stable ICU 2.0 */ U_STABLE void U_EXPORT2 ubidi_setInverse(UBiDi *pBiDi, UBool isInverse); /** * Is this Bidi object set to perform the inverse Bidi algorithm? *

Note: calling this function after setting the reordering mode with * ubidi_setReorderingMode will return TRUE if the * reordering mode was set to #UBIDI_REORDER_INVERSE_NUMBERS_AS_L, * FALSE for all other values.

* * @param pBiDi is a UBiDi object. * @return TRUE if the Bidi object is set to perform the inverse Bidi algorithm * by handling numbers as L. * * @see ubidi_setInverse * @see ubidi_setReorderingMode * @stable ICU 2.0 */ U_STABLE UBool U_EXPORT2 ubidi_isInverse(UBiDi *pBiDi); /** * Specify whether block separators must be allocated level zero, * so that successive paragraphs will progress from left to right. * This function must be called before ubidi_setPara(). * Paragraph separators (B) may appear in the text. Setting them to level zero * means that all paragraph separators (including one possibly appearing * in the last text position) are kept in the reordered text after the text * that they follow in the source text. * When this feature is not enabled, a paragraph separator at the last * position of the text before reordering will go to the first position * of the reordered text when the paragraph level is odd. * * @param pBiDi is a UBiDi object. * * @param orderParagraphsLTR specifies whether paragraph separators (B) must * receive level 0, so that successive paragraphs progress from left to right. * * @see ubidi_setPara * @stable ICU 3.4 */ U_STABLE void U_EXPORT2 ubidi_orderParagraphsLTR(UBiDi *pBiDi, UBool orderParagraphsLTR); /** * Is this Bidi object set to allocate level 0 to block separators so that * successive paragraphs progress from left to right? * * @param pBiDi is a UBiDi object. * @return TRUE if the Bidi object is set to allocate level 0 to block * separators. * * @see ubidi_orderParagraphsLTR * @stable ICU 3.4 */ U_STABLE UBool U_EXPORT2 ubidi_isOrderParagraphsLTR(UBiDi *pBiDi); /** * UBiDiReorderingMode values indicate which variant of the Bidi * algorithm to use. * * @see ubidi_setReorderingMode * @stable ICU 3.6 */ typedef enum UBiDiReorderingMode { /** Regular Logical to Visual Bidi algorithm according to Unicode. * This is a 0 value. * @stable ICU 3.6 */ UBIDI_REORDER_DEFAULT = 0, /** Logical to Visual algorithm which handles numbers in a way which * mimicks the behavior of Windows XP. * @stable ICU 3.6 */ UBIDI_REORDER_NUMBERS_SPECIAL, /** Logical to Visual algorithm grouping numbers with adjacent R characters * (reversible algorithm). * @stable ICU 3.6 */ UBIDI_REORDER_GROUP_NUMBERS_WITH_R, /** Reorder runs only to transform a Logical LTR string to the Logical RTL * string with the same display, or vice-versa.
* If this mode is set together with option * #UBIDI_OPTION_INSERT_MARKS, some Bidi controls in the source * text may be removed and other controls may be added to produce the * minimum combination which has the required display. * @stable ICU 3.6 */ UBIDI_REORDER_RUNS_ONLY, /** Visual to Logical algorithm which handles numbers like L * (same algorithm as selected by ubidi_setInverse(TRUE). * @see ubidi_setInverse * @stable ICU 3.6 */ UBIDI_REORDER_INVERSE_NUMBERS_AS_L, /** Visual to Logical algorithm equivalent to the regular Logical to Visual * algorithm. * @stable ICU 3.6 */ UBIDI_REORDER_INVERSE_LIKE_DIRECT, /** Inverse Bidi (Visual to Logical) algorithm for the * UBIDI_REORDER_NUMBERS_SPECIAL Bidi algorithm. * @stable ICU 3.6 */ UBIDI_REORDER_INVERSE_FOR_NUMBERS_SPECIAL, /** Number of values for reordering mode. * @stable ICU 3.6 */ UBIDI_REORDER_COUNT } UBiDiReorderingMode; /** * Modify the operation of the Bidi algorithm such that it implements some * variant to the basic Bidi algorithm or approximates an "inverse Bidi" * algorithm, depending on different values of the "reordering mode". * This function must be called before ubidi_setPara(), and stays * in effect until called again with a different argument. * *

The normal operation of the Bidi algorithm as described * in the Unicode Standard Annex #9 is to take text stored in logical * (keyboard, typing) order and to determine how to reorder it for visual * rendering.

* *

With the reordering mode set to a value other than * #UBIDI_REORDER_DEFAULT, this function changes the behavior of * some of the subsequent functions in a way such that they implement an * inverse Bidi algorithm or some other algorithm variants.

* *

Some legacy systems store text in visual order, and for operations * with standard, Unicode-based algorithms, the text needs to be transformed * into logical order. This is effectively the inverse algorithm of the * described Bidi algorithm. Note that there is no standard algorithm for * this "inverse Bidi", so a number of variants are implemented here.

* *

In other cases, it may be desirable to emulate some variant of the * Logical to Visual algorithm (e.g. one used in MS Windows), or perform a * Logical to Logical transformation.

* *
    *
  • When the reordering mode is set to #UBIDI_REORDER_DEFAULT, * the standard Bidi Logical to Visual algorithm is applied.
  • * *
  • When the reordering mode is set to * #UBIDI_REORDER_NUMBERS_SPECIAL, * the algorithm used to perform Bidi transformations when calling * ubidi_setPara should approximate the algorithm used in * Microsoft Windows XP rather than strictly conform to the Unicode Bidi * algorithm. *
    * The differences between the basic algorithm and the algorithm addressed * by this option are as follows: *
      *
    • Within text at an even embedding level, the sequence "123AB" * (where AB represent R or AL letters) is transformed to "123BA" by the * Unicode algorithm and to "BA123" by the Windows algorithm.
    • *
    • Arabic-Indic numbers (AN) are handled by the Windows algorithm just * like regular numbers (EN).
    • *
  • * *
  • When the reordering mode is set to * #UBIDI_REORDER_GROUP_NUMBERS_WITH_R, * numbers located between LTR text and RTL text are associated with the RTL * text. For instance, an LTR paragraph with content "abc 123 DEF" (where * upper case letters represent RTL characters) will be transformed to * "abc FED 123" (and not "abc 123 FED"), "DEF 123 abc" will be transformed * to "123 FED abc" and "123 FED abc" will be transformed to "DEF 123 abc". * This makes the algorithm reversible and makes it useful when round trip * (from visual to logical and back to visual) must be achieved without * adding LRM characters. However, this is a variation from the standard * Unicode Bidi algorithm.
    * The source text should not contain Bidi control characters other than LRM * or RLM.
  • * *
  • When the reordering mode is set to * #UBIDI_REORDER_RUNS_ONLY, * a "Logical to Logical" transformation must be performed: *
      *
    • If the default text level of the source text (argument paraLevel * in ubidi_setPara) is even, the source text will be handled as * LTR logical text and will be transformed to the RTL logical text which has * the same LTR visual display.
    • *
    • If the default level of the source text is odd, the source text * will be handled as RTL logical text and will be transformed to the * LTR logical text which has the same LTR visual display.
    • *
    * This mode may be needed when logical text which is basically Arabic or * Hebrew, with possible included numbers or phrases in English, has to be * displayed as if it had an even embedding level (this can happen if the * displaying application treats all text as if it was basically LTR). *
    * This mode may also be needed in the reverse case, when logical text which is * basically English, with possible included phrases in Arabic or Hebrew, has to * be displayed as if it had an odd embedding level. *
    * Both cases could be handled by adding LRE or RLE at the head of the text, * if the display subsystem supports these formatting controls. If it does not, * the problem may be handled by transforming the source text in this mode * before displaying it, so that it will be displayed properly.
    * The source text should not contain Bidi control characters other than LRM * or RLM.
  • * *
  • When the reordering mode is set to * #UBIDI_REORDER_INVERSE_NUMBERS_AS_L, an "inverse Bidi" algorithm * is applied. * Runs of text with numeric characters will be treated like LTR letters and * may need to be surrounded with LRM characters when they are written in * reordered sequence (the option #UBIDI_INSERT_LRM_FOR_NUMERIC can * be used with function ubidi_writeReordered to this end. This * mode is equivalent to calling ubidi_setInverse() with * argument isInverse set to TRUE.
  • * *
  • When the reordering mode is set to * #UBIDI_REORDER_INVERSE_LIKE_DIRECT, the "direct" Logical to Visual * Bidi algorithm is used as an approximation of an "inverse Bidi" algorithm. * This mode is similar to mode #UBIDI_REORDER_INVERSE_NUMBERS_AS_L * but is closer to the regular Bidi algorithm. *
    * For example, an LTR paragraph with the content "FED 123 456 CBA" (where * upper case represents RTL characters) will be transformed to * "ABC 456 123 DEF", as opposed to "DEF 123 456 ABC" * with mode UBIDI_REORDER_INVERSE_NUMBERS_AS_L.
    * When used in conjunction with option * #UBIDI_OPTION_INSERT_MARKS, this mode generally * adds Bidi marks to the output significantly more sparingly than mode * #UBIDI_REORDER_INVERSE_NUMBERS_AS_L with option * #UBIDI_INSERT_LRM_FOR_NUMERIC in calls to * ubidi_writeReordered.
  • * *
  • When the reordering mode is set to * #UBIDI_REORDER_INVERSE_FOR_NUMBERS_SPECIAL, the Logical to Visual * Bidi algorithm used in Windows XP is used as an approximation of an "inverse Bidi" algorithm. *
    * For example, an LTR paragraph with the content "abc FED123" (where * upper case represents RTL characters) will be transformed to "abc 123DEF."
  • *
* *

In all the reordering modes specifying an "inverse Bidi" algorithm * (i.e. those with a name starting with UBIDI_REORDER_INVERSE), * output runs should be retrieved using * ubidi_getVisualRun(), and the output text with * ubidi_writeReordered(). The caller should keep in mind that in * "inverse Bidi" modes the input is actually visually ordered text and * reordered output returned by ubidi_getVisualRun() or * ubidi_writeReordered() are actually runs or character string * of logically ordered output.
* For all the "inverse Bidi" modes, the source text should not contain * Bidi control characters other than LRM or RLM.

* *

Note that option #UBIDI_OUTPUT_REVERSE of * ubidi_writeReordered has no useful meaning and should not be * used in conjunction with any value of the reordering mode specifying * "inverse Bidi" or with value UBIDI_REORDER_RUNS_ONLY. * * @param pBiDi is a UBiDi object. * @param reorderingMode specifies the required variant of the Bidi algorithm. * * @see UBiDiReorderingMode * @see ubidi_setInverse * @see ubidi_setPara * @see ubidi_writeReordered * @stable ICU 3.6 */ U_STABLE void U_EXPORT2 ubidi_setReorderingMode(UBiDi *pBiDi, UBiDiReorderingMode reorderingMode); /** * What is the requested reordering mode for a given Bidi object? * * @param pBiDi is a UBiDi object. * @return the current reordering mode of the Bidi object * @see ubidi_setReorderingMode * @stable ICU 3.6 */ U_STABLE UBiDiReorderingMode U_EXPORT2 ubidi_getReorderingMode(UBiDi *pBiDi); /** * UBiDiReorderingOption values indicate which options are * specified to affect the Bidi algorithm. * * @see ubidi_setReorderingOptions * @stable ICU 3.6 */ typedef enum UBiDiReorderingOption { /** * option value for ubidi_setReorderingOptions: * disable all the options which can be set with this function * @see ubidi_setReorderingOptions * @stable ICU 3.6 */ UBIDI_OPTION_DEFAULT = 0, /** * option bit for ubidi_setReorderingOptions: * insert Bidi marks (LRM or RLM) when needed to ensure correct result of * a reordering to a Logical order * *

This option must be set or reset before calling * ubidi_setPara.

* *

This option is significant only with reordering modes which generate * a result with Logical order, specifically:

*
    *
  • #UBIDI_REORDER_RUNS_ONLY
  • *
  • #UBIDI_REORDER_INVERSE_NUMBERS_AS_L
  • *
  • #UBIDI_REORDER_INVERSE_LIKE_DIRECT
  • *
  • #UBIDI_REORDER_INVERSE_FOR_NUMBERS_SPECIAL
  • *
* *

If this option is set in conjunction with reordering mode * #UBIDI_REORDER_INVERSE_NUMBERS_AS_L or with calling * ubidi_setInverse(TRUE), it implies * option #UBIDI_INSERT_LRM_FOR_NUMERIC * in calls to function ubidi_writeReordered().

* *

For other reordering modes, a minimum number of LRM or RLM characters * will be added to the source text after reordering it so as to ensure * round trip, i.e. when applying the inverse reordering mode on the * resulting logical text with removal of Bidi marks * (option #UBIDI_OPTION_REMOVE_CONTROLS set before calling * ubidi_setPara() or option #UBIDI_REMOVE_BIDI_CONTROLS * in ubidi_writeReordered), the result will be identical to the * source text in the first transformation. * *

This option will be ignored if specified together with option * #UBIDI_OPTION_REMOVE_CONTROLS. It inhibits option * UBIDI_REMOVE_BIDI_CONTROLS in calls to function * ubidi_writeReordered() and it implies option * #UBIDI_INSERT_LRM_FOR_NUMERIC in calls to function * ubidi_writeReordered() if the reordering mode is * #UBIDI_REORDER_INVERSE_NUMBERS_AS_L.

* * @see ubidi_setReorderingMode * @see ubidi_setReorderingOptions * @stable ICU 3.6 */ UBIDI_OPTION_INSERT_MARKS = 1, /** * option bit for ubidi_setReorderingOptions: * remove Bidi control characters * *

This option must be set or reset before calling * ubidi_setPara.

* *

This option nullifies option #UBIDI_OPTION_INSERT_MARKS. * It inhibits option #UBIDI_INSERT_LRM_FOR_NUMERIC in calls * to function ubidi_writeReordered() and it implies option * #UBIDI_REMOVE_BIDI_CONTROLS in calls to that function.

* * @see ubidi_setReorderingMode * @see ubidi_setReorderingOptions * @stable ICU 3.6 */ UBIDI_OPTION_REMOVE_CONTROLS = 2, /** * option bit for ubidi_setReorderingOptions: * process the output as part of a stream to be continued * *

This option must be set or reset before calling * ubidi_setPara.

* *

This option specifies that the caller is interested in processing large * text object in parts. * The results of the successive calls are expected to be concatenated by the * caller. Only the call for the last part will have this option bit off.

* *

When this option bit is on, ubidi_setPara() may process * less than the full source text in order to truncate the text at a meaningful * boundary. The caller should call ubidi_getProcessedLength() * immediately after calling ubidi_setPara() in order to * determine how much of the source text has been processed. * Source text beyond that length should be resubmitted in following calls to * ubidi_setPara. The processed length may be less than * the length of the source text if a character preceding the last character of * the source text constitutes a reasonable boundary (like a block separator) * for text to be continued.
* If the last character of the source text constitutes a reasonable * boundary, the whole text will be processed at once.
* If nowhere in the source text there exists * such a reasonable boundary, the processed length will be zero.
* The caller should check for such an occurrence and do one of the following: *

  • submit a larger amount of text with a better chance to include * a reasonable boundary.
  • *
  • resubmit the same text after turning off option * UBIDI_OPTION_STREAMING.
* In all cases, this option should be turned off before processing the last * part of the text.

* *

When the UBIDI_OPTION_STREAMING option is used, * it is recommended to call ubidi_orderParagraphsLTR() with * argument orderParagraphsLTR set to TRUE before * calling ubidi_setPara so that later paragraphs may be * concatenated to previous paragraphs on the right.

* * @see ubidi_setReorderingMode * @see ubidi_setReorderingOptions * @see ubidi_getProcessedLength * @see ubidi_orderParagraphsLTR * @stable ICU 3.6 */ UBIDI_OPTION_STREAMING = 4 } UBiDiReorderingOption; /** * Specify which of the reordering options * should be applied during Bidi transformations. * * @param pBiDi is a UBiDi object. * @param reorderingOptions is a combination of zero or more of the following * options: * #UBIDI_OPTION_DEFAULT, #UBIDI_OPTION_INSERT_MARKS, * #UBIDI_OPTION_REMOVE_CONTROLS, #UBIDI_OPTION_STREAMING. * * @see ubidi_getReorderingOptions * @stable ICU 3.6 */ U_STABLE void U_EXPORT2 ubidi_setReorderingOptions(UBiDi *pBiDi, uint32_t reorderingOptions); /** * What are the reordering options applied to a given Bidi object? * * @param pBiDi is a UBiDi object. * @return the current reordering options of the Bidi object * @see ubidi_setReorderingOptions * @stable ICU 3.6 */ U_STABLE uint32_t U_EXPORT2 ubidi_getReorderingOptions(UBiDi *pBiDi); /** * Set the context before a call to ubidi_setPara().

* * ubidi_setPara() computes the left-right directionality for a given piece * of text which is supplied as one of its arguments. Sometimes this piece * of text (the "main text") should be considered in context, because text * appearing before ("prologue") and/or after ("epilogue") the main text * may affect the result of this computation.

* * This function specifies the prologue and/or the epilogue for the next * call to ubidi_setPara(). The characters specified as prologue and * epilogue should not be modified by the calling program until the call * to ubidi_setPara() has returned. If successive calls to ubidi_setPara() * all need specification of a context, ubidi_setContext() must be called * before each call to ubidi_setPara(). In other words, a context is not * "remembered" after the following successful call to ubidi_setPara().

* * If a call to ubidi_setPara() specifies UBIDI_DEFAULT_LTR or * UBIDI_DEFAULT_RTL as paraLevel and is preceded by a call to * ubidi_setContext() which specifies a prologue, the paragraph level will * be computed taking in consideration the text in the prologue.

* * When ubidi_setPara() is called without a previous call to * ubidi_setContext, the main text is handled as if preceded and followed * by strong directional characters at the current paragraph level. * Calling ubidi_setContext() with specification of a prologue will change * this behavior by handling the main text as if preceded by the last * strong character appearing in the prologue, if any. * Calling ubidi_setContext() with specification of an epilogue will change * the behavior of ubidi_setPara() by handling the main text as if followed * by the first strong character or digit appearing in the epilogue, if any.

* * Note 1: if ubidi_setContext is called repeatedly without * calling ubidi_setPara, the earlier calls have no effect, * only the last call will be remembered for the next call to * ubidi_setPara.

* * Note 2: calling ubidi_setContext(pBiDi, NULL, 0, NULL, 0, &errorCode) * cancels any previous setting of non-empty prologue or epilogue. * The next call to ubidi_setPara() will process no * prologue or epilogue.

* * Note 3: users must be aware that even after setting the context * before a call to ubidi_setPara() to perform e.g. a logical to visual * transformation, the resulting string may not be identical to what it * would have been if all the text, including prologue and epilogue, had * been processed together.
* Example (upper case letters represent RTL characters):
*   prologue = "abc DE"
*   epilogue = none
*   main text = "FGH xyz"
*   paraLevel = UBIDI_LTR
*   display without prologue = "HGF xyz" * ("HGF" is adjacent to "xyz")
*   display with prologue = "abc HGFED xyz" * ("HGF" is not adjacent to "xyz")
* * @param pBiDi is a paragraph UBiDi object. * * @param prologue is a pointer to the text which precedes the text that * will be specified in a coming call to ubidi_setPara(). * If there is no prologue to consider, then proLength * must be zero and this pointer can be NULL. * * @param proLength is the length of the prologue; if proLength==-1 * then the prologue must be zero-terminated. * Otherwise proLength must be >= 0. If proLength==0, it means * that there is no prologue to consider. * * @param epilogue is a pointer to the text which follows the text that * will be specified in a coming call to ubidi_setPara(). * If there is no epilogue to consider, then epiLength * must be zero and this pointer can be NULL. * * @param epiLength is the length of the epilogue; if epiLength==-1 * then the epilogue must be zero-terminated. * Otherwise epiLength must be >= 0. If epiLength==0, it means * that there is no epilogue to consider. * * @param pErrorCode must be a valid pointer to an error code value. * * @see ubidi_setPara * @stable ICU 4.8 */ U_STABLE void U_EXPORT2 ubidi_setContext(UBiDi *pBiDi, const UChar *prologue, int32_t proLength, const UChar *epilogue, int32_t epiLength, UErrorCode *pErrorCode); /** * Perform the Unicode Bidi algorithm. It is defined in the * Unicode Standard Anned #9, * version 13, * also described in The Unicode Standard, Version 4.0 .

* * This function takes a piece of plain text containing one or more paragraphs, * with or without externally specified embedding levels from styled * text and computes the left-right-directionality of each character.

* * If the entire text is all of the same directionality, then * the function may not perform all the steps described by the algorithm, * i.e., some levels may not be the same as if all steps were performed. * This is not relevant for unidirectional text.
* For example, in pure LTR text with numbers the numbers would get * a resolved level of 2 higher than the surrounding text according to * the algorithm. This implementation may set all resolved levels to * the same value in such a case.

* * The text can be composed of multiple paragraphs. Occurrence of a block * separator in the text terminates a paragraph, and whatever comes next starts * a new paragraph. The exception to this rule is when a Carriage Return (CR) * is followed by a Line Feed (LF). Both CR and LF are block separators, but * in that case, the pair of characters is considered as terminating the * preceding paragraph, and a new paragraph will be started by a character * coming after the LF. * * @param pBiDi A UBiDi object allocated with ubidi_open() * which will be set to contain the reordering information, * especially the resolved levels for all the characters in text. * * @param text is a pointer to the text that the Bidi algorithm will be performed on. * This pointer is stored in the UBiDi object and can be retrieved * with ubidi_getText().
* Note: the text must be (at least) length long. * * @param length is the length of the text; if length==-1 then * the text must be zero-terminated. * * @param paraLevel specifies the default level for the text; * it is typically 0 (LTR) or 1 (RTL). * If the function shall determine the paragraph level from the text, * then paraLevel can be set to * either #UBIDI_DEFAULT_LTR * or #UBIDI_DEFAULT_RTL; if the text contains multiple * paragraphs, the paragraph level shall be determined separately for * each paragraph; if a paragraph does not include any strongly typed * character, then the desired default is used (0 for LTR or 1 for RTL). * Any other value between 0 and #UBIDI_MAX_EXPLICIT_LEVEL * is also valid, with odd levels indicating RTL. * * @param embeddingLevels (in) may be used to preset the embedding and override levels, * ignoring characters like LRE and PDF in the text. * A level overrides the directional property of its corresponding * (same index) character if the level has the * #UBIDI_LEVEL_OVERRIDE bit set.

* Except for that bit, it must be * paraLevel<=embeddingLevels[]<=UBIDI_MAX_EXPLICIT_LEVEL, * with one exception: a level of zero may be specified for a paragraph * separator even if paraLevel>0 when multiple paragraphs * are submitted in the same call to ubidi_setPara().

* Caution: A copy of this pointer, not of the levels, * will be stored in the UBiDi object; * the embeddingLevels array must not be * deallocated before the UBiDi structure is destroyed or reused, * and the embeddingLevels * should not be modified to avoid unexpected results on subsequent Bidi operations. * However, the ubidi_setPara() and * ubidi_setLine() functions may modify some or all of the levels.

* After the UBiDi object is reused or destroyed, the caller * must take care of the deallocation of the embeddingLevels array.

* Note: the embeddingLevels array must be * at least length long. * This pointer can be NULL if this * value is not necessary. * * @param pErrorCode must be a valid pointer to an error code value. * @stable ICU 2.0 */ U_STABLE void U_EXPORT2 ubidi_setPara(UBiDi *pBiDi, const UChar *text, int32_t length, UBiDiLevel paraLevel, UBiDiLevel *embeddingLevels, UErrorCode *pErrorCode); /** * ubidi_setLine() sets a UBiDi to * contain the reordering information, especially the resolved levels, * for all the characters in a line of text. This line of text is * specified by referring to a UBiDi object representing * this information for a piece of text containing one or more paragraphs, * and by specifying a range of indexes in this text.

* In the new line object, the indexes will range from 0 to limit-start-1.

* * This is used after calling ubidi_setPara() * for a piece of text, and after line-breaking on that text. * It is not necessary if each paragraph is treated as a single line.

* * After line-breaking, rules (L1) and (L2) for the treatment of * trailing WS and for reordering are performed on * a UBiDi object that represents a line.

* * Important: pLineBiDi shares data with * pParaBiDi. * You must destroy or reuse pLineBiDi before pParaBiDi. * In other words, you must destroy or reuse the UBiDi object for a line * before the object for its parent paragraph.

* * The text pointer that was stored in pParaBiDi is also copied, * and start is added to it so that it points to the beginning of the * line for this object. * * @param pParaBiDi is the parent paragraph object. It must have been set * by a successful call to ubidi_setPara. * * @param start is the line's first index into the text. * * @param limit is just behind the line's last index into the text * (its last index +1).
* It must be 0<=startcontaining paragraph limit. * If the specified line crosses a paragraph boundary, the function * will terminate with error code U_ILLEGAL_ARGUMENT_ERROR. * * @param pLineBiDi is the object that will now represent a line of the text. * * @param pErrorCode must be a valid pointer to an error code value. * * @see ubidi_setPara * @see ubidi_getProcessedLength * @stable ICU 2.0 */ U_STABLE void U_EXPORT2 ubidi_setLine(const UBiDi *pParaBiDi, int32_t start, int32_t limit, UBiDi *pLineBiDi, UErrorCode *pErrorCode); /** * Get the directionality of the text. * * @param pBiDi is the paragraph or line UBiDi object. * * @return a value of UBIDI_LTR, UBIDI_RTL * or UBIDI_MIXED * that indicates if the entire text * represented by this object is unidirectional, * and which direction, or if it is mixed-directional. * Note - The value UBIDI_NEUTRAL is never returned from this method. * * @see UBiDiDirection * @stable ICU 2.0 */ U_STABLE UBiDiDirection U_EXPORT2 ubidi_getDirection(const UBiDi *pBiDi); /** * Gets the base direction of the text provided according * to the Unicode Bidirectional Algorithm. The base direction * is derived from the first character in the string with bidirectional * character type L, R, or AL. If the first such character has type L, * UBIDI_LTR is returned. If the first such character has * type R or AL, UBIDI_RTL is returned. If the string does * not contain any character of these types, then * UBIDI_NEUTRAL is returned. * * This is a lightweight function for use when only the base direction * is needed and no further bidi processing of the text is needed. * * @param text is a pointer to the text whose base * direction is needed. * Note: the text must be (at least) @c length long. * * @param length is the length of the text; * if length==-1 then the text * must be zero-terminated. * * @return UBIDI_LTR, UBIDI_RTL, * UBIDI_NEUTRAL * * @see UBiDiDirection * @stable ICU 4.6 */ U_STABLE UBiDiDirection U_EXPORT2 ubidi_getBaseDirection(const UChar *text, int32_t length ); /** * Get the pointer to the text. * * @param pBiDi is the paragraph or line UBiDi object. * * @return The pointer to the text that the UBiDi object was created for. * * @see ubidi_setPara * @see ubidi_setLine * @stable ICU 2.0 */ U_STABLE const UChar * U_EXPORT2 ubidi_getText(const UBiDi *pBiDi); /** * Get the length of the text. * * @param pBiDi is the paragraph or line UBiDi object. * * @return The length of the text that the UBiDi object was created for. * @stable ICU 2.0 */ U_STABLE int32_t U_EXPORT2 ubidi_getLength(const UBiDi *pBiDi); /** * Get the paragraph level of the text. * * @param pBiDi is the paragraph or line UBiDi object. * * @return The paragraph level. If there are multiple paragraphs, their * level may vary if the required paraLevel is UBIDI_DEFAULT_LTR or * UBIDI_DEFAULT_RTL. In that case, the level of the first paragraph * is returned. * * @see UBiDiLevel * @see ubidi_getParagraph * @see ubidi_getParagraphByIndex * @stable ICU 2.0 */ U_STABLE UBiDiLevel U_EXPORT2 ubidi_getParaLevel(const UBiDi *pBiDi); /** * Get the number of paragraphs. * * @param pBiDi is the paragraph or line UBiDi object. * * @return The number of paragraphs. * @stable ICU 3.4 */ U_STABLE int32_t U_EXPORT2 ubidi_countParagraphs(UBiDi *pBiDi); /** * Get a paragraph, given a position within the text. * This function returns information about a paragraph.
* Note: if the paragraph index is known, it is more efficient to * retrieve the paragraph information using ubidi_getParagraphByIndex().

* * @param pBiDi is the paragraph or line UBiDi object. * * @param charIndex is the index of a character within the text, in the * range [0..ubidi_getProcessedLength(pBiDi)-1]. * * @param pParaStart will receive the index of the first character of the * paragraph in the text. * This pointer can be NULL if this * value is not necessary. * * @param pParaLimit will receive the limit of the paragraph. * The l-value that you point to here may be the * same expression (variable) as the one for * charIndex. * This pointer can be NULL if this * value is not necessary. * * @param pParaLevel will receive the level of the paragraph. * This pointer can be NULL if this * value is not necessary. * * @param pErrorCode must be a valid pointer to an error code value. * * @return The index of the paragraph containing the specified position. * * @see ubidi_getProcessedLength * @stable ICU 3.4 */ U_STABLE int32_t U_EXPORT2 ubidi_getParagraph(const UBiDi *pBiDi, int32_t charIndex, int32_t *pParaStart, int32_t *pParaLimit, UBiDiLevel *pParaLevel, UErrorCode *pErrorCode); /** * Get a paragraph, given the index of this paragraph. * * This function returns information about a paragraph.

* * @param pBiDi is the paragraph UBiDi object. * * @param paraIndex is the number of the paragraph, in the * range [0..ubidi_countParagraphs(pBiDi)-1]. * * @param pParaStart will receive the index of the first character of the * paragraph in the text. * This pointer can be NULL if this * value is not necessary. * * @param pParaLimit will receive the limit of the paragraph. * This pointer can be NULL if this * value is not necessary. * * @param pParaLevel will receive the level of the paragraph. * This pointer can be NULL if this * value is not necessary. * * @param pErrorCode must be a valid pointer to an error code value. * * @stable ICU 3.4 */ U_STABLE void U_EXPORT2 ubidi_getParagraphByIndex(const UBiDi *pBiDi, int32_t paraIndex, int32_t *pParaStart, int32_t *pParaLimit, UBiDiLevel *pParaLevel, UErrorCode *pErrorCode); /** * Get the level for one character. * * @param pBiDi is the paragraph or line UBiDi object. * * @param charIndex the index of a character. It must be in the range * [0..ubidi_getProcessedLength(pBiDi)]. * * @return The level for the character at charIndex (0 if charIndex is not * in the valid range). * * @see UBiDiLevel * @see ubidi_getProcessedLength * @stable ICU 2.0 */ U_STABLE UBiDiLevel U_EXPORT2 ubidi_getLevelAt(const UBiDi *pBiDi, int32_t charIndex); /** * Get an array of levels for each character.

* * Note that this function may allocate memory under some * circumstances, unlike ubidi_getLevelAt(). * * @param pBiDi is the paragraph or line UBiDi object, whose * text length must be strictly positive. * * @param pErrorCode must be a valid pointer to an error code value. * * @return The levels array for the text, * or NULL if an error occurs. * * @see UBiDiLevel * @see ubidi_getProcessedLength * @stable ICU 2.0 */ U_STABLE const UBiDiLevel * U_EXPORT2 ubidi_getLevels(UBiDi *pBiDi, UErrorCode *pErrorCode); /** * Get a logical run. * This function returns information about a run and is used * to retrieve runs in logical order.

* This is especially useful for line-breaking on a paragraph. * * @param pBiDi is the paragraph or line UBiDi object. * * @param logicalPosition is a logical position within the source text. * * @param pLogicalLimit will receive the limit of the corresponding run. * The l-value that you point to here may be the * same expression (variable) as the one for * logicalPosition. * This pointer can be NULL if this * value is not necessary. * * @param pLevel will receive the level of the corresponding run. * This pointer can be NULL if this * value is not necessary. * * @see ubidi_getProcessedLength * @stable ICU 2.0 */ U_STABLE void U_EXPORT2 ubidi_getLogicalRun(const UBiDi *pBiDi, int32_t logicalPosition, int32_t *pLogicalLimit, UBiDiLevel *pLevel); /** * Get the number of runs. * This function may invoke the actual reordering on the * UBiDi object, after ubidi_setPara() * may have resolved only the levels of the text. Therefore, * ubidi_countRuns() may have to allocate memory, * and may fail doing so. * * @param pBiDi is the paragraph or line UBiDi object. * * @param pErrorCode must be a valid pointer to an error code value. * * @return The number of runs. * @stable ICU 2.0 */ U_STABLE int32_t U_EXPORT2 ubidi_countRuns(UBiDi *pBiDi, UErrorCode *pErrorCode); /** * Get one run's logical start, length, and directionality, * which can be 0 for LTR or 1 for RTL. * In an RTL run, the character at the logical start is * visually on the right of the displayed run. * The length is the number of characters in the run.

* ubidi_countRuns() should be called * before the runs are retrieved. * * @param pBiDi is the paragraph or line UBiDi object. * * @param runIndex is the number of the run in visual order, in the * range [0..ubidi_countRuns(pBiDi)-1]. * * @param pLogicalStart is the first logical character index in the text. * The pointer may be NULL if this index is not needed. * * @param pLength is the number of characters (at least one) in the run. * The pointer may be NULL if this is not needed. * * @return the directionality of the run, * UBIDI_LTR==0 or UBIDI_RTL==1, * never UBIDI_MIXED, * never UBIDI_NEUTRAL. * * @see ubidi_countRuns * * Example: *

 * \code
 * int32_t i, count=ubidi_countRuns(pBiDi),
 *         logicalStart, visualIndex=0, length;
 * for(i=0; i0);
 *     } else {
 *         logicalStart+=length;  // logicalLimit
 *         do { // RTL
 *             show_char(text[--logicalStart], visualIndex++);
 *         } while(--length>0);
 *     }
 * }
 *\endcode
 * 
* * Note that in right-to-left runs, code like this places * second surrogates before first ones (which is generally a bad idea) * and combining characters before base characters. *

* Use of ubidi_writeReordered(), optionally with the * #UBIDI_KEEP_BASE_COMBINING option, can be considered in order * to avoid these issues. * @stable ICU 2.0 */ U_STABLE UBiDiDirection U_EXPORT2 ubidi_getVisualRun(UBiDi *pBiDi, int32_t runIndex, int32_t *pLogicalStart, int32_t *pLength); /** * Get the visual position from a logical text position. * If such a mapping is used many times on the same * UBiDi object, then calling * ubidi_getLogicalMap() is more efficient.

* * The value returned may be #UBIDI_MAP_NOWHERE if there is no * visual position because the corresponding text character is a Bidi control * removed from output by the option #UBIDI_OPTION_REMOVE_CONTROLS. *

* When the visual output is altered by using options of * ubidi_writeReordered() such as UBIDI_INSERT_LRM_FOR_NUMERIC, * UBIDI_KEEP_BASE_COMBINING, UBIDI_OUTPUT_REVERSE, * UBIDI_REMOVE_BIDI_CONTROLS, the visual position returned may not * be correct. It is advised to use, when possible, reordering options * such as UBIDI_OPTION_INSERT_MARKS and UBIDI_OPTION_REMOVE_CONTROLS. *

* Note that in right-to-left runs, this mapping places * second surrogates before first ones (which is generally a bad idea) * and combining characters before base characters. * Use of ubidi_writeReordered(), optionally with the * #UBIDI_KEEP_BASE_COMBINING option can be considered instead * of using the mapping, in order to avoid these issues. * * @param pBiDi is the paragraph or line UBiDi object. * * @param logicalIndex is the index of a character in the text. * * @param pErrorCode must be a valid pointer to an error code value. * * @return The visual position of this character. * * @see ubidi_getLogicalMap * @see ubidi_getLogicalIndex * @see ubidi_getProcessedLength * @stable ICU 2.0 */ U_STABLE int32_t U_EXPORT2 ubidi_getVisualIndex(UBiDi *pBiDi, int32_t logicalIndex, UErrorCode *pErrorCode); /** * Get the logical text position from a visual position. * If such a mapping is used many times on the same * UBiDi object, then calling * ubidi_getVisualMap() is more efficient.

* * The value returned may be #UBIDI_MAP_NOWHERE if there is no * logical position because the corresponding text character is a Bidi mark * inserted in the output by option #UBIDI_OPTION_INSERT_MARKS. *

* This is the inverse function to ubidi_getVisualIndex(). *

* When the visual output is altered by using options of * ubidi_writeReordered() such as UBIDI_INSERT_LRM_FOR_NUMERIC, * UBIDI_KEEP_BASE_COMBINING, UBIDI_OUTPUT_REVERSE, * UBIDI_REMOVE_BIDI_CONTROLS, the logical position returned may not * be correct. It is advised to use, when possible, reordering options * such as UBIDI_OPTION_INSERT_MARKS and UBIDI_OPTION_REMOVE_CONTROLS. * * @param pBiDi is the paragraph or line UBiDi object. * * @param visualIndex is the visual position of a character. * * @param pErrorCode must be a valid pointer to an error code value. * * @return The index of this character in the text. * * @see ubidi_getVisualMap * @see ubidi_getVisualIndex * @see ubidi_getResultLength * @stable ICU 2.0 */ U_STABLE int32_t U_EXPORT2 ubidi_getLogicalIndex(UBiDi *pBiDi, int32_t visualIndex, UErrorCode *pErrorCode); /** * Get a logical-to-visual index map (array) for the characters in the UBiDi * (paragraph or line) object. *

* Some values in the map may be #UBIDI_MAP_NOWHERE if the * corresponding text characters are Bidi controls removed from the visual * output by the option #UBIDI_OPTION_REMOVE_CONTROLS. *

* When the visual output is altered by using options of * ubidi_writeReordered() such as UBIDI_INSERT_LRM_FOR_NUMERIC, * UBIDI_KEEP_BASE_COMBINING, UBIDI_OUTPUT_REVERSE, * UBIDI_REMOVE_BIDI_CONTROLS, the visual positions returned may not * be correct. It is advised to use, when possible, reordering options * such as UBIDI_OPTION_INSERT_MARKS and UBIDI_OPTION_REMOVE_CONTROLS. *

* Note that in right-to-left runs, this mapping places * second surrogates before first ones (which is generally a bad idea) * and combining characters before base characters. * Use of ubidi_writeReordered(), optionally with the * #UBIDI_KEEP_BASE_COMBINING option can be considered instead * of using the mapping, in order to avoid these issues. * * @param pBiDi is the paragraph or line UBiDi object. * * @param indexMap is a pointer to an array of ubidi_getProcessedLength() * indexes which will reflect the reordering of the characters. * If option #UBIDI_OPTION_INSERT_MARKS is set, the number * of elements allocated in indexMap must be no less than * ubidi_getResultLength(). * The array does not need to be initialized.

* The index map will result in indexMap[logicalIndex]==visualIndex. * * @param pErrorCode must be a valid pointer to an error code value. * * @see ubidi_getVisualMap * @see ubidi_getVisualIndex * @see ubidi_getProcessedLength * @see ubidi_getResultLength * @stable ICU 2.0 */ U_STABLE void U_EXPORT2 ubidi_getLogicalMap(UBiDi *pBiDi, int32_t *indexMap, UErrorCode *pErrorCode); /** * Get a visual-to-logical index map (array) for the characters in the UBiDi * (paragraph or line) object. *

* Some values in the map may be #UBIDI_MAP_NOWHERE if the * corresponding text characters are Bidi marks inserted in the visual output * by the option #UBIDI_OPTION_INSERT_MARKS. *

* When the visual output is altered by using options of * ubidi_writeReordered() such as UBIDI_INSERT_LRM_FOR_NUMERIC, * UBIDI_KEEP_BASE_COMBINING, UBIDI_OUTPUT_REVERSE, * UBIDI_REMOVE_BIDI_CONTROLS, the logical positions returned may not * be correct. It is advised to use, when possible, reordering options * such as UBIDI_OPTION_INSERT_MARKS and UBIDI_OPTION_REMOVE_CONTROLS. * * @param pBiDi is the paragraph or line UBiDi object. * * @param indexMap is a pointer to an array of ubidi_getResultLength() * indexes which will reflect the reordering of the characters. * If option #UBIDI_OPTION_REMOVE_CONTROLS is set, the number * of elements allocated in indexMap must be no less than * ubidi_getProcessedLength(). * The array does not need to be initialized.

* The index map will result in indexMap[visualIndex]==logicalIndex. * * @param pErrorCode must be a valid pointer to an error code value. * * @see ubidi_getLogicalMap * @see ubidi_getLogicalIndex * @see ubidi_getProcessedLength * @see ubidi_getResultLength * @stable ICU 2.0 */ U_STABLE void U_EXPORT2 ubidi_getVisualMap(UBiDi *pBiDi, int32_t *indexMap, UErrorCode *pErrorCode); /** * This is a convenience function that does not use a UBiDi object. * It is intended to be used for when an application has determined the levels * of objects (character sequences) and just needs to have them reordered (L2). * This is equivalent to using ubidi_getLogicalMap() on a * UBiDi object. * * @param levels is an array with length levels that have been determined by * the application. * * @param length is the number of levels in the array, or, semantically, * the number of objects to be reordered. * It must be length>0. * * @param indexMap is a pointer to an array of length * indexes which will reflect the reordering of the characters. * The array does not need to be initialized.

* The index map will result in indexMap[logicalIndex]==visualIndex. * @stable ICU 2.0 */ U_STABLE void U_EXPORT2 ubidi_reorderLogical(const UBiDiLevel *levels, int32_t length, int32_t *indexMap); /** * This is a convenience function that does not use a UBiDi object. * It is intended to be used for when an application has determined the levels * of objects (character sequences) and just needs to have them reordered (L2). * This is equivalent to using ubidi_getVisualMap() on a * UBiDi object. * * @param levels is an array with length levels that have been determined by * the application. * * @param length is the number of levels in the array, or, semantically, * the number of objects to be reordered. * It must be length>0. * * @param indexMap is a pointer to an array of length * indexes which will reflect the reordering of the characters. * The array does not need to be initialized.

* The index map will result in indexMap[visualIndex]==logicalIndex. * @stable ICU 2.0 */ U_STABLE void U_EXPORT2 ubidi_reorderVisual(const UBiDiLevel *levels, int32_t length, int32_t *indexMap); /** * Invert an index map. * The index mapping of the first map is inverted and written to * the second one. * * @param srcMap is an array with length elements * which defines the original mapping from a source array containing * length elements to a destination array. * Some elements of the source array may have no mapping in the * destination array. In that case, their value will be * the special value UBIDI_MAP_NOWHERE. * All elements must be >=0 or equal to UBIDI_MAP_NOWHERE. * Some elements may have a value >= length, if the * destination array has more elements than the source array. * There must be no duplicate indexes (two or more elements with the * same value except UBIDI_MAP_NOWHERE). * * @param destMap is an array with a number of elements equal to 1 + the highest * value in srcMap. * destMap will be filled with the inverse mapping. * If element with index i in srcMap has a value k different * from UBIDI_MAP_NOWHERE, this means that element i of * the source array maps to element k in the destination array. * The inverse map will have value i in its k-th element. * For all elements of the destination array which do not map to * an element in the source array, the corresponding element in the * inverse map will have a value equal to UBIDI_MAP_NOWHERE. * * @param length is the length of each array. * @see UBIDI_MAP_NOWHERE * @stable ICU 2.0 */ U_STABLE void U_EXPORT2 ubidi_invertMap(const int32_t *srcMap, int32_t *destMap, int32_t length); /** option flags for ubidi_writeReordered() */ /** * option bit for ubidi_writeReordered(): * keep combining characters after their base characters in RTL runs * * @see ubidi_writeReordered * @stable ICU 2.0 */ #define UBIDI_KEEP_BASE_COMBINING 1 /** * option bit for ubidi_writeReordered(): * replace characters with the "mirrored" property in RTL runs * by their mirror-image mappings * * @see ubidi_writeReordered * @stable ICU 2.0 */ #define UBIDI_DO_MIRRORING 2 /** * option bit for ubidi_writeReordered(): * surround the run with LRMs if necessary; * this is part of the approximate "inverse Bidi" algorithm * *

This option does not imply corresponding adjustment of the index * mappings.

* * @see ubidi_setInverse * @see ubidi_writeReordered * @stable ICU 2.0 */ #define UBIDI_INSERT_LRM_FOR_NUMERIC 4 /** * option bit for ubidi_writeReordered(): * remove Bidi control characters * (this does not affect #UBIDI_INSERT_LRM_FOR_NUMERIC) * *

This option does not imply corresponding adjustment of the index * mappings.

* * @see ubidi_writeReordered * @stable ICU 2.0 */ #define UBIDI_REMOVE_BIDI_CONTROLS 8 /** * option bit for ubidi_writeReordered(): * write the output in reverse order * *

This has the same effect as calling ubidi_writeReordered() * first without this option, and then calling * ubidi_writeReverse() without mirroring. * Doing this in the same step is faster and avoids a temporary buffer. * An example for using this option is output to a character terminal that * is designed for RTL scripts and stores text in reverse order.

* * @see ubidi_writeReordered * @stable ICU 2.0 */ #define UBIDI_OUTPUT_REVERSE 16 /** * Get the length of the source text processed by the last call to * ubidi_setPara(). This length may be different from the length * of the source text if option #UBIDI_OPTION_STREAMING * has been set. *
* Note that whenever the length of the text affects the execution or the * result of a function, it is the processed length which must be considered, * except for ubidi_setPara (which receives unprocessed source * text) and ubidi_getLength (which returns the original length * of the source text).
* In particular, the processed length is the one to consider in the following * cases: *
    *
  • maximum value of the limit argument of * ubidi_setLine
  • *
  • maximum value of the charIndex argument of * ubidi_getParagraph
  • *
  • maximum value of the charIndex argument of * ubidi_getLevelAt
  • *
  • number of elements in the array returned by ubidi_getLevels
  • *
  • maximum value of the logicalStart argument of * ubidi_getLogicalRun
  • *
  • maximum value of the logicalIndex argument of * ubidi_getVisualIndex
  • *
  • number of elements filled in the *indexMap argument of * ubidi_getLogicalMap
  • *
  • length of text processed by ubidi_writeReordered
  • *
* * @param pBiDi is the paragraph UBiDi object. * * @return The length of the part of the source text processed by * the last call to ubidi_setPara. * @see ubidi_setPara * @see UBIDI_OPTION_STREAMING * @stable ICU 3.6 */ U_STABLE int32_t U_EXPORT2 ubidi_getProcessedLength(const UBiDi *pBiDi); /** * Get the length of the reordered text resulting from the last call to * ubidi_setPara(). This length may be different from the length * of the source text if option #UBIDI_OPTION_INSERT_MARKS * or option #UBIDI_OPTION_REMOVE_CONTROLS has been set. *
* This resulting length is the one to consider in the following cases: *
    *
  • maximum value of the visualIndex argument of * ubidi_getLogicalIndex
  • *
  • number of elements of the *indexMap argument of * ubidi_getVisualMap
  • *
* Note that this length stays identical to the source text length if * Bidi marks are inserted or removed using option bits of * ubidi_writeReordered, or if option * #UBIDI_REORDER_INVERSE_NUMBERS_AS_L has been set. * * @param pBiDi is the paragraph UBiDi object. * * @return The length of the reordered text resulting from * the last call to ubidi_setPara. * @see ubidi_setPara * @see UBIDI_OPTION_INSERT_MARKS * @see UBIDI_OPTION_REMOVE_CONTROLS * @stable ICU 3.6 */ U_STABLE int32_t U_EXPORT2 ubidi_getResultLength(const UBiDi *pBiDi); U_CDECL_BEGIN /** * value returned by UBiDiClassCallback callbacks when * there is no need to override the standard Bidi class for a given code point. * @see UBiDiClassCallback * @stable ICU 3.6 */ #define U_BIDI_CLASS_DEFAULT U_CHAR_DIRECTION_COUNT /** * Callback type declaration for overriding default Bidi class values with * custom ones. *

Usually, the function pointer will be propagated to a UBiDi * object by calling the ubidi_setClassCallback() function; * then the callback will be invoked by the UBA implementation any time the * class of a character is to be determined.

* * @param context is a pointer to the callback private data. * * @param c is the code point to get a Bidi class for. * * @return The directional property / Bidi class for the given code point * c if the default class has been overridden, or * #U_BIDI_CLASS_DEFAULT if the standard Bidi class value * for c is to be used. * @see ubidi_setClassCallback * @see ubidi_getClassCallback * @stable ICU 3.6 */ typedef UCharDirection U_CALLCONV UBiDiClassCallback(const void *context, UChar32 c); U_CDECL_END /** * Retrieve the Bidi class for a given code point. *

If a #UBiDiClassCallback callback is defined and returns a * value other than #U_BIDI_CLASS_DEFAULT, that value is used; * otherwise the default class determination mechanism is invoked.

* * @param pBiDi is the paragraph UBiDi object. * * @param c is the code point whose Bidi class must be retrieved. * * @return The Bidi class for character c based * on the given pBiDi instance. * @see UBiDiClassCallback * @stable ICU 3.6 */ U_STABLE UCharDirection U_EXPORT2 ubidi_getCustomizedClass(UBiDi *pBiDi, UChar32 c); /** * Set the callback function and callback data used by the UBA * implementation for Bidi class determination. *

This may be useful for assigning Bidi classes to PUA characters, or * for special application needs. For instance, an application may want to * handle all spaces like L or R characters (according to the base direction) * when creating the visual ordering of logical lines which are part of a report * organized in columns: there should not be interaction between adjacent * cells.

* * @param pBiDi is the paragraph UBiDi object. * * @param newFn is the new callback function pointer. * * @param newContext is the new callback context pointer. This can be NULL. * * @param oldFn fillin: Returns the old callback function pointer. This can be * NULL. * * @param oldContext fillin: Returns the old callback's context. This can be * NULL. * * @param pErrorCode must be a valid pointer to an error code value. * * @see ubidi_getClassCallback * @stable ICU 3.6 */ U_STABLE void U_EXPORT2 ubidi_setClassCallback(UBiDi *pBiDi, UBiDiClassCallback *newFn, const void *newContext, UBiDiClassCallback **oldFn, const void **oldContext, UErrorCode *pErrorCode); /** * Get the current callback function used for Bidi class determination. * * @param pBiDi is the paragraph UBiDi object. * * @param fn fillin: Returns the callback function pointer. * * @param context fillin: Returns the callback's private context. * * @see ubidi_setClassCallback * @stable ICU 3.6 */ U_STABLE void U_EXPORT2 ubidi_getClassCallback(UBiDi *pBiDi, UBiDiClassCallback **fn, const void **context); /** * Take a UBiDi object containing the reordering * information for a piece of text (one or more paragraphs) set by * ubidi_setPara() or for a line of text set by * ubidi_setLine() and write a reordered string to the * destination buffer. * * This function preserves the integrity of characters with multiple * code units and (optionally) combining characters. * Characters in RTL runs can be replaced by mirror-image characters * in the destination buffer. Note that "real" mirroring has * to be done in a rendering engine by glyph selection * and that for many "mirrored" characters there are no * Unicode characters as mirror-image equivalents. * There are also options to insert or remove Bidi control * characters; see the description of the destSize * and options parameters and of the option bit flags. * * @param pBiDi A pointer to a UBiDi object that * is set by ubidi_setPara() or * ubidi_setLine() and contains the reordering * information for the text that it was defined for, * as well as a pointer to that text.

* The text was aliased (only the pointer was stored * without copying the contents) and must not have been modified * since the ubidi_setPara() call. * * @param dest A pointer to where the reordered text is to be copied. * The source text and dest[destSize] * must not overlap. * * @param destSize The size of the dest buffer, * in number of UChars. * If the UBIDI_INSERT_LRM_FOR_NUMERIC * option is set, then the destination length could be * as large as * ubidi_getLength(pBiDi)+2*ubidi_countRuns(pBiDi). * If the UBIDI_REMOVE_BIDI_CONTROLS option * is set, then the destination length may be less than * ubidi_getLength(pBiDi). * If none of these options is set, then the destination length * will be exactly ubidi_getProcessedLength(pBiDi). * * @param options A bit set of options for the reordering that control * how the reordered text is written. * The options include mirroring the characters on a code * point basis and inserting LRM characters, which is used * especially for transforming visually stored text * to logically stored text (although this is still an * imperfect implementation of an "inverse Bidi" algorithm * because it uses the "forward Bidi" algorithm at its core). * The available options are: * #UBIDI_DO_MIRRORING, * #UBIDI_INSERT_LRM_FOR_NUMERIC, * #UBIDI_KEEP_BASE_COMBINING, * #UBIDI_OUTPUT_REVERSE, * #UBIDI_REMOVE_BIDI_CONTROLS * * @param pErrorCode must be a valid pointer to an error code value. * * @return The length of the output string. * * @see ubidi_getProcessedLength * @stable ICU 2.0 */ U_STABLE int32_t U_EXPORT2 ubidi_writeReordered(UBiDi *pBiDi, UChar *dest, int32_t destSize, uint16_t options, UErrorCode *pErrorCode); /** * Reverse a Right-To-Left run of Unicode text. * * This function preserves the integrity of characters with multiple * code units and (optionally) combining characters. * Characters can be replaced by mirror-image characters * in the destination buffer. Note that "real" mirroring has * to be done in a rendering engine by glyph selection * and that for many "mirrored" characters there are no * Unicode characters as mirror-image equivalents. * There are also options to insert or remove Bidi control * characters. * * This function is the implementation for reversing RTL runs as part * of ubidi_writeReordered(). For detailed descriptions * of the parameters, see there. * Since no Bidi controls are inserted here, the output string length * will never exceed srcLength. * * @see ubidi_writeReordered * * @param src A pointer to the RTL run text. * * @param srcLength The length of the RTL run. * * @param dest A pointer to where the reordered text is to be copied. * src[srcLength] and dest[destSize] * must not overlap. * * @param destSize The size of the dest buffer, * in number of UChars. * If the UBIDI_REMOVE_BIDI_CONTROLS option * is set, then the destination length may be less than * srcLength. * If this option is not set, then the destination length * will be exactly srcLength. * * @param options A bit set of options for the reordering that control * how the reordered text is written. * See the options parameter in ubidi_writeReordered(). * * @param pErrorCode must be a valid pointer to an error code value. * * @return The length of the output string. * @stable ICU 2.0 */ U_STABLE int32_t U_EXPORT2 ubidi_writeReverse(const UChar *src, int32_t srcLength, UChar *dest, int32_t destSize, uint16_t options, UErrorCode *pErrorCode); /*#define BIDI_SAMPLE_CODE*/ /*@}*/ #endif // stringtriebuilder.h /* ******************************************************************************* * Copyright (C) 2010-2012,2014, International Business Machines * Corporation and others. All Rights Reserved. ******************************************************************************* * file name: stringtriebuilder.h * encoding: US-ASCII * tab size: 8 (not used) * indentation:4 * * created on: 2010dec24 * created by: Markus W. Scherer */ #ifndef __STRINGTRIEBUILDER_H__ #define __STRINGTRIEBUILDER_H__ /** * \file * \brief C++ API: Builder API for trie builders */ // Forward declaration. struct UHashtable; typedef struct UHashtable UHashtable; /** * Build options for BytesTrieBuilder and CharsTrieBuilder. * @stable ICU 4.8 */ enum UStringTrieBuildOption { /** * Builds a trie quickly. * @stable ICU 4.8 */ USTRINGTRIE_BUILD_FAST, /** * Builds a trie more slowly, attempting to generate * a shorter but equivalent serialization. * This build option also uses more memory. * * This option can be effective when many integer values are the same * and string/byte sequence suffixes can be shared. * Runtime speed is not expected to improve. * @stable ICU 4.8 */ USTRINGTRIE_BUILD_SMALL }; #endif // __STRINGTRIEBUILDER_H__ // putil.h /* ****************************************************************************** * * Copyright (C) 1997-2014, International Business Machines * Corporation and others. All Rights Reserved. * ****************************************************************************** * * FILE NAME : putil.h * * Date Name Description * 05/14/98 nos Creation (content moved here from utypes.h). * 06/17/99 erm Added IEEE_754 * 07/22/98 stephen Added IEEEremainder, max, min, trunc * 08/13/98 stephen Added isNegativeInfinity, isPositiveInfinity * 08/24/98 stephen Added longBitsFromDouble * 03/02/99 stephen Removed openFile(). Added AS400 support. * 04/15/99 stephen Converted to C * 11/15/99 helena Integrated S/390 changes for IEEE support. * 01/11/00 helena Added u_getVersion. ****************************************************************************** */ #ifndef PUTIL_H #define PUTIL_H /** * \file * \brief C API: Platform Utilities */ /*==========================================================================*/ /* Platform utilities */ /*==========================================================================*/ /** * Platform utilities isolates the platform dependencies of the * libarary. For each platform which this code is ported to, these * functions may have to be re-implemented. */ /** @} */ /** * Convert char characters to UChar characters. * This utility function is useful only for "invariant characters" * that are encoded in the platform default encoding. * They are a small, constant subset of the encoding and include * just the latin letters, digits, and some punctuation. * For details, see U_CHARSET_FAMILY. * * @param cs Input string, points to length * character bytes from a subset of the platform encoding. * @param us Output string, points to memory for length * Unicode characters. * @param length The number of characters to convert; this may * include the terminating NUL. * * @see U_CHARSET_FAMILY * @stable ICU 2.0 */ U_STABLE void U_EXPORT2 u_charsToUChars(const char *cs, UChar *us, int32_t length); /** * Convert UChar characters to char characters. * This utility function is useful only for "invariant characters" * that can be encoded in the platform default encoding. * They are a small, constant subset of the encoding and include * just the latin letters, digits, and some punctuation. * For details, see U_CHARSET_FAMILY. * * @param us Input string, points to length * Unicode characters that can be encoded with the * codepage-invariant subset of the platform encoding. * @param cs Output string, points to memory for length * character bytes. * @param length The number of characters to convert; this may * include the terminating NUL. * * @see U_CHARSET_FAMILY * @stable ICU 2.0 */ U_STABLE void U_EXPORT2 u_UCharsToChars(const UChar *us, char *cs, int32_t length); #endif // ustring.h /* ********************************************************************** * Copyright (C) 1998-2014, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** * * File ustring.h * * Modification History: * * Date Name Description * 12/07/98 bertrand Creation. ****************************************************************************** */ #ifndef USTRING_H #define USTRING_H /** * \def UBRK_TYPEDEF_UBREAK_ITERATOR * @internal */ #ifndef UBRK_TYPEDEF_UBREAK_ITERATOR # define UBRK_TYPEDEF_UBREAK_ITERATOR /** Simple declaration for u_strToTitle() to avoid including unicode/ubrk.h. @stable ICU 2.1*/ typedef struct UBreakIterator UBreakIterator; #endif /** * \file * \brief C API: Unicode string handling functions * * These C API functions provide general Unicode string handling. * * Some functions are equivalent in name, signature, and behavior to the ANSI C * functions. (For example, they do not check for bad arguments like NULL string pointers.) * In some cases, only the thread-safe variant of such a function is implemented here * (see u_strtok_r()). * * Other functions provide more Unicode-specific functionality like locale-specific * upper/lower-casing and string comparison in code point order. * * ICU uses 16-bit Unicode (UTF-16) in the form of arrays of UChar code units. * UTF-16 encodes each Unicode code point with either one or two UChar code units. * (This is the default form of Unicode, and a forward-compatible extension of the original, * fixed-width form that was known as UCS-2. UTF-16 superseded UCS-2 with Unicode 2.0 * in 1996.) * * Some APIs accept a 32-bit UChar32 value for a single code point. * * ICU also handles 16-bit Unicode text with unpaired surrogates. * Such text is not well-formed UTF-16. * Code-point-related functions treat unpaired surrogates as surrogate code points, * i.e., as separate units. * * Although UTF-16 is a variable-width encoding form (like some legacy multi-byte encodings), * it is much more efficient even for random access because the code unit values * for single-unit characters vs. lead units vs. trail units are completely disjoint. * This means that it is easy to determine character (code point) boundaries from * random offsets in the string. * * Unicode (UTF-16) string processing is optimized for the single-unit case. * Although it is important to support supplementary characters * (which use pairs of lead/trail code units called "surrogates"), * their occurrence is rare. Almost all characters in modern use require only * a single UChar code unit (i.e., their code point values are <=0xffff). * * For more details see the User Guide Strings chapter (http://icu-project.org/userguide/strings.html). * For a discussion of the handling of unpaired surrogates see also * Jitterbug 2145 and its icu mailing list proposal on 2002-sep-18. */ /** * \defgroup ustring_ustrlen String Length * \ingroup ustring_strlen */ /*@{*/ /** * Determine the length of an array of UChar. * * @param s The array of UChars, NULL (U+0000) terminated. * @return The number of UChars in chars, minus the terminator. * @stable ICU 2.0 */ U_STABLE int32_t U_EXPORT2 u_strlen(const UChar *s); /*@}*/ /** * Count Unicode code points in the length UChar code units of the string. * A code point may occupy either one or two UChar code units. * Counting code points involves reading all code units. * * This functions is basically the inverse of the U16_FWD_N() macro (see utf.h). * * @param s The input string. * @param length The number of UChar code units to be checked, or -1 to count all * code points before the first NUL (U+0000). * @return The number of code points in the specified code units. * @stable ICU 2.0 */ U_STABLE int32_t U_EXPORT2 u_countChar32(const UChar *s, int32_t length); /** * Check if the string contains more Unicode code points than a certain number. * This is more efficient than counting all code points in the entire string * and comparing that number with a threshold. * This function may not need to scan the string at all if the length is known * (not -1 for NUL-termination) and falls within a certain range, and * never needs to count more than 'number+1' code points. * Logically equivalent to (u_countChar32(s, length)>number). * A Unicode code point may occupy either one or two UChar code units. * * @param s The input string. * @param length The length of the string, or -1 if it is NUL-terminated. * @param number The number of code points in the string is compared against * the 'number' parameter. * @return Boolean value for whether the string contains more Unicode code points * than 'number'. Same as (u_countChar32(s, length)>number). * @stable ICU 2.4 */ U_STABLE UBool U_EXPORT2 u_strHasMoreChar32Than(const UChar *s, int32_t length, int32_t number); /** * Concatenate two ustrings. Appends a copy of src, * including the null terminator, to dst. The initial copied * character from src overwrites the null terminator in dst. * * @param dst The destination string. * @param src The source string. * @return A pointer to dst. * @stable ICU 2.0 */ U_STABLE UChar* U_EXPORT2 u_strcat(UChar *dst, const UChar *src); /** * Concatenate two ustrings. * Appends at most n characters from src to dst. * Adds a terminating NUL. * If src is too long, then only n-1 characters will be copied * before the terminating NUL. * If n<=0 then dst is not modified. * * @param dst The destination string. * @param src The source string (can be NULL/invalid if n<=0). * @param n The maximum number of characters to append; no-op if <=0. * @return A pointer to dst. * @stable ICU 2.0 */ U_STABLE UChar* U_EXPORT2 u_strncat(UChar *dst, const UChar *src, int32_t n); /** * Find the first occurrence of a substring in a string. * The substring is found at code point boundaries. * That means that if the substring begins with * a trail surrogate or ends with a lead surrogate, * then it is found only if these surrogates stand alone in the text. * Otherwise, the substring edge units would be matched against * halves of surrogate pairs. * * @param s The string to search (NUL-terminated). * @param substring The substring to find (NUL-terminated). * @return A pointer to the first occurrence of substring in s, * or s itself if the substring is empty, * or NULL if substring is not in s. * @stable ICU 2.0 * * @see u_strrstr * @see u_strFindFirst * @see u_strFindLast */ U_STABLE UChar * U_EXPORT2 u_strstr(const UChar *s, const UChar *substring); /** * Find the first occurrence of a substring in a string. * The substring is found at code point boundaries. * That means that if the substring begins with * a trail surrogate or ends with a lead surrogate, * then it is found only if these surrogates stand alone in the text. * Otherwise, the substring edge units would be matched against * halves of surrogate pairs. * * @param s The string to search. * @param length The length of s (number of UChars), or -1 if it is NUL-terminated. * @param substring The substring to find (NUL-terminated). * @param subLength The length of substring (number of UChars), or -1 if it is NUL-terminated. * @return A pointer to the first occurrence of substring in s, * or s itself if the substring is empty, * or NULL if substring is not in s. * @stable ICU 2.4 * * @see u_strstr * @see u_strFindLast */ U_STABLE UChar * U_EXPORT2 u_strFindFirst(const UChar *s, int32_t length, const UChar *substring, int32_t subLength); /** * Find the first occurrence of a BMP code point in a string. * A surrogate code point is found only if its match in the text is not * part of a surrogate pair. * A NUL character is found at the string terminator. * * @param s The string to search (NUL-terminated). * @param c The BMP code point to find. * @return A pointer to the first occurrence of c in s * or NULL if c is not in s. * @stable ICU 2.0 * * @see u_strchr32 * @see u_memchr * @see u_strstr * @see u_strFindFirst */ U_STABLE UChar * U_EXPORT2 u_strchr(const UChar *s, UChar c); /** * Find the first occurrence of a code point in a string. * A surrogate code point is found only if its match in the text is not * part of a surrogate pair. * A NUL character is found at the string terminator. * * @param s The string to search (NUL-terminated). * @param c The code point to find. * @return A pointer to the first occurrence of c in s * or NULL if c is not in s. * @stable ICU 2.0 * * @see u_strchr * @see u_memchr32 * @see u_strstr * @see u_strFindFirst */ U_STABLE UChar * U_EXPORT2 u_strchr32(const UChar *s, UChar32 c); /** * Find the last occurrence of a substring in a string. * The substring is found at code point boundaries. * That means that if the substring begins with * a trail surrogate or ends with a lead surrogate, * then it is found only if these surrogates stand alone in the text. * Otherwise, the substring edge units would be matched against * halves of surrogate pairs. * * @param s The string to search (NUL-terminated). * @param substring The substring to find (NUL-terminated). * @return A pointer to the last occurrence of substring in s, * or s itself if the substring is empty, * or NULL if substring is not in s. * @stable ICU 2.4 * * @see u_strstr * @see u_strFindFirst * @see u_strFindLast */ U_STABLE UChar * U_EXPORT2 u_strrstr(const UChar *s, const UChar *substring); /** * Find the last occurrence of a substring in a string. * The substring is found at code point boundaries. * That means that if the substring begins with * a trail surrogate or ends with a lead surrogate, * then it is found only if these surrogates stand alone in the text. * Otherwise, the substring edge units would be matched against * halves of surrogate pairs. * * @param s The string to search. * @param length The length of s (number of UChars), or -1 if it is NUL-terminated. * @param substring The substring to find (NUL-terminated). * @param subLength The length of substring (number of UChars), or -1 if it is NUL-terminated. * @return A pointer to the last occurrence of substring in s, * or s itself if the substring is empty, * or NULL if substring is not in s. * @stable ICU 2.4 * * @see u_strstr * @see u_strFindLast */ U_STABLE UChar * U_EXPORT2 u_strFindLast(const UChar *s, int32_t length, const UChar *substring, int32_t subLength); /** * Find the last occurrence of a BMP code point in a string. * A surrogate code point is found only if its match in the text is not * part of a surrogate pair. * A NUL character is found at the string terminator. * * @param s The string to search (NUL-terminated). * @param c The BMP code point to find. * @return A pointer to the last occurrence of c in s * or NULL if c is not in s. * @stable ICU 2.4 * * @see u_strrchr32 * @see u_memrchr * @see u_strrstr * @see u_strFindLast */ U_STABLE UChar * U_EXPORT2 u_strrchr(const UChar *s, UChar c); /** * Find the last occurrence of a code point in a string. * A surrogate code point is found only if its match in the text is not * part of a surrogate pair. * A NUL character is found at the string terminator. * * @param s The string to search (NUL-terminated). * @param c The code point to find. * @return A pointer to the last occurrence of c in s * or NULL if c is not in s. * @stable ICU 2.4 * * @see u_strrchr * @see u_memchr32 * @see u_strrstr * @see u_strFindLast */ U_STABLE UChar * U_EXPORT2 u_strrchr32(const UChar *s, UChar32 c); /** * Locates the first occurrence in the string string of any of the characters * in the string matchSet. * Works just like C's strpbrk but with Unicode. * * @param string The string in which to search, NUL-terminated. * @param matchSet A NUL-terminated string defining a set of code points * for which to search in the text string. * @return A pointer to the character in string that matches one of the * characters in matchSet, or NULL if no such character is found. * @stable ICU 2.0 */ U_STABLE UChar * U_EXPORT2 u_strpbrk(const UChar *string, const UChar *matchSet); /** * Returns the number of consecutive characters in string, * beginning with the first, that do not occur somewhere in matchSet. * Works just like C's strcspn but with Unicode. * * @param string The string in which to search, NUL-terminated. * @param matchSet A NUL-terminated string defining a set of code points * for which to search in the text string. * @return The number of initial characters in string that do not * occur in matchSet. * @see u_strspn * @stable ICU 2.0 */ U_STABLE int32_t U_EXPORT2 u_strcspn(const UChar *string, const UChar *matchSet); /** * Returns the number of consecutive characters in string, * beginning with the first, that occur somewhere in matchSet. * Works just like C's strspn but with Unicode. * * @param string The string in which to search, NUL-terminated. * @param matchSet A NUL-terminated string defining a set of code points * for which to search in the text string. * @return The number of initial characters in string that do * occur in matchSet. * @see u_strcspn * @stable ICU 2.0 */ U_STABLE int32_t U_EXPORT2 u_strspn(const UChar *string, const UChar *matchSet); /** * The string tokenizer API allows an application to break a string into * tokens. Unlike strtok(), the saveState (the current pointer within the * original string) is maintained in saveState. In the first call, the * argument src is a pointer to the string. In subsequent calls to * return successive tokens of that string, src must be specified as * NULL. The value saveState is set by this function to maintain the * function's position within the string, and on each subsequent call * you must give this argument the same variable. This function does * handle surrogate pairs. This function is similar to the strtok_r() * the POSIX Threads Extension (1003.1c-1995) version. * * @param src String containing token(s). This string will be modified. * After the first call to u_strtok_r(), this argument must * be NULL to get to the next token. * @param delim Set of delimiter characters (Unicode code points). * @param saveState The current pointer within the original string, * which is set by this function. The saveState * parameter should the address of a local variable of type * UChar *. (i.e. defined "Uhar *myLocalSaveState" and use * &myLocalSaveState for this parameter). * @return A pointer to the next token found in src, or NULL * when there are no more tokens. * @stable ICU 2.0 */ U_STABLE UChar * U_EXPORT2 u_strtok_r(UChar *src, const UChar *delim, UChar **saveState); /** * Compare two Unicode strings for bitwise equality (code unit order). * * @param s1 A string to compare. * @param s2 A string to compare. * @return 0 if s1 and s2 are bitwise equal; a negative * value if s1 is bitwise less than s2,; a positive * value if s1 is bitwise greater than s2. * @stable ICU 2.0 */ U_STABLE int32_t U_EXPORT2 u_strcmp(const UChar *s1, const UChar *s2); /** * Compare two Unicode strings in code point order. * See u_strCompare for details. * * @param s1 A string to compare. * @param s2 A string to compare. * @return a negative/zero/positive integer corresponding to whether * the first string is less than/equal to/greater than the second one * in code point order * @stable ICU 2.0 */ U_STABLE int32_t U_EXPORT2 u_strcmpCodePointOrder(const UChar *s1, const UChar *s2); /** * Compare two Unicode strings (binary order). * * The comparison can be done in code unit order or in code point order. * They differ only in UTF-16 when * comparing supplementary code points (U+10000..U+10ffff) * to BMP code points near the end of the BMP (i.e., U+e000..U+ffff). * In code unit order, high BMP code points sort after supplementary code points * because they are stored as pairs of surrogates which are at U+d800..U+dfff. * * This functions works with strings of different explicitly specified lengths * unlike the ANSI C-like u_strcmp() and u_memcmp() etc. * NUL-terminated strings are possible with length arguments of -1. * * @param s1 First source string. * @param length1 Length of first source string, or -1 if NUL-terminated. * * @param s2 Second source string. * @param length2 Length of second source string, or -1 if NUL-terminated. * * @param codePointOrder Choose between code unit order (FALSE) * and code point order (TRUE). * * @return <0 or 0 or >0 as usual for string comparisons * * @stable ICU 2.2 */ U_STABLE int32_t U_EXPORT2 u_strCompare(const UChar *s1, int32_t length1, const UChar *s2, int32_t length2, UBool codePointOrder); /** * Compare two Unicode strings (binary order) * as presented by UCharIterator objects. * Works otherwise just like u_strCompare(). * * Both iterators are reset to their start positions. * When the function returns, it is undefined where the iterators * have stopped. * * @param iter1 First source string iterator. * @param iter2 Second source string iterator. * @param codePointOrder Choose between code unit order (FALSE) * and code point order (TRUE). * * @return <0 or 0 or >0 as usual for string comparisons * * @see u_strCompare * * @stable ICU 2.6 */ U_STABLE int32_t U_EXPORT2 u_strCompareIter(UCharIterator *iter1, UCharIterator *iter2, UBool codePointOrder); #ifndef U_COMPARE_CODE_POINT_ORDER /* see also unistr.h and unorm.h */ /** * Option bit for u_strCaseCompare, u_strcasecmp, unorm_compare, etc: * Compare strings in code point order instead of code unit order. * @stable ICU 2.2 */ #define U_COMPARE_CODE_POINT_ORDER 0x8000 #endif /** * Compare two strings case-insensitively using full case folding. * This is equivalent to * u_strCompare(u_strFoldCase(s1, options), * u_strFoldCase(s2, options), * (options&U_COMPARE_CODE_POINT_ORDER)!=0). * * The comparison can be done in UTF-16 code unit order or in code point order. * They differ only when comparing supplementary code points (U+10000..U+10ffff) * to BMP code points near the end of the BMP (i.e., U+e000..U+ffff). * In code unit order, high BMP code points sort after supplementary code points * because they are stored as pairs of surrogates which are at U+d800..U+dfff. * * This functions works with strings of different explicitly specified lengths * unlike the ANSI C-like u_strcmp() and u_memcmp() etc. * NUL-terminated strings are possible with length arguments of -1. * * @param s1 First source string. * @param length1 Length of first source string, or -1 if NUL-terminated. * * @param s2 Second source string. * @param length2 Length of second source string, or -1 if NUL-terminated. * * @param options A bit set of options: * - U_FOLD_CASE_DEFAULT or 0 is used for default options: * Comparison in code unit order with default case folding. * * - U_COMPARE_CODE_POINT_ORDER * Set to choose code point order instead of code unit order * (see u_strCompare for details). * * - U_FOLD_CASE_EXCLUDE_SPECIAL_I * * @param pErrorCode Must be a valid pointer to an error code value, * which must not indicate a failure before the function call. * * @return <0 or 0 or >0 as usual for string comparisons * * @stable ICU 2.2 */ U_STABLE int32_t U_EXPORT2 u_strCaseCompare(const UChar *s1, int32_t length1, const UChar *s2, int32_t length2, uint32_t options, UErrorCode *pErrorCode); /** * Compare two ustrings for bitwise equality. * Compares at most n characters. * * @param ucs1 A string to compare (can be NULL/invalid if n<=0). * @param ucs2 A string to compare (can be NULL/invalid if n<=0). * @param n The maximum number of characters to compare; always returns 0 if n<=0. * @return 0 if s1 and s2 are bitwise equal; a negative * value if s1 is bitwise less than s2; a positive * value if s1 is bitwise greater than s2. * @stable ICU 2.0 */ U_STABLE int32_t U_EXPORT2 u_strncmp(const UChar *ucs1, const UChar *ucs2, int32_t n); /** * Compare two Unicode strings in code point order. * This is different in UTF-16 from u_strncmp() if supplementary characters are present. * For details, see u_strCompare(). * * @param s1 A string to compare. * @param s2 A string to compare. * @param n The maximum number of characters to compare. * @return a negative/zero/positive integer corresponding to whether * the first string is less than/equal to/greater than the second one * in code point order * @stable ICU 2.0 */ U_STABLE int32_t U_EXPORT2 u_strncmpCodePointOrder(const UChar *s1, const UChar *s2, int32_t n); /** * Compare two strings case-insensitively using full case folding. * This is equivalent to u_strcmp(u_strFoldCase(s1, options), u_strFoldCase(s2, options)). * * @param s1 A string to compare. * @param s2 A string to compare. * @param options A bit set of options: * - U_FOLD_CASE_DEFAULT or 0 is used for default options: * Comparison in code unit order with default case folding. * * - U_COMPARE_CODE_POINT_ORDER * Set to choose code point order instead of code unit order * (see u_strCompare for details). * * - U_FOLD_CASE_EXCLUDE_SPECIAL_I * * @return A negative, zero, or positive integer indicating the comparison result. * @stable ICU 2.0 */ U_STABLE int32_t U_EXPORT2 u_strcasecmp(const UChar *s1, const UChar *s2, uint32_t options); /** * Compare two strings case-insensitively using full case folding. * This is equivalent to u_strcmp(u_strFoldCase(s1, at most n, options), * u_strFoldCase(s2, at most n, options)). * * @param s1 A string to compare. * @param s2 A string to compare. * @param n The maximum number of characters each string to case-fold and then compare. * @param options A bit set of options: * - U_FOLD_CASE_DEFAULT or 0 is used for default options: * Comparison in code unit order with default case folding. * * - U_COMPARE_CODE_POINT_ORDER * Set to choose code point order instead of code unit order * (see u_strCompare for details). * * - U_FOLD_CASE_EXCLUDE_SPECIAL_I * * @return A negative, zero, or positive integer indicating the comparison result. * @stable ICU 2.0 */ U_STABLE int32_t U_EXPORT2 u_strncasecmp(const UChar *s1, const UChar *s2, int32_t n, uint32_t options); /** * Compare two strings case-insensitively using full case folding. * This is equivalent to u_strcmp(u_strFoldCase(s1, n, options), * u_strFoldCase(s2, n, options)). * * @param s1 A string to compare. * @param s2 A string to compare. * @param length The number of characters in each string to case-fold and then compare. * @param options A bit set of options: * - U_FOLD_CASE_DEFAULT or 0 is used for default options: * Comparison in code unit order with default case folding. * * - U_COMPARE_CODE_POINT_ORDER * Set to choose code point order instead of code unit order * (see u_strCompare for details). * * - U_FOLD_CASE_EXCLUDE_SPECIAL_I * * @return A negative, zero, or positive integer indicating the comparison result. * @stable ICU 2.0 */ U_STABLE int32_t U_EXPORT2 u_memcasecmp(const UChar *s1, const UChar *s2, int32_t length, uint32_t options); /** * Copy a ustring. Adds a null terminator. * * @param dst The destination string. * @param src The source string. * @return A pointer to dst. * @stable ICU 2.0 */ U_STABLE UChar* U_EXPORT2 u_strcpy(UChar *dst, const UChar *src); /** * Copy a ustring. * Copies at most n characters. The result will be null terminated * if the length of src is less than n. * * @param dst The destination string. * @param src The source string (can be NULL/invalid if n<=0). * @param n The maximum number of characters to copy; no-op if <=0. * @return A pointer to dst. * @stable ICU 2.0 */ U_STABLE UChar* U_EXPORT2 u_strncpy(UChar *dst, const UChar *src, int32_t n); #if !UCONFIG_NO_CONVERSION /** * Copy a byte string encoded in the default codepage to a ustring. * Adds a null terminator. * Performs a host byte to UChar conversion * * @param dst The destination string. * @param src The source string. * @return A pointer to dst. * @stable ICU 2.0 */ U_STABLE UChar* U_EXPORT2 u_uastrcpy(UChar *dst, const char *src ); /** * Copy a byte string encoded in the default codepage to a ustring. * Copies at most n characters. The result will be null terminated * if the length of src is less than n. * Performs a host byte to UChar conversion * * @param dst The destination string. * @param src The source string. * @param n The maximum number of characters to copy. * @return A pointer to dst. * @stable ICU 2.0 */ U_STABLE UChar* U_EXPORT2 u_uastrncpy(UChar *dst, const char *src, int32_t n); /** * Copy ustring to a byte string encoded in the default codepage. * Adds a null terminator. * Performs a UChar to host byte conversion * * @param dst The destination string. * @param src The source string. * @return A pointer to dst. * @stable ICU 2.0 */ U_STABLE char* U_EXPORT2 u_austrcpy(char *dst, const UChar *src ); /** * Copy ustring to a byte string encoded in the default codepage. * Copies at most n characters. The result will be null terminated * if the length of src is less than n. * Performs a UChar to host byte conversion * * @param dst The destination string. * @param src The source string. * @param n The maximum number of characters to copy. * @return A pointer to dst. * @stable ICU 2.0 */ U_STABLE char* U_EXPORT2 u_austrncpy(char *dst, const UChar *src, int32_t n ); #endif /** * Synonym for memcpy(), but with UChars only. * @param dest The destination string * @param src The source string (can be NULL/invalid if count<=0) * @param count The number of characters to copy; no-op if <=0 * @return A pointer to dest * @stable ICU 2.0 */ U_STABLE UChar* U_EXPORT2 u_memcpy(UChar *dest, const UChar *src, int32_t count); /** * Synonym for memmove(), but with UChars only. * @param dest The destination string * @param src The source string (can be NULL/invalid if count<=0) * @param count The number of characters to move; no-op if <=0 * @return A pointer to dest * @stable ICU 2.0 */ U_STABLE UChar* U_EXPORT2 u_memmove(UChar *dest, const UChar *src, int32_t count); /** * Initialize count characters of dest to c. * * @param dest The destination string. * @param c The character to initialize the string. * @param count The maximum number of characters to set. * @return A pointer to dest. * @stable ICU 2.0 */ U_STABLE UChar* U_EXPORT2 u_memset(UChar *dest, UChar c, int32_t count); /** * Compare the first count UChars of each buffer. * * @param buf1 The first string to compare. * @param buf2 The second string to compare. * @param count The maximum number of UChars to compare. * @return When buf1 < buf2, a negative number is returned. * When buf1 == buf2, 0 is returned. * When buf1 > buf2, a positive number is returned. * @stable ICU 2.0 */ U_STABLE int32_t U_EXPORT2 u_memcmp(const UChar *buf1, const UChar *buf2, int32_t count); /** * Compare two Unicode strings in code point order. * This is different in UTF-16 from u_memcmp() if supplementary characters are present. * For details, see u_strCompare(). * * @param s1 A string to compare. * @param s2 A string to compare. * @param count The maximum number of characters to compare. * @return a negative/zero/positive integer corresponding to whether * the first string is less than/equal to/greater than the second one * in code point order * @stable ICU 2.0 */ U_STABLE int32_t U_EXPORT2 u_memcmpCodePointOrder(const UChar *s1, const UChar *s2, int32_t count); /** * Find the first occurrence of a BMP code point in a string. * A surrogate code point is found only if its match in the text is not * part of a surrogate pair. * A NUL character is found at the string terminator. * * @param s The string to search (contains count UChars). * @param c The BMP code point to find. * @param count The length of the string. * @return A pointer to the first occurrence of c in s * or NULL if c is not in s. * @stable ICU 2.0 * * @see u_strchr * @see u_memchr32 * @see u_strFindFirst */ U_STABLE UChar* U_EXPORT2 u_memchr(const UChar *s, UChar c, int32_t count); /** * Find the first occurrence of a code point in a string. * A surrogate code point is found only if its match in the text is not * part of a surrogate pair. * A NUL character is found at the string terminator. * * @param s The string to search (contains count UChars). * @param c The code point to find. * @param count The length of the string. * @return A pointer to the first occurrence of c in s * or NULL if c is not in s. * @stable ICU 2.0 * * @see u_strchr32 * @see u_memchr * @see u_strFindFirst */ U_STABLE UChar* U_EXPORT2 u_memchr32(const UChar *s, UChar32 c, int32_t count); /** * Find the last occurrence of a BMP code point in a string. * A surrogate code point is found only if its match in the text is not * part of a surrogate pair. * A NUL character is found at the string terminator. * * @param s The string to search (contains count UChars). * @param c The BMP code point to find. * @param count The length of the string. * @return A pointer to the last occurrence of c in s * or NULL if c is not in s. * @stable ICU 2.4 * * @see u_strrchr * @see u_memrchr32 * @see u_strFindLast */ U_STABLE UChar* U_EXPORT2 u_memrchr(const UChar *s, UChar c, int32_t count); /** * Find the last occurrence of a code point in a string. * A surrogate code point is found only if its match in the text is not * part of a surrogate pair. * A NUL character is found at the string terminator. * * @param s The string to search (contains count UChars). * @param c The code point to find. * @param count The length of the string. * @return A pointer to the last occurrence of c in s * or NULL if c is not in s. * @stable ICU 2.4 * * @see u_strrchr32 * @see u_memrchr * @see u_strFindLast */ U_STABLE UChar* U_EXPORT2 u_memrchr32(const UChar *s, UChar32 c, int32_t count); /** * Unicode String literals in C. * We need one macro to declare a variable for the string * and to statically preinitialize it if possible, * and a second macro to dynamically intialize such a string variable if necessary. * * The macros are defined for maximum performance. * They work only for strings that contain "invariant characters", i.e., * only latin letters, digits, and some punctuation. * See utypes.h for details. * * A pair of macros for a single string must be used with the same * parameters. * The string parameter must be a C string literal. * The length of the string, not including the terminating * NUL, must be specified as a constant. * The U_STRING_DECL macro should be invoked exactly once for one * such string variable before it is used. * * Usage: *

 *    U_STRING_DECL(ustringVar1, "Quick-Fox 2", 11);
 *    U_STRING_DECL(ustringVar2, "jumps 5%", 8);
 *    static UBool didInit=FALSE;
 * 
 *    int32_t function() {
 *        if(!didInit) {
 *            U_STRING_INIT(ustringVar1, "Quick-Fox 2", 11);
 *            U_STRING_INIT(ustringVar2, "jumps 5%", 8);
 *            didInit=TRUE;
 *        }
 *        return u_strcmp(ustringVar1, ustringVar2);
 *    }
 * 
* * Note that the macros will NOT consistently work if their argument is another #define. * The following will not work on all platforms, don't use it. * *
 *     #define GLUCK "Mr. Gluck"
 *     U_STRING_DECL(var, GLUCK, 9)
 *     U_STRING_INIT(var, GLUCK, 9)
 * 
* * Instead, use the string literal "Mr. Gluck" as the argument to both macro * calls. * * * @stable ICU 2.0 */ #if defined(U_DECLARE_UTF16) # define U_STRING_DECL(var, cs, length) static const UChar *var=(const UChar *)U_DECLARE_UTF16(cs) /**@stable ICU 2.0 */ # define U_STRING_INIT(var, cs, length) #elif U_SIZEOF_WCHAR_T==U_SIZEOF_UCHAR && (U_CHARSET_FAMILY==U_ASCII_FAMILY || (U_SIZEOF_UCHAR == 2 && defined(U_WCHAR_IS_UTF16))) # define U_STRING_DECL(var, cs, length) static const UChar var[(length)+1]=L ## cs /**@stable ICU 2.0 */ # define U_STRING_INIT(var, cs, length) #elif U_SIZEOF_UCHAR==1 && U_CHARSET_FAMILY==U_ASCII_FAMILY # define U_STRING_DECL(var, cs, length) static const UChar var[(length)+1]=cs /**@stable ICU 2.0 */ # define U_STRING_INIT(var, cs, length) #else # define U_STRING_DECL(var, cs, length) static UChar var[(length)+1] /**@stable ICU 2.0 */ # define U_STRING_INIT(var, cs, length) u_charsToUChars(cs, var, length+1) #endif /** * Unescape a string of characters and write the resulting * Unicode characters to the destination buffer. The following escape * sequences are recognized: * * \\uhhhh 4 hex digits; h in [0-9A-Fa-f] * \\Uhhhhhhhh 8 hex digits * \\xhh 1-2 hex digits * \\x{h...} 1-8 hex digits * \\ooo 1-3 octal digits; o in [0-7] * \\cX control-X; X is masked with 0x1F * * as well as the standard ANSI C escapes: * * \\a => U+0007, \\b => U+0008, \\t => U+0009, \\n => U+000A, * \\v => U+000B, \\f => U+000C, \\r => U+000D, \\e => U+001B, * \\" => U+0022, \\' => U+0027, \\? => U+003F, \\\\ => U+005C * * Anything else following a backslash is generically escaped. For * example, "[a\\-z]" returns "[a-z]". * * If an escape sequence is ill-formed, this method returns an empty * string. An example of an ill-formed sequence is "\\u" followed by * fewer than 4 hex digits. * * The above characters are recognized in the compiler's codepage, * that is, they are coded as 'u', '\\', etc. Characters that are * not parts of escape sequences are converted using u_charsToUChars(). * * This function is similar to UnicodeString::unescape() but not * identical to it. The latter takes a source UnicodeString, so it * does escape recognition but no conversion. * * @param src a zero-terminated string of invariant characters * @param dest pointer to buffer to receive converted and unescaped * text and, if there is room, a zero terminator. May be NULL for * preflighting, in which case no UChars will be written, but the * return value will still be valid. On error, an empty string is * stored here (if possible). * @param destCapacity the number of UChars that may be written at * dest. Ignored if dest == NULL. * @return the length of unescaped string. * @see u_unescapeAt * @see UnicodeString#unescape() * @see UnicodeString#unescapeAt() * @stable ICU 2.0 */ U_STABLE int32_t U_EXPORT2 u_unescape(const char *src, UChar *dest, int32_t destCapacity); U_CDECL_BEGIN /** * Callback function for u_unescapeAt() that returns a character of * the source text given an offset and a context pointer. The context * pointer will be whatever is passed into u_unescapeAt(). * * @param offset pointer to the offset that will be passed to u_unescapeAt(). * @param context an opaque pointer passed directly into u_unescapeAt() * @return the character represented by the escape sequence at * offset * @see u_unescapeAt * @stable ICU 2.0 */ typedef UChar (U_CALLCONV *UNESCAPE_CHAR_AT)(int32_t offset, void *context); U_CDECL_END /** * Unescape a single sequence. The character at offset-1 is assumed * (without checking) to be a backslash. This method takes a callback * pointer to a function that returns the UChar at a given offset. By * varying this callback, ICU functions are able to unescape char* * strings, UnicodeString objects, and UFILE pointers. * * If offset is out of range, or if the escape sequence is ill-formed, * (UChar32)0xFFFFFFFF is returned. See documentation of u_unescape() * for a list of recognized sequences. * * @param charAt callback function that returns a UChar of the source * text given an offset and a context pointer. * @param offset pointer to the offset that will be passed to charAt. * The offset value will be updated upon return to point after the * last parsed character of the escape sequence. On error the offset * is unchanged. * @param length the number of characters in the source text. The * last character of the source text is considered to be at offset * length-1. * @param context an opaque pointer passed directly into charAt. * @return the character represented by the escape sequence at * offset, or (UChar32)0xFFFFFFFF on error. * @see u_unescape() * @see UnicodeString#unescape() * @see UnicodeString#unescapeAt() * @stable ICU 2.0 */ U_STABLE UChar32 U_EXPORT2 u_unescapeAt(UNESCAPE_CHAR_AT charAt, int32_t *offset, int32_t length, void *context); /** * Uppercase the characters in a string. * Casing is locale-dependent and context-sensitive. * The result may be longer or shorter than the original. * The source string and the destination buffer are allowed to overlap. * * @param dest A buffer for the result string. The result will be zero-terminated if * the buffer is large enough. * @param destCapacity The size of the buffer (number of UChars). If it is 0, then * dest may be NULL and the function will only return the length of the result * without writing any of the result string. * @param src The original string * @param srcLength The length of the original string. If -1, then src must be zero-terminated. * @param locale The locale to consider, or "" for the root locale or NULL for the default locale. * @param pErrorCode Must be a valid pointer to an error code value, * which must not indicate a failure before the function call. * @return The length of the result string. It may be greater than destCapacity. In that case, * only some of the result was written to the destination buffer. * @stable ICU 2.0 */ U_STABLE int32_t U_EXPORT2 u_strToUpper(UChar *dest, int32_t destCapacity, const UChar *src, int32_t srcLength, const char *locale, UErrorCode *pErrorCode); /** * Lowercase the characters in a string. * Casing is locale-dependent and context-sensitive. * The result may be longer or shorter than the original. * The source string and the destination buffer are allowed to overlap. * * @param dest A buffer for the result string. The result will be zero-terminated if * the buffer is large enough. * @param destCapacity The size of the buffer (number of UChars). If it is 0, then * dest may be NULL and the function will only return the length of the result * without writing any of the result string. * @param src The original string * @param srcLength The length of the original string. If -1, then src must be zero-terminated. * @param locale The locale to consider, or "" for the root locale or NULL for the default locale. * @param pErrorCode Must be a valid pointer to an error code value, * which must not indicate a failure before the function call. * @return The length of the result string. It may be greater than destCapacity. In that case, * only some of the result was written to the destination buffer. * @stable ICU 2.0 */ U_STABLE int32_t U_EXPORT2 u_strToLower(UChar *dest, int32_t destCapacity, const UChar *src, int32_t srcLength, const char *locale, UErrorCode *pErrorCode); #if !UCONFIG_NO_BREAK_ITERATION /** * Titlecase a string. * Casing is locale-dependent and context-sensitive. * Titlecasing uses a break iterator to find the first characters of words * that are to be titlecased. It titlecases those characters and lowercases * all others. * * The titlecase break iterator can be provided to customize for arbitrary * styles, using rules and dictionaries beyond the standard iterators. * It may be more efficient to always provide an iterator to avoid * opening and closing one for each string. * The standard titlecase iterator for the root locale implements the * algorithm of Unicode TR 21. * * This function uses only the setText(), first() and next() methods of the * provided break iterator. * * The result may be longer or shorter than the original. * The source string and the destination buffer are allowed to overlap. * * @param dest A buffer for the result string. The result will be zero-terminated if * the buffer is large enough. * @param destCapacity The size of the buffer (number of UChars). If it is 0, then * dest may be NULL and the function will only return the length of the result * without writing any of the result string. * @param src The original string * @param srcLength The length of the original string. If -1, then src must be zero-terminated. * @param titleIter A break iterator to find the first characters of words * that are to be titlecased. * If none is provided (NULL), then a standard titlecase * break iterator is opened. * @param locale The locale to consider, or "" for the root locale or NULL for the default locale. * @param pErrorCode Must be a valid pointer to an error code value, * which must not indicate a failure before the function call. * @return The length of the result string. It may be greater than destCapacity. In that case, * only some of the result was written to the destination buffer. * @stable ICU 2.1 */ U_STABLE int32_t U_EXPORT2 u_strToTitle(UChar *dest, int32_t destCapacity, const UChar *src, int32_t srcLength, UBreakIterator *titleIter, const char *locale, UErrorCode *pErrorCode); #endif /** * Case-folds the characters in a string. * * Case-folding is locale-independent and not context-sensitive, * but there is an option for whether to include or exclude mappings for dotted I * and dotless i that are marked with 'T' in CaseFolding.txt. * * The result may be longer or shorter than the original. * The source string and the destination buffer are allowed to overlap. * * @param dest A buffer for the result string. The result will be zero-terminated if * the buffer is large enough. * @param destCapacity The size of the buffer (number of UChars). If it is 0, then * dest may be NULL and the function will only return the length of the result * without writing any of the result string. * @param src The original string * @param srcLength The length of the original string. If -1, then src must be zero-terminated. * @param options Either U_FOLD_CASE_DEFAULT or U_FOLD_CASE_EXCLUDE_SPECIAL_I * @param pErrorCode Must be a valid pointer to an error code value, * which must not indicate a failure before the function call. * @return The length of the result string. It may be greater than destCapacity. In that case, * only some of the result was written to the destination buffer. * @stable ICU 2.0 */ U_STABLE int32_t U_EXPORT2 u_strFoldCase(UChar *dest, int32_t destCapacity, const UChar *src, int32_t srcLength, uint32_t options, UErrorCode *pErrorCode); #if defined(U_WCHAR_IS_UTF16) || defined(U_WCHAR_IS_UTF32) || !UCONFIG_NO_CONVERSION /** * Convert a UTF-16 string to a wchar_t string. * If it is known at compile time that wchar_t strings are in UTF-16 or UTF-32, then * this function simply calls the fast, dedicated function for that. * Otherwise, two conversions UTF-16 -> default charset -> wchar_t* are performed. * * @param dest A buffer for the result string. The result will be zero-terminated if * the buffer is large enough. * @param destCapacity The size of the buffer (number of wchar_t's). If it is 0, then * dest may be NULL and the function will only return the length of the * result without writing any of the result string (pre-flighting). * @param pDestLength A pointer to receive the number of units written to the destination. If * pDestLength!=NULL then *pDestLength is always set to the * number of output units corresponding to the transformation of * all the input units, even in case of a buffer overflow. * @param src The original source string * @param srcLength The length of the original string. If -1, then src must be zero-terminated. * @param pErrorCode Must be a valid pointer to an error code value, * which must not indicate a failure before the function call. * @return The pointer to destination buffer. * @stable ICU 2.0 */ U_STABLE wchar_t* U_EXPORT2 u_strToWCS(wchar_t *dest, int32_t destCapacity, int32_t *pDestLength, const UChar *src, int32_t srcLength, UErrorCode *pErrorCode); /** * Convert a wchar_t string to UTF-16. * If it is known at compile time that wchar_t strings are in UTF-16 or UTF-32, then * this function simply calls the fast, dedicated function for that. * Otherwise, two conversions wchar_t* -> default charset -> UTF-16 are performed. * * @param dest A buffer for the result string. The result will be zero-terminated if * the buffer is large enough. * @param destCapacity The size of the buffer (number of UChars). If it is 0, then * dest may be NULL and the function will only return the length of the * result without writing any of the result string (pre-flighting). * @param pDestLength A pointer to receive the number of units written to the destination. If * pDestLength!=NULL then *pDestLength is always set to the * number of output units corresponding to the transformation of * all the input units, even in case of a buffer overflow. * @param src The original source string * @param srcLength The length of the original string. If -1, then src must be zero-terminated. * @param pErrorCode Must be a valid pointer to an error code value, * which must not indicate a failure before the function call. * @return The pointer to destination buffer. * @stable ICU 2.0 */ U_STABLE UChar* U_EXPORT2 u_strFromWCS(UChar *dest, int32_t destCapacity, int32_t *pDestLength, const wchar_t *src, int32_t srcLength, UErrorCode *pErrorCode); #endif /* defined(U_WCHAR_IS_UTF16) || defined(U_WCHAR_IS_UTF32) || !UCONFIG_NO_CONVERSION */ /** * Convert a UTF-16 string to UTF-8. * If the input string is not well-formed, then the U_INVALID_CHAR_FOUND error code is set. * * @param dest A buffer for the result string. The result will be zero-terminated if * the buffer is large enough. * @param destCapacity The size of the buffer (number of chars). If it is 0, then * dest may be NULL and the function will only return the length of the * result without writing any of the result string (pre-flighting). * @param pDestLength A pointer to receive the number of units written to the destination. If * pDestLength!=NULL then *pDestLength is always set to the * number of output units corresponding to the transformation of * all the input units, even in case of a buffer overflow. * @param src The original source string * @param srcLength The length of the original string. If -1, then src must be zero-terminated. * @param pErrorCode Must be a valid pointer to an error code value, * which must not indicate a failure before the function call. * @return The pointer to destination buffer. * @stable ICU 2.0 * @see u_strToUTF8WithSub * @see u_strFromUTF8 */ U_STABLE char* U_EXPORT2 u_strToUTF8(char *dest, int32_t destCapacity, int32_t *pDestLength, const UChar *src, int32_t srcLength, UErrorCode *pErrorCode); /** * Convert a UTF-8 string to UTF-16. * If the input string is not well-formed, then the U_INVALID_CHAR_FOUND error code is set. * * @param dest A buffer for the result string. The result will be zero-terminated if * the buffer is large enough. * @param destCapacity The size of the buffer (number of UChars). If it is 0, then * dest may be NULL and the function will only return the length of the * result without writing any of the result string (pre-flighting). * @param pDestLength A pointer to receive the number of units written to the destination. If * pDestLength!=NULL then *pDestLength is always set to the * number of output units corresponding to the transformation of * all the input units, even in case of a buffer overflow. * @param src The original source string * @param srcLength The length of the original string. If -1, then src must be zero-terminated. * @param pErrorCode Must be a valid pointer to an error code value, * which must not indicate a failure before the function call. * @return The pointer to destination buffer. * @stable ICU 2.0 * @see u_strFromUTF8WithSub * @see u_strFromUTF8Lenient */ U_STABLE UChar* U_EXPORT2 u_strFromUTF8(UChar *dest, int32_t destCapacity, int32_t *pDestLength, const char *src, int32_t srcLength, UErrorCode *pErrorCode); /** * Convert a UTF-16 string to UTF-8. * * Same as u_strToUTF8() except for the additional subchar which is output for * illegal input sequences, instead of stopping with the U_INVALID_CHAR_FOUND error code. * With subchar==U_SENTINEL, this function behaves exactly like u_strToUTF8(). * * @param dest A buffer for the result string. The result will be zero-terminated if * the buffer is large enough. * @param destCapacity The size of the buffer (number of chars). If it is 0, then * dest may be NULL and the function will only return the length of the * result without writing any of the result string (pre-flighting). * @param pDestLength A pointer to receive the number of units written to the destination. If * pDestLength!=NULL then *pDestLength is always set to the * number of output units corresponding to the transformation of * all the input units, even in case of a buffer overflow. * @param src The original source string * @param srcLength The length of the original string. If -1, then src must be zero-terminated. * @param subchar The substitution character to use in place of an illegal input sequence, * or U_SENTINEL if the function is to return with U_INVALID_CHAR_FOUND instead. * A substitution character can be any valid Unicode code point (up to U+10FFFF) * except for surrogate code points (U+D800..U+DFFF). * The recommended value is U+FFFD "REPLACEMENT CHARACTER". * @param pNumSubstitutions Output parameter receiving the number of substitutions if subchar>=0. * Set to 0 if no substitutions occur or subchar<0. * pNumSubstitutions can be NULL. * @param pErrorCode Pointer to a standard ICU error code. Its input value must * pass the U_SUCCESS() test, or else the function returns * immediately. Check for U_FAILURE() on output or use with * function chaining. (See User Guide for details.) * @return The pointer to destination buffer. * @see u_strToUTF8 * @see u_strFromUTF8WithSub * @stable ICU 3.6 */ U_STABLE char* U_EXPORT2 u_strToUTF8WithSub(char *dest, int32_t destCapacity, int32_t *pDestLength, const UChar *src, int32_t srcLength, UChar32 subchar, int32_t *pNumSubstitutions, UErrorCode *pErrorCode); /** * Convert a UTF-8 string to UTF-16. * * Same as u_strFromUTF8() except for the additional subchar which is output for * illegal input sequences, instead of stopping with the U_INVALID_CHAR_FOUND error code. * With subchar==U_SENTINEL, this function behaves exactly like u_strFromUTF8(). * * @param dest A buffer for the result string. The result will be zero-terminated if * the buffer is large enough. * @param destCapacity The size of the buffer (number of UChars). If it is 0, then * dest may be NULL and the function will only return the length of the * result without writing any of the result string (pre-flighting). * @param pDestLength A pointer to receive the number of units written to the destination. If * pDestLength!=NULL then *pDestLength is always set to the * number of output units corresponding to the transformation of * all the input units, even in case of a buffer overflow. * @param src The original source string * @param srcLength The length of the original string. If -1, then src must be zero-terminated. * @param subchar The substitution character to use in place of an illegal input sequence, * or U_SENTINEL if the function is to return with U_INVALID_CHAR_FOUND instead. * A substitution character can be any valid Unicode code point (up to U+10FFFF) * except for surrogate code points (U+D800..U+DFFF). * The recommended value is U+FFFD "REPLACEMENT CHARACTER". * @param pNumSubstitutions Output parameter receiving the number of substitutions if subchar>=0. * Set to 0 if no substitutions occur or subchar<0. * pNumSubstitutions can be NULL. * @param pErrorCode Pointer to a standard ICU error code. Its input value must * pass the U_SUCCESS() test, or else the function returns * immediately. Check for U_FAILURE() on output or use with * function chaining. (See User Guide for details.) * @return The pointer to destination buffer. * @see u_strFromUTF8 * @see u_strFromUTF8Lenient * @see u_strToUTF8WithSub * @stable ICU 3.6 */ U_STABLE UChar* U_EXPORT2 u_strFromUTF8WithSub(UChar *dest, int32_t destCapacity, int32_t *pDestLength, const char *src, int32_t srcLength, UChar32 subchar, int32_t *pNumSubstitutions, UErrorCode *pErrorCode); /** * Convert a UTF-8 string to UTF-16. * * Same as u_strFromUTF8() except that this function is designed to be very fast, * which it achieves by being lenient about malformed UTF-8 sequences. * This function is intended for use in environments where UTF-8 text is * expected to be well-formed. * * Its semantics are: * - Well-formed UTF-8 text is correctly converted to well-formed UTF-16 text. * - The function will not read beyond the input string, nor write beyond * the destCapacity. * - Malformed UTF-8 results in "garbage" 16-bit Unicode strings which may not * be well-formed UTF-16. * The function will resynchronize to valid code point boundaries * within a small number of code points after an illegal sequence. * - Non-shortest forms are not detected and will result in "spoofing" output. * * For further performance improvement, if srcLength is given (>=0), * then it must be destCapacity>=srcLength. * * There is no inverse u_strToUTF8Lenient() function because there is practically * no performance gain from not checking that a UTF-16 string is well-formed. * * @param dest A buffer for the result string. The result will be zero-terminated if * the buffer is large enough. * @param destCapacity The size of the buffer (number of UChars). If it is 0, then * dest may be NULL and the function will only return the length of the * result without writing any of the result string (pre-flighting). * Unlike for other ICU functions, if srcLength>=0 then it * must be destCapacity>=srcLength. * @param pDestLength A pointer to receive the number of units written to the destination. If * pDestLength!=NULL then *pDestLength is always set to the * number of output units corresponding to the transformation of * all the input units, even in case of a buffer overflow. * Unlike for other ICU functions, if srcLength>=0 but * destCapacity=0. * Set to 0 if no substitutions occur or subchar<0. * pNumSubstitutions can be NULL. * @param pErrorCode Pointer to a standard ICU error code. Its input value must * pass the U_SUCCESS() test, or else the function returns * immediately. Check for U_FAILURE() on output or use with * function chaining. (See User Guide for details.) * @return The pointer to destination buffer. * @see u_strToUTF32 * @see u_strFromUTF32WithSub * @stable ICU 4.2 */ U_STABLE UChar32* U_EXPORT2 u_strToUTF32WithSub(UChar32 *dest, int32_t destCapacity, int32_t *pDestLength, const UChar *src, int32_t srcLength, UChar32 subchar, int32_t *pNumSubstitutions, UErrorCode *pErrorCode); /** * Convert a UTF-32 string to UTF-16. * * Same as u_strFromUTF32() except for the additional subchar which is output for * illegal input sequences, instead of stopping with the U_INVALID_CHAR_FOUND error code. * With subchar==U_SENTINEL, this function behaves exactly like u_strFromUTF32(). * * @param dest A buffer for the result string. The result will be zero-terminated if * the buffer is large enough. * @param destCapacity The size of the buffer (number of UChars). If it is 0, then * dest may be NULL and the function will only return the length of the * result without writing any of the result string (pre-flighting). * @param pDestLength A pointer to receive the number of units written to the destination. If * pDestLength!=NULL then *pDestLength is always set to the * number of output units corresponding to the transformation of * all the input units, even in case of a buffer overflow. * @param src The original source string * @param srcLength The length of the original string. If -1, then src must be zero-terminated. * @param subchar The substitution character to use in place of an illegal input sequence, * or U_SENTINEL if the function is to return with U_INVALID_CHAR_FOUND instead. * A substitution character can be any valid Unicode code point (up to U+10FFFF) * except for surrogate code points (U+D800..U+DFFF). * The recommended value is U+FFFD "REPLACEMENT CHARACTER". * @param pNumSubstitutions Output parameter receiving the number of substitutions if subchar>=0. * Set to 0 if no substitutions occur or subchar<0. * pNumSubstitutions can be NULL. * @param pErrorCode Pointer to a standard ICU error code. Its input value must * pass the U_SUCCESS() test, or else the function returns * immediately. Check for U_FAILURE() on output or use with * function chaining. (See User Guide for details.) * @return The pointer to destination buffer. * @see u_strFromUTF32 * @see u_strToUTF32WithSub * @stable ICU 4.2 */ U_STABLE UChar* U_EXPORT2 u_strFromUTF32WithSub(UChar *dest, int32_t destCapacity, int32_t *pDestLength, const UChar32 *src, int32_t srcLength, UChar32 subchar, int32_t *pNumSubstitutions, UErrorCode *pErrorCode); /** * Convert a 16-bit Unicode string to Java Modified UTF-8. * See http://java.sun.com/javase/6/docs/api/java/io/DataInput.html#modified-utf-8 * * This function behaves according to the documentation for Java DataOutput.writeUTF() * except that it does not encode the output length in the destination buffer * and does not have an output length restriction. * See http://java.sun.com/javase/6/docs/api/java/io/DataOutput.html#writeUTF(java.lang.String) * * The input string need not be well-formed UTF-16. * (Therefore there is no subchar parameter.) * * @param dest A buffer for the result string. The result will be zero-terminated if * the buffer is large enough. * @param destCapacity The size of the buffer (number of chars). If it is 0, then * dest may be NULL and the function will only return the length of the * result without writing any of the result string (pre-flighting). * @param pDestLength A pointer to receive the number of units written to the destination. If * pDestLength!=NULL then *pDestLength is always set to the * number of output units corresponding to the transformation of * all the input units, even in case of a buffer overflow. * @param src The original source string * @param srcLength The length of the original string. If -1, then src must be zero-terminated. * @param pErrorCode Pointer to a standard ICU error code. Its input value must * pass the U_SUCCESS() test, or else the function returns * immediately. Check for U_FAILURE() on output or use with * function chaining. (See User Guide for details.) * @return The pointer to destination buffer. * @stable ICU 4.4 * @see u_strToUTF8WithSub * @see u_strFromJavaModifiedUTF8WithSub */ U_STABLE char* U_EXPORT2 u_strToJavaModifiedUTF8( char *dest, int32_t destCapacity, int32_t *pDestLength, const UChar *src, int32_t srcLength, UErrorCode *pErrorCode); /** * Convert a Java Modified UTF-8 string to a 16-bit Unicode string. * If the input string is not well-formed and no substitution char is specified, * then the U_INVALID_CHAR_FOUND error code is set. * * This function behaves according to the documentation for Java DataInput.readUTF() * except that it takes a length parameter rather than * interpreting the first two input bytes as the length. * See http://java.sun.com/javase/6/docs/api/java/io/DataInput.html#readUTF() * * The output string may not be well-formed UTF-16. * * @param dest A buffer for the result string. The result will be zero-terminated if * the buffer is large enough. * @param destCapacity The size of the buffer (number of UChars). If it is 0, then * dest may be NULL and the function will only return the length of the * result without writing any of the result string (pre-flighting). * @param pDestLength A pointer to receive the number of units written to the destination. If * pDestLength!=NULL then *pDestLength is always set to the * number of output units corresponding to the transformation of * all the input units, even in case of a buffer overflow. * @param src The original source string * @param srcLength The length of the original string. If -1, then src must be zero-terminated. * @param subchar The substitution character to use in place of an illegal input sequence, * or U_SENTINEL if the function is to return with U_INVALID_CHAR_FOUND instead. * A substitution character can be any valid Unicode code point (up to U+10FFFF) * except for surrogate code points (U+D800..U+DFFF). * The recommended value is U+FFFD "REPLACEMENT CHARACTER". * @param pNumSubstitutions Output parameter receiving the number of substitutions if subchar>=0. * Set to 0 if no substitutions occur or subchar<0. * pNumSubstitutions can be NULL. * @param pErrorCode Pointer to a standard ICU error code. Its input value must * pass the U_SUCCESS() test, or else the function returns * immediately. Check for U_FAILURE() on output or use with * function chaining. (See User Guide for details.) * @return The pointer to destination buffer. * @see u_strFromUTF8WithSub * @see u_strFromUTF8Lenient * @see u_strToJavaModifiedUTF8 * @stable ICU 4.4 */ U_STABLE UChar* U_EXPORT2 u_strFromJavaModifiedUTF8WithSub( UChar *dest, int32_t destCapacity, int32_t *pDestLength, const char *src, int32_t srcLength, UChar32 subchar, int32_t *pNumSubstitutions, UErrorCode *pErrorCode); #endif // ucasemap.h /* ******************************************************************************* * * Copyright (C) 2005-2012, International Business Machines * Corporation and others. All Rights Reserved. * ******************************************************************************* * file name: ucasemap.h * encoding: US-ASCII * tab size: 8 (not used) * indentation:4 * * created on: 2005may06 * created by: Markus W. Scherer * * Case mapping service object and functions using it. */ #ifndef __UCASEMAP_H__ #define __UCASEMAP_H__ /** * \file * \brief C API: Unicode case mapping functions using a UCaseMap service object. * * The service object takes care of memory allocations, data loading, and setup * for the attributes, as usual. * * Currently, the functionality provided here does not overlap with uchar.h * and ustring.h, except for ucasemap_toTitle(). * * ucasemap_utf8XYZ() functions operate directly on UTF-8 strings. */ /** * UCaseMap is an opaque service object for newer ICU case mapping functions. * Older functions did not use a service object. * @stable ICU 3.4 */ struct UCaseMap; typedef struct UCaseMap UCaseMap; /**< C typedef for struct UCaseMap. @stable ICU 3.4 */ /** * Open a UCaseMap service object for a locale and a set of options. * The locale ID and options are preprocessed so that functions using the * service object need not process them in each call. * * @param locale ICU locale ID, used for language-dependent * upper-/lower-/title-casing according to the Unicode standard. * Usual semantics: ""=root, NULL=default locale, etc. * @param options Options bit set, used for case folding and string comparisons. * Same flags as for u_foldCase(), u_strFoldCase(), * u_strCaseCompare(), etc. * Use 0 or U_FOLD_CASE_DEFAULT for default behavior. * @param pErrorCode Must be a valid pointer to an error code value, * which must not indicate a failure before the function call. * @return Pointer to a UCaseMap service object, if successful. * * @see U_FOLD_CASE_DEFAULT * @see U_FOLD_CASE_EXCLUDE_SPECIAL_I * @see U_TITLECASE_NO_LOWERCASE * @see U_TITLECASE_NO_BREAK_ADJUSTMENT * @stable ICU 3.4 */ U_STABLE UCaseMap * U_EXPORT2 ucasemap_open(const char *locale, uint32_t options, UErrorCode *pErrorCode); /** * Close a UCaseMap service object. * @param csm Object to be closed. * @stable ICU 3.4 */ U_STABLE void U_EXPORT2 ucasemap_close(UCaseMap *csm); /** * Get the locale ID that is used for language-dependent case mappings. * @param csm UCaseMap service object. * @return locale ID * @stable ICU 3.4 */ U_STABLE const char * U_EXPORT2 ucasemap_getLocale(const UCaseMap *csm); /** * Get the options bit set that is used for case folding and string comparisons. * @param csm UCaseMap service object. * @return options bit set * @stable ICU 3.4 */ U_STABLE uint32_t U_EXPORT2 ucasemap_getOptions(const UCaseMap *csm); /** * Set the locale ID that is used for language-dependent case mappings. * * @param csm UCaseMap service object. * @param locale Locale ID, see ucasemap_open(). * @param pErrorCode Must be a valid pointer to an error code value, * which must not indicate a failure before the function call. * * @see ucasemap_open * @stable ICU 3.4 */ U_STABLE void U_EXPORT2 ucasemap_setLocale(UCaseMap *csm, const char *locale, UErrorCode *pErrorCode); /** * Set the options bit set that is used for case folding and string comparisons. * * @param csm UCaseMap service object. * @param options Options bit set, see ucasemap_open(). * @param pErrorCode Must be a valid pointer to an error code value, * which must not indicate a failure before the function call. * * @see ucasemap_open * @stable ICU 3.4 */ U_STABLE void U_EXPORT2 ucasemap_setOptions(UCaseMap *csm, uint32_t options, UErrorCode *pErrorCode); /** * Do not lowercase non-initial parts of words when titlecasing. * Option bit for titlecasing APIs that take an options bit set. * * By default, titlecasing will titlecase the first cased character * of a word and lowercase all other characters. * With this option, the other characters will not be modified. * * @see ucasemap_setOptions * @see ucasemap_toTitle * @see ucasemap_utf8ToTitle * @see UnicodeString::toTitle * @stable ICU 3.8 */ #define U_TITLECASE_NO_LOWERCASE 0x100 /** * Do not adjust the titlecasing indexes from BreakIterator::next() indexes; * titlecase exactly the characters at breaks from the iterator. * Option bit for titlecasing APIs that take an options bit set. * * By default, titlecasing will take each break iterator index, * adjust it by looking for the next cased character, and titlecase that one. * Other characters are lowercased. * * This follows Unicode 4 & 5 section 3.13 Default Case Operations: * * R3 toTitlecase(X): Find the word boundaries based on Unicode Standard Annex * #29, "Text Boundaries." Between each pair of word boundaries, find the first * cased character F. If F exists, map F to default_title(F); then map each * subsequent character C to default_lower(C). * * @see ucasemap_setOptions * @see ucasemap_toTitle * @see ucasemap_utf8ToTitle * @see UnicodeString::toTitle * @see U_TITLECASE_NO_LOWERCASE * @stable ICU 3.8 */ #define U_TITLECASE_NO_BREAK_ADJUSTMENT 0x200 #if !UCONFIG_NO_BREAK_ITERATION /** * Get the break iterator that is used for titlecasing. * Do not modify the returned break iterator. * @param csm UCaseMap service object. * @return titlecasing break iterator * @stable ICU 3.8 */ U_STABLE const UBreakIterator * U_EXPORT2 ucasemap_getBreakIterator(const UCaseMap *csm); /** * Set the break iterator that is used for titlecasing. * The UCaseMap service object releases a previously set break iterator * and "adopts" this new one, taking ownership of it. * It will be released in a subsequent call to ucasemap_setBreakIterator() * or ucasemap_close(). * * Break iterator operations are not thread-safe. Therefore, titlecasing * functions use non-const UCaseMap objects. It is not possible to titlecase * strings concurrently using the same UCaseMap. * * @param csm UCaseMap service object. * @param iterToAdopt Break iterator to be adopted for titlecasing. * @param pErrorCode Must be a valid pointer to an error code value, * which must not indicate a failure before the function call. * * @see ucasemap_toTitle * @see ucasemap_utf8ToTitle * @stable ICU 3.8 */ U_STABLE void U_EXPORT2 ucasemap_setBreakIterator(UCaseMap *csm, UBreakIterator *iterToAdopt, UErrorCode *pErrorCode); /** * Titlecase a UTF-16 string. This function is almost a duplicate of u_strToTitle(), * except that it takes ucasemap_setOptions() into account and has performance * advantages from being able to use a UCaseMap object for multiple case mapping * operations, saving setup time. * * Casing is locale-dependent and context-sensitive. * Titlecasing uses a break iterator to find the first characters of words * that are to be titlecased. It titlecases those characters and lowercases * all others. (This can be modified with ucasemap_setOptions().) * * Note: This function takes a non-const UCaseMap pointer because it will * open a default break iterator if no break iterator was set yet, * and effectively call ucasemap_setBreakIterator(); * also because the break iterator is stateful and will be modified during * the iteration. * * The titlecase break iterator can be provided to customize for arbitrary * styles, using rules and dictionaries beyond the standard iterators. * The standard titlecase iterator for the root locale implements the * algorithm of Unicode TR 21. * * This function uses only the setUText(), first(), next() and close() methods of the * provided break iterator. * * The result may be longer or shorter than the original. * The source string and the destination buffer must not overlap. * * @param csm UCaseMap service object. This pointer is non-const! * See the note above for details. * @param dest A buffer for the result string. The result will be NUL-terminated if * the buffer is large enough. * The contents is undefined in case of failure. * @param destCapacity The size of the buffer (number of bytes). If it is 0, then * dest may be NULL and the function will only return the length of the result * without writing any of the result string. * @param src The original string. * @param srcLength The length of the original string. If -1, then src must be NUL-terminated. * @param pErrorCode Must be a valid pointer to an error code value, * which must not indicate a failure before the function call. * @return The length of the result string, if successful - or in case of a buffer overflow, * in which case it will be greater than destCapacity. * * @see u_strToTitle * @stable ICU 3.8 */ U_STABLE int32_t U_EXPORT2 ucasemap_toTitle(UCaseMap *csm, UChar *dest, int32_t destCapacity, const UChar *src, int32_t srcLength, UErrorCode *pErrorCode); #endif /** * Lowercase the characters in a UTF-8 string. * Casing is locale-dependent and context-sensitive. * The result may be longer or shorter than the original. * The source string and the destination buffer must not overlap. * * @param csm UCaseMap service object. * @param dest A buffer for the result string. The result will be NUL-terminated if * the buffer is large enough. * The contents is undefined in case of failure. * @param destCapacity The size of the buffer (number of bytes). If it is 0, then * dest may be NULL and the function will only return the length of the result * without writing any of the result string. * @param src The original string. * @param srcLength The length of the original string. If -1, then src must be NUL-terminated. * @param pErrorCode Must be a valid pointer to an error code value, * which must not indicate a failure before the function call. * @return The length of the result string, if successful - or in case of a buffer overflow, * in which case it will be greater than destCapacity. * * @see u_strToLower * @stable ICU 3.4 */ U_STABLE int32_t U_EXPORT2 ucasemap_utf8ToLower(const UCaseMap *csm, char *dest, int32_t destCapacity, const char *src, int32_t srcLength, UErrorCode *pErrorCode); /** * Uppercase the characters in a UTF-8 string. * Casing is locale-dependent and context-sensitive. * The result may be longer or shorter than the original. * The source string and the destination buffer must not overlap. * * @param csm UCaseMap service object. * @param dest A buffer for the result string. The result will be NUL-terminated if * the buffer is large enough. * The contents is undefined in case of failure. * @param destCapacity The size of the buffer (number of bytes). If it is 0, then * dest may be NULL and the function will only return the length of the result * without writing any of the result string. * @param src The original string. * @param srcLength The length of the original string. If -1, then src must be NUL-terminated. * @param pErrorCode Must be a valid pointer to an error code value, * which must not indicate a failure before the function call. * @return The length of the result string, if successful - or in case of a buffer overflow, * in which case it will be greater than destCapacity. * * @see u_strToUpper * @stable ICU 3.4 */ U_STABLE int32_t U_EXPORT2 ucasemap_utf8ToUpper(const UCaseMap *csm, char *dest, int32_t destCapacity, const char *src, int32_t srcLength, UErrorCode *pErrorCode); #if !UCONFIG_NO_BREAK_ITERATION /** * Titlecase a UTF-8 string. * Casing is locale-dependent and context-sensitive. * Titlecasing uses a break iterator to find the first characters of words * that are to be titlecased. It titlecases those characters and lowercases * all others. (This can be modified with ucasemap_setOptions().) * * Note: This function takes a non-const UCaseMap pointer because it will * open a default break iterator if no break iterator was set yet, * and effectively call ucasemap_setBreakIterator(); * also because the break iterator is stateful and will be modified during * the iteration. * * The titlecase break iterator can be provided to customize for arbitrary * styles, using rules and dictionaries beyond the standard iterators. * The standard titlecase iterator for the root locale implements the * algorithm of Unicode TR 21. * * This function uses only the setUText(), first(), next() and close() methods of the * provided break iterator. * * The result may be longer or shorter than the original. * The source string and the destination buffer must not overlap. * * @param csm UCaseMap service object. This pointer is non-const! * See the note above for details. * @param dest A buffer for the result string. The result will be NUL-terminated if * the buffer is large enough. * The contents is undefined in case of failure. * @param destCapacity The size of the buffer (number of bytes). If it is 0, then * dest may be NULL and the function will only return the length of the result * without writing any of the result string. * @param src The original string. * @param srcLength The length of the original string. If -1, then src must be NUL-terminated. * @param pErrorCode Must be a valid pointer to an error code value, * which must not indicate a failure before the function call. * @return The length of the result string, if successful - or in case of a buffer overflow, * in which case it will be greater than destCapacity. * * @see u_strToTitle * @see U_TITLECASE_NO_LOWERCASE * @see U_TITLECASE_NO_BREAK_ADJUSTMENT * @stable ICU 3.8 */ U_STABLE int32_t U_EXPORT2 ucasemap_utf8ToTitle(UCaseMap *csm, char *dest, int32_t destCapacity, const char *src, int32_t srcLength, UErrorCode *pErrorCode); #endif /** * Case-folds the characters in a UTF-8 string. * * Case-folding is locale-independent and not context-sensitive, * but there is an option for whether to include or exclude mappings for dotted I * and dotless i that are marked with 'T' in CaseFolding.txt. * * The result may be longer or shorter than the original. * The source string and the destination buffer must not overlap. * * @param csm UCaseMap service object. * @param dest A buffer for the result string. The result will be NUL-terminated if * the buffer is large enough. * The contents is undefined in case of failure. * @param destCapacity The size of the buffer (number of bytes). If it is 0, then * dest may be NULL and the function will only return the length of the result * without writing any of the result string. * @param src The original string. * @param srcLength The length of the original string. If -1, then src must be NUL-terminated. * @param pErrorCode Must be a valid pointer to an error code value, * which must not indicate a failure before the function call. * @return The length of the result string, if successful - or in case of a buffer overflow, * in which case it will be greater than destCapacity. * * @see u_strFoldCase * @see ucasemap_setOptions * @see U_FOLD_CASE_DEFAULT * @see U_FOLD_CASE_EXCLUDE_SPECIAL_I * @stable ICU 3.8 */ U_STABLE int32_t U_EXPORT2 ucasemap_utf8FoldCase(const UCaseMap *csm, char *dest, int32_t destCapacity, const char *src, int32_t srcLength, UErrorCode *pErrorCode); #endif // unistr.h /* ********************************************************************** * Copyright (C) 1998-2016, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** * * File unistr.h * * Modification History: * * Date Name Description * 09/25/98 stephen Creation. * 11/11/98 stephen Changed per 11/9 code review. * 04/20/99 stephen Overhauled per 4/16 code review. * 11/18/99 aliu Made to inherit from Replaceable. Added method * handleReplaceBetween(); other methods unchanged. * 06/25/01 grhoten Remove dependency on iostream. ****************************************************************************** */ #ifndef UNISTR_H #define UNISTR_H /** * \file * \brief C++ API: Unicode String */ struct UConverter; // unicode/ucnv.h #ifndef U_COMPARE_CODE_POINT_ORDER /* see also ustring.h and unorm.h */ /** * Option bit for u_strCaseCompare, u_strcasecmp, unorm_compare, etc: * Compare strings in code point order instead of code unit order. * @stable ICU 2.2 */ #define U_COMPARE_CODE_POINT_ORDER 0x8000 #endif #ifndef USTRING_H /** * \ingroup ustring_ustrlen */ U_STABLE int32_t U_EXPORT2 u_strlen(const UChar *s); #endif /** * \def U_STRING_CASE_MAPPER_DEFINED * @internal */ #ifndef U_STRING_CASE_MAPPER_DEFINED #define U_STRING_CASE_MAPPER_DEFINED /** * Internal string case mapping function type. * @internal */ typedef int32_t U_CALLCONV UStringCaseMapper(const UCaseMap *csm, UChar *dest, int32_t destCapacity, const UChar *src, int32_t srcLength, UErrorCode *pErrorCode); #endif #endif // parseerr.h /* ********************************************************************** * Copyright (C) 1999-2005, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** * Date Name Description * 03/14/00 aliu Creation. * 06/27/00 aliu Change from C++ class to C struct ********************************************************************** */ #ifndef PARSEERR_H #define PARSEERR_H /** * \file * \brief C API: Parse Error Information */ /** * The capacity of the context strings in UParseError. * @stable ICU 2.0 */ enum { U_PARSE_CONTEXT_LEN = 16 }; /** * A UParseError struct is used to returned detailed information about * parsing errors. It is used by ICU parsing engines that parse long * rules, patterns, or programs, where the text being parsed is long * enough that more information than a UErrorCode is needed to * localize the error. * *

The line, offset, and context fields are optional; parsing * engines may choose not to use to use them. * *

The preContext and postContext strings include some part of the * context surrounding the error. If the source text is "let for=7" * and "for" is the error (e.g., because it is a reserved word), then * some examples of what a parser might produce are the following: * *

 * preContext   postContext
 * ""           ""            The parser does not support context
 * "let "       "=7"          Pre- and post-context only
 * "let "       "for=7"       Pre- and post-context and error text
 * ""           "for"         Error text only
 * 
* *

Examples of engines which use UParseError (or may use it in the * future) are Transliterator, RuleBasedBreakIterator, and * RegexPattern. * * @stable ICU 2.0 */ typedef struct UParseError { /** * The line on which the error occured. If the parser uses this * field, it sets it to the line number of the source text line on * which the error appears, which will be be a value >= 1. If the * parse does not support line numbers, the value will be <= 0. * @stable ICU 2.0 */ int32_t line; /** * The character offset to the error. If the line field is >= 1, * then this is the offset from the start of the line. Otherwise, * this is the offset from the start of the text. If the parser * does not support this field, it will have a value < 0. * @stable ICU 2.0 */ int32_t offset; /** * Textual context before the error. Null-terminated. The empty * string if not supported by parser. * @stable ICU 2.0 */ UChar preContext[U_PARSE_CONTEXT_LEN]; /** * The error itself and/or textual context after the error. * Null-terminated. The empty string if not supported by parser. * @stable ICU 2.0 */ UChar postContext[U_PARSE_CONTEXT_LEN]; } UParseError; #endif // usprep.h /* ******************************************************************************* * * Copyright (C) 2003-2014, International Business Machines * Corporation and others. All Rights Reserved. * ******************************************************************************* * file name: usprep.h * encoding: US-ASCII * tab size: 8 (not used) * indentation:4 * * created on: 2003jul2 * created by: Ram Viswanadha */ #ifndef __USPREP_H__ #define __USPREP_H__ /** * \file * \brief C API: Implements the StringPrep algorithm. */ /** * * StringPrep API implements the StingPrep framework as described by RFC 3454. * StringPrep prepares Unicode strings for use in network protocols. * Profiles of StingPrep are set of rules and data according to with the * Unicode Strings are prepared. Each profiles contains tables which describe * how a code point should be treated. The tables are broadly classied into *

    *
  • Unassinged Table: Contains code points that are unassigned * in the Unicode Version supported by StringPrep. Currently * RFC 3454 supports Unicode 3.2.
  • *
  • Prohibited Table: Contains code points that are prohibted from * the output of the StringPrep processing function.
  • *
  • Mapping Table: Contains code ponts that are deleted from the output or case mapped.
  • *
* * The procedure for preparing Unicode strings: *
    *
  1. Map: For each character in the input, check if it has a mapping * and, if so, replace it with its mapping.
  2. *
  3. Normalize: Possibly normalize the result of step 1 using Unicode * normalization.
  4. *
  5. Prohibit: Check for any characters that are not allowed in the * output. If any are found, return an error.
  6. *
  7. Check bidi: Possibly check for right-to-left characters, and if * any are found, make sure that the whole string satisfies the * requirements for bidirectional strings. If the string does not * satisfy the requirements for bidirectional strings, return an * error.
  8. *
* @author Ram Viswanadha */ #if !UCONFIG_NO_IDNA /** * The StringPrep profile * @stable ICU 2.8 */ typedef struct UStringPrepProfile UStringPrepProfile; /** * Option to prohibit processing of unassigned code points in the input * * @see usprep_prepare * @stable ICU 2.8 */ #define USPREP_DEFAULT 0x0000 /** * Option to allow processing of unassigned code points in the input * * @see usprep_prepare * @stable ICU 2.8 */ #define USPREP_ALLOW_UNASSIGNED 0x0001 /** * enums for the standard stringprep profile types * supported by usprep_openByType. * @see usprep_openByType * @stable ICU 4.2 */ typedef enum UStringPrepProfileType { /** * RFC3491 Nameprep * @stable ICU 4.2 */ USPREP_RFC3491_NAMEPREP, /** * RFC3530 nfs4_cs_prep * @stable ICU 4.2 */ USPREP_RFC3530_NFS4_CS_PREP, /** * RFC3530 nfs4_cs_prep with case insensitive option * @stable ICU 4.2 */ USPREP_RFC3530_NFS4_CS_PREP_CI, /** * RFC3530 nfs4_cis_prep * @stable ICU 4.2 */ USPREP_RFC3530_NFS4_CIS_PREP, /** * RFC3530 nfs4_mixed_prep for prefix * @stable ICU 4.2 */ USPREP_RFC3530_NFS4_MIXED_PREP_PREFIX, /** * RFC3530 nfs4_mixed_prep for suffix * @stable ICU 4.2 */ USPREP_RFC3530_NFS4_MIXED_PREP_SUFFIX, /** * RFC3722 iSCSI * @stable ICU 4.2 */ USPREP_RFC3722_ISCSI, /** * RFC3920 XMPP Nodeprep * @stable ICU 4.2 */ USPREP_RFC3920_NODEPREP, /** * RFC3920 XMPP Resourceprep * @stable ICU 4.2 */ USPREP_RFC3920_RESOURCEPREP, /** * RFC4011 Policy MIB Stringprep * @stable ICU 4.2 */ USPREP_RFC4011_MIB, /** * RFC4013 SASLprep * @stable ICU 4.2 */ USPREP_RFC4013_SASLPREP, /** * RFC4505 trace * @stable ICU 4.2 */ USPREP_RFC4505_TRACE, /** * RFC4518 LDAP * @stable ICU 4.2 */ USPREP_RFC4518_LDAP, /** * RFC4518 LDAP for case ignore, numeric and stored prefix * matching rules * @stable ICU 4.2 */ USPREP_RFC4518_LDAP_CI } UStringPrepProfileType; /** * Creates a StringPrep profile from the data file. * * @param path string containing the full path pointing to the directory * where the profile reside followed by the package name * e.g. "/usr/resource/my_app/profiles/mydata" on a Unix system. * if NULL, ICU default data files will be used. * @param fileName name of the profile file to be opened * @param status ICU error code in/out parameter. Must not be NULL. * Must fulfill U_SUCCESS before the function call. * @return Pointer to UStringPrepProfile that is opened. Should be closed by * calling usprep_close() * @see usprep_close() * @stable ICU 2.8 */ U_STABLE UStringPrepProfile* U_EXPORT2 usprep_open(const char* path, const char* fileName, UErrorCode* status); /** * Creates a StringPrep profile for the specified profile type. * * @param type The profile type * @param status ICU error code in/out parameter. Must not be NULL. * Must fulfill U_SUCCESS before the function call. * @return Pointer to UStringPrepProfile that is opened. Should be closed by * calling usprep_close() * @see usprep_close() * @stable ICU 4.2 */ U_STABLE UStringPrepProfile* U_EXPORT2 usprep_openByType(UStringPrepProfileType type, UErrorCode* status); /** * Closes the profile * @param profile The profile to close * @stable ICU 2.8 */ U_STABLE void U_EXPORT2 usprep_close(UStringPrepProfile* profile); /** * Prepare the input buffer for use in applications with the given profile. This operation maps, normalizes(NFKC), * checks for prohited and BiDi characters in the order defined by RFC 3454 * depending on the options specified in the profile. * * @param prep The profile to use * @param src Pointer to UChar buffer containing the string to prepare * @param srcLength Number of characters in the source string * @param dest Pointer to the destination buffer to receive the output * @param destCapacity The capacity of destination array * @param options A bit set of options: * * - USPREP_DEFAULT Prohibit processing of unassigned code points in the input * * - USPREP_ALLOW_UNASSIGNED Treat the unassigned code points are in the input * as normal Unicode code points. * * @param parseError Pointer to UParseError struct to receive information on position * of error if an error is encountered. Can be NULL. * @param status ICU in/out error code parameter. * U_INVALID_CHAR_FOUND if src contains * unmatched single surrogates. * U_INDEX_OUTOFBOUNDS_ERROR if src contains * too many code points. * U_BUFFER_OVERFLOW_ERROR if destCapacity is not enough * @return The number of UChars in the destination buffer * @stable ICU 2.8 */ U_STABLE int32_t U_EXPORT2 usprep_prepare( const UStringPrepProfile* prep, const UChar* src, int32_t srcLength, UChar* dest, int32_t destCapacity, int32_t options, UParseError* parseError, UErrorCode* status ); #endif /* #if !UCONFIG_NO_IDNA */ #endif // uidna.h /* ******************************************************************************* * * Copyright (C) 2003-2014, International Business Machines * Corporation and others. All Rights Reserved. * ******************************************************************************* * file name: uidna.h * encoding: US-ASCII * tab size: 8 (not used) * indentation:4 * * created on: 2003feb1 * created by: Ram Viswanadha */ #ifndef __UIDNA_H__ #define __UIDNA_H__ #if !UCONFIG_NO_IDNA /** * \file * \brief C API: Internationalizing Domain Names in Applications (IDNA) * * IDNA2008 is implemented according to UTS #46, see the IDNA C++ class in idna.h. * * The C API functions which do take a UIDNA * service object pointer * implement UTS #46 and IDNA2008. * * IDNA2003 is obsolete. * The C API functions which do not take a service object pointer * implement IDNA2003. They are all deprecated. */ /* * IDNA option bit set values. */ enum { /** * Default options value: None of the other options are set. * For use in static worker and factory methods. * @stable ICU 2.6 */ UIDNA_DEFAULT=0, /** * Option to check whether the input conforms to the STD3 ASCII rules, * for example the restriction of labels to LDH characters * (ASCII Letters, Digits and Hyphen-Minus). * For use in static worker and factory methods. * @stable ICU 2.6 */ UIDNA_USE_STD3_RULES=2, /** * IDNA option to check for whether the input conforms to the BiDi rules. * For use in static worker and factory methods. *

This option is ignored by the IDNA2003 implementation. * (IDNA2003 always performs a BiDi check.) * @stable ICU 4.6 */ UIDNA_CHECK_BIDI=4, /** * IDNA option to check for whether the input conforms to the CONTEXTJ rules. * For use in static worker and factory methods. *

This option is ignored by the IDNA2003 implementation. * (The CONTEXTJ check is new in IDNA2008.) * @stable ICU 4.6 */ UIDNA_CHECK_CONTEXTJ=8, /** * IDNA option for nontransitional processing in ToASCII(). * For use in static worker and factory methods. *

By default, ToASCII() uses transitional processing. *

This option is ignored by the IDNA2003 implementation. * (This is only relevant for compatibility of newer IDNA implementations with IDNA2003.) * @stable ICU 4.6 */ UIDNA_NONTRANSITIONAL_TO_ASCII=0x10, /** * IDNA option for nontransitional processing in ToUnicode(). * For use in static worker and factory methods. *

By default, ToUnicode() uses transitional processing. *

This option is ignored by the IDNA2003 implementation. * (This is only relevant for compatibility of newer IDNA implementations with IDNA2003.) * @stable ICU 4.6 */ UIDNA_NONTRANSITIONAL_TO_UNICODE=0x20, /** * IDNA option to check for whether the input conforms to the CONTEXTO rules. * For use in static worker and factory methods. *

This option is ignored by the IDNA2003 implementation. * (The CONTEXTO check is new in IDNA2008.) *

This is for use by registries for IDNA2008 conformance. * UTS #46 does not require the CONTEXTO check. * @stable ICU 49 */ UIDNA_CHECK_CONTEXTO=0x40 }; /** * Opaque C service object type for the new IDNA API. * @stable ICU 4.6 */ struct UIDNA; typedef struct UIDNA UIDNA; /**< C typedef for struct UIDNA. @stable ICU 4.6 */ /** * Returns a UIDNA instance which implements UTS #46. * Returns an unmodifiable instance, owned by the caller. * Cache it for multiple operations, and uidna_close() it when done. * The instance is thread-safe, that is, it can be used concurrently. * * For details about the UTS #46 implementation see the IDNA C++ class in idna.h. * * @param options Bit set to modify the processing and error checking. * See option bit set values in uidna.h. * @param pErrorCode Standard ICU error code. Its input value must * pass the U_SUCCESS() test, or else the function returns * immediately. Check for U_FAILURE() on output or use with * function chaining. (See User Guide for details.) * @return the UTS #46 UIDNA instance, if successful * @stable ICU 4.6 */ U_STABLE UIDNA * U_EXPORT2 uidna_openUTS46(uint32_t options, UErrorCode *pErrorCode); /** * Closes a UIDNA instance. * @param idna UIDNA instance to be closed * @stable ICU 4.6 */ U_STABLE void U_EXPORT2 uidna_close(UIDNA *idna); /** * Output container for IDNA processing errors. * Initialize with UIDNA_INFO_INITIALIZER: * \code * UIDNAInfo info = UIDNA_INFO_INITIALIZER; * int32_t length = uidna_nameToASCII(..., &info, &errorCode); * if(U_SUCCESS(errorCode) && info.errors!=0) { ... } * \endcode * @stable ICU 4.6 */ typedef struct UIDNAInfo { /** sizeof(UIDNAInfo) @stable ICU 4.6 */ int16_t size; /** * Set to TRUE if transitional and nontransitional processing produce different results. * For details see C++ IDNAInfo::isTransitionalDifferent(). * @stable ICU 4.6 */ UBool isTransitionalDifferent; UBool reservedB3; /**< Reserved field, do not use. @internal */ /** * Bit set indicating IDNA processing errors. 0 if no errors. * See UIDNA_ERROR_... constants. * @stable ICU 4.6 */ uint32_t errors; int32_t reservedI2; /**< Reserved field, do not use. @internal */ int32_t reservedI3; /**< Reserved field, do not use. @internal */ } UIDNAInfo; /** * Static initializer for a UIDNAInfo struct. * @stable ICU 4.6 */ #define UIDNA_INFO_INITIALIZER { \ (int16_t)sizeof(UIDNAInfo), \ FALSE, FALSE, \ 0, 0, 0 } /** * Converts a single domain name label into its ASCII form for DNS lookup. * If any processing step fails, then pInfo->errors will be non-zero and * the result might not be an ASCII string. * The label might be modified according to the types of errors. * Labels with severe errors will be left in (or turned into) their Unicode form. * * The UErrorCode indicates an error only in exceptional cases, * such as a U_MEMORY_ALLOCATION_ERROR. * * @param idna UIDNA instance * @param label Input domain name label * @param length Label length, or -1 if NUL-terminated * @param dest Destination string buffer * @param capacity Destination buffer capacity * @param pInfo Output container of IDNA processing details. * @param pErrorCode Standard ICU error code. Its input value must * pass the U_SUCCESS() test, or else the function returns * immediately. Check for U_FAILURE() on output or use with * function chaining. (See User Guide for details.) * @return destination string length * @stable ICU 4.6 */ U_STABLE int32_t U_EXPORT2 uidna_labelToASCII(const UIDNA *idna, const UChar *label, int32_t length, UChar *dest, int32_t capacity, UIDNAInfo *pInfo, UErrorCode *pErrorCode); /** * Converts a single domain name label into its Unicode form for human-readable display. * If any processing step fails, then pInfo->errors will be non-zero. * The label might be modified according to the types of errors. * * The UErrorCode indicates an error only in exceptional cases, * such as a U_MEMORY_ALLOCATION_ERROR. * * @param idna UIDNA instance * @param label Input domain name label * @param length Label length, or -1 if NUL-terminated * @param dest Destination string buffer * @param capacity Destination buffer capacity * @param pInfo Output container of IDNA processing details. * @param pErrorCode Standard ICU error code. Its input value must * pass the U_SUCCESS() test, or else the function returns * immediately. Check for U_FAILURE() on output or use with * function chaining. (See User Guide for details.) * @return destination string length * @stable ICU 4.6 */ U_STABLE int32_t U_EXPORT2 uidna_labelToUnicode(const UIDNA *idna, const UChar *label, int32_t length, UChar *dest, int32_t capacity, UIDNAInfo *pInfo, UErrorCode *pErrorCode); /** * Converts a whole domain name into its ASCII form for DNS lookup. * If any processing step fails, then pInfo->errors will be non-zero and * the result might not be an ASCII string. * The domain name might be modified according to the types of errors. * Labels with severe errors will be left in (or turned into) their Unicode form. * * The UErrorCode indicates an error only in exceptional cases, * such as a U_MEMORY_ALLOCATION_ERROR. * * @param idna UIDNA instance * @param name Input domain name * @param length Domain name length, or -1 if NUL-terminated * @param dest Destination string buffer * @param capacity Destination buffer capacity * @param pInfo Output container of IDNA processing details. * @param pErrorCode Standard ICU error code. Its input value must * pass the U_SUCCESS() test, or else the function returns * immediately. Check for U_FAILURE() on output or use with * function chaining. (See User Guide for details.) * @return destination string length * @stable ICU 4.6 */ U_STABLE int32_t U_EXPORT2 uidna_nameToASCII(const UIDNA *idna, const UChar *name, int32_t length, UChar *dest, int32_t capacity, UIDNAInfo *pInfo, UErrorCode *pErrorCode); /** * Converts a whole domain name into its Unicode form for human-readable display. * If any processing step fails, then pInfo->errors will be non-zero. * The domain name might be modified according to the types of errors. * * The UErrorCode indicates an error only in exceptional cases, * such as a U_MEMORY_ALLOCATION_ERROR. * * @param idna UIDNA instance * @param name Input domain name * @param length Domain name length, or -1 if NUL-terminated * @param dest Destination string buffer * @param capacity Destination buffer capacity * @param pInfo Output container of IDNA processing details. * @param pErrorCode Standard ICU error code. Its input value must * pass the U_SUCCESS() test, or else the function returns * immediately. Check for U_FAILURE() on output or use with * function chaining. (See User Guide for details.) * @return destination string length * @stable ICU 4.6 */ U_STABLE int32_t U_EXPORT2 uidna_nameToUnicode(const UIDNA *idna, const UChar *name, int32_t length, UChar *dest, int32_t capacity, UIDNAInfo *pInfo, UErrorCode *pErrorCode); /* UTF-8 versions of the processing methods --------------------------------- */ /** * Converts a single domain name label into its ASCII form for DNS lookup. * UTF-8 version of uidna_labelToASCII(), same behavior. * * @param idna UIDNA instance * @param label Input domain name label * @param length Label length, or -1 if NUL-terminated * @param dest Destination string buffer * @param capacity Destination buffer capacity * @param pInfo Output container of IDNA processing details. * @param pErrorCode Standard ICU error code. Its input value must * pass the U_SUCCESS() test, or else the function returns * immediately. Check for U_FAILURE() on output or use with * function chaining. (See User Guide for details.) * @return destination string length * @stable ICU 4.6 */ U_STABLE int32_t U_EXPORT2 uidna_labelToASCII_UTF8(const UIDNA *idna, const char *label, int32_t length, char *dest, int32_t capacity, UIDNAInfo *pInfo, UErrorCode *pErrorCode); /** * Converts a single domain name label into its Unicode form for human-readable display. * UTF-8 version of uidna_labelToUnicode(), same behavior. * * @param idna UIDNA instance * @param label Input domain name label * @param length Label length, or -1 if NUL-terminated * @param dest Destination string buffer * @param capacity Destination buffer capacity * @param pInfo Output container of IDNA processing details. * @param pErrorCode Standard ICU error code. Its input value must * pass the U_SUCCESS() test, or else the function returns * immediately. Check for U_FAILURE() on output or use with * function chaining. (See User Guide for details.) * @return destination string length * @stable ICU 4.6 */ U_STABLE int32_t U_EXPORT2 uidna_labelToUnicodeUTF8(const UIDNA *idna, const char *label, int32_t length, char *dest, int32_t capacity, UIDNAInfo *pInfo, UErrorCode *pErrorCode); /** * Converts a whole domain name into its ASCII form for DNS lookup. * UTF-8 version of uidna_nameToASCII(), same behavior. * * @param idna UIDNA instance * @param name Input domain name * @param length Domain name length, or -1 if NUL-terminated * @param dest Destination string buffer * @param capacity Destination buffer capacity * @param pInfo Output container of IDNA processing details. * @param pErrorCode Standard ICU error code. Its input value must * pass the U_SUCCESS() test, or else the function returns * immediately. Check for U_FAILURE() on output or use with * function chaining. (See User Guide for details.) * @return destination string length * @stable ICU 4.6 */ U_STABLE int32_t U_EXPORT2 uidna_nameToASCII_UTF8(const UIDNA *idna, const char *name, int32_t length, char *dest, int32_t capacity, UIDNAInfo *pInfo, UErrorCode *pErrorCode); /** * Converts a whole domain name into its Unicode form for human-readable display. * UTF-8 version of uidna_nameToUnicode(), same behavior. * * @param idna UIDNA instance * @param name Input domain name * @param length Domain name length, or -1 if NUL-terminated * @param dest Destination string buffer * @param capacity Destination buffer capacity * @param pInfo Output container of IDNA processing details. * @param pErrorCode Standard ICU error code. Its input value must * pass the U_SUCCESS() test, or else the function returns * immediately. Check for U_FAILURE() on output or use with * function chaining. (See User Guide for details.) * @return destination string length * @stable ICU 4.6 */ U_STABLE int32_t U_EXPORT2 uidna_nameToUnicodeUTF8(const UIDNA *idna, const char *name, int32_t length, char *dest, int32_t capacity, UIDNAInfo *pInfo, UErrorCode *pErrorCode); /* * IDNA error bit set values. * When a domain name or label fails a processing step or does not meet the * validity criteria, then one or more of these error bits are set. */ enum { /** * A non-final domain name label (or the whole domain name) is empty. * @stable ICU 4.6 */ UIDNA_ERROR_EMPTY_LABEL=1, /** * A domain name label is longer than 63 bytes. * (See STD13/RFC1034 3.1. Name space specifications and terminology.) * This is only checked in ToASCII operations, and only if the output label is all-ASCII. * @stable ICU 4.6 */ UIDNA_ERROR_LABEL_TOO_LONG=2, /** * A domain name is longer than 255 bytes in its storage form. * (See STD13/RFC1034 3.1. Name space specifications and terminology.) * This is only checked in ToASCII operations, and only if the output domain name is all-ASCII. * @stable ICU 4.6 */ UIDNA_ERROR_DOMAIN_NAME_TOO_LONG=4, /** * A label starts with a hyphen-minus ('-'). * @stable ICU 4.6 */ UIDNA_ERROR_LEADING_HYPHEN=8, /** * A label ends with a hyphen-minus ('-'). * @stable ICU 4.6 */ UIDNA_ERROR_TRAILING_HYPHEN=0x10, /** * A label contains hyphen-minus ('-') in the third and fourth positions. * @stable ICU 4.6 */ UIDNA_ERROR_HYPHEN_3_4=0x20, /** * A label starts with a combining mark. * @stable ICU 4.6 */ UIDNA_ERROR_LEADING_COMBINING_MARK=0x40, /** * A label or domain name contains disallowed characters. * @stable ICU 4.6 */ UIDNA_ERROR_DISALLOWED=0x80, /** * A label starts with "xn--" but does not contain valid Punycode. * That is, an xn-- label failed Punycode decoding. * @stable ICU 4.6 */ UIDNA_ERROR_PUNYCODE=0x100, /** * A label contains a dot=full stop. * This can occur in an input string for a single-label function. * @stable ICU 4.6 */ UIDNA_ERROR_LABEL_HAS_DOT=0x200, /** * An ACE label does not contain a valid label string. * The label was successfully ACE (Punycode) decoded but the resulting * string had severe validation errors. For example, * it might contain characters that are not allowed in ACE labels, * or it might not be normalized. * @stable ICU 4.6 */ UIDNA_ERROR_INVALID_ACE_LABEL=0x400, /** * A label does not meet the IDNA BiDi requirements (for right-to-left characters). * @stable ICU 4.6 */ UIDNA_ERROR_BIDI=0x800, /** * A label does not meet the IDNA CONTEXTJ requirements. * @stable ICU 4.6 */ UIDNA_ERROR_CONTEXTJ=0x1000, /** * A label does not meet the IDNA CONTEXTO requirements for punctuation characters. * Some punctuation characters "Would otherwise have been DISALLOWED" * but are allowed in certain contexts. (RFC 5892) * @stable ICU 49 */ UIDNA_ERROR_CONTEXTO_PUNCTUATION=0x2000, /** * A label does not meet the IDNA CONTEXTO requirements for digits. * Arabic-Indic Digits (U+066x) must not be mixed with Extended Arabic-Indic Digits (U+06Fx). * @stable ICU 49 */ UIDNA_ERROR_CONTEXTO_DIGITS=0x4000 }; #endif /* #if !UCONFIG_NO_IDNA */ #endif // ubrk.h /* ****************************************************************************** * Copyright (C) 1996-2015, International Business Machines Corporation and others. * All Rights Reserved. ****************************************************************************** */ #ifndef UBRK_H #define UBRK_H /** * A text-break iterator. * For usage in C programs. */ #ifndef UBRK_TYPEDEF_UBREAK_ITERATOR # define UBRK_TYPEDEF_UBREAK_ITERATOR /** * Opaque type representing an ICU Break iterator object. * @stable ICU 2.0 */ typedef struct UBreakIterator UBreakIterator; #endif #if !UCONFIG_NO_BREAK_ITERATION /** * \file * \brief C API: BreakIterator * *

BreakIterator C API

* * The BreakIterator C API defines methods for finding the location * of boundaries in text. Pointer to a UBreakIterator maintain a * current position and scan over text returning the index of characters * where boundaries occur. *

* Line boundary analysis determines where a text string can be broken * when line-wrapping. The mechanism correctly handles punctuation and * hyphenated words. *

* Note: The locale keyword "lb" can be used to modify line break * behavior according to the CSS level 3 line-break options, see * . For example: * "ja@lb=strict", "zh@lb=loose". *

* Sentence boundary analysis allows selection with correct * interpretation of periods within numbers and abbreviations, and * trailing punctuation marks such as quotation marks and parentheses. *

* Note: The locale keyword "ss" can be used to enable use of * segmentation suppression data (preventing breaks in English after * abbreviations such as "Mr." or "Est.", for example), as follows: * "en@ss=standard". *

* Word boundary analysis is used by search and replace functions, as * well as within text editing applications that allow the user to * select words with a double click. Word selection provides correct * interpretation of punctuation marks within and following * words. Characters that are not part of a word, such as symbols or * punctuation marks, have word-breaks on both sides. *

* Character boundary analysis identifies the boundaries of * "Extended Grapheme Clusters", which are groupings of codepoints * that should be treated as character-like units for many text operations. * Please see Unicode Standard Annex #29, Unicode Text Segmentation, * http://www.unicode.org/reports/tr29/ for additional information * on grapheme clusters and guidelines on their use. *

* Title boundary analysis locates all positions, * typically starts of words, that should be set to Title Case * when title casing the text. *

* The text boundary positions are found according to the rules * described in Unicode Standard Annex #29, Text Boundaries, and * Unicode Standard Annex #14, Line Breaking Properties. These * are available at http://www.unicode.org/reports/tr14/ and * http://www.unicode.org/reports/tr29/. *

* In addition to the plain C API defined in this header file, an * object oriented C++ API with equivalent functionality is defined in the * file brkiter.h. *

* Code snippets illustrating the use of the Break Iterator APIs * are available in the ICU User Guide, * http://icu-project.org/userguide/boundaryAnalysis.html * and in the sample program icu/source/samples/break/break.cpp */ /** The possible types of text boundaries. @stable ICU 2.0 */ typedef enum UBreakIteratorType { /** Character breaks @stable ICU 2.0 */ UBRK_CHARACTER = 0, /** Word breaks @stable ICU 2.0 */ UBRK_WORD = 1, /** Line breaks @stable ICU 2.0 */ UBRK_LINE = 2, /** Sentence breaks @stable ICU 2.0 */ UBRK_SENTENCE = 3, UBRK_COUNT = 5 } UBreakIteratorType; /** Value indicating all text boundaries have been returned. * @stable ICU 2.0 */ #define UBRK_DONE ((int32_t) -1) /** * Enum constants for the word break tags returned by * getRuleStatus(). A range of values is defined for each category of * word, to allow for further subdivisions of a category in future releases. * Applications should check for tag values falling within the range, rather * than for single individual values. * @stable ICU 2.2 */ typedef enum UWordBreak { /** Tag value for "words" that do not fit into any of other categories. * Includes spaces and most punctuation. */ UBRK_WORD_NONE = 0, /** Upper bound for tags for uncategorized words. */ UBRK_WORD_NONE_LIMIT = 100, /** Tag value for words that appear to be numbers, lower limit. */ UBRK_WORD_NUMBER = 100, /** Tag value for words that appear to be numbers, upper limit. */ UBRK_WORD_NUMBER_LIMIT = 200, /** Tag value for words that contain letters, excluding * hiragana, katakana or ideographic characters, lower limit. */ UBRK_WORD_LETTER = 200, /** Tag value for words containing letters, upper limit */ UBRK_WORD_LETTER_LIMIT = 300, /** Tag value for words containing kana characters, lower limit */ UBRK_WORD_KANA = 300, /** Tag value for words containing kana characters, upper limit */ UBRK_WORD_KANA_LIMIT = 400, /** Tag value for words containing ideographic characters, lower limit */ UBRK_WORD_IDEO = 400, /** Tag value for words containing ideographic characters, upper limit */ UBRK_WORD_IDEO_LIMIT = 500 } UWordBreak; /** * Enum constants for the line break tags returned by getRuleStatus(). * A range of values is defined for each category of * word, to allow for further subdivisions of a category in future releases. * Applications should check for tag values falling within the range, rather * than for single individual values. * @stable ICU 2.8 */ typedef enum ULineBreakTag { /** Tag value for soft line breaks, positions at which a line break * is acceptable but not required */ UBRK_LINE_SOFT = 0, /** Upper bound for soft line breaks. */ UBRK_LINE_SOFT_LIMIT = 100, /** Tag value for a hard, or mandatory line break */ UBRK_LINE_HARD = 100, /** Upper bound for hard line breaks. */ UBRK_LINE_HARD_LIMIT = 200 } ULineBreakTag; /** * Enum constants for the sentence break tags returned by getRuleStatus(). * A range of values is defined for each category of * sentence, to allow for further subdivisions of a category in future releases. * Applications should check for tag values falling within the range, rather * than for single individual values. * @stable ICU 2.8 */ typedef enum USentenceBreakTag { /** Tag value for for sentences ending with a sentence terminator * ('.', '?', '!', etc.) character, possibly followed by a * hard separator (CR, LF, PS, etc.) */ UBRK_SENTENCE_TERM = 0, /** Upper bound for tags for sentences ended by sentence terminators. */ UBRK_SENTENCE_TERM_LIMIT = 100, /** Tag value for for sentences that do not contain an ending * sentence terminator ('.', '?', '!', etc.) character, but * are ended only by a hard separator (CR, LF, PS, etc.) or end of input. */ UBRK_SENTENCE_SEP = 100, /** Upper bound for tags for sentences ended by a separator. */ UBRK_SENTENCE_SEP_LIMIT = 200 /** Tag value for a hard, or mandatory line break */ } USentenceBreakTag; /** * Open a new UBreakIterator for locating text boundaries for a specified locale. * A UBreakIterator may be used for detecting character, line, word, * and sentence breaks in text. * @param type The type of UBreakIterator to open: one of UBRK_CHARACTER, UBRK_WORD, * UBRK_LINE, UBRK_SENTENCE * @param locale The locale specifying the text-breaking conventions. Note that * locale keys such as "lb" and "ss" may be used to modify text break behavior, * see general discussion of BreakIterator C API. * @param text The text to be iterated over. * @param textLength The number of characters in text, or -1 if null-terminated. * @param status A UErrorCode to receive any errors. * @return A UBreakIterator for the specified locale. * @see ubrk_openRules * @stable ICU 2.0 */ U_STABLE UBreakIterator* U_EXPORT2 ubrk_open(UBreakIteratorType type, const char *locale, const UChar *text, int32_t textLength, UErrorCode *status); /** * Open a new UBreakIterator for locating text boundaries using specified breaking rules. * The rule syntax is ... (TBD) * @param rules A set of rules specifying the text breaking conventions. * @param rulesLength The number of characters in rules, or -1 if null-terminated. * @param text The text to be iterated over. May be null, in which case ubrk_setText() is * used to specify the text to be iterated. * @param textLength The number of characters in text, or -1 if null-terminated. * @param parseErr Receives position and context information for any syntax errors * detected while parsing the rules. * @param status A UErrorCode to receive any errors. * @return A UBreakIterator for the specified rules. * @see ubrk_open * @stable ICU 2.2 */ U_STABLE UBreakIterator* U_EXPORT2 ubrk_openRules(const UChar *rules, int32_t rulesLength, const UChar *text, int32_t textLength, UParseError *parseErr, UErrorCode *status); /** * Thread safe cloning operation * @param bi iterator to be cloned * @param stackBuffer Deprecated functionality as of ICU 52, use NULL.
* user allocated space for the new clone. If NULL new memory will be allocated. * If buffer is not large enough, new memory will be allocated. * Clients can use the U_BRK_SAFECLONE_BUFFERSIZE. * @param pBufferSize Deprecated functionality as of ICU 52, use NULL or 1.
* pointer to size of allocated space. * If *pBufferSize == 0, a sufficient size for use in cloning will * be returned ('pre-flighting') * If *pBufferSize is not enough for a stack-based safe clone, * new memory will be allocated. * @param status to indicate whether the operation went on smoothly or there were errors * An informational status value, U_SAFECLONE_ALLOCATED_ERROR, is used if any allocations were necessary. * @return pointer to the new clone * @stable ICU 2.0 */ U_STABLE UBreakIterator * U_EXPORT2 ubrk_safeClone( const UBreakIterator *bi, void *stackBuffer, int32_t *pBufferSize, UErrorCode *status); /** * Close a UBreakIterator. * Once closed, a UBreakIterator may no longer be used. * @param bi The break iterator to close. * @stable ICU 2.0 */ U_STABLE void U_EXPORT2 ubrk_close(UBreakIterator *bi); /** * Sets an existing iterator to point to a new piece of text * @param bi The iterator to use * @param text The text to be set * @param textLength The length of the text * @param status The error code * @stable ICU 2.0 */ U_STABLE void U_EXPORT2 ubrk_setText(UBreakIterator* bi, const UChar* text, int32_t textLength, UErrorCode* status); /** * Sets an existing iterator to point to a new piece of text. * * All index positions returned by break iterator functions are * native indices from the UText. For example, when breaking UTF-8 * encoded text, the break positions returned by \ref ubrk_next, \ref ubrk_previous, etc. * will be UTF-8 string indices, not UTF-16 positions. * * @param bi The iterator to use * @param text The text to be set. * This function makes a shallow clone of the supplied UText. This means * that the caller is free to immediately close or otherwise reuse the * UText that was passed as a parameter, but that the underlying text itself * must not be altered while being referenced by the break iterator. * @param status The error code * @stable ICU 3.4 */ U_STABLE void U_EXPORT2 ubrk_setUText(UBreakIterator* bi, UText* text, UErrorCode* status); /** * Determine the most recently-returned text boundary. * * @param bi The break iterator to use. * @return The character index most recently returned by \ref ubrk_next, \ref ubrk_previous, * \ref ubrk_first, or \ref ubrk_last. * @stable ICU 2.0 */ U_STABLE int32_t U_EXPORT2 ubrk_current(const UBreakIterator *bi); /** * Advance the iterator to the boundary following the current boundary. * * @param bi The break iterator to use. * @return The character index of the next text boundary, or UBRK_DONE * if all text boundaries have been returned. * @see ubrk_previous * @stable ICU 2.0 */ U_STABLE int32_t U_EXPORT2 ubrk_next(UBreakIterator *bi); /** * Set the iterator position to the boundary preceding the current boundary. * * @param bi The break iterator to use. * @return The character index of the preceding text boundary, or UBRK_DONE * if all text boundaries have been returned. * @see ubrk_next * @stable ICU 2.0 */ U_STABLE int32_t U_EXPORT2 ubrk_previous(UBreakIterator *bi); /** * Set the iterator position to zero, the start of the text being scanned. * @param bi The break iterator to use. * @return The new iterator position (zero). * @see ubrk_last * @stable ICU 2.0 */ U_STABLE int32_t U_EXPORT2 ubrk_first(UBreakIterator *bi); /** * Set the iterator position to the index immediately beyond the last character in the text being scanned. * This is not the same as the last character. * @param bi The break iterator to use. * @return The character offset immediately beyond the last character in the * text being scanned. * @see ubrk_first * @stable ICU 2.0 */ U_STABLE int32_t U_EXPORT2 ubrk_last(UBreakIterator *bi); /** * Set the iterator position to the first boundary preceding the specified offset. * The new position is always smaller than offset, or UBRK_DONE. * @param bi The break iterator to use. * @param offset The offset to begin scanning. * @return The text boundary preceding offset, or UBRK_DONE. * @see ubrk_following * @stable ICU 2.0 */ U_STABLE int32_t U_EXPORT2 ubrk_preceding(UBreakIterator *bi, int32_t offset); /** * Advance the iterator to the first boundary following the specified offset. * The value returned is always greater than offset, or UBRK_DONE. * @param bi The break iterator to use. * @param offset The offset to begin scanning. * @return The text boundary following offset, or UBRK_DONE. * @see ubrk_preceding * @stable ICU 2.0 */ U_STABLE int32_t U_EXPORT2 ubrk_following(UBreakIterator *bi, int32_t offset); /** * Get a locale for which text breaking information is available. * A UBreakIterator in a locale returned by this function will perform the correct * text breaking for the locale. * @param index The index of the desired locale. * @return A locale for which number text breaking information is available, or 0 if none. * @see ubrk_countAvailable * @stable ICU 2.0 */ U_STABLE const char* U_EXPORT2 ubrk_getAvailable(int32_t index); /** * Determine how many locales have text breaking information available. * This function is most useful as determining the loop ending condition for * calls to \ref ubrk_getAvailable. * @return The number of locales for which text breaking information is available. * @see ubrk_getAvailable * @stable ICU 2.0 */ U_STABLE int32_t U_EXPORT2 ubrk_countAvailable(void); /** * Returns true if the specfied position is a boundary position. As a side * effect, leaves the iterator pointing to the first boundary position at * or after "offset". * @param bi The break iterator to use. * @param offset the offset to check. * @return True if "offset" is a boundary position. * @stable ICU 2.0 */ U_STABLE UBool U_EXPORT2 ubrk_isBoundary(UBreakIterator *bi, int32_t offset); /** * Return the status from the break rule that determined the most recently * returned break position. The values appear in the rule source * within brackets, {123}, for example. For rules that do not specify a * status, a default value of 0 is returned. *

* For word break iterators, the possible values are defined in enum UWordBreak. * @stable ICU 2.2 */ U_STABLE int32_t U_EXPORT2 ubrk_getRuleStatus(UBreakIterator *bi); /** * Get the statuses from the break rules that determined the most recently * returned break position. The values appear in the rule source * within brackets, {123}, for example. The default status value for rules * that do not explicitly provide one is zero. *

* For word break iterators, the possible values are defined in enum UWordBreak. * @param bi The break iterator to use * @param fillInVec an array to be filled in with the status values. * @param capacity the length of the supplied vector. A length of zero causes * the function to return the number of status values, in the * normal way, without attemtping to store any values. * @param status receives error codes. * @return The number of rule status values from rules that determined * the most recent boundary returned by the break iterator. * @stable ICU 3.0 */ U_STABLE int32_t U_EXPORT2 ubrk_getRuleStatusVec(UBreakIterator *bi, int32_t *fillInVec, int32_t capacity, UErrorCode *status); /** * Return the locale of the break iterator. You can choose between the valid and * the actual locale. * @param bi break iterator * @param type locale type (valid or actual) * @param status error code * @return locale string * @stable ICU 2.8 */ U_STABLE const char* U_EXPORT2 ubrk_getLocaleByType(const UBreakIterator *bi, ULocDataLocaleType type, UErrorCode* status); /** * Set the subject text string upon which the break iterator is operating * without changing any other aspect of the state. * The new and previous text strings must have the same content. * * This function is intended for use in environments where ICU is operating on * strings that may move around in memory. It provides a mechanism for notifying * ICU that the string has been relocated, and providing a new UText to access the * string in its new position. * * Note that the break iterator never copies the underlying text * of a string being processed, but always operates directly on the original text * provided by the user. Refreshing simply drops the references to the old text * and replaces them with references to the new. * * Caution: this function is normally used only by very specialized * system-level code. One example use case is with garbage collection * that moves the text in memory. * * @param bi The break iterator. * @param text The new (moved) text string. * @param status Receives errors detected by this function. * * @stable ICU 49 */ U_STABLE void U_EXPORT2 ubrk_refreshUText(UBreakIterator *bi, UText *text, UErrorCode *status); #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ #endif // messagepattern.h /* ******************************************************************************* * Copyright (C) 2011-2013, International Business Machines * Corporation and others. All Rights Reserved. ******************************************************************************* * file name: messagepattern.h * encoding: US-ASCII * tab size: 8 (not used) * indentation:4 * * created on: 2011mar14 * created by: Markus W. Scherer */ #ifndef __MESSAGEPATTERN_H__ #define __MESSAGEPATTERN_H__ /** * \file * \brief C++ API: MessagePattern class: Parses and represents ICU MessageFormat patterns. */ #if !UCONFIG_NO_FORMATTING /** * Mode for when an apostrophe starts quoted literal text for MessageFormat output. * The default is DOUBLE_OPTIONAL unless overridden via uconfig.h * (UCONFIG_MSGPAT_DEFAULT_APOSTROPHE_MODE). *

* A pair of adjacent apostrophes always results in a single apostrophe in the output, * even when the pair is between two single, text-quoting apostrophes. *

* The following table shows examples of desired MessageFormat.format() output * with the pattern strings that yield that output. *

* * * * * * * * * * * * * * * * * * * * * *
Desired outputDOUBLE_OPTIONALDOUBLE_REQUIRED
I see {many}I see '{many}'(same)
I said {'Wow!'}I said '{''Wow!''}'(same)
I don't knowI don't know OR
I don''t know
I don''t know
* @stable ICU 4.8 * @see UCONFIG_MSGPAT_DEFAULT_APOSTROPHE_MODE */ enum UMessagePatternApostropheMode { /** * A literal apostrophe is represented by * either a single or a double apostrophe pattern character. * Within a MessageFormat pattern, a single apostrophe only starts quoted literal text * if it immediately precedes a curly brace {}, * or a pipe symbol | if inside a choice format, * or a pound symbol # if inside a plural format. *

* This is the default behavior starting with ICU 4.8. * @stable ICU 4.8 */ UMSGPAT_APOS_DOUBLE_OPTIONAL, /** * A literal apostrophe must be represented by * a double apostrophe pattern character. * A single apostrophe always starts quoted literal text. *

* This is the behavior of ICU 4.6 and earlier, and of the JDK. * @stable ICU 4.8 */ UMSGPAT_APOS_DOUBLE_REQUIRED }; /** * @stable ICU 4.8 */ typedef enum UMessagePatternApostropheMode UMessagePatternApostropheMode; /** * MessagePattern::Part type constants. * @stable ICU 4.8 */ enum UMessagePatternPartType { /** * Start of a message pattern (main or nested). * The length is 0 for the top-level message * and for a choice argument sub-message, otherwise 1 for the '{'. * The value indicates the nesting level, starting with 0 for the main message. *

* There is always a later MSG_LIMIT part. * @stable ICU 4.8 */ UMSGPAT_PART_TYPE_MSG_START, /** * End of a message pattern (main or nested). * The length is 0 for the top-level message and * the last sub-message of a choice argument, * otherwise 1 for the '}' or (in a choice argument style) the '|'. * The value indicates the nesting level, starting with 0 for the main message. * @stable ICU 4.8 */ UMSGPAT_PART_TYPE_MSG_LIMIT, /** * Indicates a substring of the pattern string which is to be skipped when formatting. * For example, an apostrophe that begins or ends quoted text * would be indicated with such a part. * The value is undefined and currently always 0. * @stable ICU 4.8 */ UMSGPAT_PART_TYPE_SKIP_SYNTAX, /** * Indicates that a syntax character needs to be inserted for auto-quoting. * The length is 0. * The value is the character code of the insertion character. (U+0027=APOSTROPHE) * @stable ICU 4.8 */ UMSGPAT_PART_TYPE_INSERT_CHAR, /** * Indicates a syntactic (non-escaped) # symbol in a plural variant. * When formatting, replace this part's substring with the * (value-offset) for the plural argument value. * The value is undefined and currently always 0. * @stable ICU 4.8 */ UMSGPAT_PART_TYPE_REPLACE_NUMBER, /** * Start of an argument. * The length is 1 for the '{'. * The value is the ordinal value of the ArgType. Use getArgType(). *

* This part is followed by either an ARG_NUMBER or ARG_NAME, * followed by optional argument sub-parts (see UMessagePatternArgType constants) * and finally an ARG_LIMIT part. * @stable ICU 4.8 */ UMSGPAT_PART_TYPE_ARG_START, /** * End of an argument. * The length is 1 for the '}'. * The value is the ordinal value of the ArgType. Use getArgType(). * @stable ICU 4.8 */ UMSGPAT_PART_TYPE_ARG_LIMIT, /** * The argument number, provided by the value. * @stable ICU 4.8 */ UMSGPAT_PART_TYPE_ARG_NUMBER, /** * The argument name. * The value is undefined and currently always 0. * @stable ICU 4.8 */ UMSGPAT_PART_TYPE_ARG_NAME, /** * The argument type. * The value is undefined and currently always 0. * @stable ICU 4.8 */ UMSGPAT_PART_TYPE_ARG_TYPE, /** * The argument style text. * The value is undefined and currently always 0. * @stable ICU 4.8 */ UMSGPAT_PART_TYPE_ARG_STYLE, /** * A selector substring in a "complex" argument style. * The value is undefined and currently always 0. * @stable ICU 4.8 */ UMSGPAT_PART_TYPE_ARG_SELECTOR, /** * An integer value, for example the offset or an explicit selector value * in a PluralFormat style. * The part value is the integer value. * @stable ICU 4.8 */ UMSGPAT_PART_TYPE_ARG_INT, /** * A numeric value, for example the offset or an explicit selector value * in a PluralFormat style. * The part value is an index into an internal array of numeric values; * use getNumericValue(). * @stable ICU 4.8 */ UMSGPAT_PART_TYPE_ARG_DOUBLE }; /** * @stable ICU 4.8 */ typedef enum UMessagePatternPartType UMessagePatternPartType; /** * Argument type constants. * Returned by Part.getArgType() for ARG_START and ARG_LIMIT parts. * * Messages nested inside an argument are each delimited by MSG_START and MSG_LIMIT, * with a nesting level one greater than the surrounding message. * @stable ICU 4.8 */ enum UMessagePatternArgType { /** * The argument has no specified type. * @stable ICU 4.8 */ UMSGPAT_ARG_TYPE_NONE, /** * The argument has a "simple" type which is provided by the ARG_TYPE part. * An ARG_STYLE part might follow that. * @stable ICU 4.8 */ UMSGPAT_ARG_TYPE_SIMPLE, /** * The argument is a ChoiceFormat with one or more * ((ARG_INT | ARG_DOUBLE), ARG_SELECTOR, message) tuples. * @stable ICU 4.8 */ UMSGPAT_ARG_TYPE_CHOICE, /** * The argument is a cardinal-number PluralFormat with an optional ARG_INT or ARG_DOUBLE offset * (e.g., offset:1) * and one or more (ARG_SELECTOR [explicit-value] message) tuples. * If the selector has an explicit value (e.g., =2), then * that value is provided by the ARG_INT or ARG_DOUBLE part preceding the message. * Otherwise the message immediately follows the ARG_SELECTOR. * @stable ICU 4.8 */ UMSGPAT_ARG_TYPE_PLURAL, /** * The argument is a SelectFormat with one or more (ARG_SELECTOR, message) pairs. * @stable ICU 4.8 */ UMSGPAT_ARG_TYPE_SELECT, /** * The argument is an ordinal-number PluralFormat * with the same style parts sequence and semantics as UMSGPAT_ARG_TYPE_PLURAL. * @stable ICU 50 */ UMSGPAT_ARG_TYPE_SELECTORDINAL }; /** * @stable ICU 4.8 */ typedef enum UMessagePatternArgType UMessagePatternArgType; /** * \def UMSGPAT_ARG_TYPE_HAS_PLURAL_STYLE * Returns TRUE if the argument type has a plural style part sequence and semantics, * for example UMSGPAT_ARG_TYPE_PLURAL and UMSGPAT_ARG_TYPE_SELECTORDINAL. * @stable ICU 50 */ #define UMSGPAT_ARG_TYPE_HAS_PLURAL_STYLE(argType) \ ((argType)==UMSGPAT_ARG_TYPE_PLURAL || (argType)==UMSGPAT_ARG_TYPE_SELECTORDINAL) enum { /** * Return value from MessagePattern.validateArgumentName() for when * the string is a valid "pattern identifier" but not a number. * @stable ICU 4.8 */ UMSGPAT_ARG_NAME_NOT_NUMBER=-1, /** * Return value from MessagePattern.validateArgumentName() for when * the string is invalid. * It might not be a valid "pattern identifier", * or it have only ASCII digits but there is a leading zero or the number is too large. * @stable ICU 4.8 */ UMSGPAT_ARG_NAME_NOT_VALID=-2 }; /** * Special value that is returned by getNumericValue(Part) when no * numeric value is defined for a part. * @see MessagePattern.getNumericValue() * @stable ICU 4.8 */ #define UMSGPAT_NO_NUMERIC_VALUE ((double)(-123456789)) #endif // !UCONFIG_NO_FORMATTING #endif // __MESSAGEPATTERN_H__ // icudataver.h /* ****************************************************************************** * * Copyright (C) 2009-2013, International Business Machines * Corporation and others. All Rights Reserved. * ****************************************************************************** */ /** * \file * \brief C API: access to ICU Data Version number */ #ifndef __ICU_DATA_VER_H__ #define __ICU_DATA_VER_H__ /** * @stable ICU 49 */ #define U_ICU_VERSION_BUNDLE "icuver" /** * @stable ICU 49 */ #define U_ICU_DATA_KEY "DataVersion" /** * Retrieves the data version from icuver and stores it in dataVersionFillin. * * @param dataVersionFillin icuver data version information to be filled in if not-null * @param status stores the error code from the calls to resource bundle * * @stable ICU 49 */ U_STABLE void U_EXPORT2 u_getDataVersion(UVersionInfo dataVersionFillin, UErrorCode *status); #endif // caniter.h /* ******************************************************************************* * Copyright (C) 1996-2014, International Business Machines Corporation and * others. All Rights Reserved. ******************************************************************************* */ #ifndef CANITER_H #define CANITER_H #if !UCONFIG_NO_NORMALIZATION /** * \file * \brief C++ API: Canonical Iterator */ /** Should permutation skip characters with combining class zero * Should be either TRUE or FALSE. This is a compile time option * @stable ICU 2.4 */ #ifndef CANITER_SKIP_ZEROES #define CANITER_SKIP_ZEROES TRUE #endif #endif /* #if !UCONFIG_NO_NORMALIZATION */ #endif #endif // (NTDDI_VERSION >= NTDDI_WIN10_RS2)